arc.c revision 323667
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 24 * Copyright (c) 2011, 2017 by Delphix. All rights reserved. 25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. 26 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 27 */ 28 29/* 30 * DVA-based Adjustable Replacement Cache 31 * 32 * While much of the theory of operation used here is 33 * based on the self-tuning, low overhead replacement cache 34 * presented by Megiddo and Modha at FAST 2003, there are some 35 * significant differences: 36 * 37 * 1. The Megiddo and Modha model assumes any page is evictable. 38 * Pages in its cache cannot be "locked" into memory. This makes 39 * the eviction algorithm simple: evict the last page in the list. 40 * This also make the performance characteristics easy to reason 41 * about. Our cache is not so simple. At any given moment, some 42 * subset of the blocks in the cache are un-evictable because we 43 * have handed out a reference to them. Blocks are only evictable 44 * when there are no external references active. This makes 45 * eviction far more problematic: we choose to evict the evictable 46 * blocks that are the "lowest" in the list. 47 * 48 * There are times when it is not possible to evict the requested 49 * space. In these circumstances we are unable to adjust the cache 50 * size. To prevent the cache growing unbounded at these times we 51 * implement a "cache throttle" that slows the flow of new data 52 * into the cache until we can make space available. 53 * 54 * 2. The Megiddo and Modha model assumes a fixed cache size. 55 * Pages are evicted when the cache is full and there is a cache 56 * miss. Our model has a variable sized cache. It grows with 57 * high use, but also tries to react to memory pressure from the 58 * operating system: decreasing its size when system memory is 59 * tight. 60 * 61 * 3. The Megiddo and Modha model assumes a fixed page size. All 62 * elements of the cache are therefore exactly the same size. So 63 * when adjusting the cache size following a cache miss, its simply 64 * a matter of choosing a single page to evict. In our model, we 65 * have variable sized cache blocks (rangeing from 512 bytes to 66 * 128K bytes). We therefore choose a set of blocks to evict to make 67 * space for a cache miss that approximates as closely as possible 68 * the space used by the new block. 69 * 70 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 71 * by N. Megiddo & D. Modha, FAST 2003 72 */ 73 74/* 75 * The locking model: 76 * 77 * A new reference to a cache buffer can be obtained in two 78 * ways: 1) via a hash table lookup using the DVA as a key, 79 * or 2) via one of the ARC lists. The arc_read() interface 80 * uses method 1, while the internal ARC algorithms for 81 * adjusting the cache use method 2. We therefore provide two 82 * types of locks: 1) the hash table lock array, and 2) the 83 * ARC list locks. 84 * 85 * Buffers do not have their own mutexes, rather they rely on the 86 * hash table mutexes for the bulk of their protection (i.e. most 87 * fields in the arc_buf_hdr_t are protected by these mutexes). 88 * 89 * buf_hash_find() returns the appropriate mutex (held) when it 90 * locates the requested buffer in the hash table. It returns 91 * NULL for the mutex if the buffer was not in the table. 92 * 93 * buf_hash_remove() expects the appropriate hash mutex to be 94 * already held before it is invoked. 95 * 96 * Each ARC state also has a mutex which is used to protect the 97 * buffer list associated with the state. When attempting to 98 * obtain a hash table lock while holding an ARC list lock you 99 * must use: mutex_tryenter() to avoid deadlock. Also note that 100 * the active state mutex must be held before the ghost state mutex. 101 * 102 * Note that the majority of the performance stats are manipulated 103 * with atomic operations. 104 * 105 * The L2ARC uses the l2ad_mtx on each vdev for the following: 106 * 107 * - L2ARC buflist creation 108 * - L2ARC buflist eviction 109 * - L2ARC write completion, which walks L2ARC buflists 110 * - ARC header destruction, as it removes from L2ARC buflists 111 * - ARC header release, as it removes from L2ARC buflists 112 */ 113 114/* 115 * ARC operation: 116 * 117 * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. 118 * This structure can point either to a block that is still in the cache or to 119 * one that is only accessible in an L2 ARC device, or it can provide 120 * information about a block that was recently evicted. If a block is 121 * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough 122 * information to retrieve it from the L2ARC device. This information is 123 * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block 124 * that is in this state cannot access the data directly. 125 * 126 * Blocks that are actively being referenced or have not been evicted 127 * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within 128 * the arc_buf_hdr_t that will point to the data block in memory. A block can 129 * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC 130 * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and 131 * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). 132 * 133 * The L1ARC's data pointer may or may not be uncompressed. The ARC has the 134 * ability to store the physical data (b_pabd) associated with the DVA of the 135 * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, 136 * it will match its on-disk compression characteristics. This behavior can be 137 * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the 138 * compressed ARC functionality is disabled, the b_pabd will point to an 139 * uncompressed version of the on-disk data. 140 * 141 * Data in the L1ARC is not accessed by consumers of the ARC directly. Each 142 * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it. 143 * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC 144 * consumer. The ARC will provide references to this data and will keep it 145 * cached until it is no longer in use. The ARC caches only the L1ARC's physical 146 * data block and will evict any arc_buf_t that is no longer referenced. The 147 * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the 148 * "overhead_size" kstat. 149 * 150 * Depending on the consumer, an arc_buf_t can be requested in uncompressed or 151 * compressed form. The typical case is that consumers will want uncompressed 152 * data, and when that happens a new data buffer is allocated where the data is 153 * decompressed for them to use. Currently the only consumer who wants 154 * compressed arc_buf_t's is "zfs send", when it streams data exactly as it 155 * exists on disk. When this happens, the arc_buf_t's data buffer is shared 156 * with the arc_buf_hdr_t. 157 * 158 * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The 159 * first one is owned by a compressed send consumer (and therefore references 160 * the same compressed data buffer as the arc_buf_hdr_t) and the second could be 161 * used by any other consumer (and has its own uncompressed copy of the data 162 * buffer). 163 * 164 * arc_buf_hdr_t 165 * +-----------+ 166 * | fields | 167 * | common to | 168 * | L1- and | 169 * | L2ARC | 170 * +-----------+ 171 * | l2arc_buf_hdr_t 172 * | | 173 * +-----------+ 174 * | l1arc_buf_hdr_t 175 * | | arc_buf_t 176 * | b_buf +------------>+-----------+ arc_buf_t 177 * | b_pabd +-+ |b_next +---->+-----------+ 178 * +-----------+ | |-----------| |b_next +-->NULL 179 * | |b_comp = T | +-----------+ 180 * | |b_data +-+ |b_comp = F | 181 * | +-----------+ | |b_data +-+ 182 * +->+------+ | +-----------+ | 183 * compressed | | | | 184 * data | |<--------------+ | uncompressed 185 * +------+ compressed, | data 186 * shared +-->+------+ 187 * data | | 188 * | | 189 * +------+ 190 * 191 * When a consumer reads a block, the ARC must first look to see if the 192 * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new 193 * arc_buf_t and either copies uncompressed data into a new data buffer from an 194 * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a 195 * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the 196 * hdr is compressed and the desired compression characteristics of the 197 * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the 198 * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be 199 * the last buffer in the hdr's b_buf list, however a shared compressed buf can 200 * be anywhere in the hdr's list. 201 * 202 * The diagram below shows an example of an uncompressed ARC hdr that is 203 * sharing its data with an arc_buf_t (note that the shared uncompressed buf is 204 * the last element in the buf list): 205 * 206 * arc_buf_hdr_t 207 * +-----------+ 208 * | | 209 * | | 210 * | | 211 * +-----------+ 212 * l2arc_buf_hdr_t| | 213 * | | 214 * +-----------+ 215 * l1arc_buf_hdr_t| | 216 * | | arc_buf_t (shared) 217 * | b_buf +------------>+---------+ arc_buf_t 218 * | | |b_next +---->+---------+ 219 * | b_pabd +-+ |---------| |b_next +-->NULL 220 * +-----------+ | | | +---------+ 221 * | |b_data +-+ | | 222 * | +---------+ | |b_data +-+ 223 * +->+------+ | +---------+ | 224 * | | | | 225 * uncompressed | | | | 226 * data +------+ | | 227 * ^ +->+------+ | 228 * | uncompressed | | | 229 * | data | | | 230 * | +------+ | 231 * +---------------------------------+ 232 * 233 * Writing to the ARC requires that the ARC first discard the hdr's b_pabd 234 * since the physical block is about to be rewritten. The new data contents 235 * will be contained in the arc_buf_t. As the I/O pipeline performs the write, 236 * it may compress the data before writing it to disk. The ARC will be called 237 * with the transformed data and will bcopy the transformed on-disk block into 238 * a newly allocated b_pabd. Writes are always done into buffers which have 239 * either been loaned (and hence are new and don't have other readers) or 240 * buffers which have been released (and hence have their own hdr, if there 241 * were originally other readers of the buf's original hdr). This ensures that 242 * the ARC only needs to update a single buf and its hdr after a write occurs. 243 * 244 * When the L2ARC is in use, it will also take advantage of the b_pabd. The 245 * L2ARC will always write the contents of b_pabd to the L2ARC. This means 246 * that when compressed ARC is enabled that the L2ARC blocks are identical 247 * to the on-disk block in the main data pool. This provides a significant 248 * advantage since the ARC can leverage the bp's checksum when reading from the 249 * L2ARC to determine if the contents are valid. However, if the compressed 250 * ARC is disabled, then the L2ARC's block must be transformed to look 251 * like the physical block in the main data pool before comparing the 252 * checksum and determining its validity. 253 */ 254 255#include <sys/spa.h> 256#include <sys/zio.h> 257#include <sys/spa_impl.h> 258#include <sys/zio_compress.h> 259#include <sys/zio_checksum.h> 260#include <sys/zfs_context.h> 261#include <sys/arc.h> 262#include <sys/refcount.h> 263#include <sys/vdev.h> 264#include <sys/vdev_impl.h> 265#include <sys/dsl_pool.h> 266#include <sys/zio_checksum.h> 267#include <sys/multilist.h> 268#include <sys/abd.h> 269#ifdef _KERNEL 270#include <sys/dnlc.h> 271#include <sys/racct.h> 272#endif 273#include <sys/callb.h> 274#include <sys/kstat.h> 275#include <sys/trim_map.h> 276#include <zfs_fletcher.h> 277#include <sys/sdt.h> 278 279#include <machine/vmparam.h> 280 281#ifdef illumos 282#ifndef _KERNEL 283/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 284boolean_t arc_watch = B_FALSE; 285int arc_procfd; 286#endif 287#endif /* illumos */ 288 289static kmutex_t arc_reclaim_lock; 290static kcondvar_t arc_reclaim_thread_cv; 291static boolean_t arc_reclaim_thread_exit; 292static kcondvar_t arc_reclaim_waiters_cv; 293 294static kmutex_t arc_dnlc_evicts_lock; 295static kcondvar_t arc_dnlc_evicts_cv; 296static boolean_t arc_dnlc_evicts_thread_exit; 297 298uint_t arc_reduce_dnlc_percent = 3; 299 300/* 301 * The number of headers to evict in arc_evict_state_impl() before 302 * dropping the sublist lock and evicting from another sublist. A lower 303 * value means we're more likely to evict the "correct" header (i.e. the 304 * oldest header in the arc state), but comes with higher overhead 305 * (i.e. more invocations of arc_evict_state_impl()). 306 */ 307int zfs_arc_evict_batch_limit = 10; 308 309/* number of seconds before growing cache again */ 310static int arc_grow_retry = 60; 311 312/* shift of arc_c for calculating overflow limit in arc_get_data_impl */ 313int zfs_arc_overflow_shift = 8; 314 315/* shift of arc_c for calculating both min and max arc_p */ 316static int arc_p_min_shift = 4; 317 318/* log2(fraction of arc to reclaim) */ 319static int arc_shrink_shift = 7; 320 321/* 322 * log2(fraction of ARC which must be free to allow growing). 323 * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, 324 * when reading a new block into the ARC, we will evict an equal-sized block 325 * from the ARC. 326 * 327 * This must be less than arc_shrink_shift, so that when we shrink the ARC, 328 * we will still not allow it to grow. 329 */ 330int arc_no_grow_shift = 5; 331 332 333/* 334 * minimum lifespan of a prefetch block in clock ticks 335 * (initialized in arc_init()) 336 */ 337static int arc_min_prefetch_lifespan; 338 339/* 340 * If this percent of memory is free, don't throttle. 341 */ 342int arc_lotsfree_percent = 10; 343 344static int arc_dead; 345extern boolean_t zfs_prefetch_disable; 346 347/* 348 * The arc has filled available memory and has now warmed up. 349 */ 350static boolean_t arc_warm; 351 352/* 353 * These tunables are for performance analysis. 354 */ 355uint64_t zfs_arc_max; 356uint64_t zfs_arc_min; 357uint64_t zfs_arc_meta_limit = 0; 358uint64_t zfs_arc_meta_min = 0; 359int zfs_arc_grow_retry = 0; 360int zfs_arc_shrink_shift = 0; 361int zfs_arc_no_grow_shift = 0; 362int zfs_arc_p_min_shift = 0; 363uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ 364u_int zfs_arc_free_target = 0; 365 366/* Absolute min for arc min / max is 16MB. */ 367static uint64_t arc_abs_min = 16 << 20; 368 369boolean_t zfs_compressed_arc_enabled = B_TRUE; 370 371static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); 372static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); 373static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS); 374static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS); 375static int sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS); 376 377#if defined(__FreeBSD__) && defined(_KERNEL) 378static void 379arc_free_target_init(void *unused __unused) 380{ 381 382 zfs_arc_free_target = vm_pageout_wakeup_thresh; 383} 384SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, 385 arc_free_target_init, NULL); 386 387TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 388TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); 389TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); 390TUNABLE_INT("vfs.zfs.arc_grow_retry", &zfs_arc_grow_retry); 391TUNABLE_INT("vfs.zfs.arc_no_grow_shift", &zfs_arc_no_grow_shift); 392SYSCTL_DECL(_vfs_zfs); 393SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_U64 | CTLFLAG_RWTUN, 394 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size"); 395SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_U64 | CTLFLAG_RWTUN, 396 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size"); 397SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, CTLTYPE_U32 | CTLFLAG_RWTUN, 398 0, sizeof(uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U", 399 "log2(fraction of ARC which must be free to allow growing)"); 400SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, 401 &zfs_arc_average_blocksize, 0, 402 "ARC average blocksize"); 403SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, 404 &arc_shrink_shift, 0, 405 "log2(fraction of arc to reclaim)"); 406SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_grow_retry, CTLFLAG_RW, 407 &arc_grow_retry, 0, 408 "Wait in seconds before considering growing ARC"); 409SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN, 410 &zfs_compressed_arc_enabled, 0, "Enable compressed ARC"); 411 412/* 413 * We don't have a tunable for arc_free_target due to the dependency on 414 * pagedaemon initialisation. 415 */ 416SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, 417 CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), 418 sysctl_vfs_zfs_arc_free_target, "IU", 419 "Desired number of free pages below which ARC triggers reclaim"); 420 421static int 422sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) 423{ 424 u_int val; 425 int err; 426 427 val = zfs_arc_free_target; 428 err = sysctl_handle_int(oidp, &val, 0, req); 429 if (err != 0 || req->newptr == NULL) 430 return (err); 431 432 if (val < minfree) 433 return (EINVAL); 434 if (val > vm_cnt.v_page_count) 435 return (EINVAL); 436 437 zfs_arc_free_target = val; 438 439 return (0); 440} 441 442/* 443 * Must be declared here, before the definition of corresponding kstat 444 * macro which uses the same names will confuse the compiler. 445 */ 446SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, 447 CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 448 sysctl_vfs_zfs_arc_meta_limit, "QU", 449 "ARC metadata limit"); 450#endif 451 452/* 453 * Note that buffers can be in one of 6 states: 454 * ARC_anon - anonymous (discussed below) 455 * ARC_mru - recently used, currently cached 456 * ARC_mru_ghost - recentely used, no longer in cache 457 * ARC_mfu - frequently used, currently cached 458 * ARC_mfu_ghost - frequently used, no longer in cache 459 * ARC_l2c_only - exists in L2ARC but not other states 460 * When there are no active references to the buffer, they are 461 * are linked onto a list in one of these arc states. These are 462 * the only buffers that can be evicted or deleted. Within each 463 * state there are multiple lists, one for meta-data and one for 464 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 465 * etc.) is tracked separately so that it can be managed more 466 * explicitly: favored over data, limited explicitly. 467 * 468 * Anonymous buffers are buffers that are not associated with 469 * a DVA. These are buffers that hold dirty block copies 470 * before they are written to stable storage. By definition, 471 * they are "ref'd" and are considered part of arc_mru 472 * that cannot be freed. Generally, they will aquire a DVA 473 * as they are written and migrate onto the arc_mru list. 474 * 475 * The ARC_l2c_only state is for buffers that are in the second 476 * level ARC but no longer in any of the ARC_m* lists. The second 477 * level ARC itself may also contain buffers that are in any of 478 * the ARC_m* states - meaning that a buffer can exist in two 479 * places. The reason for the ARC_l2c_only state is to keep the 480 * buffer header in the hash table, so that reads that hit the 481 * second level ARC benefit from these fast lookups. 482 */ 483 484typedef struct arc_state { 485 /* 486 * list of evictable buffers 487 */ 488 multilist_t *arcs_list[ARC_BUFC_NUMTYPES]; 489 /* 490 * total amount of evictable data in this state 491 */ 492 refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; 493 /* 494 * total amount of data in this state; this includes: evictable, 495 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. 496 */ 497 refcount_t arcs_size; 498} arc_state_t; 499 500/* The 6 states: */ 501static arc_state_t ARC_anon; 502static arc_state_t ARC_mru; 503static arc_state_t ARC_mru_ghost; 504static arc_state_t ARC_mfu; 505static arc_state_t ARC_mfu_ghost; 506static arc_state_t ARC_l2c_only; 507 508typedef struct arc_stats { 509 kstat_named_t arcstat_hits; 510 kstat_named_t arcstat_misses; 511 kstat_named_t arcstat_demand_data_hits; 512 kstat_named_t arcstat_demand_data_misses; 513 kstat_named_t arcstat_demand_metadata_hits; 514 kstat_named_t arcstat_demand_metadata_misses; 515 kstat_named_t arcstat_prefetch_data_hits; 516 kstat_named_t arcstat_prefetch_data_misses; 517 kstat_named_t arcstat_prefetch_metadata_hits; 518 kstat_named_t arcstat_prefetch_metadata_misses; 519 kstat_named_t arcstat_mru_hits; 520 kstat_named_t arcstat_mru_ghost_hits; 521 kstat_named_t arcstat_mfu_hits; 522 kstat_named_t arcstat_mfu_ghost_hits; 523 kstat_named_t arcstat_allocated; 524 kstat_named_t arcstat_deleted; 525 /* 526 * Number of buffers that could not be evicted because the hash lock 527 * was held by another thread. The lock may not necessarily be held 528 * by something using the same buffer, since hash locks are shared 529 * by multiple buffers. 530 */ 531 kstat_named_t arcstat_mutex_miss; 532 /* 533 * Number of buffers skipped because they have I/O in progress, are 534 * indrect prefetch buffers that have not lived long enough, or are 535 * not from the spa we're trying to evict from. 536 */ 537 kstat_named_t arcstat_evict_skip; 538 /* 539 * Number of times arc_evict_state() was unable to evict enough 540 * buffers to reach it's target amount. 541 */ 542 kstat_named_t arcstat_evict_not_enough; 543 kstat_named_t arcstat_evict_l2_cached; 544 kstat_named_t arcstat_evict_l2_eligible; 545 kstat_named_t arcstat_evict_l2_ineligible; 546 kstat_named_t arcstat_evict_l2_skip; 547 kstat_named_t arcstat_hash_elements; 548 kstat_named_t arcstat_hash_elements_max; 549 kstat_named_t arcstat_hash_collisions; 550 kstat_named_t arcstat_hash_chains; 551 kstat_named_t arcstat_hash_chain_max; 552 kstat_named_t arcstat_p; 553 kstat_named_t arcstat_c; 554 kstat_named_t arcstat_c_min; 555 kstat_named_t arcstat_c_max; 556 kstat_named_t arcstat_size; 557 /* 558 * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. 559 * Note that the compressed bytes may match the uncompressed bytes 560 * if the block is either not compressed or compressed arc is disabled. 561 */ 562 kstat_named_t arcstat_compressed_size; 563 /* 564 * Uncompressed size of the data stored in b_pabd. If compressed 565 * arc is disabled then this value will be identical to the stat 566 * above. 567 */ 568 kstat_named_t arcstat_uncompressed_size; 569 /* 570 * Number of bytes stored in all the arc_buf_t's. This is classified 571 * as "overhead" since this data is typically short-lived and will 572 * be evicted from the arc when it becomes unreferenced unless the 573 * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level 574 * values have been set (see comment in dbuf.c for more information). 575 */ 576 kstat_named_t arcstat_overhead_size; 577 /* 578 * Number of bytes consumed by internal ARC structures necessary 579 * for tracking purposes; these structures are not actually 580 * backed by ARC buffers. This includes arc_buf_hdr_t structures 581 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 582 * caches), and arc_buf_t structures (allocated via arc_buf_t 583 * cache). 584 */ 585 kstat_named_t arcstat_hdr_size; 586 /* 587 * Number of bytes consumed by ARC buffers of type equal to 588 * ARC_BUFC_DATA. This is generally consumed by buffers backing 589 * on disk user data (e.g. plain file contents). 590 */ 591 kstat_named_t arcstat_data_size; 592 /* 593 * Number of bytes consumed by ARC buffers of type equal to 594 * ARC_BUFC_METADATA. This is generally consumed by buffers 595 * backing on disk data that is used for internal ZFS 596 * structures (e.g. ZAP, dnode, indirect blocks, etc). 597 */ 598 kstat_named_t arcstat_metadata_size; 599 /* 600 * Number of bytes consumed by various buffers and structures 601 * not actually backed with ARC buffers. This includes bonus 602 * buffers (allocated directly via zio_buf_* functions), 603 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t 604 * cache), and dnode_t structures (allocated via dnode_t cache). 605 */ 606 kstat_named_t arcstat_other_size; 607 /* 608 * Total number of bytes consumed by ARC buffers residing in the 609 * arc_anon state. This includes *all* buffers in the arc_anon 610 * state; e.g. data, metadata, evictable, and unevictable buffers 611 * are all included in this value. 612 */ 613 kstat_named_t arcstat_anon_size; 614 /* 615 * Number of bytes consumed by ARC buffers that meet the 616 * following criteria: backing buffers of type ARC_BUFC_DATA, 617 * residing in the arc_anon state, and are eligible for eviction 618 * (e.g. have no outstanding holds on the buffer). 619 */ 620 kstat_named_t arcstat_anon_evictable_data; 621 /* 622 * Number of bytes consumed by ARC buffers that meet the 623 * following criteria: backing buffers of type ARC_BUFC_METADATA, 624 * residing in the arc_anon state, and are eligible for eviction 625 * (e.g. have no outstanding holds on the buffer). 626 */ 627 kstat_named_t arcstat_anon_evictable_metadata; 628 /* 629 * Total number of bytes consumed by ARC buffers residing in the 630 * arc_mru state. This includes *all* buffers in the arc_mru 631 * state; e.g. data, metadata, evictable, and unevictable buffers 632 * are all included in this value. 633 */ 634 kstat_named_t arcstat_mru_size; 635 /* 636 * Number of bytes consumed by ARC buffers that meet the 637 * following criteria: backing buffers of type ARC_BUFC_DATA, 638 * residing in the arc_mru state, and are eligible for eviction 639 * (e.g. have no outstanding holds on the buffer). 640 */ 641 kstat_named_t arcstat_mru_evictable_data; 642 /* 643 * Number of bytes consumed by ARC buffers that meet the 644 * following criteria: backing buffers of type ARC_BUFC_METADATA, 645 * residing in the arc_mru state, and are eligible for eviction 646 * (e.g. have no outstanding holds on the buffer). 647 */ 648 kstat_named_t arcstat_mru_evictable_metadata; 649 /* 650 * Total number of bytes that *would have been* consumed by ARC 651 * buffers in the arc_mru_ghost state. The key thing to note 652 * here, is the fact that this size doesn't actually indicate 653 * RAM consumption. The ghost lists only consist of headers and 654 * don't actually have ARC buffers linked off of these headers. 655 * Thus, *if* the headers had associated ARC buffers, these 656 * buffers *would have* consumed this number of bytes. 657 */ 658 kstat_named_t arcstat_mru_ghost_size; 659 /* 660 * Number of bytes that *would have been* consumed by ARC 661 * buffers that are eligible for eviction, of type 662 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 663 */ 664 kstat_named_t arcstat_mru_ghost_evictable_data; 665 /* 666 * Number of bytes that *would have been* consumed by ARC 667 * buffers that are eligible for eviction, of type 668 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 669 */ 670 kstat_named_t arcstat_mru_ghost_evictable_metadata; 671 /* 672 * Total number of bytes consumed by ARC buffers residing in the 673 * arc_mfu state. This includes *all* buffers in the arc_mfu 674 * state; e.g. data, metadata, evictable, and unevictable buffers 675 * are all included in this value. 676 */ 677 kstat_named_t arcstat_mfu_size; 678 /* 679 * Number of bytes consumed by ARC buffers that are eligible for 680 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 681 * state. 682 */ 683 kstat_named_t arcstat_mfu_evictable_data; 684 /* 685 * Number of bytes consumed by ARC buffers that are eligible for 686 * eviction, of type ARC_BUFC_METADATA, and reside in the 687 * arc_mfu state. 688 */ 689 kstat_named_t arcstat_mfu_evictable_metadata; 690 /* 691 * Total number of bytes that *would have been* consumed by ARC 692 * buffers in the arc_mfu_ghost state. See the comment above 693 * arcstat_mru_ghost_size for more details. 694 */ 695 kstat_named_t arcstat_mfu_ghost_size; 696 /* 697 * Number of bytes that *would have been* consumed by ARC 698 * buffers that are eligible for eviction, of type 699 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 700 */ 701 kstat_named_t arcstat_mfu_ghost_evictable_data; 702 /* 703 * Number of bytes that *would have been* consumed by ARC 704 * buffers that are eligible for eviction, of type 705 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 706 */ 707 kstat_named_t arcstat_mfu_ghost_evictable_metadata; 708 kstat_named_t arcstat_l2_hits; 709 kstat_named_t arcstat_l2_misses; 710 kstat_named_t arcstat_l2_feeds; 711 kstat_named_t arcstat_l2_rw_clash; 712 kstat_named_t arcstat_l2_read_bytes; 713 kstat_named_t arcstat_l2_write_bytes; 714 kstat_named_t arcstat_l2_writes_sent; 715 kstat_named_t arcstat_l2_writes_done; 716 kstat_named_t arcstat_l2_writes_error; 717 kstat_named_t arcstat_l2_writes_lock_retry; 718 kstat_named_t arcstat_l2_evict_lock_retry; 719 kstat_named_t arcstat_l2_evict_reading; 720 kstat_named_t arcstat_l2_evict_l1cached; 721 kstat_named_t arcstat_l2_free_on_write; 722 kstat_named_t arcstat_l2_abort_lowmem; 723 kstat_named_t arcstat_l2_cksum_bad; 724 kstat_named_t arcstat_l2_io_error; 725 kstat_named_t arcstat_l2_size; 726 kstat_named_t arcstat_l2_asize; 727 kstat_named_t arcstat_l2_hdr_size; 728 kstat_named_t arcstat_l2_write_trylock_fail; 729 kstat_named_t arcstat_l2_write_passed_headroom; 730 kstat_named_t arcstat_l2_write_spa_mismatch; 731 kstat_named_t arcstat_l2_write_in_l2; 732 kstat_named_t arcstat_l2_write_hdr_io_in_progress; 733 kstat_named_t arcstat_l2_write_not_cacheable; 734 kstat_named_t arcstat_l2_write_full; 735 kstat_named_t arcstat_l2_write_buffer_iter; 736 kstat_named_t arcstat_l2_write_pios; 737 kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 738 kstat_named_t arcstat_l2_write_buffer_list_iter; 739 kstat_named_t arcstat_l2_write_buffer_list_null_iter; 740 kstat_named_t arcstat_memory_throttle_count; 741 kstat_named_t arcstat_meta_used; 742 kstat_named_t arcstat_meta_limit; 743 kstat_named_t arcstat_meta_max; 744 kstat_named_t arcstat_meta_min; 745 kstat_named_t arcstat_sync_wait_for_async; 746 kstat_named_t arcstat_demand_hit_predictive_prefetch; 747} arc_stats_t; 748 749static arc_stats_t arc_stats = { 750 { "hits", KSTAT_DATA_UINT64 }, 751 { "misses", KSTAT_DATA_UINT64 }, 752 { "demand_data_hits", KSTAT_DATA_UINT64 }, 753 { "demand_data_misses", KSTAT_DATA_UINT64 }, 754 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 755 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 756 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 757 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 758 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 759 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 760 { "mru_hits", KSTAT_DATA_UINT64 }, 761 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 762 { "mfu_hits", KSTAT_DATA_UINT64 }, 763 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 764 { "allocated", KSTAT_DATA_UINT64 }, 765 { "deleted", KSTAT_DATA_UINT64 }, 766 { "mutex_miss", KSTAT_DATA_UINT64 }, 767 { "evict_skip", KSTAT_DATA_UINT64 }, 768 { "evict_not_enough", KSTAT_DATA_UINT64 }, 769 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 770 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 771 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 772 { "evict_l2_skip", KSTAT_DATA_UINT64 }, 773 { "hash_elements", KSTAT_DATA_UINT64 }, 774 { "hash_elements_max", KSTAT_DATA_UINT64 }, 775 { "hash_collisions", KSTAT_DATA_UINT64 }, 776 { "hash_chains", KSTAT_DATA_UINT64 }, 777 { "hash_chain_max", KSTAT_DATA_UINT64 }, 778 { "p", KSTAT_DATA_UINT64 }, 779 { "c", KSTAT_DATA_UINT64 }, 780 { "c_min", KSTAT_DATA_UINT64 }, 781 { "c_max", KSTAT_DATA_UINT64 }, 782 { "size", KSTAT_DATA_UINT64 }, 783 { "compressed_size", KSTAT_DATA_UINT64 }, 784 { "uncompressed_size", KSTAT_DATA_UINT64 }, 785 { "overhead_size", KSTAT_DATA_UINT64 }, 786 { "hdr_size", KSTAT_DATA_UINT64 }, 787 { "data_size", KSTAT_DATA_UINT64 }, 788 { "metadata_size", KSTAT_DATA_UINT64 }, 789 { "other_size", KSTAT_DATA_UINT64 }, 790 { "anon_size", KSTAT_DATA_UINT64 }, 791 { "anon_evictable_data", KSTAT_DATA_UINT64 }, 792 { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, 793 { "mru_size", KSTAT_DATA_UINT64 }, 794 { "mru_evictable_data", KSTAT_DATA_UINT64 }, 795 { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, 796 { "mru_ghost_size", KSTAT_DATA_UINT64 }, 797 { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, 798 { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 799 { "mfu_size", KSTAT_DATA_UINT64 }, 800 { "mfu_evictable_data", KSTAT_DATA_UINT64 }, 801 { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, 802 { "mfu_ghost_size", KSTAT_DATA_UINT64 }, 803 { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, 804 { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 805 { "l2_hits", KSTAT_DATA_UINT64 }, 806 { "l2_misses", KSTAT_DATA_UINT64 }, 807 { "l2_feeds", KSTAT_DATA_UINT64 }, 808 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 809 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 810 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 811 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 812 { "l2_writes_done", KSTAT_DATA_UINT64 }, 813 { "l2_writes_error", KSTAT_DATA_UINT64 }, 814 { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, 815 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 816 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 817 { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, 818 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 819 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 820 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 821 { "l2_io_error", KSTAT_DATA_UINT64 }, 822 { "l2_size", KSTAT_DATA_UINT64 }, 823 { "l2_asize", KSTAT_DATA_UINT64 }, 824 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 825 { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 826 { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 827 { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 828 { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 829 { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 830 { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 831 { "l2_write_full", KSTAT_DATA_UINT64 }, 832 { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 833 { "l2_write_pios", KSTAT_DATA_UINT64 }, 834 { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 835 { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 836 { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, 837 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 838 { "arc_meta_used", KSTAT_DATA_UINT64 }, 839 { "arc_meta_limit", KSTAT_DATA_UINT64 }, 840 { "arc_meta_max", KSTAT_DATA_UINT64 }, 841 { "arc_meta_min", KSTAT_DATA_UINT64 }, 842 { "sync_wait_for_async", KSTAT_DATA_UINT64 }, 843 { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, 844}; 845 846#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 847 848#define ARCSTAT_INCR(stat, val) \ 849 atomic_add_64(&arc_stats.stat.value.ui64, (val)) 850 851#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 852#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 853 854#define ARCSTAT_MAX(stat, val) { \ 855 uint64_t m; \ 856 while ((val) > (m = arc_stats.stat.value.ui64) && \ 857 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 858 continue; \ 859} 860 861#define ARCSTAT_MAXSTAT(stat) \ 862 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 863 864/* 865 * We define a macro to allow ARC hits/misses to be easily broken down by 866 * two separate conditions, giving a total of four different subtypes for 867 * each of hits and misses (so eight statistics total). 868 */ 869#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 870 if (cond1) { \ 871 if (cond2) { \ 872 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 873 } else { \ 874 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 875 } \ 876 } else { \ 877 if (cond2) { \ 878 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 879 } else { \ 880 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 881 } \ 882 } 883 884kstat_t *arc_ksp; 885static arc_state_t *arc_anon; 886static arc_state_t *arc_mru; 887static arc_state_t *arc_mru_ghost; 888static arc_state_t *arc_mfu; 889static arc_state_t *arc_mfu_ghost; 890static arc_state_t *arc_l2c_only; 891 892/* 893 * There are several ARC variables that are critical to export as kstats -- 894 * but we don't want to have to grovel around in the kstat whenever we wish to 895 * manipulate them. For these variables, we therefore define them to be in 896 * terms of the statistic variable. This assures that we are not introducing 897 * the possibility of inconsistency by having shadow copies of the variables, 898 * while still allowing the code to be readable. 899 */ 900#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 901#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 902#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 903#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 904#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 905#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 906#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ 907#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ 908#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 909 910/* compressed size of entire arc */ 911#define arc_compressed_size ARCSTAT(arcstat_compressed_size) 912/* uncompressed size of entire arc */ 913#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) 914/* number of bytes in the arc from arc_buf_t's */ 915#define arc_overhead_size ARCSTAT(arcstat_overhead_size) 916 917static int arc_no_grow; /* Don't try to grow cache size */ 918static uint64_t arc_tempreserve; 919static uint64_t arc_loaned_bytes; 920 921typedef struct arc_callback arc_callback_t; 922 923struct arc_callback { 924 void *acb_private; 925 arc_done_func_t *acb_done; 926 arc_buf_t *acb_buf; 927 boolean_t acb_compressed; 928 zio_t *acb_zio_dummy; 929 arc_callback_t *acb_next; 930}; 931 932typedef struct arc_write_callback arc_write_callback_t; 933 934struct arc_write_callback { 935 void *awcb_private; 936 arc_done_func_t *awcb_ready; 937 arc_done_func_t *awcb_children_ready; 938 arc_done_func_t *awcb_physdone; 939 arc_done_func_t *awcb_done; 940 arc_buf_t *awcb_buf; 941}; 942 943/* 944 * ARC buffers are separated into multiple structs as a memory saving measure: 945 * - Common fields struct, always defined, and embedded within it: 946 * - L2-only fields, always allocated but undefined when not in L2ARC 947 * - L1-only fields, only allocated when in L1ARC 948 * 949 * Buffer in L1 Buffer only in L2 950 * +------------------------+ +------------------------+ 951 * | arc_buf_hdr_t | | arc_buf_hdr_t | 952 * | | | | 953 * | | | | 954 * | | | | 955 * +------------------------+ +------------------------+ 956 * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 957 * | (undefined if L1-only) | | | 958 * +------------------------+ +------------------------+ 959 * | l1arc_buf_hdr_t | 960 * | | 961 * | | 962 * | | 963 * | | 964 * +------------------------+ 965 * 966 * Because it's possible for the L2ARC to become extremely large, we can wind 967 * up eating a lot of memory in L2ARC buffer headers, so the size of a header 968 * is minimized by only allocating the fields necessary for an L1-cached buffer 969 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 970 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 971 * words in pointers. arc_hdr_realloc() is used to switch a header between 972 * these two allocation states. 973 */ 974typedef struct l1arc_buf_hdr { 975 kmutex_t b_freeze_lock; 976 zio_cksum_t *b_freeze_cksum; 977#ifdef ZFS_DEBUG 978 /* 979 * Used for debugging with kmem_flags - by allocating and freeing 980 * b_thawed when the buffer is thawed, we get a record of the stack 981 * trace that thawed it. 982 */ 983 void *b_thawed; 984#endif 985 986 arc_buf_t *b_buf; 987 uint32_t b_bufcnt; 988 /* for waiting on writes to complete */ 989 kcondvar_t b_cv; 990 uint8_t b_byteswap; 991 992 /* protected by arc state mutex */ 993 arc_state_t *b_state; 994 multilist_node_t b_arc_node; 995 996 /* updated atomically */ 997 clock_t b_arc_access; 998 999 /* self protecting */ 1000 refcount_t b_refcnt; 1001 1002 arc_callback_t *b_acb; 1003 abd_t *b_pabd; 1004} l1arc_buf_hdr_t; 1005 1006typedef struct l2arc_dev l2arc_dev_t; 1007 1008typedef struct l2arc_buf_hdr { 1009 /* protected by arc_buf_hdr mutex */ 1010 l2arc_dev_t *b_dev; /* L2ARC device */ 1011 uint64_t b_daddr; /* disk address, offset byte */ 1012 1013 list_node_t b_l2node; 1014} l2arc_buf_hdr_t; 1015 1016struct arc_buf_hdr { 1017 /* protected by hash lock */ 1018 dva_t b_dva; 1019 uint64_t b_birth; 1020 1021 arc_buf_contents_t b_type; 1022 arc_buf_hdr_t *b_hash_next; 1023 arc_flags_t b_flags; 1024 1025 /* 1026 * This field stores the size of the data buffer after 1027 * compression, and is set in the arc's zio completion handlers. 1028 * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). 1029 * 1030 * While the block pointers can store up to 32MB in their psize 1031 * field, we can only store up to 32MB minus 512B. This is due 1032 * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. 1033 * a field of zeros represents 512B in the bp). We can't use a 1034 * bias of 1 since we need to reserve a psize of zero, here, to 1035 * represent holes and embedded blocks. 1036 * 1037 * This isn't a problem in practice, since the maximum size of a 1038 * buffer is limited to 16MB, so we never need to store 32MB in 1039 * this field. Even in the upstream illumos code base, the 1040 * maximum size of a buffer is limited to 16MB. 1041 */ 1042 uint16_t b_psize; 1043 1044 /* 1045 * This field stores the size of the data buffer before 1046 * compression, and cannot change once set. It is in units 1047 * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) 1048 */ 1049 uint16_t b_lsize; /* immutable */ 1050 uint64_t b_spa; /* immutable */ 1051 1052 /* L2ARC fields. Undefined when not in L2ARC. */ 1053 l2arc_buf_hdr_t b_l2hdr; 1054 /* L1ARC fields. Undefined when in l2arc_only state */ 1055 l1arc_buf_hdr_t b_l1hdr; 1056}; 1057 1058#if defined(__FreeBSD__) && defined(_KERNEL) 1059static int 1060sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) 1061{ 1062 uint64_t val; 1063 int err; 1064 1065 val = arc_meta_limit; 1066 err = sysctl_handle_64(oidp, &val, 0, req); 1067 if (err != 0 || req->newptr == NULL) 1068 return (err); 1069 1070 if (val <= 0 || val > arc_c_max) 1071 return (EINVAL); 1072 1073 arc_meta_limit = val; 1074 return (0); 1075} 1076 1077static int 1078sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) 1079{ 1080 uint32_t val; 1081 int err; 1082 1083 val = arc_no_grow_shift; 1084 err = sysctl_handle_32(oidp, &val, 0, req); 1085 if (err != 0 || req->newptr == NULL) 1086 return (err); 1087 1088 if (val >= arc_shrink_shift) 1089 return (EINVAL); 1090 1091 arc_no_grow_shift = val; 1092 return (0); 1093} 1094 1095static int 1096sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS) 1097{ 1098 uint64_t val; 1099 int err; 1100 1101 val = zfs_arc_max; 1102 err = sysctl_handle_64(oidp, &val, 0, req); 1103 if (err != 0 || req->newptr == NULL) 1104 return (err); 1105 1106 if (zfs_arc_max == 0) { 1107 /* Loader tunable so blindly set */ 1108 zfs_arc_max = val; 1109 return (0); 1110 } 1111 1112 if (val < arc_abs_min || val > kmem_size()) 1113 return (EINVAL); 1114 if (val < arc_c_min) 1115 return (EINVAL); 1116 if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit) 1117 return (EINVAL); 1118 1119 arc_c_max = val; 1120 1121 arc_c = arc_c_max; 1122 arc_p = (arc_c >> 1); 1123 1124 if (zfs_arc_meta_limit == 0) { 1125 /* limit meta-data to 1/4 of the arc capacity */ 1126 arc_meta_limit = arc_c_max / 4; 1127 } 1128 1129 /* if kmem_flags are set, lets try to use less memory */ 1130 if (kmem_debugging()) 1131 arc_c = arc_c / 2; 1132 1133 zfs_arc_max = arc_c; 1134 1135 return (0); 1136} 1137 1138static int 1139sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS) 1140{ 1141 uint64_t val; 1142 int err; 1143 1144 val = zfs_arc_min; 1145 err = sysctl_handle_64(oidp, &val, 0, req); 1146 if (err != 0 || req->newptr == NULL) 1147 return (err); 1148 1149 if (zfs_arc_min == 0) { 1150 /* Loader tunable so blindly set */ 1151 zfs_arc_min = val; 1152 return (0); 1153 } 1154 1155 if (val < arc_abs_min || val > arc_c_max) 1156 return (EINVAL); 1157 1158 arc_c_min = val; 1159 1160 if (zfs_arc_meta_min == 0) 1161 arc_meta_min = arc_c_min / 2; 1162 1163 if (arc_c < arc_c_min) 1164 arc_c = arc_c_min; 1165 1166 zfs_arc_min = arc_c_min; 1167 1168 return (0); 1169} 1170#endif 1171 1172#define GHOST_STATE(state) \ 1173 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 1174 (state) == arc_l2c_only) 1175 1176#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) 1177#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) 1178#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) 1179#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) 1180#define HDR_COMPRESSION_ENABLED(hdr) \ 1181 ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) 1182 1183#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) 1184#define HDR_L2_READING(hdr) \ 1185 (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ 1186 ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) 1187#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) 1188#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) 1189#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) 1190#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) 1191 1192#define HDR_ISTYPE_METADATA(hdr) \ 1193 ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) 1194#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) 1195 1196#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) 1197#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) 1198 1199/* For storing compression mode in b_flags */ 1200#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) 1201 1202#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ 1203 HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) 1204#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ 1205 HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); 1206 1207#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) 1208#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) 1209#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) 1210 1211/* 1212 * Other sizes 1213 */ 1214 1215#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 1216#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) 1217 1218/* 1219 * Hash table routines 1220 */ 1221 1222#define HT_LOCK_PAD CACHE_LINE_SIZE 1223 1224struct ht_lock { 1225 kmutex_t ht_lock; 1226#ifdef _KERNEL 1227 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 1228#endif 1229}; 1230 1231#define BUF_LOCKS 256 1232typedef struct buf_hash_table { 1233 uint64_t ht_mask; 1234 arc_buf_hdr_t **ht_table; 1235 struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 1236} buf_hash_table_t; 1237 1238static buf_hash_table_t buf_hash_table; 1239 1240#define BUF_HASH_INDEX(spa, dva, birth) \ 1241 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 1242#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 1243#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 1244#define HDR_LOCK(hdr) \ 1245 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 1246 1247uint64_t zfs_crc64_table[256]; 1248 1249/* 1250 * Level 2 ARC 1251 */ 1252 1253#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 1254#define L2ARC_HEADROOM 2 /* num of writes */ 1255/* 1256 * If we discover during ARC scan any buffers to be compressed, we boost 1257 * our headroom for the next scanning cycle by this percentage multiple. 1258 */ 1259#define L2ARC_HEADROOM_BOOST 200 1260#define L2ARC_FEED_SECS 1 /* caching interval secs */ 1261#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 1262 1263#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 1264#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 1265 1266/* L2ARC Performance Tunables */ 1267uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 1268uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 1269uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 1270uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 1271uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 1272uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 1273boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 1274boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 1275boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 1276 1277SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, 1278 &l2arc_write_max, 0, "max write size"); 1279SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, 1280 &l2arc_write_boost, 0, "extra write during warmup"); 1281SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, 1282 &l2arc_headroom, 0, "number of dev writes"); 1283SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, 1284 &l2arc_feed_secs, 0, "interval seconds"); 1285SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, 1286 &l2arc_feed_min_ms, 0, "min interval milliseconds"); 1287 1288SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, 1289 &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 1290SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, 1291 &l2arc_feed_again, 0, "turbo warmup"); 1292SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, 1293 &l2arc_norw, 0, "no reads during writes"); 1294 1295SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 1296 &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); 1297SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, 1298 &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1299 "size of anonymous state"); 1300SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, 1301 &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1302 "size of anonymous state"); 1303 1304SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 1305 &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); 1306SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, 1307 &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1308 "size of metadata in mru state"); 1309SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, 1310 &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1311 "size of data in mru state"); 1312 1313SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 1314 &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); 1315SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, 1316 &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1317 "size of metadata in mru ghost state"); 1318SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, 1319 &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1320 "size of data in mru ghost state"); 1321 1322SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 1323 &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); 1324SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, 1325 &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1326 "size of metadata in mfu state"); 1327SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, 1328 &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1329 "size of data in mfu state"); 1330 1331SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 1332 &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); 1333SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, 1334 &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1335 "size of metadata in mfu ghost state"); 1336SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, 1337 &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1338 "size of data in mfu ghost state"); 1339 1340SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 1341 &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); 1342 1343/* 1344 * L2ARC Internals 1345 */ 1346struct l2arc_dev { 1347 vdev_t *l2ad_vdev; /* vdev */ 1348 spa_t *l2ad_spa; /* spa */ 1349 uint64_t l2ad_hand; /* next write location */ 1350 uint64_t l2ad_start; /* first addr on device */ 1351 uint64_t l2ad_end; /* last addr on device */ 1352 boolean_t l2ad_first; /* first sweep through */ 1353 boolean_t l2ad_writing; /* currently writing */ 1354 kmutex_t l2ad_mtx; /* lock for buffer list */ 1355 list_t l2ad_buflist; /* buffer list */ 1356 list_node_t l2ad_node; /* device list node */ 1357 refcount_t l2ad_alloc; /* allocated bytes */ 1358}; 1359 1360static list_t L2ARC_dev_list; /* device list */ 1361static list_t *l2arc_dev_list; /* device list pointer */ 1362static kmutex_t l2arc_dev_mtx; /* device list mutex */ 1363static l2arc_dev_t *l2arc_dev_last; /* last device used */ 1364static list_t L2ARC_free_on_write; /* free after write buf list */ 1365static list_t *l2arc_free_on_write; /* free after write list ptr */ 1366static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 1367static uint64_t l2arc_ndev; /* number of devices */ 1368 1369typedef struct l2arc_read_callback { 1370 arc_buf_hdr_t *l2rcb_hdr; /* read header */ 1371 blkptr_t l2rcb_bp; /* original blkptr */ 1372 zbookmark_phys_t l2rcb_zb; /* original bookmark */ 1373 int l2rcb_flags; /* original flags */ 1374 abd_t *l2rcb_abd; /* temporary buffer */ 1375} l2arc_read_callback_t; 1376 1377typedef struct l2arc_write_callback { 1378 l2arc_dev_t *l2wcb_dev; /* device info */ 1379 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 1380} l2arc_write_callback_t; 1381 1382typedef struct l2arc_data_free { 1383 /* protected by l2arc_free_on_write_mtx */ 1384 abd_t *l2df_abd; 1385 size_t l2df_size; 1386 arc_buf_contents_t l2df_type; 1387 list_node_t l2df_list_node; 1388} l2arc_data_free_t; 1389 1390static kmutex_t l2arc_feed_thr_lock; 1391static kcondvar_t l2arc_feed_thr_cv; 1392static uint8_t l2arc_thread_exit; 1393 1394static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *); 1395static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); 1396static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *); 1397static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); 1398static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); 1399static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); 1400static void arc_hdr_free_pabd(arc_buf_hdr_t *); 1401static void arc_hdr_alloc_pabd(arc_buf_hdr_t *); 1402static void arc_access(arc_buf_hdr_t *, kmutex_t *); 1403static boolean_t arc_is_overflowing(); 1404static void arc_buf_watch(arc_buf_t *); 1405 1406static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); 1407static uint32_t arc_bufc_to_flags(arc_buf_contents_t); 1408static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); 1409static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); 1410 1411static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); 1412static void l2arc_read_done(zio_t *); 1413 1414static void 1415l2arc_trim(const arc_buf_hdr_t *hdr) 1416{ 1417 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 1418 1419 ASSERT(HDR_HAS_L2HDR(hdr)); 1420 ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 1421 1422 if (HDR_GET_PSIZE(hdr) != 0) { 1423 trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, 1424 HDR_GET_PSIZE(hdr), 0); 1425 } 1426} 1427 1428static uint64_t 1429buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 1430{ 1431 uint8_t *vdva = (uint8_t *)dva; 1432 uint64_t crc = -1ULL; 1433 int i; 1434 1435 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 1436 1437 for (i = 0; i < sizeof (dva_t); i++) 1438 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 1439 1440 crc ^= (spa>>8) ^ birth; 1441 1442 return (crc); 1443} 1444 1445#define HDR_EMPTY(hdr) \ 1446 ((hdr)->b_dva.dva_word[0] == 0 && \ 1447 (hdr)->b_dva.dva_word[1] == 0) 1448 1449#define HDR_EQUAL(spa, dva, birth, hdr) \ 1450 ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 1451 ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 1452 ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) 1453 1454static void 1455buf_discard_identity(arc_buf_hdr_t *hdr) 1456{ 1457 hdr->b_dva.dva_word[0] = 0; 1458 hdr->b_dva.dva_word[1] = 0; 1459 hdr->b_birth = 0; 1460} 1461 1462static arc_buf_hdr_t * 1463buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) 1464{ 1465 const dva_t *dva = BP_IDENTITY(bp); 1466 uint64_t birth = BP_PHYSICAL_BIRTH(bp); 1467 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 1468 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1469 arc_buf_hdr_t *hdr; 1470 1471 mutex_enter(hash_lock); 1472 for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; 1473 hdr = hdr->b_hash_next) { 1474 if (HDR_EQUAL(spa, dva, birth, hdr)) { 1475 *lockp = hash_lock; 1476 return (hdr); 1477 } 1478 } 1479 mutex_exit(hash_lock); 1480 *lockp = NULL; 1481 return (NULL); 1482} 1483 1484/* 1485 * Insert an entry into the hash table. If there is already an element 1486 * equal to elem in the hash table, then the already existing element 1487 * will be returned and the new element will not be inserted. 1488 * Otherwise returns NULL. 1489 * If lockp == NULL, the caller is assumed to already hold the hash lock. 1490 */ 1491static arc_buf_hdr_t * 1492buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) 1493{ 1494 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1495 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1496 arc_buf_hdr_t *fhdr; 1497 uint32_t i; 1498 1499 ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); 1500 ASSERT(hdr->b_birth != 0); 1501 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1502 1503 if (lockp != NULL) { 1504 *lockp = hash_lock; 1505 mutex_enter(hash_lock); 1506 } else { 1507 ASSERT(MUTEX_HELD(hash_lock)); 1508 } 1509 1510 for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; 1511 fhdr = fhdr->b_hash_next, i++) { 1512 if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) 1513 return (fhdr); 1514 } 1515 1516 hdr->b_hash_next = buf_hash_table.ht_table[idx]; 1517 buf_hash_table.ht_table[idx] = hdr; 1518 arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); 1519 1520 /* collect some hash table performance data */ 1521 if (i > 0) { 1522 ARCSTAT_BUMP(arcstat_hash_collisions); 1523 if (i == 1) 1524 ARCSTAT_BUMP(arcstat_hash_chains); 1525 1526 ARCSTAT_MAX(arcstat_hash_chain_max, i); 1527 } 1528 1529 ARCSTAT_BUMP(arcstat_hash_elements); 1530 ARCSTAT_MAXSTAT(arcstat_hash_elements); 1531 1532 return (NULL); 1533} 1534 1535static void 1536buf_hash_remove(arc_buf_hdr_t *hdr) 1537{ 1538 arc_buf_hdr_t *fhdr, **hdrp; 1539 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1540 1541 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 1542 ASSERT(HDR_IN_HASH_TABLE(hdr)); 1543 1544 hdrp = &buf_hash_table.ht_table[idx]; 1545 while ((fhdr = *hdrp) != hdr) { 1546 ASSERT3P(fhdr, !=, NULL); 1547 hdrp = &fhdr->b_hash_next; 1548 } 1549 *hdrp = hdr->b_hash_next; 1550 hdr->b_hash_next = NULL; 1551 arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); 1552 1553 /* collect some hash table performance data */ 1554 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 1555 1556 if (buf_hash_table.ht_table[idx] && 1557 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 1558 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 1559} 1560 1561/* 1562 * Global data structures and functions for the buf kmem cache. 1563 */ 1564static kmem_cache_t *hdr_full_cache; 1565static kmem_cache_t *hdr_l2only_cache; 1566static kmem_cache_t *buf_cache; 1567 1568static void 1569buf_fini(void) 1570{ 1571 int i; 1572 1573 kmem_free(buf_hash_table.ht_table, 1574 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 1575 for (i = 0; i < BUF_LOCKS; i++) 1576 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 1577 kmem_cache_destroy(hdr_full_cache); 1578 kmem_cache_destroy(hdr_l2only_cache); 1579 kmem_cache_destroy(buf_cache); 1580} 1581 1582/* 1583 * Constructor callback - called when the cache is empty 1584 * and a new buf is requested. 1585 */ 1586/* ARGSUSED */ 1587static int 1588hdr_full_cons(void *vbuf, void *unused, int kmflag) 1589{ 1590 arc_buf_hdr_t *hdr = vbuf; 1591 1592 bzero(hdr, HDR_FULL_SIZE); 1593 cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); 1594 refcount_create(&hdr->b_l1hdr.b_refcnt); 1595 mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 1596 multilist_link_init(&hdr->b_l1hdr.b_arc_node); 1597 arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1598 1599 return (0); 1600} 1601 1602/* ARGSUSED */ 1603static int 1604hdr_l2only_cons(void *vbuf, void *unused, int kmflag) 1605{ 1606 arc_buf_hdr_t *hdr = vbuf; 1607 1608 bzero(hdr, HDR_L2ONLY_SIZE); 1609 arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1610 1611 return (0); 1612} 1613 1614/* ARGSUSED */ 1615static int 1616buf_cons(void *vbuf, void *unused, int kmflag) 1617{ 1618 arc_buf_t *buf = vbuf; 1619 1620 bzero(buf, sizeof (arc_buf_t)); 1621 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 1622 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1623 1624 return (0); 1625} 1626 1627/* 1628 * Destructor callback - called when a cached buf is 1629 * no longer required. 1630 */ 1631/* ARGSUSED */ 1632static void 1633hdr_full_dest(void *vbuf, void *unused) 1634{ 1635 arc_buf_hdr_t *hdr = vbuf; 1636 1637 ASSERT(HDR_EMPTY(hdr)); 1638 cv_destroy(&hdr->b_l1hdr.b_cv); 1639 refcount_destroy(&hdr->b_l1hdr.b_refcnt); 1640 mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); 1641 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 1642 arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1643} 1644 1645/* ARGSUSED */ 1646static void 1647hdr_l2only_dest(void *vbuf, void *unused) 1648{ 1649 arc_buf_hdr_t *hdr = vbuf; 1650 1651 ASSERT(HDR_EMPTY(hdr)); 1652 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1653} 1654 1655/* ARGSUSED */ 1656static void 1657buf_dest(void *vbuf, void *unused) 1658{ 1659 arc_buf_t *buf = vbuf; 1660 1661 mutex_destroy(&buf->b_evict_lock); 1662 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1663} 1664 1665/* 1666 * Reclaim callback -- invoked when memory is low. 1667 */ 1668/* ARGSUSED */ 1669static void 1670hdr_recl(void *unused) 1671{ 1672 dprintf("hdr_recl called\n"); 1673 /* 1674 * umem calls the reclaim func when we destroy the buf cache, 1675 * which is after we do arc_fini(). 1676 */ 1677 if (!arc_dead) 1678 cv_signal(&arc_reclaim_thread_cv); 1679} 1680 1681static void 1682buf_init(void) 1683{ 1684 uint64_t *ct; 1685 uint64_t hsize = 1ULL << 12; 1686 int i, j; 1687 1688 /* 1689 * The hash table is big enough to fill all of physical memory 1690 * with an average block size of zfs_arc_average_blocksize (default 8K). 1691 * By default, the table will take up 1692 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). 1693 */ 1694 while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) 1695 hsize <<= 1; 1696retry: 1697 buf_hash_table.ht_mask = hsize - 1; 1698 buf_hash_table.ht_table = 1699 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1700 if (buf_hash_table.ht_table == NULL) { 1701 ASSERT(hsize > (1ULL << 8)); 1702 hsize >>= 1; 1703 goto retry; 1704 } 1705 1706 hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 1707 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); 1708 hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", 1709 HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, 1710 NULL, NULL, 0); 1711 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1712 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1713 1714 for (i = 0; i < 256; i++) 1715 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1716 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1717 1718 for (i = 0; i < BUF_LOCKS; i++) { 1719 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1720 NULL, MUTEX_DEFAULT, NULL); 1721 } 1722} 1723 1724/* 1725 * This is the size that the buf occupies in memory. If the buf is compressed, 1726 * it will correspond to the compressed size. You should use this method of 1727 * getting the buf size unless you explicitly need the logical size. 1728 */ 1729int32_t 1730arc_buf_size(arc_buf_t *buf) 1731{ 1732 return (ARC_BUF_COMPRESSED(buf) ? 1733 HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr)); 1734} 1735 1736int32_t 1737arc_buf_lsize(arc_buf_t *buf) 1738{ 1739 return (HDR_GET_LSIZE(buf->b_hdr)); 1740} 1741 1742enum zio_compress 1743arc_get_compression(arc_buf_t *buf) 1744{ 1745 return (ARC_BUF_COMPRESSED(buf) ? 1746 HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); 1747} 1748 1749#define ARC_MINTIME (hz>>4) /* 62 ms */ 1750 1751static inline boolean_t 1752arc_buf_is_shared(arc_buf_t *buf) 1753{ 1754 boolean_t shared = (buf->b_data != NULL && 1755 buf->b_hdr->b_l1hdr.b_pabd != NULL && 1756 abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && 1757 buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); 1758 IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); 1759 IMPLY(shared, ARC_BUF_SHARED(buf)); 1760 IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); 1761 1762 /* 1763 * It would be nice to assert arc_can_share() too, but the "hdr isn't 1764 * already being shared" requirement prevents us from doing that. 1765 */ 1766 1767 return (shared); 1768} 1769 1770/* 1771 * Free the checksum associated with this header. If there is no checksum, this 1772 * is a no-op. 1773 */ 1774static inline void 1775arc_cksum_free(arc_buf_hdr_t *hdr) 1776{ 1777 ASSERT(HDR_HAS_L1HDR(hdr)); 1778 mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 1779 if (hdr->b_l1hdr.b_freeze_cksum != NULL) { 1780 kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); 1781 hdr->b_l1hdr.b_freeze_cksum = NULL; 1782 } 1783 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1784} 1785 1786/* 1787 * Return true iff at least one of the bufs on hdr is not compressed. 1788 */ 1789static boolean_t 1790arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) 1791{ 1792 for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) { 1793 if (!ARC_BUF_COMPRESSED(b)) { 1794 return (B_TRUE); 1795 } 1796 } 1797 return (B_FALSE); 1798} 1799 1800/* 1801 * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data 1802 * matches the checksum that is stored in the hdr. If there is no checksum, 1803 * or if the buf is compressed, this is a no-op. 1804 */ 1805static void 1806arc_cksum_verify(arc_buf_t *buf) 1807{ 1808 arc_buf_hdr_t *hdr = buf->b_hdr; 1809 zio_cksum_t zc; 1810 1811 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1812 return; 1813 1814 if (ARC_BUF_COMPRESSED(buf)) { 1815 ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || 1816 arc_hdr_has_uncompressed_buf(hdr)); 1817 return; 1818 } 1819 1820 ASSERT(HDR_HAS_L1HDR(hdr)); 1821 1822 mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 1823 if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { 1824 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1825 return; 1826 } 1827 1828 fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc); 1829 if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) 1830 panic("buffer modified while frozen!"); 1831 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1832} 1833 1834static boolean_t 1835arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) 1836{ 1837 enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp); 1838 boolean_t valid_cksum; 1839 1840 ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); 1841 VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); 1842 1843 /* 1844 * We rely on the blkptr's checksum to determine if the block 1845 * is valid or not. When compressed arc is enabled, the l2arc 1846 * writes the block to the l2arc just as it appears in the pool. 1847 * This allows us to use the blkptr's checksum to validate the 1848 * data that we just read off of the l2arc without having to store 1849 * a separate checksum in the arc_buf_hdr_t. However, if compressed 1850 * arc is disabled, then the data written to the l2arc is always 1851 * uncompressed and won't match the block as it exists in the main 1852 * pool. When this is the case, we must first compress it if it is 1853 * compressed on the main pool before we can validate the checksum. 1854 */ 1855 if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) { 1856 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); 1857 uint64_t lsize = HDR_GET_LSIZE(hdr); 1858 uint64_t csize; 1859 1860 void *cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr)); 1861 csize = zio_compress_data(compress, zio->io_abd, cbuf, lsize); 1862 1863 ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); 1864 if (csize < HDR_GET_PSIZE(hdr)) { 1865 /* 1866 * Compressed blocks are always a multiple of the 1867 * smallest ashift in the pool. Ideally, we would 1868 * like to round up the csize to the next 1869 * spa_min_ashift but that value may have changed 1870 * since the block was last written. Instead, 1871 * we rely on the fact that the hdr's psize 1872 * was set to the psize of the block when it was 1873 * last written. We set the csize to that value 1874 * and zero out any part that should not contain 1875 * data. 1876 */ 1877 bzero((char *)cbuf + csize, HDR_GET_PSIZE(hdr) - csize); 1878 csize = HDR_GET_PSIZE(hdr); 1879 } 1880 zio_push_transform(zio, cbuf, csize, HDR_GET_PSIZE(hdr), NULL); 1881 } 1882 1883 /* 1884 * Block pointers always store the checksum for the logical data. 1885 * If the block pointer has the gang bit set, then the checksum 1886 * it represents is for the reconstituted data and not for an 1887 * individual gang member. The zio pipeline, however, must be able to 1888 * determine the checksum of each of the gang constituents so it 1889 * treats the checksum comparison differently than what we need 1890 * for l2arc blocks. This prevents us from using the 1891 * zio_checksum_error() interface directly. Instead we must call the 1892 * zio_checksum_error_impl() so that we can ensure the checksum is 1893 * generated using the correct checksum algorithm and accounts for the 1894 * logical I/O size and not just a gang fragment. 1895 */ 1896 valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, 1897 BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, 1898 zio->io_offset, NULL) == 0); 1899 zio_pop_transforms(zio); 1900 return (valid_cksum); 1901} 1902 1903/* 1904 * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a 1905 * checksum and attaches it to the buf's hdr so that we can ensure that the buf 1906 * isn't modified later on. If buf is compressed or there is already a checksum 1907 * on the hdr, this is a no-op (we only checksum uncompressed bufs). 1908 */ 1909static void 1910arc_cksum_compute(arc_buf_t *buf) 1911{ 1912 arc_buf_hdr_t *hdr = buf->b_hdr; 1913 1914 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1915 return; 1916 1917 ASSERT(HDR_HAS_L1HDR(hdr)); 1918 1919 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1920 if (hdr->b_l1hdr.b_freeze_cksum != NULL) { 1921 ASSERT(arc_hdr_has_uncompressed_buf(hdr)); 1922 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1923 return; 1924 } else if (ARC_BUF_COMPRESSED(buf)) { 1925 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1926 return; 1927 } 1928 1929 ASSERT(!ARC_BUF_COMPRESSED(buf)); 1930 hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), 1931 KM_SLEEP); 1932 fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, 1933 hdr->b_l1hdr.b_freeze_cksum); 1934 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1935#ifdef illumos 1936 arc_buf_watch(buf); 1937#endif 1938} 1939 1940#ifdef illumos 1941#ifndef _KERNEL 1942typedef struct procctl { 1943 long cmd; 1944 prwatch_t prwatch; 1945} procctl_t; 1946#endif 1947 1948/* ARGSUSED */ 1949static void 1950arc_buf_unwatch(arc_buf_t *buf) 1951{ 1952#ifndef _KERNEL 1953 if (arc_watch) { 1954 int result; 1955 procctl_t ctl; 1956 ctl.cmd = PCWATCH; 1957 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1958 ctl.prwatch.pr_size = 0; 1959 ctl.prwatch.pr_wflags = 0; 1960 result = write(arc_procfd, &ctl, sizeof (ctl)); 1961 ASSERT3U(result, ==, sizeof (ctl)); 1962 } 1963#endif 1964} 1965 1966/* ARGSUSED */ 1967static void 1968arc_buf_watch(arc_buf_t *buf) 1969{ 1970#ifndef _KERNEL 1971 if (arc_watch) { 1972 int result; 1973 procctl_t ctl; 1974 ctl.cmd = PCWATCH; 1975 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1976 ctl.prwatch.pr_size = arc_buf_size(buf); 1977 ctl.prwatch.pr_wflags = WA_WRITE; 1978 result = write(arc_procfd, &ctl, sizeof (ctl)); 1979 ASSERT3U(result, ==, sizeof (ctl)); 1980 } 1981#endif 1982} 1983#endif /* illumos */ 1984 1985static arc_buf_contents_t 1986arc_buf_type(arc_buf_hdr_t *hdr) 1987{ 1988 arc_buf_contents_t type; 1989 if (HDR_ISTYPE_METADATA(hdr)) { 1990 type = ARC_BUFC_METADATA; 1991 } else { 1992 type = ARC_BUFC_DATA; 1993 } 1994 VERIFY3U(hdr->b_type, ==, type); 1995 return (type); 1996} 1997 1998boolean_t 1999arc_is_metadata(arc_buf_t *buf) 2000{ 2001 return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0); 2002} 2003 2004static uint32_t 2005arc_bufc_to_flags(arc_buf_contents_t type) 2006{ 2007 switch (type) { 2008 case ARC_BUFC_DATA: 2009 /* metadata field is 0 if buffer contains normal data */ 2010 return (0); 2011 case ARC_BUFC_METADATA: 2012 return (ARC_FLAG_BUFC_METADATA); 2013 default: 2014 break; 2015 } 2016 panic("undefined ARC buffer type!"); 2017 return ((uint32_t)-1); 2018} 2019 2020void 2021arc_buf_thaw(arc_buf_t *buf) 2022{ 2023 arc_buf_hdr_t *hdr = buf->b_hdr; 2024 2025 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 2026 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2027 2028 arc_cksum_verify(buf); 2029 2030 /* 2031 * Compressed buffers do not manipulate the b_freeze_cksum or 2032 * allocate b_thawed. 2033 */ 2034 if (ARC_BUF_COMPRESSED(buf)) { 2035 ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || 2036 arc_hdr_has_uncompressed_buf(hdr)); 2037 return; 2038 } 2039 2040 ASSERT(HDR_HAS_L1HDR(hdr)); 2041 arc_cksum_free(hdr); 2042 2043 mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 2044#ifdef ZFS_DEBUG 2045 if (zfs_flags & ZFS_DEBUG_MODIFY) { 2046 if (hdr->b_l1hdr.b_thawed != NULL) 2047 kmem_free(hdr->b_l1hdr.b_thawed, 1); 2048 hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); 2049 } 2050#endif 2051 2052 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 2053 2054#ifdef illumos 2055 arc_buf_unwatch(buf); 2056#endif 2057} 2058 2059void 2060arc_buf_freeze(arc_buf_t *buf) 2061{ 2062 arc_buf_hdr_t *hdr = buf->b_hdr; 2063 kmutex_t *hash_lock; 2064 2065 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 2066 return; 2067 2068 if (ARC_BUF_COMPRESSED(buf)) { 2069 ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || 2070 arc_hdr_has_uncompressed_buf(hdr)); 2071 return; 2072 } 2073 2074 hash_lock = HDR_LOCK(hdr); 2075 mutex_enter(hash_lock); 2076 2077 ASSERT(HDR_HAS_L1HDR(hdr)); 2078 ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL || 2079 hdr->b_l1hdr.b_state == arc_anon); 2080 arc_cksum_compute(buf); 2081 mutex_exit(hash_lock); 2082} 2083 2084/* 2085 * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, 2086 * the following functions should be used to ensure that the flags are 2087 * updated in a thread-safe way. When manipulating the flags either 2088 * the hash_lock must be held or the hdr must be undiscoverable. This 2089 * ensures that we're not racing with any other threads when updating 2090 * the flags. 2091 */ 2092static inline void 2093arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) 2094{ 2095 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2096 hdr->b_flags |= flags; 2097} 2098 2099static inline void 2100arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) 2101{ 2102 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2103 hdr->b_flags &= ~flags; 2104} 2105 2106/* 2107 * Setting the compression bits in the arc_buf_hdr_t's b_flags is 2108 * done in a special way since we have to clear and set bits 2109 * at the same time. Consumers that wish to set the compression bits 2110 * must use this function to ensure that the flags are updated in 2111 * thread-safe manner. 2112 */ 2113static void 2114arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) 2115{ 2116 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2117 2118 /* 2119 * Holes and embedded blocks will always have a psize = 0 so 2120 * we ignore the compression of the blkptr and set the 2121 * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF. 2122 * Holes and embedded blocks remain anonymous so we don't 2123 * want to uncompress them. Mark them as uncompressed. 2124 */ 2125 if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { 2126 arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); 2127 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); 2128 ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); 2129 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); 2130 } else { 2131 arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); 2132 HDR_SET_COMPRESS(hdr, cmp); 2133 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); 2134 ASSERT(HDR_COMPRESSION_ENABLED(hdr)); 2135 } 2136} 2137 2138/* 2139 * Looks for another buf on the same hdr which has the data decompressed, copies 2140 * from it, and returns true. If no such buf exists, returns false. 2141 */ 2142static boolean_t 2143arc_buf_try_copy_decompressed_data(arc_buf_t *buf) 2144{ 2145 arc_buf_hdr_t *hdr = buf->b_hdr; 2146 boolean_t copied = B_FALSE; 2147 2148 ASSERT(HDR_HAS_L1HDR(hdr)); 2149 ASSERT3P(buf->b_data, !=, NULL); 2150 ASSERT(!ARC_BUF_COMPRESSED(buf)); 2151 2152 for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL; 2153 from = from->b_next) { 2154 /* can't use our own data buffer */ 2155 if (from == buf) { 2156 continue; 2157 } 2158 2159 if (!ARC_BUF_COMPRESSED(from)) { 2160 bcopy(from->b_data, buf->b_data, arc_buf_size(buf)); 2161 copied = B_TRUE; 2162 break; 2163 } 2164 } 2165 2166 /* 2167 * There were no decompressed bufs, so there should not be a 2168 * checksum on the hdr either. 2169 */ 2170 EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); 2171 2172 return (copied); 2173} 2174 2175/* 2176 * Given a buf that has a data buffer attached to it, this function will 2177 * efficiently fill the buf with data of the specified compression setting from 2178 * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr 2179 * are already sharing a data buf, no copy is performed. 2180 * 2181 * If the buf is marked as compressed but uncompressed data was requested, this 2182 * will allocate a new data buffer for the buf, remove that flag, and fill the 2183 * buf with uncompressed data. You can't request a compressed buf on a hdr with 2184 * uncompressed data, and (since we haven't added support for it yet) if you 2185 * want compressed data your buf must already be marked as compressed and have 2186 * the correct-sized data buffer. 2187 */ 2188static int 2189arc_buf_fill(arc_buf_t *buf, boolean_t compressed) 2190{ 2191 arc_buf_hdr_t *hdr = buf->b_hdr; 2192 boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); 2193 dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; 2194 2195 ASSERT3P(buf->b_data, !=, NULL); 2196 IMPLY(compressed, hdr_compressed); 2197 IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); 2198 2199 if (hdr_compressed == compressed) { 2200 if (!arc_buf_is_shared(buf)) { 2201 abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, 2202 arc_buf_size(buf)); 2203 } 2204 } else { 2205 ASSERT(hdr_compressed); 2206 ASSERT(!compressed); 2207 ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); 2208 2209 /* 2210 * If the buf is sharing its data with the hdr, unlink it and 2211 * allocate a new data buffer for the buf. 2212 */ 2213 if (arc_buf_is_shared(buf)) { 2214 ASSERT(ARC_BUF_COMPRESSED(buf)); 2215 2216 /* We need to give the buf it's own b_data */ 2217 buf->b_flags &= ~ARC_BUF_FLAG_SHARED; 2218 buf->b_data = 2219 arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); 2220 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); 2221 2222 /* Previously overhead was 0; just add new overhead */ 2223 ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); 2224 } else if (ARC_BUF_COMPRESSED(buf)) { 2225 /* We need to reallocate the buf's b_data */ 2226 arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), 2227 buf); 2228 buf->b_data = 2229 arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); 2230 2231 /* We increased the size of b_data; update overhead */ 2232 ARCSTAT_INCR(arcstat_overhead_size, 2233 HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr)); 2234 } 2235 2236 /* 2237 * Regardless of the buf's previous compression settings, it 2238 * should not be compressed at the end of this function. 2239 */ 2240 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; 2241 2242 /* 2243 * Try copying the data from another buf which already has a 2244 * decompressed version. If that's not possible, it's time to 2245 * bite the bullet and decompress the data from the hdr. 2246 */ 2247 if (arc_buf_try_copy_decompressed_data(buf)) { 2248 /* Skip byteswapping and checksumming (already done) */ 2249 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL); 2250 return (0); 2251 } else { 2252 int error = zio_decompress_data(HDR_GET_COMPRESS(hdr), 2253 hdr->b_l1hdr.b_pabd, buf->b_data, 2254 HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); 2255 2256 /* 2257 * Absent hardware errors or software bugs, this should 2258 * be impossible, but log it anyway so we can debug it. 2259 */ 2260 if (error != 0) { 2261 zfs_dbgmsg( 2262 "hdr %p, compress %d, psize %d, lsize %d", 2263 hdr, HDR_GET_COMPRESS(hdr), 2264 HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); 2265 return (SET_ERROR(EIO)); 2266 } 2267 } 2268 } 2269 2270 /* Byteswap the buf's data if necessary */ 2271 if (bswap != DMU_BSWAP_NUMFUNCS) { 2272 ASSERT(!HDR_SHARED_DATA(hdr)); 2273 ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); 2274 dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); 2275 } 2276 2277 /* Compute the hdr's checksum if necessary */ 2278 arc_cksum_compute(buf); 2279 2280 return (0); 2281} 2282 2283int 2284arc_decompress(arc_buf_t *buf) 2285{ 2286 return (arc_buf_fill(buf, B_FALSE)); 2287} 2288 2289/* 2290 * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. 2291 */ 2292static uint64_t 2293arc_hdr_size(arc_buf_hdr_t *hdr) 2294{ 2295 uint64_t size; 2296 2297 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && 2298 HDR_GET_PSIZE(hdr) > 0) { 2299 size = HDR_GET_PSIZE(hdr); 2300 } else { 2301 ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); 2302 size = HDR_GET_LSIZE(hdr); 2303 } 2304 return (size); 2305} 2306 2307/* 2308 * Increment the amount of evictable space in the arc_state_t's refcount. 2309 * We account for the space used by the hdr and the arc buf individually 2310 * so that we can add and remove them from the refcount individually. 2311 */ 2312static void 2313arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) 2314{ 2315 arc_buf_contents_t type = arc_buf_type(hdr); 2316 2317 ASSERT(HDR_HAS_L1HDR(hdr)); 2318 2319 if (GHOST_STATE(state)) { 2320 ASSERT0(hdr->b_l1hdr.b_bufcnt); 2321 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2322 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2323 (void) refcount_add_many(&state->arcs_esize[type], 2324 HDR_GET_LSIZE(hdr), hdr); 2325 return; 2326 } 2327 2328 ASSERT(!GHOST_STATE(state)); 2329 if (hdr->b_l1hdr.b_pabd != NULL) { 2330 (void) refcount_add_many(&state->arcs_esize[type], 2331 arc_hdr_size(hdr), hdr); 2332 } 2333 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2334 buf = buf->b_next) { 2335 if (arc_buf_is_shared(buf)) 2336 continue; 2337 (void) refcount_add_many(&state->arcs_esize[type], 2338 arc_buf_size(buf), buf); 2339 } 2340} 2341 2342/* 2343 * Decrement the amount of evictable space in the arc_state_t's refcount. 2344 * We account for the space used by the hdr and the arc buf individually 2345 * so that we can add and remove them from the refcount individually. 2346 */ 2347static void 2348arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) 2349{ 2350 arc_buf_contents_t type = arc_buf_type(hdr); 2351 2352 ASSERT(HDR_HAS_L1HDR(hdr)); 2353 2354 if (GHOST_STATE(state)) { 2355 ASSERT0(hdr->b_l1hdr.b_bufcnt); 2356 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2357 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2358 (void) refcount_remove_many(&state->arcs_esize[type], 2359 HDR_GET_LSIZE(hdr), hdr); 2360 return; 2361 } 2362 2363 ASSERT(!GHOST_STATE(state)); 2364 if (hdr->b_l1hdr.b_pabd != NULL) { 2365 (void) refcount_remove_many(&state->arcs_esize[type], 2366 arc_hdr_size(hdr), hdr); 2367 } 2368 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2369 buf = buf->b_next) { 2370 if (arc_buf_is_shared(buf)) 2371 continue; 2372 (void) refcount_remove_many(&state->arcs_esize[type], 2373 arc_buf_size(buf), buf); 2374 } 2375} 2376 2377/* 2378 * Add a reference to this hdr indicating that someone is actively 2379 * referencing that memory. When the refcount transitions from 0 to 1, 2380 * we remove it from the respective arc_state_t list to indicate that 2381 * it is not evictable. 2382 */ 2383static void 2384add_reference(arc_buf_hdr_t *hdr, void *tag) 2385{ 2386 ASSERT(HDR_HAS_L1HDR(hdr)); 2387 if (!MUTEX_HELD(HDR_LOCK(hdr))) { 2388 ASSERT(hdr->b_l1hdr.b_state == arc_anon); 2389 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2390 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2391 } 2392 2393 arc_state_t *state = hdr->b_l1hdr.b_state; 2394 2395 if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && 2396 (state != arc_anon)) { 2397 /* We don't use the L2-only state list. */ 2398 if (state != arc_l2c_only) { 2399 multilist_remove(state->arcs_list[arc_buf_type(hdr)], 2400 hdr); 2401 arc_evictable_space_decrement(hdr, state); 2402 } 2403 /* remove the prefetch flag if we get a reference */ 2404 arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); 2405 } 2406} 2407 2408/* 2409 * Remove a reference from this hdr. When the reference transitions from 2410 * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's 2411 * list making it eligible for eviction. 2412 */ 2413static int 2414remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 2415{ 2416 int cnt; 2417 arc_state_t *state = hdr->b_l1hdr.b_state; 2418 2419 ASSERT(HDR_HAS_L1HDR(hdr)); 2420 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 2421 ASSERT(!GHOST_STATE(state)); 2422 2423 /* 2424 * arc_l2c_only counts as a ghost state so we don't need to explicitly 2425 * check to prevent usage of the arc_l2c_only list. 2426 */ 2427 if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && 2428 (state != arc_anon)) { 2429 multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr); 2430 ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); 2431 arc_evictable_space_increment(hdr, state); 2432 } 2433 return (cnt); 2434} 2435 2436/* 2437 * Move the supplied buffer to the indicated state. The hash lock 2438 * for the buffer must be held by the caller. 2439 */ 2440static void 2441arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, 2442 kmutex_t *hash_lock) 2443{ 2444 arc_state_t *old_state; 2445 int64_t refcnt; 2446 uint32_t bufcnt; 2447 boolean_t update_old, update_new; 2448 arc_buf_contents_t buftype = arc_buf_type(hdr); 2449 2450 /* 2451 * We almost always have an L1 hdr here, since we call arc_hdr_realloc() 2452 * in arc_read() when bringing a buffer out of the L2ARC. However, the 2453 * L1 hdr doesn't always exist when we change state to arc_anon before 2454 * destroying a header, in which case reallocating to add the L1 hdr is 2455 * pointless. 2456 */ 2457 if (HDR_HAS_L1HDR(hdr)) { 2458 old_state = hdr->b_l1hdr.b_state; 2459 refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); 2460 bufcnt = hdr->b_l1hdr.b_bufcnt; 2461 update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL); 2462 } else { 2463 old_state = arc_l2c_only; 2464 refcnt = 0; 2465 bufcnt = 0; 2466 update_old = B_FALSE; 2467 } 2468 update_new = update_old; 2469 2470 ASSERT(MUTEX_HELD(hash_lock)); 2471 ASSERT3P(new_state, !=, old_state); 2472 ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); 2473 ASSERT(old_state != arc_anon || bufcnt <= 1); 2474 2475 /* 2476 * If this buffer is evictable, transfer it from the 2477 * old state list to the new state list. 2478 */ 2479 if (refcnt == 0) { 2480 if (old_state != arc_anon && old_state != arc_l2c_only) { 2481 ASSERT(HDR_HAS_L1HDR(hdr)); 2482 multilist_remove(old_state->arcs_list[buftype], hdr); 2483 2484 if (GHOST_STATE(old_state)) { 2485 ASSERT0(bufcnt); 2486 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2487 update_old = B_TRUE; 2488 } 2489 arc_evictable_space_decrement(hdr, old_state); 2490 } 2491 if (new_state != arc_anon && new_state != arc_l2c_only) { 2492 2493 /* 2494 * An L1 header always exists here, since if we're 2495 * moving to some L1-cached state (i.e. not l2c_only or 2496 * anonymous), we realloc the header to add an L1hdr 2497 * beforehand. 2498 */ 2499 ASSERT(HDR_HAS_L1HDR(hdr)); 2500 multilist_insert(new_state->arcs_list[buftype], hdr); 2501 2502 if (GHOST_STATE(new_state)) { 2503 ASSERT0(bufcnt); 2504 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2505 update_new = B_TRUE; 2506 } 2507 arc_evictable_space_increment(hdr, new_state); 2508 } 2509 } 2510 2511 ASSERT(!HDR_EMPTY(hdr)); 2512 if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) 2513 buf_hash_remove(hdr); 2514 2515 /* adjust state sizes (ignore arc_l2c_only) */ 2516 2517 if (update_new && new_state != arc_l2c_only) { 2518 ASSERT(HDR_HAS_L1HDR(hdr)); 2519 if (GHOST_STATE(new_state)) { 2520 ASSERT0(bufcnt); 2521 2522 /* 2523 * When moving a header to a ghost state, we first 2524 * remove all arc buffers. Thus, we'll have a 2525 * bufcnt of zero, and no arc buffer to use for 2526 * the reference. As a result, we use the arc 2527 * header pointer for the reference. 2528 */ 2529 (void) refcount_add_many(&new_state->arcs_size, 2530 HDR_GET_LSIZE(hdr), hdr); 2531 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2532 } else { 2533 uint32_t buffers = 0; 2534 2535 /* 2536 * Each individual buffer holds a unique reference, 2537 * thus we must remove each of these references one 2538 * at a time. 2539 */ 2540 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2541 buf = buf->b_next) { 2542 ASSERT3U(bufcnt, !=, 0); 2543 buffers++; 2544 2545 /* 2546 * When the arc_buf_t is sharing the data 2547 * block with the hdr, the owner of the 2548 * reference belongs to the hdr. Only 2549 * add to the refcount if the arc_buf_t is 2550 * not shared. 2551 */ 2552 if (arc_buf_is_shared(buf)) 2553 continue; 2554 2555 (void) refcount_add_many(&new_state->arcs_size, 2556 arc_buf_size(buf), buf); 2557 } 2558 ASSERT3U(bufcnt, ==, buffers); 2559 2560 if (hdr->b_l1hdr.b_pabd != NULL) { 2561 (void) refcount_add_many(&new_state->arcs_size, 2562 arc_hdr_size(hdr), hdr); 2563 } else { 2564 ASSERT(GHOST_STATE(old_state)); 2565 } 2566 } 2567 } 2568 2569 if (update_old && old_state != arc_l2c_only) { 2570 ASSERT(HDR_HAS_L1HDR(hdr)); 2571 if (GHOST_STATE(old_state)) { 2572 ASSERT0(bufcnt); 2573 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2574 2575 /* 2576 * When moving a header off of a ghost state, 2577 * the header will not contain any arc buffers. 2578 * We use the arc header pointer for the reference 2579 * which is exactly what we did when we put the 2580 * header on the ghost state. 2581 */ 2582 2583 (void) refcount_remove_many(&old_state->arcs_size, 2584 HDR_GET_LSIZE(hdr), hdr); 2585 } else { 2586 uint32_t buffers = 0; 2587 2588 /* 2589 * Each individual buffer holds a unique reference, 2590 * thus we must remove each of these references one 2591 * at a time. 2592 */ 2593 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2594 buf = buf->b_next) { 2595 ASSERT3U(bufcnt, !=, 0); 2596 buffers++; 2597 2598 /* 2599 * When the arc_buf_t is sharing the data 2600 * block with the hdr, the owner of the 2601 * reference belongs to the hdr. Only 2602 * add to the refcount if the arc_buf_t is 2603 * not shared. 2604 */ 2605 if (arc_buf_is_shared(buf)) 2606 continue; 2607 2608 (void) refcount_remove_many( 2609 &old_state->arcs_size, arc_buf_size(buf), 2610 buf); 2611 } 2612 ASSERT3U(bufcnt, ==, buffers); 2613 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 2614 (void) refcount_remove_many( 2615 &old_state->arcs_size, arc_hdr_size(hdr), hdr); 2616 } 2617 } 2618 2619 if (HDR_HAS_L1HDR(hdr)) 2620 hdr->b_l1hdr.b_state = new_state; 2621 2622 /* 2623 * L2 headers should never be on the L2 state list since they don't 2624 * have L1 headers allocated. 2625 */ 2626 ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && 2627 multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); 2628} 2629 2630void 2631arc_space_consume(uint64_t space, arc_space_type_t type) 2632{ 2633 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 2634 2635 switch (type) { 2636 case ARC_SPACE_DATA: 2637 ARCSTAT_INCR(arcstat_data_size, space); 2638 break; 2639 case ARC_SPACE_META: 2640 ARCSTAT_INCR(arcstat_metadata_size, space); 2641 break; 2642 case ARC_SPACE_OTHER: 2643 ARCSTAT_INCR(arcstat_other_size, space); 2644 break; 2645 case ARC_SPACE_HDRS: 2646 ARCSTAT_INCR(arcstat_hdr_size, space); 2647 break; 2648 case ARC_SPACE_L2HDRS: 2649 ARCSTAT_INCR(arcstat_l2_hdr_size, space); 2650 break; 2651 } 2652 2653 if (type != ARC_SPACE_DATA) 2654 ARCSTAT_INCR(arcstat_meta_used, space); 2655 2656 atomic_add_64(&arc_size, space); 2657} 2658 2659void 2660arc_space_return(uint64_t space, arc_space_type_t type) 2661{ 2662 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 2663 2664 switch (type) { 2665 case ARC_SPACE_DATA: 2666 ARCSTAT_INCR(arcstat_data_size, -space); 2667 break; 2668 case ARC_SPACE_META: 2669 ARCSTAT_INCR(arcstat_metadata_size, -space); 2670 break; 2671 case ARC_SPACE_OTHER: 2672 ARCSTAT_INCR(arcstat_other_size, -space); 2673 break; 2674 case ARC_SPACE_HDRS: 2675 ARCSTAT_INCR(arcstat_hdr_size, -space); 2676 break; 2677 case ARC_SPACE_L2HDRS: 2678 ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 2679 break; 2680 } 2681 2682 if (type != ARC_SPACE_DATA) { 2683 ASSERT(arc_meta_used >= space); 2684 if (arc_meta_max < arc_meta_used) 2685 arc_meta_max = arc_meta_used; 2686 ARCSTAT_INCR(arcstat_meta_used, -space); 2687 } 2688 2689 ASSERT(arc_size >= space); 2690 atomic_add_64(&arc_size, -space); 2691} 2692 2693/* 2694 * Given a hdr and a buf, returns whether that buf can share its b_data buffer 2695 * with the hdr's b_pabd. 2696 */ 2697static boolean_t 2698arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) 2699{ 2700 /* 2701 * The criteria for sharing a hdr's data are: 2702 * 1. the hdr's compression matches the buf's compression 2703 * 2. the hdr doesn't need to be byteswapped 2704 * 3. the hdr isn't already being shared 2705 * 4. the buf is either compressed or it is the last buf in the hdr list 2706 * 2707 * Criterion #4 maintains the invariant that shared uncompressed 2708 * bufs must be the final buf in the hdr's b_buf list. Reading this, you 2709 * might ask, "if a compressed buf is allocated first, won't that be the 2710 * last thing in the list?", but in that case it's impossible to create 2711 * a shared uncompressed buf anyway (because the hdr must be compressed 2712 * to have the compressed buf). You might also think that #3 is 2713 * sufficient to make this guarantee, however it's possible 2714 * (specifically in the rare L2ARC write race mentioned in 2715 * arc_buf_alloc_impl()) there will be an existing uncompressed buf that 2716 * is sharable, but wasn't at the time of its allocation. Rather than 2717 * allow a new shared uncompressed buf to be created and then shuffle 2718 * the list around to make it the last element, this simply disallows 2719 * sharing if the new buf isn't the first to be added. 2720 */ 2721 ASSERT3P(buf->b_hdr, ==, hdr); 2722 boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF; 2723 boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; 2724 return (buf_compressed == hdr_compressed && 2725 hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && 2726 !HDR_SHARED_DATA(hdr) && 2727 (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); 2728} 2729 2730/* 2731 * Allocate a buf for this hdr. If you care about the data that's in the hdr, 2732 * or if you want a compressed buffer, pass those flags in. Returns 0 if the 2733 * copy was made successfully, or an error code otherwise. 2734 */ 2735static int 2736arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, 2737 boolean_t fill, arc_buf_t **ret) 2738{ 2739 arc_buf_t *buf; 2740 2741 ASSERT(HDR_HAS_L1HDR(hdr)); 2742 ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); 2743 VERIFY(hdr->b_type == ARC_BUFC_DATA || 2744 hdr->b_type == ARC_BUFC_METADATA); 2745 ASSERT3P(ret, !=, NULL); 2746 ASSERT3P(*ret, ==, NULL); 2747 2748 buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2749 buf->b_hdr = hdr; 2750 buf->b_data = NULL; 2751 buf->b_next = hdr->b_l1hdr.b_buf; 2752 buf->b_flags = 0; 2753 2754 add_reference(hdr, tag); 2755 2756 /* 2757 * We're about to change the hdr's b_flags. We must either 2758 * hold the hash_lock or be undiscoverable. 2759 */ 2760 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2761 2762 /* 2763 * Only honor requests for compressed bufs if the hdr is actually 2764 * compressed. 2765 */ 2766 if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 2767 buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; 2768 2769 /* 2770 * If the hdr's data can be shared then we share the data buffer and 2771 * set the appropriate bit in the hdr's b_flags to indicate the hdr is 2772 * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new 2773 * buffer to store the buf's data. 2774 * 2775 * There are two additional restrictions here because we're sharing 2776 * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be 2777 * actively involved in an L2ARC write, because if this buf is used by 2778 * an arc_write() then the hdr's data buffer will be released when the 2779 * write completes, even though the L2ARC write might still be using it. 2780 * Second, the hdr's ABD must be linear so that the buf's user doesn't 2781 * need to be ABD-aware. 2782 */ 2783 boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && 2784 abd_is_linear(hdr->b_l1hdr.b_pabd); 2785 2786 /* Set up b_data and sharing */ 2787 if (can_share) { 2788 buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); 2789 buf->b_flags |= ARC_BUF_FLAG_SHARED; 2790 arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); 2791 } else { 2792 buf->b_data = 2793 arc_get_data_buf(hdr, arc_buf_size(buf), buf); 2794 ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); 2795 } 2796 VERIFY3P(buf->b_data, !=, NULL); 2797 2798 hdr->b_l1hdr.b_buf = buf; 2799 hdr->b_l1hdr.b_bufcnt += 1; 2800 2801 /* 2802 * If the user wants the data from the hdr, we need to either copy or 2803 * decompress the data. 2804 */ 2805 if (fill) { 2806 return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0)); 2807 } 2808 2809 return (0); 2810} 2811 2812static char *arc_onloan_tag = "onloan"; 2813 2814static inline void 2815arc_loaned_bytes_update(int64_t delta) 2816{ 2817 atomic_add_64(&arc_loaned_bytes, delta); 2818 2819 /* assert that it did not wrap around */ 2820 ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); 2821} 2822 2823/* 2824 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 2825 * flight data by arc_tempreserve_space() until they are "returned". Loaned 2826 * buffers must be returned to the arc before they can be used by the DMU or 2827 * freed. 2828 */ 2829arc_buf_t * 2830arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) 2831{ 2832 arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, 2833 is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); 2834 2835 arc_loaned_bytes_update(size); 2836 2837 return (buf); 2838} 2839 2840arc_buf_t * 2841arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, 2842 enum zio_compress compression_type) 2843{ 2844 arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, 2845 psize, lsize, compression_type); 2846 2847 arc_loaned_bytes_update(psize); 2848 2849 return (buf); 2850} 2851 2852 2853/* 2854 * Return a loaned arc buffer to the arc. 2855 */ 2856void 2857arc_return_buf(arc_buf_t *buf, void *tag) 2858{ 2859 arc_buf_hdr_t *hdr = buf->b_hdr; 2860 2861 ASSERT3P(buf->b_data, !=, NULL); 2862 ASSERT(HDR_HAS_L1HDR(hdr)); 2863 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 2864 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2865 2866 arc_loaned_bytes_update(-arc_buf_size(buf)); 2867} 2868 2869/* Detach an arc_buf from a dbuf (tag) */ 2870void 2871arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 2872{ 2873 arc_buf_hdr_t *hdr = buf->b_hdr; 2874 2875 ASSERT3P(buf->b_data, !=, NULL); 2876 ASSERT(HDR_HAS_L1HDR(hdr)); 2877 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2878 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); 2879 2880 arc_loaned_bytes_update(arc_buf_size(buf)); 2881} 2882 2883static void 2884l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) 2885{ 2886 l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); 2887 2888 df->l2df_abd = abd; 2889 df->l2df_size = size; 2890 df->l2df_type = type; 2891 mutex_enter(&l2arc_free_on_write_mtx); 2892 list_insert_head(l2arc_free_on_write, df); 2893 mutex_exit(&l2arc_free_on_write_mtx); 2894} 2895 2896static void 2897arc_hdr_free_on_write(arc_buf_hdr_t *hdr) 2898{ 2899 arc_state_t *state = hdr->b_l1hdr.b_state; 2900 arc_buf_contents_t type = arc_buf_type(hdr); 2901 uint64_t size = arc_hdr_size(hdr); 2902 2903 /* protected by hash lock, if in the hash table */ 2904 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 2905 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2906 ASSERT(state != arc_anon && state != arc_l2c_only); 2907 2908 (void) refcount_remove_many(&state->arcs_esize[type], 2909 size, hdr); 2910 } 2911 (void) refcount_remove_many(&state->arcs_size, size, hdr); 2912 if (type == ARC_BUFC_METADATA) { 2913 arc_space_return(size, ARC_SPACE_META); 2914 } else { 2915 ASSERT(type == ARC_BUFC_DATA); 2916 arc_space_return(size, ARC_SPACE_DATA); 2917 } 2918 2919 l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); 2920} 2921 2922/* 2923 * Share the arc_buf_t's data with the hdr. Whenever we are sharing the 2924 * data buffer, we transfer the refcount ownership to the hdr and update 2925 * the appropriate kstats. 2926 */ 2927static void 2928arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) 2929{ 2930 arc_state_t *state = hdr->b_l1hdr.b_state; 2931 2932 ASSERT(arc_can_share(hdr, buf)); 2933 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2934 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2935 2936 /* 2937 * Start sharing the data buffer. We transfer the 2938 * refcount ownership to the hdr since it always owns 2939 * the refcount whenever an arc_buf_t is shared. 2940 */ 2941 refcount_transfer_ownership(&state->arcs_size, buf, hdr); 2942 hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); 2943 abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, 2944 HDR_ISTYPE_METADATA(hdr)); 2945 arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); 2946 buf->b_flags |= ARC_BUF_FLAG_SHARED; 2947 2948 /* 2949 * Since we've transferred ownership to the hdr we need 2950 * to increment its compressed and uncompressed kstats and 2951 * decrement the overhead size. 2952 */ 2953 ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); 2954 ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); 2955 ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf)); 2956} 2957 2958static void 2959arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) 2960{ 2961 arc_state_t *state = hdr->b_l1hdr.b_state; 2962 2963 ASSERT(arc_buf_is_shared(buf)); 2964 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 2965 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2966 2967 /* 2968 * We are no longer sharing this buffer so we need 2969 * to transfer its ownership to the rightful owner. 2970 */ 2971 refcount_transfer_ownership(&state->arcs_size, hdr, buf); 2972 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); 2973 abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); 2974 abd_put(hdr->b_l1hdr.b_pabd); 2975 hdr->b_l1hdr.b_pabd = NULL; 2976 buf->b_flags &= ~ARC_BUF_FLAG_SHARED; 2977 2978 /* 2979 * Since the buffer is no longer shared between 2980 * the arc buf and the hdr, count it as overhead. 2981 */ 2982 ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); 2983 ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); 2984 ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); 2985} 2986 2987/* 2988 * Remove an arc_buf_t from the hdr's buf list and return the last 2989 * arc_buf_t on the list. If no buffers remain on the list then return 2990 * NULL. 2991 */ 2992static arc_buf_t * 2993arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) 2994{ 2995 ASSERT(HDR_HAS_L1HDR(hdr)); 2996 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2997 2998 arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; 2999 arc_buf_t *lastbuf = NULL; 3000 3001 /* 3002 * Remove the buf from the hdr list and locate the last 3003 * remaining buffer on the list. 3004 */ 3005 while (*bufp != NULL) { 3006 if (*bufp == buf) 3007 *bufp = buf->b_next; 3008 3009 /* 3010 * If we've removed a buffer in the middle of 3011 * the list then update the lastbuf and update 3012 * bufp. 3013 */ 3014 if (*bufp != NULL) { 3015 lastbuf = *bufp; 3016 bufp = &(*bufp)->b_next; 3017 } 3018 } 3019 buf->b_next = NULL; 3020 ASSERT3P(lastbuf, !=, buf); 3021 IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); 3022 IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); 3023 IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); 3024 3025 return (lastbuf); 3026} 3027 3028/* 3029 * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's 3030 * list and free it. 3031 */ 3032static void 3033arc_buf_destroy_impl(arc_buf_t *buf) 3034{ 3035 arc_buf_hdr_t *hdr = buf->b_hdr; 3036 3037 /* 3038 * Free up the data associated with the buf but only if we're not 3039 * sharing this with the hdr. If we are sharing it with the hdr, the 3040 * hdr is responsible for doing the free. 3041 */ 3042 if (buf->b_data != NULL) { 3043 /* 3044 * We're about to change the hdr's b_flags. We must either 3045 * hold the hash_lock or be undiscoverable. 3046 */ 3047 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 3048 3049 arc_cksum_verify(buf); 3050#ifdef illumos 3051 arc_buf_unwatch(buf); 3052#endif 3053 3054 if (arc_buf_is_shared(buf)) { 3055 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); 3056 } else { 3057 uint64_t size = arc_buf_size(buf); 3058 arc_free_data_buf(hdr, buf->b_data, size, buf); 3059 ARCSTAT_INCR(arcstat_overhead_size, -size); 3060 } 3061 buf->b_data = NULL; 3062 3063 ASSERT(hdr->b_l1hdr.b_bufcnt > 0); 3064 hdr->b_l1hdr.b_bufcnt -= 1; 3065 } 3066 3067 arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); 3068 3069 if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { 3070 /* 3071 * If the current arc_buf_t is sharing its data buffer with the 3072 * hdr, then reassign the hdr's b_pabd to share it with the new 3073 * buffer at the end of the list. The shared buffer is always 3074 * the last one on the hdr's buffer list. 3075 * 3076 * There is an equivalent case for compressed bufs, but since 3077 * they aren't guaranteed to be the last buf in the list and 3078 * that is an exceedingly rare case, we just allow that space be 3079 * wasted temporarily. 3080 */ 3081 if (lastbuf != NULL) { 3082 /* Only one buf can be shared at once */ 3083 VERIFY(!arc_buf_is_shared(lastbuf)); 3084 /* hdr is uncompressed so can't have compressed buf */ 3085 VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); 3086 3087 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3088 arc_hdr_free_pabd(hdr); 3089 3090 /* 3091 * We must setup a new shared block between the 3092 * last buffer and the hdr. The data would have 3093 * been allocated by the arc buf so we need to transfer 3094 * ownership to the hdr since it's now being shared. 3095 */ 3096 arc_share_buf(hdr, lastbuf); 3097 } 3098 } else if (HDR_SHARED_DATA(hdr)) { 3099 /* 3100 * Uncompressed shared buffers are always at the end 3101 * of the list. Compressed buffers don't have the 3102 * same requirements. This makes it hard to 3103 * simply assert that the lastbuf is shared so 3104 * we rely on the hdr's compression flags to determine 3105 * if we have a compressed, shared buffer. 3106 */ 3107 ASSERT3P(lastbuf, !=, NULL); 3108 ASSERT(arc_buf_is_shared(lastbuf) || 3109 HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); 3110 } 3111 3112 /* 3113 * Free the checksum if we're removing the last uncompressed buf from 3114 * this hdr. 3115 */ 3116 if (!arc_hdr_has_uncompressed_buf(hdr)) { 3117 arc_cksum_free(hdr); 3118 } 3119 3120 /* clean up the buf */ 3121 buf->b_hdr = NULL; 3122 kmem_cache_free(buf_cache, buf); 3123} 3124 3125static void 3126arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr) 3127{ 3128 ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); 3129 ASSERT(HDR_HAS_L1HDR(hdr)); 3130 ASSERT(!HDR_SHARED_DATA(hdr)); 3131 3132 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3133 hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); 3134 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; 3135 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3136 3137 ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); 3138 ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); 3139} 3140 3141static void 3142arc_hdr_free_pabd(arc_buf_hdr_t *hdr) 3143{ 3144 ASSERT(HDR_HAS_L1HDR(hdr)); 3145 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3146 3147 /* 3148 * If the hdr is currently being written to the l2arc then 3149 * we defer freeing the data by adding it to the l2arc_free_on_write 3150 * list. The l2arc will free the data once it's finished 3151 * writing it to the l2arc device. 3152 */ 3153 if (HDR_L2_WRITING(hdr)) { 3154 arc_hdr_free_on_write(hdr); 3155 ARCSTAT_BUMP(arcstat_l2_free_on_write); 3156 } else { 3157 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, 3158 arc_hdr_size(hdr), hdr); 3159 } 3160 hdr->b_l1hdr.b_pabd = NULL; 3161 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; 3162 3163 ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); 3164 ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); 3165} 3166 3167static arc_buf_hdr_t * 3168arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, 3169 enum zio_compress compression_type, arc_buf_contents_t type) 3170{ 3171 arc_buf_hdr_t *hdr; 3172 3173 VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); 3174 3175 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 3176 ASSERT(HDR_EMPTY(hdr)); 3177 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 3178 ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL); 3179 HDR_SET_PSIZE(hdr, psize); 3180 HDR_SET_LSIZE(hdr, lsize); 3181 hdr->b_spa = spa; 3182 hdr->b_type = type; 3183 hdr->b_flags = 0; 3184 arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); 3185 arc_hdr_set_compress(hdr, compression_type); 3186 3187 hdr->b_l1hdr.b_state = arc_anon; 3188 hdr->b_l1hdr.b_arc_access = 0; 3189 hdr->b_l1hdr.b_bufcnt = 0; 3190 hdr->b_l1hdr.b_buf = NULL; 3191 3192 /* 3193 * Allocate the hdr's buffer. This will contain either 3194 * the compressed or uncompressed data depending on the block 3195 * it references and compressed arc enablement. 3196 */ 3197 arc_hdr_alloc_pabd(hdr); 3198 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3199 3200 return (hdr); 3201} 3202 3203/* 3204 * Transition between the two allocation states for the arc_buf_hdr struct. 3205 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without 3206 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller 3207 * version is used when a cache buffer is only in the L2ARC in order to reduce 3208 * memory usage. 3209 */ 3210static arc_buf_hdr_t * 3211arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) 3212{ 3213 ASSERT(HDR_HAS_L2HDR(hdr)); 3214 3215 arc_buf_hdr_t *nhdr; 3216 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 3217 3218 ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || 3219 (old == hdr_l2only_cache && new == hdr_full_cache)); 3220 3221 nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); 3222 3223 ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); 3224 buf_hash_remove(hdr); 3225 3226 bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); 3227 3228 if (new == hdr_full_cache) { 3229 arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); 3230 /* 3231 * arc_access and arc_change_state need to be aware that a 3232 * header has just come out of L2ARC, so we set its state to 3233 * l2c_only even though it's about to change. 3234 */ 3235 nhdr->b_l1hdr.b_state = arc_l2c_only; 3236 3237 /* Verify previous threads set to NULL before freeing */ 3238 ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); 3239 } else { 3240 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 3241 ASSERT0(hdr->b_l1hdr.b_bufcnt); 3242 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 3243 3244 /* 3245 * If we've reached here, We must have been called from 3246 * arc_evict_hdr(), as such we should have already been 3247 * removed from any ghost list we were previously on 3248 * (which protects us from racing with arc_evict_state), 3249 * thus no locking is needed during this check. 3250 */ 3251 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 3252 3253 /* 3254 * A buffer must not be moved into the arc_l2c_only 3255 * state if it's not finished being written out to the 3256 * l2arc device. Otherwise, the b_l1hdr.b_pabd field 3257 * might try to be accessed, even though it was removed. 3258 */ 3259 VERIFY(!HDR_L2_WRITING(hdr)); 3260 VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3261 3262#ifdef ZFS_DEBUG 3263 if (hdr->b_l1hdr.b_thawed != NULL) { 3264 kmem_free(hdr->b_l1hdr.b_thawed, 1); 3265 hdr->b_l1hdr.b_thawed = NULL; 3266 } 3267#endif 3268 3269 arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); 3270 } 3271 /* 3272 * The header has been reallocated so we need to re-insert it into any 3273 * lists it was on. 3274 */ 3275 (void) buf_hash_insert(nhdr, NULL); 3276 3277 ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); 3278 3279 mutex_enter(&dev->l2ad_mtx); 3280 3281 /* 3282 * We must place the realloc'ed header back into the list at 3283 * the same spot. Otherwise, if it's placed earlier in the list, 3284 * l2arc_write_buffers() could find it during the function's 3285 * write phase, and try to write it out to the l2arc. 3286 */ 3287 list_insert_after(&dev->l2ad_buflist, hdr, nhdr); 3288 list_remove(&dev->l2ad_buflist, hdr); 3289 3290 mutex_exit(&dev->l2ad_mtx); 3291 3292 /* 3293 * Since we're using the pointer address as the tag when 3294 * incrementing and decrementing the l2ad_alloc refcount, we 3295 * must remove the old pointer (that we're about to destroy) and 3296 * add the new pointer to the refcount. Otherwise we'd remove 3297 * the wrong pointer address when calling arc_hdr_destroy() later. 3298 */ 3299 3300 (void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); 3301 (void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); 3302 3303 buf_discard_identity(hdr); 3304 kmem_cache_free(old, hdr); 3305 3306 return (nhdr); 3307} 3308 3309/* 3310 * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. 3311 * The buf is returned thawed since we expect the consumer to modify it. 3312 */ 3313arc_buf_t * 3314arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) 3315{ 3316 arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, 3317 ZIO_COMPRESS_OFF, type); 3318 ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); 3319 3320 arc_buf_t *buf = NULL; 3321 VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf)); 3322 arc_buf_thaw(buf); 3323 3324 return (buf); 3325} 3326 3327/* 3328 * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this 3329 * for bufs containing metadata. 3330 */ 3331arc_buf_t * 3332arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, 3333 enum zio_compress compression_type) 3334{ 3335 ASSERT3U(lsize, >, 0); 3336 ASSERT3U(lsize, >=, psize); 3337 ASSERT(compression_type > ZIO_COMPRESS_OFF); 3338 ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS); 3339 3340 arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, 3341 compression_type, ARC_BUFC_DATA); 3342 ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); 3343 3344 arc_buf_t *buf = NULL; 3345 VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf)); 3346 arc_buf_thaw(buf); 3347 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 3348 3349 if (!arc_buf_is_shared(buf)) { 3350 /* 3351 * To ensure that the hdr has the correct data in it if we call 3352 * arc_decompress() on this buf before it's been written to 3353 * disk, it's easiest if we just set up sharing between the 3354 * buf and the hdr. 3355 */ 3356 ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd)); 3357 arc_hdr_free_pabd(hdr); 3358 arc_share_buf(hdr, buf); 3359 } 3360 3361 return (buf); 3362} 3363 3364static void 3365arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) 3366{ 3367 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 3368 l2arc_dev_t *dev = l2hdr->b_dev; 3369 uint64_t asize = arc_hdr_size(hdr); 3370 3371 ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 3372 ASSERT(HDR_HAS_L2HDR(hdr)); 3373 3374 list_remove(&dev->l2ad_buflist, hdr); 3375 3376 ARCSTAT_INCR(arcstat_l2_asize, -asize); 3377 ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr)); 3378 3379 vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); 3380 3381 (void) refcount_remove_many(&dev->l2ad_alloc, asize, hdr); 3382 arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); 3383} 3384 3385static void 3386arc_hdr_destroy(arc_buf_hdr_t *hdr) 3387{ 3388 if (HDR_HAS_L1HDR(hdr)) { 3389 ASSERT(hdr->b_l1hdr.b_buf == NULL || 3390 hdr->b_l1hdr.b_bufcnt > 0); 3391 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3392 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 3393 } 3394 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3395 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 3396 3397 if (!HDR_EMPTY(hdr)) 3398 buf_discard_identity(hdr); 3399 3400 if (HDR_HAS_L2HDR(hdr)) { 3401 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 3402 boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); 3403 3404 if (!buflist_held) 3405 mutex_enter(&dev->l2ad_mtx); 3406 3407 /* 3408 * Even though we checked this conditional above, we 3409 * need to check this again now that we have the 3410 * l2ad_mtx. This is because we could be racing with 3411 * another thread calling l2arc_evict() which might have 3412 * destroyed this header's L2 portion as we were waiting 3413 * to acquire the l2ad_mtx. If that happens, we don't 3414 * want to re-destroy the header's L2 portion. 3415 */ 3416 if (HDR_HAS_L2HDR(hdr)) { 3417 l2arc_trim(hdr); 3418 arc_hdr_l2hdr_destroy(hdr); 3419 } 3420 3421 if (!buflist_held) 3422 mutex_exit(&dev->l2ad_mtx); 3423 } 3424 3425 if (HDR_HAS_L1HDR(hdr)) { 3426 arc_cksum_free(hdr); 3427 3428 while (hdr->b_l1hdr.b_buf != NULL) 3429 arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); 3430 3431#ifdef ZFS_DEBUG 3432 if (hdr->b_l1hdr.b_thawed != NULL) { 3433 kmem_free(hdr->b_l1hdr.b_thawed, 1); 3434 hdr->b_l1hdr.b_thawed = NULL; 3435 } 3436#endif 3437 3438 if (hdr->b_l1hdr.b_pabd != NULL) { 3439 arc_hdr_free_pabd(hdr); 3440 } 3441 } 3442 3443 ASSERT3P(hdr->b_hash_next, ==, NULL); 3444 if (HDR_HAS_L1HDR(hdr)) { 3445 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 3446 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 3447 kmem_cache_free(hdr_full_cache, hdr); 3448 } else { 3449 kmem_cache_free(hdr_l2only_cache, hdr); 3450 } 3451} 3452 3453void 3454arc_buf_destroy(arc_buf_t *buf, void* tag) 3455{ 3456 arc_buf_hdr_t *hdr = buf->b_hdr; 3457 kmutex_t *hash_lock = HDR_LOCK(hdr); 3458 3459 if (hdr->b_l1hdr.b_state == arc_anon) { 3460 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); 3461 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3462 VERIFY0(remove_reference(hdr, NULL, tag)); 3463 arc_hdr_destroy(hdr); 3464 return; 3465 } 3466 3467 mutex_enter(hash_lock); 3468 ASSERT3P(hdr, ==, buf->b_hdr); 3469 ASSERT(hdr->b_l1hdr.b_bufcnt > 0); 3470 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 3471 ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); 3472 ASSERT3P(buf->b_data, !=, NULL); 3473 3474 (void) remove_reference(hdr, hash_lock, tag); 3475 arc_buf_destroy_impl(buf); 3476 mutex_exit(hash_lock); 3477} 3478 3479/* 3480 * Evict the arc_buf_hdr that is provided as a parameter. The resultant 3481 * state of the header is dependent on it's state prior to entering this 3482 * function. The following transitions are possible: 3483 * 3484 * - arc_mru -> arc_mru_ghost 3485 * - arc_mfu -> arc_mfu_ghost 3486 * - arc_mru_ghost -> arc_l2c_only 3487 * - arc_mru_ghost -> deleted 3488 * - arc_mfu_ghost -> arc_l2c_only 3489 * - arc_mfu_ghost -> deleted 3490 */ 3491static int64_t 3492arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 3493{ 3494 arc_state_t *evicted_state, *state; 3495 int64_t bytes_evicted = 0; 3496 3497 ASSERT(MUTEX_HELD(hash_lock)); 3498 ASSERT(HDR_HAS_L1HDR(hdr)); 3499 3500 state = hdr->b_l1hdr.b_state; 3501 if (GHOST_STATE(state)) { 3502 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3503 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 3504 3505 /* 3506 * l2arc_write_buffers() relies on a header's L1 portion 3507 * (i.e. its b_pabd field) during it's write phase. 3508 * Thus, we cannot push a header onto the arc_l2c_only 3509 * state (removing it's L1 piece) until the header is 3510 * done being written to the l2arc. 3511 */ 3512 if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { 3513 ARCSTAT_BUMP(arcstat_evict_l2_skip); 3514 return (bytes_evicted); 3515 } 3516 3517 ARCSTAT_BUMP(arcstat_deleted); 3518 bytes_evicted += HDR_GET_LSIZE(hdr); 3519 3520 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); 3521 3522 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3523 if (HDR_HAS_L2HDR(hdr)) { 3524 /* 3525 * This buffer is cached on the 2nd Level ARC; 3526 * don't destroy the header. 3527 */ 3528 arc_change_state(arc_l2c_only, hdr, hash_lock); 3529 /* 3530 * dropping from L1+L2 cached to L2-only, 3531 * realloc to remove the L1 header. 3532 */ 3533 hdr = arc_hdr_realloc(hdr, hdr_full_cache, 3534 hdr_l2only_cache); 3535 } else { 3536 arc_change_state(arc_anon, hdr, hash_lock); 3537 arc_hdr_destroy(hdr); 3538 } 3539 return (bytes_evicted); 3540 } 3541 3542 ASSERT(state == arc_mru || state == arc_mfu); 3543 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 3544 3545 /* prefetch buffers have a minimum lifespan */ 3546 if (HDR_IO_IN_PROGRESS(hdr) || 3547 ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && 3548 ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < 3549 arc_min_prefetch_lifespan)) { 3550 ARCSTAT_BUMP(arcstat_evict_skip); 3551 return (bytes_evicted); 3552 } 3553 3554 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 3555 while (hdr->b_l1hdr.b_buf) { 3556 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 3557 if (!mutex_tryenter(&buf->b_evict_lock)) { 3558 ARCSTAT_BUMP(arcstat_mutex_miss); 3559 break; 3560 } 3561 if (buf->b_data != NULL) 3562 bytes_evicted += HDR_GET_LSIZE(hdr); 3563 mutex_exit(&buf->b_evict_lock); 3564 arc_buf_destroy_impl(buf); 3565 } 3566 3567 if (HDR_HAS_L2HDR(hdr)) { 3568 ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); 3569 } else { 3570 if (l2arc_write_eligible(hdr->b_spa, hdr)) { 3571 ARCSTAT_INCR(arcstat_evict_l2_eligible, 3572 HDR_GET_LSIZE(hdr)); 3573 } else { 3574 ARCSTAT_INCR(arcstat_evict_l2_ineligible, 3575 HDR_GET_LSIZE(hdr)); 3576 } 3577 } 3578 3579 if (hdr->b_l1hdr.b_bufcnt == 0) { 3580 arc_cksum_free(hdr); 3581 3582 bytes_evicted += arc_hdr_size(hdr); 3583 3584 /* 3585 * If this hdr is being evicted and has a compressed 3586 * buffer then we discard it here before we change states. 3587 * This ensures that the accounting is updated correctly 3588 * in arc_free_data_impl(). 3589 */ 3590 arc_hdr_free_pabd(hdr); 3591 3592 arc_change_state(evicted_state, hdr, hash_lock); 3593 ASSERT(HDR_IN_HASH_TABLE(hdr)); 3594 arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); 3595 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); 3596 } 3597 3598 return (bytes_evicted); 3599} 3600 3601static uint64_t 3602arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, 3603 uint64_t spa, int64_t bytes) 3604{ 3605 multilist_sublist_t *mls; 3606 uint64_t bytes_evicted = 0; 3607 arc_buf_hdr_t *hdr; 3608 kmutex_t *hash_lock; 3609 int evict_count = 0; 3610 3611 ASSERT3P(marker, !=, NULL); 3612 IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 3613 3614 mls = multilist_sublist_lock(ml, idx); 3615 3616 for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; 3617 hdr = multilist_sublist_prev(mls, marker)) { 3618 if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || 3619 (evict_count >= zfs_arc_evict_batch_limit)) 3620 break; 3621 3622 /* 3623 * To keep our iteration location, move the marker 3624 * forward. Since we're not holding hdr's hash lock, we 3625 * must be very careful and not remove 'hdr' from the 3626 * sublist. Otherwise, other consumers might mistake the 3627 * 'hdr' as not being on a sublist when they call the 3628 * multilist_link_active() function (they all rely on 3629 * the hash lock protecting concurrent insertions and 3630 * removals). multilist_sublist_move_forward() was 3631 * specifically implemented to ensure this is the case 3632 * (only 'marker' will be removed and re-inserted). 3633 */ 3634 multilist_sublist_move_forward(mls, marker); 3635 3636 /* 3637 * The only case where the b_spa field should ever be 3638 * zero, is the marker headers inserted by 3639 * arc_evict_state(). It's possible for multiple threads 3640 * to be calling arc_evict_state() concurrently (e.g. 3641 * dsl_pool_close() and zio_inject_fault()), so we must 3642 * skip any markers we see from these other threads. 3643 */ 3644 if (hdr->b_spa == 0) 3645 continue; 3646 3647 /* we're only interested in evicting buffers of a certain spa */ 3648 if (spa != 0 && hdr->b_spa != spa) { 3649 ARCSTAT_BUMP(arcstat_evict_skip); 3650 continue; 3651 } 3652 3653 hash_lock = HDR_LOCK(hdr); 3654 3655 /* 3656 * We aren't calling this function from any code path 3657 * that would already be holding a hash lock, so we're 3658 * asserting on this assumption to be defensive in case 3659 * this ever changes. Without this check, it would be 3660 * possible to incorrectly increment arcstat_mutex_miss 3661 * below (e.g. if the code changed such that we called 3662 * this function with a hash lock held). 3663 */ 3664 ASSERT(!MUTEX_HELD(hash_lock)); 3665 3666 if (mutex_tryenter(hash_lock)) { 3667 uint64_t evicted = arc_evict_hdr(hdr, hash_lock); 3668 mutex_exit(hash_lock); 3669 3670 bytes_evicted += evicted; 3671 3672 /* 3673 * If evicted is zero, arc_evict_hdr() must have 3674 * decided to skip this header, don't increment 3675 * evict_count in this case. 3676 */ 3677 if (evicted != 0) 3678 evict_count++; 3679 3680 /* 3681 * If arc_size isn't overflowing, signal any 3682 * threads that might happen to be waiting. 3683 * 3684 * For each header evicted, we wake up a single 3685 * thread. If we used cv_broadcast, we could 3686 * wake up "too many" threads causing arc_size 3687 * to significantly overflow arc_c; since 3688 * arc_get_data_impl() doesn't check for overflow 3689 * when it's woken up (it doesn't because it's 3690 * possible for the ARC to be overflowing while 3691 * full of un-evictable buffers, and the 3692 * function should proceed in this case). 3693 * 3694 * If threads are left sleeping, due to not 3695 * using cv_broadcast, they will be woken up 3696 * just before arc_reclaim_thread() sleeps. 3697 */ 3698 mutex_enter(&arc_reclaim_lock); 3699 if (!arc_is_overflowing()) 3700 cv_signal(&arc_reclaim_waiters_cv); 3701 mutex_exit(&arc_reclaim_lock); 3702 } else { 3703 ARCSTAT_BUMP(arcstat_mutex_miss); 3704 } 3705 } 3706 3707 multilist_sublist_unlock(mls); 3708 3709 return (bytes_evicted); 3710} 3711 3712/* 3713 * Evict buffers from the given arc state, until we've removed the 3714 * specified number of bytes. Move the removed buffers to the 3715 * appropriate evict state. 3716 * 3717 * This function makes a "best effort". It skips over any buffers 3718 * it can't get a hash_lock on, and so, may not catch all candidates. 3719 * It may also return without evicting as much space as requested. 3720 * 3721 * If bytes is specified using the special value ARC_EVICT_ALL, this 3722 * will evict all available (i.e. unlocked and evictable) buffers from 3723 * the given arc state; which is used by arc_flush(). 3724 */ 3725static uint64_t 3726arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, 3727 arc_buf_contents_t type) 3728{ 3729 uint64_t total_evicted = 0; 3730 multilist_t *ml = state->arcs_list[type]; 3731 int num_sublists; 3732 arc_buf_hdr_t **markers; 3733 3734 IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 3735 3736 num_sublists = multilist_get_num_sublists(ml); 3737 3738 /* 3739 * If we've tried to evict from each sublist, made some 3740 * progress, but still have not hit the target number of bytes 3741 * to evict, we want to keep trying. The markers allow us to 3742 * pick up where we left off for each individual sublist, rather 3743 * than starting from the tail each time. 3744 */ 3745 markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); 3746 for (int i = 0; i < num_sublists; i++) { 3747 markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); 3748 3749 /* 3750 * A b_spa of 0 is used to indicate that this header is 3751 * a marker. This fact is used in arc_adjust_type() and 3752 * arc_evict_state_impl(). 3753 */ 3754 markers[i]->b_spa = 0; 3755 3756 multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 3757 multilist_sublist_insert_tail(mls, markers[i]); 3758 multilist_sublist_unlock(mls); 3759 } 3760 3761 /* 3762 * While we haven't hit our target number of bytes to evict, or 3763 * we're evicting all available buffers. 3764 */ 3765 while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { 3766 /* 3767 * Start eviction using a randomly selected sublist, 3768 * this is to try and evenly balance eviction across all 3769 * sublists. Always starting at the same sublist 3770 * (e.g. index 0) would cause evictions to favor certain 3771 * sublists over others. 3772 */ 3773 int sublist_idx = multilist_get_random_index(ml); 3774 uint64_t scan_evicted = 0; 3775 3776 for (int i = 0; i < num_sublists; i++) { 3777 uint64_t bytes_remaining; 3778 uint64_t bytes_evicted; 3779 3780 if (bytes == ARC_EVICT_ALL) 3781 bytes_remaining = ARC_EVICT_ALL; 3782 else if (total_evicted < bytes) 3783 bytes_remaining = bytes - total_evicted; 3784 else 3785 break; 3786 3787 bytes_evicted = arc_evict_state_impl(ml, sublist_idx, 3788 markers[sublist_idx], spa, bytes_remaining); 3789 3790 scan_evicted += bytes_evicted; 3791 total_evicted += bytes_evicted; 3792 3793 /* we've reached the end, wrap to the beginning */ 3794 if (++sublist_idx >= num_sublists) 3795 sublist_idx = 0; 3796 } 3797 3798 /* 3799 * If we didn't evict anything during this scan, we have 3800 * no reason to believe we'll evict more during another 3801 * scan, so break the loop. 3802 */ 3803 if (scan_evicted == 0) { 3804 /* This isn't possible, let's make that obvious */ 3805 ASSERT3S(bytes, !=, 0); 3806 3807 /* 3808 * When bytes is ARC_EVICT_ALL, the only way to 3809 * break the loop is when scan_evicted is zero. 3810 * In that case, we actually have evicted enough, 3811 * so we don't want to increment the kstat. 3812 */ 3813 if (bytes != ARC_EVICT_ALL) { 3814 ASSERT3S(total_evicted, <, bytes); 3815 ARCSTAT_BUMP(arcstat_evict_not_enough); 3816 } 3817 3818 break; 3819 } 3820 } 3821 3822 for (int i = 0; i < num_sublists; i++) { 3823 multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 3824 multilist_sublist_remove(mls, markers[i]); 3825 multilist_sublist_unlock(mls); 3826 3827 kmem_cache_free(hdr_full_cache, markers[i]); 3828 } 3829 kmem_free(markers, sizeof (*markers) * num_sublists); 3830 3831 return (total_evicted); 3832} 3833 3834/* 3835 * Flush all "evictable" data of the given type from the arc state 3836 * specified. This will not evict any "active" buffers (i.e. referenced). 3837 * 3838 * When 'retry' is set to B_FALSE, the function will make a single pass 3839 * over the state and evict any buffers that it can. Since it doesn't 3840 * continually retry the eviction, it might end up leaving some buffers 3841 * in the ARC due to lock misses. 3842 * 3843 * When 'retry' is set to B_TRUE, the function will continually retry the 3844 * eviction until *all* evictable buffers have been removed from the 3845 * state. As a result, if concurrent insertions into the state are 3846 * allowed (e.g. if the ARC isn't shutting down), this function might 3847 * wind up in an infinite loop, continually trying to evict buffers. 3848 */ 3849static uint64_t 3850arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, 3851 boolean_t retry) 3852{ 3853 uint64_t evicted = 0; 3854 3855 while (refcount_count(&state->arcs_esize[type]) != 0) { 3856 evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); 3857 3858 if (!retry) 3859 break; 3860 } 3861 3862 return (evicted); 3863} 3864 3865/* 3866 * Evict the specified number of bytes from the state specified, 3867 * restricting eviction to the spa and type given. This function 3868 * prevents us from trying to evict more from a state's list than 3869 * is "evictable", and to skip evicting altogether when passed a 3870 * negative value for "bytes". In contrast, arc_evict_state() will 3871 * evict everything it can, when passed a negative value for "bytes". 3872 */ 3873static uint64_t 3874arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, 3875 arc_buf_contents_t type) 3876{ 3877 int64_t delta; 3878 3879 if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) { 3880 delta = MIN(refcount_count(&state->arcs_esize[type]), bytes); 3881 return (arc_evict_state(state, spa, delta, type)); 3882 } 3883 3884 return (0); 3885} 3886 3887/* 3888 * Evict metadata buffers from the cache, such that arc_meta_used is 3889 * capped by the arc_meta_limit tunable. 3890 */ 3891static uint64_t 3892arc_adjust_meta(void) 3893{ 3894 uint64_t total_evicted = 0; 3895 int64_t target; 3896 3897 /* 3898 * If we're over the meta limit, we want to evict enough 3899 * metadata to get back under the meta limit. We don't want to 3900 * evict so much that we drop the MRU below arc_p, though. If 3901 * we're over the meta limit more than we're over arc_p, we 3902 * evict some from the MRU here, and some from the MFU below. 3903 */ 3904 target = MIN((int64_t)(arc_meta_used - arc_meta_limit), 3905 (int64_t)(refcount_count(&arc_anon->arcs_size) + 3906 refcount_count(&arc_mru->arcs_size) - arc_p)); 3907 3908 total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 3909 3910 /* 3911 * Similar to the above, we want to evict enough bytes to get us 3912 * below the meta limit, but not so much as to drop us below the 3913 * space allotted to the MFU (which is defined as arc_c - arc_p). 3914 */ 3915 target = MIN((int64_t)(arc_meta_used - arc_meta_limit), 3916 (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p))); 3917 3918 total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 3919 3920 return (total_evicted); 3921} 3922 3923/* 3924 * Return the type of the oldest buffer in the given arc state 3925 * 3926 * This function will select a random sublist of type ARC_BUFC_DATA and 3927 * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist 3928 * is compared, and the type which contains the "older" buffer will be 3929 * returned. 3930 */ 3931static arc_buf_contents_t 3932arc_adjust_type(arc_state_t *state) 3933{ 3934 multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA]; 3935 multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA]; 3936 int data_idx = multilist_get_random_index(data_ml); 3937 int meta_idx = multilist_get_random_index(meta_ml); 3938 multilist_sublist_t *data_mls; 3939 multilist_sublist_t *meta_mls; 3940 arc_buf_contents_t type; 3941 arc_buf_hdr_t *data_hdr; 3942 arc_buf_hdr_t *meta_hdr; 3943 3944 /* 3945 * We keep the sublist lock until we're finished, to prevent 3946 * the headers from being destroyed via arc_evict_state(). 3947 */ 3948 data_mls = multilist_sublist_lock(data_ml, data_idx); 3949 meta_mls = multilist_sublist_lock(meta_ml, meta_idx); 3950 3951 /* 3952 * These two loops are to ensure we skip any markers that 3953 * might be at the tail of the lists due to arc_evict_state(). 3954 */ 3955 3956 for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; 3957 data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { 3958 if (data_hdr->b_spa != 0) 3959 break; 3960 } 3961 3962 for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; 3963 meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { 3964 if (meta_hdr->b_spa != 0) 3965 break; 3966 } 3967 3968 if (data_hdr == NULL && meta_hdr == NULL) { 3969 type = ARC_BUFC_DATA; 3970 } else if (data_hdr == NULL) { 3971 ASSERT3P(meta_hdr, !=, NULL); 3972 type = ARC_BUFC_METADATA; 3973 } else if (meta_hdr == NULL) { 3974 ASSERT3P(data_hdr, !=, NULL); 3975 type = ARC_BUFC_DATA; 3976 } else { 3977 ASSERT3P(data_hdr, !=, NULL); 3978 ASSERT3P(meta_hdr, !=, NULL); 3979 3980 /* The headers can't be on the sublist without an L1 header */ 3981 ASSERT(HDR_HAS_L1HDR(data_hdr)); 3982 ASSERT(HDR_HAS_L1HDR(meta_hdr)); 3983 3984 if (data_hdr->b_l1hdr.b_arc_access < 3985 meta_hdr->b_l1hdr.b_arc_access) { 3986 type = ARC_BUFC_DATA; 3987 } else { 3988 type = ARC_BUFC_METADATA; 3989 } 3990 } 3991 3992 multilist_sublist_unlock(meta_mls); 3993 multilist_sublist_unlock(data_mls); 3994 3995 return (type); 3996} 3997 3998/* 3999 * Evict buffers from the cache, such that arc_size is capped by arc_c. 4000 */ 4001static uint64_t 4002arc_adjust(void) 4003{ 4004 uint64_t total_evicted = 0; 4005 uint64_t bytes; 4006 int64_t target; 4007 4008 /* 4009 * If we're over arc_meta_limit, we want to correct that before 4010 * potentially evicting data buffers below. 4011 */ 4012 total_evicted += arc_adjust_meta(); 4013 4014 /* 4015 * Adjust MRU size 4016 * 4017 * If we're over the target cache size, we want to evict enough 4018 * from the list to get back to our target size. We don't want 4019 * to evict too much from the MRU, such that it drops below 4020 * arc_p. So, if we're over our target cache size more than 4021 * the MRU is over arc_p, we'll evict enough to get back to 4022 * arc_p here, and then evict more from the MFU below. 4023 */ 4024 target = MIN((int64_t)(arc_size - arc_c), 4025 (int64_t)(refcount_count(&arc_anon->arcs_size) + 4026 refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p)); 4027 4028 /* 4029 * If we're below arc_meta_min, always prefer to evict data. 4030 * Otherwise, try to satisfy the requested number of bytes to 4031 * evict from the type which contains older buffers; in an 4032 * effort to keep newer buffers in the cache regardless of their 4033 * type. If we cannot satisfy the number of bytes from this 4034 * type, spill over into the next type. 4035 */ 4036 if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && 4037 arc_meta_used > arc_meta_min) { 4038 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 4039 total_evicted += bytes; 4040 4041 /* 4042 * If we couldn't evict our target number of bytes from 4043 * metadata, we try to get the rest from data. 4044 */ 4045 target -= bytes; 4046 4047 total_evicted += 4048 arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 4049 } else { 4050 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 4051 total_evicted += bytes; 4052 4053 /* 4054 * If we couldn't evict our target number of bytes from 4055 * data, we try to get the rest from metadata. 4056 */ 4057 target -= bytes; 4058 4059 total_evicted += 4060 arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 4061 } 4062 4063 /* 4064 * Adjust MFU size 4065 * 4066 * Now that we've tried to evict enough from the MRU to get its 4067 * size back to arc_p, if we're still above the target cache 4068 * size, we evict the rest from the MFU. 4069 */ 4070 target = arc_size - arc_c; 4071 4072 if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && 4073 arc_meta_used > arc_meta_min) { 4074 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 4075 total_evicted += bytes; 4076 4077 /* 4078 * If we couldn't evict our target number of bytes from 4079 * metadata, we try to get the rest from data. 4080 */ 4081 target -= bytes; 4082 4083 total_evicted += 4084 arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 4085 } else { 4086 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 4087 total_evicted += bytes; 4088 4089 /* 4090 * If we couldn't evict our target number of bytes from 4091 * data, we try to get the rest from data. 4092 */ 4093 target -= bytes; 4094 4095 total_evicted += 4096 arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 4097 } 4098 4099 /* 4100 * Adjust ghost lists 4101 * 4102 * In addition to the above, the ARC also defines target values 4103 * for the ghost lists. The sum of the mru list and mru ghost 4104 * list should never exceed the target size of the cache, and 4105 * the sum of the mru list, mfu list, mru ghost list, and mfu 4106 * ghost list should never exceed twice the target size of the 4107 * cache. The following logic enforces these limits on the ghost 4108 * caches, and evicts from them as needed. 4109 */ 4110 target = refcount_count(&arc_mru->arcs_size) + 4111 refcount_count(&arc_mru_ghost->arcs_size) - arc_c; 4112 4113 bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); 4114 total_evicted += bytes; 4115 4116 target -= bytes; 4117 4118 total_evicted += 4119 arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); 4120 4121 /* 4122 * We assume the sum of the mru list and mfu list is less than 4123 * or equal to arc_c (we enforced this above), which means we 4124 * can use the simpler of the two equations below: 4125 * 4126 * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c 4127 * mru ghost + mfu ghost <= arc_c 4128 */ 4129 target = refcount_count(&arc_mru_ghost->arcs_size) + 4130 refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; 4131 4132 bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); 4133 total_evicted += bytes; 4134 4135 target -= bytes; 4136 4137 total_evicted += 4138 arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); 4139 4140 return (total_evicted); 4141} 4142 4143void 4144arc_flush(spa_t *spa, boolean_t retry) 4145{ 4146 uint64_t guid = 0; 4147 4148 /* 4149 * If retry is B_TRUE, a spa must not be specified since we have 4150 * no good way to determine if all of a spa's buffers have been 4151 * evicted from an arc state. 4152 */ 4153 ASSERT(!retry || spa == 0); 4154 4155 if (spa != NULL) 4156 guid = spa_load_guid(spa); 4157 4158 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); 4159 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); 4160 4161 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); 4162 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); 4163 4164 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); 4165 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); 4166 4167 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); 4168 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); 4169} 4170 4171void 4172arc_shrink(int64_t to_free) 4173{ 4174 if (arc_c > arc_c_min) { 4175 DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, 4176 arc_c_min, uint64_t, arc_p, uint64_t, to_free); 4177 if (arc_c > arc_c_min + to_free) 4178 atomic_add_64(&arc_c, -to_free); 4179 else 4180 arc_c = arc_c_min; 4181 4182 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 4183 if (arc_c > arc_size) 4184 arc_c = MAX(arc_size, arc_c_min); 4185 if (arc_p > arc_c) 4186 arc_p = (arc_c >> 1); 4187 4188 DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, 4189 arc_p); 4190 4191 ASSERT(arc_c >= arc_c_min); 4192 ASSERT((int64_t)arc_p >= 0); 4193 } 4194 4195 if (arc_size > arc_c) { 4196 DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, 4197 uint64_t, arc_c); 4198 (void) arc_adjust(); 4199 } 4200} 4201 4202static long needfree = 0; 4203 4204typedef enum free_memory_reason_t { 4205 FMR_UNKNOWN, 4206 FMR_NEEDFREE, 4207 FMR_LOTSFREE, 4208 FMR_SWAPFS_MINFREE, 4209 FMR_PAGES_PP_MAXIMUM, 4210 FMR_HEAP_ARENA, 4211 FMR_ZIO_ARENA, 4212 FMR_ZIO_FRAG, 4213} free_memory_reason_t; 4214 4215int64_t last_free_memory; 4216free_memory_reason_t last_free_reason; 4217 4218/* 4219 * Additional reserve of pages for pp_reserve. 4220 */ 4221int64_t arc_pages_pp_reserve = 64; 4222 4223/* 4224 * Additional reserve of pages for swapfs. 4225 */ 4226int64_t arc_swapfs_reserve = 64; 4227 4228/* 4229 * Return the amount of memory that can be consumed before reclaim will be 4230 * needed. Positive if there is sufficient free memory, negative indicates 4231 * the amount of memory that needs to be freed up. 4232 */ 4233static int64_t 4234arc_available_memory(void) 4235{ 4236 int64_t lowest = INT64_MAX; 4237 int64_t n; 4238 free_memory_reason_t r = FMR_UNKNOWN; 4239 4240#ifdef _KERNEL 4241 if (needfree > 0) { 4242 n = PAGESIZE * (-needfree); 4243 if (n < lowest) { 4244 lowest = n; 4245 r = FMR_NEEDFREE; 4246 } 4247 } 4248 4249 /* 4250 * Cooperate with pagedaemon when it's time for it to scan 4251 * and reclaim some pages. 4252 */ 4253 n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); 4254 if (n < lowest) { 4255 lowest = n; 4256 r = FMR_LOTSFREE; 4257 } 4258 4259#ifdef illumos 4260 /* 4261 * check that we're out of range of the pageout scanner. It starts to 4262 * schedule paging if freemem is less than lotsfree and needfree. 4263 * lotsfree is the high-water mark for pageout, and needfree is the 4264 * number of needed free pages. We add extra pages here to make sure 4265 * the scanner doesn't start up while we're freeing memory. 4266 */ 4267 n = PAGESIZE * (freemem - lotsfree - needfree - desfree); 4268 if (n < lowest) { 4269 lowest = n; 4270 r = FMR_LOTSFREE; 4271 } 4272 4273 /* 4274 * check to make sure that swapfs has enough space so that anon 4275 * reservations can still succeed. anon_resvmem() checks that the 4276 * availrmem is greater than swapfs_minfree, and the number of reserved 4277 * swap pages. We also add a bit of extra here just to prevent 4278 * circumstances from getting really dire. 4279 */ 4280 n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - 4281 desfree - arc_swapfs_reserve); 4282 if (n < lowest) { 4283 lowest = n; 4284 r = FMR_SWAPFS_MINFREE; 4285 } 4286 4287 4288 /* 4289 * Check that we have enough availrmem that memory locking (e.g., via 4290 * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum 4291 * stores the number of pages that cannot be locked; when availrmem 4292 * drops below pages_pp_maximum, page locking mechanisms such as 4293 * page_pp_lock() will fail.) 4294 */ 4295 n = PAGESIZE * (availrmem - pages_pp_maximum - 4296 arc_pages_pp_reserve); 4297 if (n < lowest) { 4298 lowest = n; 4299 r = FMR_PAGES_PP_MAXIMUM; 4300 } 4301 4302#endif /* illumos */ 4303#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 4304 /* 4305 * If we're on an i386 platform, it's possible that we'll exhaust the 4306 * kernel heap space before we ever run out of available physical 4307 * memory. Most checks of the size of the heap_area compare against 4308 * tune.t_minarmem, which is the minimum available real memory that we 4309 * can have in the system. However, this is generally fixed at 25 pages 4310 * which is so low that it's useless. In this comparison, we seek to 4311 * calculate the total heap-size, and reclaim if more than 3/4ths of the 4312 * heap is allocated. (Or, in the calculation, if less than 1/4th is 4313 * free) 4314 */ 4315 n = (int64_t)vmem_size(heap_arena, VMEM_FREE) - 4316 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); 4317 if (n < lowest) { 4318 lowest = n; 4319 r = FMR_HEAP_ARENA; 4320 } 4321#define zio_arena NULL 4322#else 4323#define zio_arena heap_arena 4324#endif 4325 4326 /* 4327 * If zio data pages are being allocated out of a separate heap segment, 4328 * then enforce that the size of available vmem for this arena remains 4329 * above about 1/16th free. 4330 * 4331 * Note: The 1/16th arena free requirement was put in place 4332 * to aggressively evict memory from the arc in order to avoid 4333 * memory fragmentation issues. 4334 */ 4335 if (zio_arena != NULL) { 4336 n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - 4337 (vmem_size(zio_arena, VMEM_ALLOC) >> 4); 4338 if (n < lowest) { 4339 lowest = n; 4340 r = FMR_ZIO_ARENA; 4341 } 4342 } 4343 4344 /* 4345 * Above limits know nothing about real level of KVA fragmentation. 4346 * Start aggressive reclamation if too little sequential KVA left. 4347 */ 4348 if (lowest > 0) { 4349 n = (vmem_size(heap_arena, VMEM_MAXFREE) < SPA_MAXBLOCKSIZE) ? 4350 -((int64_t)vmem_size(heap_arena, VMEM_ALLOC) >> 4) : 4351 INT64_MAX; 4352 if (n < lowest) { 4353 lowest = n; 4354 r = FMR_ZIO_FRAG; 4355 } 4356 } 4357 4358#else /* _KERNEL */ 4359 /* Every 100 calls, free a small amount */ 4360 if (spa_get_random(100) == 0) 4361 lowest = -1024; 4362#endif /* _KERNEL */ 4363 4364 last_free_memory = lowest; 4365 last_free_reason = r; 4366 DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); 4367 return (lowest); 4368} 4369 4370 4371/* 4372 * Determine if the system is under memory pressure and is asking 4373 * to reclaim memory. A return value of B_TRUE indicates that the system 4374 * is under memory pressure and that the arc should adjust accordingly. 4375 */ 4376static boolean_t 4377arc_reclaim_needed(void) 4378{ 4379 return (arc_available_memory() < 0); 4380} 4381 4382extern kmem_cache_t *zio_buf_cache[]; 4383extern kmem_cache_t *zio_data_buf_cache[]; 4384extern kmem_cache_t *range_seg_cache; 4385extern kmem_cache_t *abd_chunk_cache; 4386 4387static __noinline void 4388arc_kmem_reap_now(void) 4389{ 4390 size_t i; 4391 kmem_cache_t *prev_cache = NULL; 4392 kmem_cache_t *prev_data_cache = NULL; 4393 4394 DTRACE_PROBE(arc__kmem_reap_start); 4395#ifdef _KERNEL 4396 if (arc_meta_used >= arc_meta_limit) { 4397 /* 4398 * We are exceeding our meta-data cache limit. 4399 * Purge some DNLC entries to release holds on meta-data. 4400 */ 4401 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 4402 } 4403#if defined(__i386) 4404 /* 4405 * Reclaim unused memory from all kmem caches. 4406 */ 4407 kmem_reap(); 4408#endif 4409#endif 4410 4411 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 4412 if (zio_buf_cache[i] != prev_cache) { 4413 prev_cache = zio_buf_cache[i]; 4414 kmem_cache_reap_now(zio_buf_cache[i]); 4415 } 4416 if (zio_data_buf_cache[i] != prev_data_cache) { 4417 prev_data_cache = zio_data_buf_cache[i]; 4418 kmem_cache_reap_now(zio_data_buf_cache[i]); 4419 } 4420 } 4421 kmem_cache_reap_now(abd_chunk_cache); 4422 kmem_cache_reap_now(buf_cache); 4423 kmem_cache_reap_now(hdr_full_cache); 4424 kmem_cache_reap_now(hdr_l2only_cache); 4425 kmem_cache_reap_now(range_seg_cache); 4426 4427#ifdef illumos 4428 if (zio_arena != NULL) { 4429 /* 4430 * Ask the vmem arena to reclaim unused memory from its 4431 * quantum caches. 4432 */ 4433 vmem_qcache_reap(zio_arena); 4434 } 4435#endif 4436 DTRACE_PROBE(arc__kmem_reap_end); 4437} 4438 4439/* 4440 * Threads can block in arc_get_data_impl() waiting for this thread to evict 4441 * enough data and signal them to proceed. When this happens, the threads in 4442 * arc_get_data_impl() are sleeping while holding the hash lock for their 4443 * particular arc header. Thus, we must be careful to never sleep on a 4444 * hash lock in this thread. This is to prevent the following deadlock: 4445 * 4446 * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L", 4447 * waiting for the reclaim thread to signal it. 4448 * 4449 * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, 4450 * fails, and goes to sleep forever. 4451 * 4452 * This possible deadlock is avoided by always acquiring a hash lock 4453 * using mutex_tryenter() from arc_reclaim_thread(). 4454 */ 4455static void 4456arc_reclaim_thread(void *dummy __unused) 4457{ 4458 hrtime_t growtime = 0; 4459 callb_cpr_t cpr; 4460 4461 CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); 4462 4463 mutex_enter(&arc_reclaim_lock); 4464 while (!arc_reclaim_thread_exit) { 4465 uint64_t evicted = 0; 4466 4467 /* 4468 * This is necessary in order for the mdb ::arc dcmd to 4469 * show up to date information. Since the ::arc command 4470 * does not call the kstat's update function, without 4471 * this call, the command may show stale stats for the 4472 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even 4473 * with this change, the data might be up to 1 second 4474 * out of date; but that should suffice. The arc_state_t 4475 * structures can be queried directly if more accurate 4476 * information is needed. 4477 */ 4478 if (arc_ksp != NULL) 4479 arc_ksp->ks_update(arc_ksp, KSTAT_READ); 4480 4481 mutex_exit(&arc_reclaim_lock); 4482 4483 /* 4484 * We call arc_adjust() before (possibly) calling 4485 * arc_kmem_reap_now(), so that we can wake up 4486 * arc_get_data_impl() sooner. 4487 */ 4488 evicted = arc_adjust(); 4489 4490 int64_t free_memory = arc_available_memory(); 4491 if (free_memory < 0) { 4492 4493 arc_no_grow = B_TRUE; 4494 arc_warm = B_TRUE; 4495 4496 /* 4497 * Wait at least zfs_grow_retry (default 60) seconds 4498 * before considering growing. 4499 */ 4500 growtime = gethrtime() + SEC2NSEC(arc_grow_retry); 4501 4502 arc_kmem_reap_now(); 4503 4504 /* 4505 * If we are still low on memory, shrink the ARC 4506 * so that we have arc_shrink_min free space. 4507 */ 4508 free_memory = arc_available_memory(); 4509 4510 int64_t to_free = 4511 (arc_c >> arc_shrink_shift) - free_memory; 4512 if (to_free > 0) { 4513#ifdef _KERNEL 4514 to_free = MAX(to_free, ptob(needfree)); 4515#endif 4516 arc_shrink(to_free); 4517 } 4518 } else if (free_memory < arc_c >> arc_no_grow_shift) { 4519 arc_no_grow = B_TRUE; 4520 } else if (gethrtime() >= growtime) { 4521 arc_no_grow = B_FALSE; 4522 } 4523 4524 mutex_enter(&arc_reclaim_lock); 4525 4526 /* 4527 * If evicted is zero, we couldn't evict anything via 4528 * arc_adjust(). This could be due to hash lock 4529 * collisions, but more likely due to the majority of 4530 * arc buffers being unevictable. Therefore, even if 4531 * arc_size is above arc_c, another pass is unlikely to 4532 * be helpful and could potentially cause us to enter an 4533 * infinite loop. 4534 */ 4535 if (arc_size <= arc_c || evicted == 0) { 4536#ifdef _KERNEL 4537 needfree = 0; 4538#endif 4539 /* 4540 * We're either no longer overflowing, or we 4541 * can't evict anything more, so we should wake 4542 * up any threads before we go to sleep. 4543 */ 4544 cv_broadcast(&arc_reclaim_waiters_cv); 4545 4546 /* 4547 * Block until signaled, or after one second (we 4548 * might need to perform arc_kmem_reap_now() 4549 * even if we aren't being signalled) 4550 */ 4551 CALLB_CPR_SAFE_BEGIN(&cpr); 4552 (void) cv_timedwait_hires(&arc_reclaim_thread_cv, 4553 &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); 4554 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); 4555 } 4556 } 4557 4558 arc_reclaim_thread_exit = B_FALSE; 4559 cv_broadcast(&arc_reclaim_thread_cv); 4560 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ 4561 thread_exit(); 4562} 4563 4564static u_int arc_dnlc_evicts_arg; 4565extern struct vfsops zfs_vfsops; 4566 4567static void 4568arc_dnlc_evicts_thread(void *dummy __unused) 4569{ 4570 callb_cpr_t cpr; 4571 u_int percent; 4572 4573 CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG); 4574 4575 mutex_enter(&arc_dnlc_evicts_lock); 4576 while (!arc_dnlc_evicts_thread_exit) { 4577 CALLB_CPR_SAFE_BEGIN(&cpr); 4578 (void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); 4579 CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock); 4580 if (arc_dnlc_evicts_arg != 0) { 4581 percent = arc_dnlc_evicts_arg; 4582 mutex_exit(&arc_dnlc_evicts_lock); 4583#ifdef _KERNEL 4584 vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops); 4585#endif 4586 mutex_enter(&arc_dnlc_evicts_lock); 4587 /* 4588 * Clear our token only after vnlru_free() 4589 * pass is done, to avoid false queueing of 4590 * the requests. 4591 */ 4592 arc_dnlc_evicts_arg = 0; 4593 } 4594 } 4595 arc_dnlc_evicts_thread_exit = FALSE; 4596 cv_broadcast(&arc_dnlc_evicts_cv); 4597 CALLB_CPR_EXIT(&cpr); 4598 thread_exit(); 4599} 4600 4601void 4602dnlc_reduce_cache(void *arg) 4603{ 4604 u_int percent; 4605 4606 percent = (u_int)(uintptr_t)arg; 4607 mutex_enter(&arc_dnlc_evicts_lock); 4608 if (arc_dnlc_evicts_arg == 0) { 4609 arc_dnlc_evicts_arg = percent; 4610 cv_broadcast(&arc_dnlc_evicts_cv); 4611 } 4612 mutex_exit(&arc_dnlc_evicts_lock); 4613} 4614 4615/* 4616 * Adapt arc info given the number of bytes we are trying to add and 4617 * the state that we are comming from. This function is only called 4618 * when we are adding new content to the cache. 4619 */ 4620static void 4621arc_adapt(int bytes, arc_state_t *state) 4622{ 4623 int mult; 4624 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 4625 int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size); 4626 int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size); 4627 4628 if (state == arc_l2c_only) 4629 return; 4630 4631 ASSERT(bytes > 0); 4632 /* 4633 * Adapt the target size of the MRU list: 4634 * - if we just hit in the MRU ghost list, then increase 4635 * the target size of the MRU list. 4636 * - if we just hit in the MFU ghost list, then increase 4637 * the target size of the MFU list by decreasing the 4638 * target size of the MRU list. 4639 */ 4640 if (state == arc_mru_ghost) { 4641 mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); 4642 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 4643 4644 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 4645 } else if (state == arc_mfu_ghost) { 4646 uint64_t delta; 4647 4648 mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); 4649 mult = MIN(mult, 10); 4650 4651 delta = MIN(bytes * mult, arc_p); 4652 arc_p = MAX(arc_p_min, arc_p - delta); 4653 } 4654 ASSERT((int64_t)arc_p >= 0); 4655 4656 if (arc_reclaim_needed()) { 4657 cv_signal(&arc_reclaim_thread_cv); 4658 return; 4659 } 4660 4661 if (arc_no_grow) 4662 return; 4663 4664 if (arc_c >= arc_c_max) 4665 return; 4666 4667 /* 4668 * If we're within (2 * maxblocksize) bytes of the target 4669 * cache size, increment the target cache size 4670 */ 4671 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 4672 DTRACE_PROBE1(arc__inc_adapt, int, bytes); 4673 atomic_add_64(&arc_c, (int64_t)bytes); 4674 if (arc_c > arc_c_max) 4675 arc_c = arc_c_max; 4676 else if (state == arc_anon) 4677 atomic_add_64(&arc_p, (int64_t)bytes); 4678 if (arc_p > arc_c) 4679 arc_p = arc_c; 4680 } 4681 ASSERT((int64_t)arc_p >= 0); 4682} 4683 4684/* 4685 * Check if arc_size has grown past our upper threshold, determined by 4686 * zfs_arc_overflow_shift. 4687 */ 4688static boolean_t 4689arc_is_overflowing(void) 4690{ 4691 /* Always allow at least one block of overflow */ 4692 uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, 4693 arc_c >> zfs_arc_overflow_shift); 4694 4695 return (arc_size >= arc_c + overflow); 4696} 4697 4698static abd_t * 4699arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 4700{ 4701 arc_buf_contents_t type = arc_buf_type(hdr); 4702 4703 arc_get_data_impl(hdr, size, tag); 4704 if (type == ARC_BUFC_METADATA) { 4705 return (abd_alloc(size, B_TRUE)); 4706 } else { 4707 ASSERT(type == ARC_BUFC_DATA); 4708 return (abd_alloc(size, B_FALSE)); 4709 } 4710} 4711 4712static void * 4713arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 4714{ 4715 arc_buf_contents_t type = arc_buf_type(hdr); 4716 4717 arc_get_data_impl(hdr, size, tag); 4718 if (type == ARC_BUFC_METADATA) { 4719 return (zio_buf_alloc(size)); 4720 } else { 4721 ASSERT(type == ARC_BUFC_DATA); 4722 return (zio_data_buf_alloc(size)); 4723 } 4724} 4725 4726/* 4727 * Allocate a block and return it to the caller. If we are hitting the 4728 * hard limit for the cache size, we must sleep, waiting for the eviction 4729 * thread to catch up. If we're past the target size but below the hard 4730 * limit, we'll only signal the reclaim thread and continue on. 4731 */ 4732static void 4733arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 4734{ 4735 arc_state_t *state = hdr->b_l1hdr.b_state; 4736 arc_buf_contents_t type = arc_buf_type(hdr); 4737 4738 arc_adapt(size, state); 4739 4740 /* 4741 * If arc_size is currently overflowing, and has grown past our 4742 * upper limit, we must be adding data faster than the evict 4743 * thread can evict. Thus, to ensure we don't compound the 4744 * problem by adding more data and forcing arc_size to grow even 4745 * further past it's target size, we halt and wait for the 4746 * eviction thread to catch up. 4747 * 4748 * It's also possible that the reclaim thread is unable to evict 4749 * enough buffers to get arc_size below the overflow limit (e.g. 4750 * due to buffers being un-evictable, or hash lock collisions). 4751 * In this case, we want to proceed regardless if we're 4752 * overflowing; thus we don't use a while loop here. 4753 */ 4754 if (arc_is_overflowing()) { 4755 mutex_enter(&arc_reclaim_lock); 4756 4757 /* 4758 * Now that we've acquired the lock, we may no longer be 4759 * over the overflow limit, lets check. 4760 * 4761 * We're ignoring the case of spurious wake ups. If that 4762 * were to happen, it'd let this thread consume an ARC 4763 * buffer before it should have (i.e. before we're under 4764 * the overflow limit and were signalled by the reclaim 4765 * thread). As long as that is a rare occurrence, it 4766 * shouldn't cause any harm. 4767 */ 4768 if (arc_is_overflowing()) { 4769 cv_signal(&arc_reclaim_thread_cv); 4770 cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); 4771 } 4772 4773 mutex_exit(&arc_reclaim_lock); 4774 } 4775 4776 VERIFY3U(hdr->b_type, ==, type); 4777 if (type == ARC_BUFC_METADATA) { 4778 arc_space_consume(size, ARC_SPACE_META); 4779 } else { 4780 arc_space_consume(size, ARC_SPACE_DATA); 4781 } 4782 4783 /* 4784 * Update the state size. Note that ghost states have a 4785 * "ghost size" and so don't need to be updated. 4786 */ 4787 if (!GHOST_STATE(state)) { 4788 4789 (void) refcount_add_many(&state->arcs_size, size, tag); 4790 4791 /* 4792 * If this is reached via arc_read, the link is 4793 * protected by the hash lock. If reached via 4794 * arc_buf_alloc, the header should not be accessed by 4795 * any other thread. And, if reached via arc_read_done, 4796 * the hash lock will protect it if it's found in the 4797 * hash table; otherwise no other thread should be 4798 * trying to [add|remove]_reference it. 4799 */ 4800 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 4801 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4802 (void) refcount_add_many(&state->arcs_esize[type], 4803 size, tag); 4804 } 4805 4806 /* 4807 * If we are growing the cache, and we are adding anonymous 4808 * data, and we have outgrown arc_p, update arc_p 4809 */ 4810 if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && 4811 (refcount_count(&arc_anon->arcs_size) + 4812 refcount_count(&arc_mru->arcs_size) > arc_p)) 4813 arc_p = MIN(arc_c, arc_p + size); 4814 } 4815 ARCSTAT_BUMP(arcstat_allocated); 4816} 4817 4818static void 4819arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag) 4820{ 4821 arc_free_data_impl(hdr, size, tag); 4822 abd_free(abd); 4823} 4824 4825static void 4826arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag) 4827{ 4828 arc_buf_contents_t type = arc_buf_type(hdr); 4829 4830 arc_free_data_impl(hdr, size, tag); 4831 if (type == ARC_BUFC_METADATA) { 4832 zio_buf_free(buf, size); 4833 } else { 4834 ASSERT(type == ARC_BUFC_DATA); 4835 zio_data_buf_free(buf, size); 4836 } 4837} 4838 4839/* 4840 * Free the arc data buffer. 4841 */ 4842static void 4843arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 4844{ 4845 arc_state_t *state = hdr->b_l1hdr.b_state; 4846 arc_buf_contents_t type = arc_buf_type(hdr); 4847 4848 /* protected by hash lock, if in the hash table */ 4849 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 4850 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4851 ASSERT(state != arc_anon && state != arc_l2c_only); 4852 4853 (void) refcount_remove_many(&state->arcs_esize[type], 4854 size, tag); 4855 } 4856 (void) refcount_remove_many(&state->arcs_size, size, tag); 4857 4858 VERIFY3U(hdr->b_type, ==, type); 4859 if (type == ARC_BUFC_METADATA) { 4860 arc_space_return(size, ARC_SPACE_META); 4861 } else { 4862 ASSERT(type == ARC_BUFC_DATA); 4863 arc_space_return(size, ARC_SPACE_DATA); 4864 } 4865} 4866 4867/* 4868 * This routine is called whenever a buffer is accessed. 4869 * NOTE: the hash lock is dropped in this function. 4870 */ 4871static void 4872arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 4873{ 4874 clock_t now; 4875 4876 ASSERT(MUTEX_HELD(hash_lock)); 4877 ASSERT(HDR_HAS_L1HDR(hdr)); 4878 4879 if (hdr->b_l1hdr.b_state == arc_anon) { 4880 /* 4881 * This buffer is not in the cache, and does not 4882 * appear in our "ghost" list. Add the new buffer 4883 * to the MRU state. 4884 */ 4885 4886 ASSERT0(hdr->b_l1hdr.b_arc_access); 4887 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4888 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 4889 arc_change_state(arc_mru, hdr, hash_lock); 4890 4891 } else if (hdr->b_l1hdr.b_state == arc_mru) { 4892 now = ddi_get_lbolt(); 4893 4894 /* 4895 * If this buffer is here because of a prefetch, then either: 4896 * - clear the flag if this is a "referencing" read 4897 * (any subsequent access will bump this into the MFU state). 4898 * or 4899 * - move the buffer to the head of the list if this is 4900 * another prefetch (to make it less likely to be evicted). 4901 */ 4902 if (HDR_PREFETCH(hdr)) { 4903 if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 4904 /* link protected by hash lock */ 4905 ASSERT(multilist_link_active( 4906 &hdr->b_l1hdr.b_arc_node)); 4907 } else { 4908 arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); 4909 ARCSTAT_BUMP(arcstat_mru_hits); 4910 } 4911 hdr->b_l1hdr.b_arc_access = now; 4912 return; 4913 } 4914 4915 /* 4916 * This buffer has been "accessed" only once so far, 4917 * but it is still in the cache. Move it to the MFU 4918 * state. 4919 */ 4920 if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { 4921 /* 4922 * More than 125ms have passed since we 4923 * instantiated this buffer. Move it to the 4924 * most frequently used state. 4925 */ 4926 hdr->b_l1hdr.b_arc_access = now; 4927 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4928 arc_change_state(arc_mfu, hdr, hash_lock); 4929 } 4930 ARCSTAT_BUMP(arcstat_mru_hits); 4931 } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { 4932 arc_state_t *new_state; 4933 /* 4934 * This buffer has been "accessed" recently, but 4935 * was evicted from the cache. Move it to the 4936 * MFU state. 4937 */ 4938 4939 if (HDR_PREFETCH(hdr)) { 4940 new_state = arc_mru; 4941 if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) 4942 arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); 4943 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 4944 } else { 4945 new_state = arc_mfu; 4946 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4947 } 4948 4949 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4950 arc_change_state(new_state, hdr, hash_lock); 4951 4952 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 4953 } else if (hdr->b_l1hdr.b_state == arc_mfu) { 4954 /* 4955 * This buffer has been accessed more than once and is 4956 * still in the cache. Keep it in the MFU state. 4957 * 4958 * NOTE: an add_reference() that occurred when we did 4959 * the arc_read() will have kicked this off the list. 4960 * If it was a prefetch, we will explicitly move it to 4961 * the head of the list now. 4962 */ 4963 if ((HDR_PREFETCH(hdr)) != 0) { 4964 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4965 /* link protected by hash_lock */ 4966 ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 4967 } 4968 ARCSTAT_BUMP(arcstat_mfu_hits); 4969 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4970 } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { 4971 arc_state_t *new_state = arc_mfu; 4972 /* 4973 * This buffer has been accessed more than once but has 4974 * been evicted from the cache. Move it back to the 4975 * MFU state. 4976 */ 4977 4978 if (HDR_PREFETCH(hdr)) { 4979 /* 4980 * This is a prefetch access... 4981 * move this block back to the MRU state. 4982 */ 4983 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 4984 new_state = arc_mru; 4985 } 4986 4987 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4988 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4989 arc_change_state(new_state, hdr, hash_lock); 4990 4991 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 4992 } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { 4993 /* 4994 * This buffer is on the 2nd Level ARC. 4995 */ 4996 4997 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4998 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4999 arc_change_state(arc_mfu, hdr, hash_lock); 5000 } else { 5001 ASSERT(!"invalid arc state"); 5002 } 5003} 5004 5005/* a generic arc_done_func_t which you can use */ 5006/* ARGSUSED */ 5007void 5008arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 5009{ 5010 if (zio == NULL || zio->io_error == 0) 5011 bcopy(buf->b_data, arg, arc_buf_size(buf)); 5012 arc_buf_destroy(buf, arg); 5013} 5014 5015/* a generic arc_done_func_t */ 5016void 5017arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 5018{ 5019 arc_buf_t **bufp = arg; 5020 if (zio && zio->io_error) { 5021 arc_buf_destroy(buf, arg); 5022 *bufp = NULL; 5023 } else { 5024 *bufp = buf; 5025 ASSERT(buf->b_data); 5026 } 5027} 5028 5029static void 5030arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) 5031{ 5032 if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { 5033 ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); 5034 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); 5035 } else { 5036 if (HDR_COMPRESSION_ENABLED(hdr)) { 5037 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, 5038 BP_GET_COMPRESS(bp)); 5039 } 5040 ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); 5041 ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); 5042 } 5043} 5044 5045static void 5046arc_read_done(zio_t *zio) 5047{ 5048 arc_buf_hdr_t *hdr = zio->io_private; 5049 kmutex_t *hash_lock = NULL; 5050 arc_callback_t *callback_list; 5051 arc_callback_t *acb; 5052 boolean_t freeable = B_FALSE; 5053 boolean_t no_zio_error = (zio->io_error == 0); 5054 5055 /* 5056 * The hdr was inserted into hash-table and removed from lists 5057 * prior to starting I/O. We should find this header, since 5058 * it's in the hash table, and it should be legit since it's 5059 * not possible to evict it during the I/O. The only possible 5060 * reason for it not to be found is if we were freed during the 5061 * read. 5062 */ 5063 if (HDR_IN_HASH_TABLE(hdr)) { 5064 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); 5065 ASSERT3U(hdr->b_dva.dva_word[0], ==, 5066 BP_IDENTITY(zio->io_bp)->dva_word[0]); 5067 ASSERT3U(hdr->b_dva.dva_word[1], ==, 5068 BP_IDENTITY(zio->io_bp)->dva_word[1]); 5069 5070 arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, 5071 &hash_lock); 5072 5073 ASSERT((found == hdr && 5074 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 5075 (found == hdr && HDR_L2_READING(hdr))); 5076 ASSERT3P(hash_lock, !=, NULL); 5077 } 5078 5079 if (no_zio_error) { 5080 /* byteswap if necessary */ 5081 if (BP_SHOULD_BYTESWAP(zio->io_bp)) { 5082 if (BP_GET_LEVEL(zio->io_bp) > 0) { 5083 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; 5084 } else { 5085 hdr->b_l1hdr.b_byteswap = 5086 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 5087 } 5088 } else { 5089 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; 5090 } 5091 } 5092 5093 arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); 5094 if (l2arc_noprefetch && HDR_PREFETCH(hdr)) 5095 arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); 5096 5097 callback_list = hdr->b_l1hdr.b_acb; 5098 ASSERT3P(callback_list, !=, NULL); 5099 5100 if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) { 5101 /* 5102 * Only call arc_access on anonymous buffers. This is because 5103 * if we've issued an I/O for an evicted buffer, we've already 5104 * called arc_access (to prevent any simultaneous readers from 5105 * getting confused). 5106 */ 5107 arc_access(hdr, hash_lock); 5108 } 5109 5110 /* 5111 * If a read request has a callback (i.e. acb_done is not NULL), then we 5112 * make a buf containing the data according to the parameters which were 5113 * passed in. The implementation of arc_buf_alloc_impl() ensures that we 5114 * aren't needlessly decompressing the data multiple times. 5115 */ 5116 int callback_cnt = 0; 5117 for (acb = callback_list; acb != NULL; acb = acb->acb_next) { 5118 if (!acb->acb_done) 5119 continue; 5120 5121 /* This is a demand read since prefetches don't use callbacks */ 5122 callback_cnt++; 5123 5124 int error = arc_buf_alloc_impl(hdr, acb->acb_private, 5125 acb->acb_compressed, no_zio_error, &acb->acb_buf); 5126 if (no_zio_error) { 5127 zio->io_error = error; 5128 } 5129 } 5130 hdr->b_l1hdr.b_acb = NULL; 5131 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 5132 if (callback_cnt == 0) { 5133 ASSERT(HDR_PREFETCH(hdr)); 5134 ASSERT0(hdr->b_l1hdr.b_bufcnt); 5135 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 5136 } 5137 5138 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || 5139 callback_list != NULL); 5140 5141 if (no_zio_error) { 5142 arc_hdr_verify(hdr, zio->io_bp); 5143 } else { 5144 arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); 5145 if (hdr->b_l1hdr.b_state != arc_anon) 5146 arc_change_state(arc_anon, hdr, hash_lock); 5147 if (HDR_IN_HASH_TABLE(hdr)) 5148 buf_hash_remove(hdr); 5149 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 5150 } 5151 5152 /* 5153 * Broadcast before we drop the hash_lock to avoid the possibility 5154 * that the hdr (and hence the cv) might be freed before we get to 5155 * the cv_broadcast(). 5156 */ 5157 cv_broadcast(&hdr->b_l1hdr.b_cv); 5158 5159 if (hash_lock != NULL) { 5160 mutex_exit(hash_lock); 5161 } else { 5162 /* 5163 * This block was freed while we waited for the read to 5164 * complete. It has been removed from the hash table and 5165 * moved to the anonymous state (so that it won't show up 5166 * in the cache). 5167 */ 5168 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 5169 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 5170 } 5171 5172 /* execute each callback and free its structure */ 5173 while ((acb = callback_list) != NULL) { 5174 if (acb->acb_done) 5175 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 5176 5177 if (acb->acb_zio_dummy != NULL) { 5178 acb->acb_zio_dummy->io_error = zio->io_error; 5179 zio_nowait(acb->acb_zio_dummy); 5180 } 5181 5182 callback_list = acb->acb_next; 5183 kmem_free(acb, sizeof (arc_callback_t)); 5184 } 5185 5186 if (freeable) 5187 arc_hdr_destroy(hdr); 5188} 5189 5190/* 5191 * "Read" the block at the specified DVA (in bp) via the 5192 * cache. If the block is found in the cache, invoke the provided 5193 * callback immediately and return. Note that the `zio' parameter 5194 * in the callback will be NULL in this case, since no IO was 5195 * required. If the block is not in the cache pass the read request 5196 * on to the spa with a substitute callback function, so that the 5197 * requested block will be added to the cache. 5198 * 5199 * If a read request arrives for a block that has a read in-progress, 5200 * either wait for the in-progress read to complete (and return the 5201 * results); or, if this is a read with a "done" func, add a record 5202 * to the read to invoke the "done" func when the read completes, 5203 * and return; or just return. 5204 * 5205 * arc_read_done() will invoke all the requested "done" functions 5206 * for readers of this block. 5207 */ 5208int 5209arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 5210 void *private, zio_priority_t priority, int zio_flags, 5211 arc_flags_t *arc_flags, const zbookmark_phys_t *zb) 5212{ 5213 arc_buf_hdr_t *hdr = NULL; 5214 kmutex_t *hash_lock = NULL; 5215 zio_t *rzio; 5216 uint64_t guid = spa_load_guid(spa); 5217 boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0; 5218 5219 ASSERT(!BP_IS_EMBEDDED(bp) || 5220 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 5221 5222top: 5223 if (!BP_IS_EMBEDDED(bp)) { 5224 /* 5225 * Embedded BP's have no DVA and require no I/O to "read". 5226 * Create an anonymous arc buf to back it. 5227 */ 5228 hdr = buf_hash_find(guid, bp, &hash_lock); 5229 } 5230 5231 if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) { 5232 arc_buf_t *buf = NULL; 5233 *arc_flags |= ARC_FLAG_CACHED; 5234 5235 if (HDR_IO_IN_PROGRESS(hdr)) { 5236 5237 if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && 5238 priority == ZIO_PRIORITY_SYNC_READ) { 5239 /* 5240 * This sync read must wait for an 5241 * in-progress async read (e.g. a predictive 5242 * prefetch). Async reads are queued 5243 * separately at the vdev_queue layer, so 5244 * this is a form of priority inversion. 5245 * Ideally, we would "inherit" the demand 5246 * i/o's priority by moving the i/o from 5247 * the async queue to the synchronous queue, 5248 * but there is currently no mechanism to do 5249 * so. Track this so that we can evaluate 5250 * the magnitude of this potential performance 5251 * problem. 5252 * 5253 * Note that if the prefetch i/o is already 5254 * active (has been issued to the device), 5255 * the prefetch improved performance, because 5256 * we issued it sooner than we would have 5257 * without the prefetch. 5258 */ 5259 DTRACE_PROBE1(arc__sync__wait__for__async, 5260 arc_buf_hdr_t *, hdr); 5261 ARCSTAT_BUMP(arcstat_sync_wait_for_async); 5262 } 5263 if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 5264 arc_hdr_clear_flags(hdr, 5265 ARC_FLAG_PREDICTIVE_PREFETCH); 5266 } 5267 5268 if (*arc_flags & ARC_FLAG_WAIT) { 5269 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); 5270 mutex_exit(hash_lock); 5271 goto top; 5272 } 5273 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 5274 5275 if (done) { 5276 arc_callback_t *acb = NULL; 5277 5278 acb = kmem_zalloc(sizeof (arc_callback_t), 5279 KM_SLEEP); 5280 acb->acb_done = done; 5281 acb->acb_private = private; 5282 acb->acb_compressed = compressed_read; 5283 if (pio != NULL) 5284 acb->acb_zio_dummy = zio_null(pio, 5285 spa, NULL, NULL, NULL, zio_flags); 5286 5287 ASSERT3P(acb->acb_done, !=, NULL); 5288 acb->acb_next = hdr->b_l1hdr.b_acb; 5289 hdr->b_l1hdr.b_acb = acb; 5290 mutex_exit(hash_lock); 5291 return (0); 5292 } 5293 mutex_exit(hash_lock); 5294 return (0); 5295 } 5296 5297 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 5298 hdr->b_l1hdr.b_state == arc_mfu); 5299 5300 if (done) { 5301 if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 5302 /* 5303 * This is a demand read which does not have to 5304 * wait for i/o because we did a predictive 5305 * prefetch i/o for it, which has completed. 5306 */ 5307 DTRACE_PROBE1( 5308 arc__demand__hit__predictive__prefetch, 5309 arc_buf_hdr_t *, hdr); 5310 ARCSTAT_BUMP( 5311 arcstat_demand_hit_predictive_prefetch); 5312 arc_hdr_clear_flags(hdr, 5313 ARC_FLAG_PREDICTIVE_PREFETCH); 5314 } 5315 ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); 5316 5317 /* Get a buf with the desired data in it. */ 5318 VERIFY0(arc_buf_alloc_impl(hdr, private, 5319 compressed_read, B_TRUE, &buf)); 5320 } else if (*arc_flags & ARC_FLAG_PREFETCH && 5321 refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 5322 arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); 5323 } 5324 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 5325 arc_access(hdr, hash_lock); 5326 if (*arc_flags & ARC_FLAG_L2CACHE) 5327 arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); 5328 mutex_exit(hash_lock); 5329 ARCSTAT_BUMP(arcstat_hits); 5330 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 5331 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 5332 data, metadata, hits); 5333 5334 if (done) 5335 done(NULL, buf, private); 5336 } else { 5337 uint64_t lsize = BP_GET_LSIZE(bp); 5338 uint64_t psize = BP_GET_PSIZE(bp); 5339 arc_callback_t *acb; 5340 vdev_t *vd = NULL; 5341 uint64_t addr = 0; 5342 boolean_t devw = B_FALSE; 5343 uint64_t size; 5344 5345 if (hdr == NULL) { 5346 /* this block is not in the cache */ 5347 arc_buf_hdr_t *exists = NULL; 5348 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 5349 hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, 5350 BP_GET_COMPRESS(bp), type); 5351 5352 if (!BP_IS_EMBEDDED(bp)) { 5353 hdr->b_dva = *BP_IDENTITY(bp); 5354 hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 5355 exists = buf_hash_insert(hdr, &hash_lock); 5356 } 5357 if (exists != NULL) { 5358 /* somebody beat us to the hash insert */ 5359 mutex_exit(hash_lock); 5360 buf_discard_identity(hdr); 5361 arc_hdr_destroy(hdr); 5362 goto top; /* restart the IO request */ 5363 } 5364 } else { 5365 /* 5366 * This block is in the ghost cache. If it was L2-only 5367 * (and thus didn't have an L1 hdr), we realloc the 5368 * header to add an L1 hdr. 5369 */ 5370 if (!HDR_HAS_L1HDR(hdr)) { 5371 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, 5372 hdr_full_cache); 5373 } 5374 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 5375 ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); 5376 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 5377 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 5378 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 5379 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 5380 5381 /* 5382 * This is a delicate dance that we play here. 5383 * This hdr is in the ghost list so we access it 5384 * to move it out of the ghost list before we 5385 * initiate the read. If it's a prefetch then 5386 * it won't have a callback so we'll remove the 5387 * reference that arc_buf_alloc_impl() created. We 5388 * do this after we've called arc_access() to 5389 * avoid hitting an assert in remove_reference(). 5390 */ 5391 arc_access(hdr, hash_lock); 5392 arc_hdr_alloc_pabd(hdr); 5393 } 5394 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 5395 size = arc_hdr_size(hdr); 5396 5397 /* 5398 * If compression is enabled on the hdr, then will do 5399 * RAW I/O and will store the compressed data in the hdr's 5400 * data block. Otherwise, the hdr's data block will contain 5401 * the uncompressed data. 5402 */ 5403 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { 5404 zio_flags |= ZIO_FLAG_RAW; 5405 } 5406 5407 if (*arc_flags & ARC_FLAG_PREFETCH) 5408 arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); 5409 if (*arc_flags & ARC_FLAG_L2CACHE) 5410 arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); 5411 if (BP_GET_LEVEL(bp) > 0) 5412 arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); 5413 if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) 5414 arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); 5415 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); 5416 5417 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 5418 acb->acb_done = done; 5419 acb->acb_private = private; 5420 acb->acb_compressed = compressed_read; 5421 5422 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 5423 hdr->b_l1hdr.b_acb = acb; 5424 arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 5425 5426 if (HDR_HAS_L2HDR(hdr) && 5427 (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { 5428 devw = hdr->b_l2hdr.b_dev->l2ad_writing; 5429 addr = hdr->b_l2hdr.b_daddr; 5430 /* 5431 * Lock out device removal. 5432 */ 5433 if (vdev_is_dead(vd) || 5434 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 5435 vd = NULL; 5436 } 5437 5438 if (priority == ZIO_PRIORITY_ASYNC_READ) 5439 arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); 5440 else 5441 arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); 5442 5443 if (hash_lock != NULL) 5444 mutex_exit(hash_lock); 5445 5446 /* 5447 * At this point, we have a level 1 cache miss. Try again in 5448 * L2ARC if possible. 5449 */ 5450 ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); 5451 5452 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 5453 uint64_t, lsize, zbookmark_phys_t *, zb); 5454 ARCSTAT_BUMP(arcstat_misses); 5455 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 5456 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 5457 data, metadata, misses); 5458#ifdef _KERNEL 5459#ifdef RACCT 5460 if (racct_enable) { 5461 PROC_LOCK(curproc); 5462 racct_add_force(curproc, RACCT_READBPS, size); 5463 racct_add_force(curproc, RACCT_READIOPS, 1); 5464 PROC_UNLOCK(curproc); 5465 } 5466#endif /* RACCT */ 5467 curthread->td_ru.ru_inblock++; 5468#endif 5469 5470 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 5471 /* 5472 * Read from the L2ARC if the following are true: 5473 * 1. The L2ARC vdev was previously cached. 5474 * 2. This buffer still has L2ARC metadata. 5475 * 3. This buffer isn't currently writing to the L2ARC. 5476 * 4. The L2ARC entry wasn't evicted, which may 5477 * also have invalidated the vdev. 5478 * 5. This isn't prefetch and l2arc_noprefetch is set. 5479 */ 5480 if (HDR_HAS_L2HDR(hdr) && 5481 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 5482 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 5483 l2arc_read_callback_t *cb; 5484 abd_t *abd; 5485 uint64_t asize; 5486 5487 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 5488 ARCSTAT_BUMP(arcstat_l2_hits); 5489 5490 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 5491 KM_SLEEP); 5492 cb->l2rcb_hdr = hdr; 5493 cb->l2rcb_bp = *bp; 5494 cb->l2rcb_zb = *zb; 5495 cb->l2rcb_flags = zio_flags; 5496 5497 asize = vdev_psize_to_asize(vd, size); 5498 if (asize != size) { 5499 abd = abd_alloc_for_io(asize, 5500 HDR_ISTYPE_METADATA(hdr)); 5501 cb->l2rcb_abd = abd; 5502 } else { 5503 abd = hdr->b_l1hdr.b_pabd; 5504 } 5505 5506 ASSERT(addr >= VDEV_LABEL_START_SIZE && 5507 addr + asize <= vd->vdev_psize - 5508 VDEV_LABEL_END_SIZE); 5509 5510 /* 5511 * l2arc read. The SCL_L2ARC lock will be 5512 * released by l2arc_read_done(). 5513 * Issue a null zio if the underlying buffer 5514 * was squashed to zero size by compression. 5515 */ 5516 ASSERT3U(HDR_GET_COMPRESS(hdr), !=, 5517 ZIO_COMPRESS_EMPTY); 5518 rzio = zio_read_phys(pio, vd, addr, 5519 asize, abd, 5520 ZIO_CHECKSUM_OFF, 5521 l2arc_read_done, cb, priority, 5522 zio_flags | ZIO_FLAG_DONT_CACHE | 5523 ZIO_FLAG_CANFAIL | 5524 ZIO_FLAG_DONT_PROPAGATE | 5525 ZIO_FLAG_DONT_RETRY, B_FALSE); 5526 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 5527 zio_t *, rzio); 5528 ARCSTAT_INCR(arcstat_l2_read_bytes, size); 5529 5530 if (*arc_flags & ARC_FLAG_NOWAIT) { 5531 zio_nowait(rzio); 5532 return (0); 5533 } 5534 5535 ASSERT(*arc_flags & ARC_FLAG_WAIT); 5536 if (zio_wait(rzio) == 0) 5537 return (0); 5538 5539 /* l2arc read error; goto zio_read() */ 5540 } else { 5541 DTRACE_PROBE1(l2arc__miss, 5542 arc_buf_hdr_t *, hdr); 5543 ARCSTAT_BUMP(arcstat_l2_misses); 5544 if (HDR_L2_WRITING(hdr)) 5545 ARCSTAT_BUMP(arcstat_l2_rw_clash); 5546 spa_config_exit(spa, SCL_L2ARC, vd); 5547 } 5548 } else { 5549 if (vd != NULL) 5550 spa_config_exit(spa, SCL_L2ARC, vd); 5551 if (l2arc_ndev != 0) { 5552 DTRACE_PROBE1(l2arc__miss, 5553 arc_buf_hdr_t *, hdr); 5554 ARCSTAT_BUMP(arcstat_l2_misses); 5555 } 5556 } 5557 5558 rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size, 5559 arc_read_done, hdr, priority, zio_flags, zb); 5560 5561 if (*arc_flags & ARC_FLAG_WAIT) 5562 return (zio_wait(rzio)); 5563 5564 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 5565 zio_nowait(rzio); 5566 } 5567 return (0); 5568} 5569 5570/* 5571 * Notify the arc that a block was freed, and thus will never be used again. 5572 */ 5573void 5574arc_freed(spa_t *spa, const blkptr_t *bp) 5575{ 5576 arc_buf_hdr_t *hdr; 5577 kmutex_t *hash_lock; 5578 uint64_t guid = spa_load_guid(spa); 5579 5580 ASSERT(!BP_IS_EMBEDDED(bp)); 5581 5582 hdr = buf_hash_find(guid, bp, &hash_lock); 5583 if (hdr == NULL) 5584 return; 5585 5586 /* 5587 * We might be trying to free a block that is still doing I/O 5588 * (i.e. prefetch) or has a reference (i.e. a dedup-ed, 5589 * dmu_sync-ed block). If this block is being prefetched, then it 5590 * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr 5591 * until the I/O completes. A block may also have a reference if it is 5592 * part of a dedup-ed, dmu_synced write. The dmu_sync() function would 5593 * have written the new block to its final resting place on disk but 5594 * without the dedup flag set. This would have left the hdr in the MRU 5595 * state and discoverable. When the txg finally syncs it detects that 5596 * the block was overridden in open context and issues an override I/O. 5597 * Since this is a dedup block, the override I/O will determine if the 5598 * block is already in the DDT. If so, then it will replace the io_bp 5599 * with the bp from the DDT and allow the I/O to finish. When the I/O 5600 * reaches the done callback, dbuf_write_override_done, it will 5601 * check to see if the io_bp and io_bp_override are identical. 5602 * If they are not, then it indicates that the bp was replaced with 5603 * the bp in the DDT and the override bp is freed. This allows 5604 * us to arrive here with a reference on a block that is being 5605 * freed. So if we have an I/O in progress, or a reference to 5606 * this hdr, then we don't destroy the hdr. 5607 */ 5608 if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && 5609 refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { 5610 arc_change_state(arc_anon, hdr, hash_lock); 5611 arc_hdr_destroy(hdr); 5612 mutex_exit(hash_lock); 5613 } else { 5614 mutex_exit(hash_lock); 5615 } 5616 5617} 5618 5619/* 5620 * Release this buffer from the cache, making it an anonymous buffer. This 5621 * must be done after a read and prior to modifying the buffer contents. 5622 * If the buffer has more than one reference, we must make 5623 * a new hdr for the buffer. 5624 */ 5625void 5626arc_release(arc_buf_t *buf, void *tag) 5627{ 5628 arc_buf_hdr_t *hdr = buf->b_hdr; 5629 5630 /* 5631 * It would be nice to assert that if it's DMU metadata (level > 5632 * 0 || it's the dnode file), then it must be syncing context. 5633 * But we don't know that information at this level. 5634 */ 5635 5636 mutex_enter(&buf->b_evict_lock); 5637 5638 ASSERT(HDR_HAS_L1HDR(hdr)); 5639 5640 /* 5641 * We don't grab the hash lock prior to this check, because if 5642 * the buffer's header is in the arc_anon state, it won't be 5643 * linked into the hash table. 5644 */ 5645 if (hdr->b_l1hdr.b_state == arc_anon) { 5646 mutex_exit(&buf->b_evict_lock); 5647 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 5648 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 5649 ASSERT(!HDR_HAS_L2HDR(hdr)); 5650 ASSERT(HDR_EMPTY(hdr)); 5651 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); 5652 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); 5653 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 5654 5655 hdr->b_l1hdr.b_arc_access = 0; 5656 5657 /* 5658 * If the buf is being overridden then it may already 5659 * have a hdr that is not empty. 5660 */ 5661 buf_discard_identity(hdr); 5662 arc_buf_thaw(buf); 5663 5664 return; 5665 } 5666 5667 kmutex_t *hash_lock = HDR_LOCK(hdr); 5668 mutex_enter(hash_lock); 5669 5670 /* 5671 * This assignment is only valid as long as the hash_lock is 5672 * held, we must be careful not to reference state or the 5673 * b_state field after dropping the lock. 5674 */ 5675 arc_state_t *state = hdr->b_l1hdr.b_state; 5676 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 5677 ASSERT3P(state, !=, arc_anon); 5678 5679 /* this buffer is not on any list */ 5680 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); 5681 5682 if (HDR_HAS_L2HDR(hdr)) { 5683 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); 5684 5685 /* 5686 * We have to recheck this conditional again now that 5687 * we're holding the l2ad_mtx to prevent a race with 5688 * another thread which might be concurrently calling 5689 * l2arc_evict(). In that case, l2arc_evict() might have 5690 * destroyed the header's L2 portion as we were waiting 5691 * to acquire the l2ad_mtx. 5692 */ 5693 if (HDR_HAS_L2HDR(hdr)) { 5694 l2arc_trim(hdr); 5695 arc_hdr_l2hdr_destroy(hdr); 5696 } 5697 5698 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); 5699 } 5700 5701 /* 5702 * Do we have more than one buf? 5703 */ 5704 if (hdr->b_l1hdr.b_bufcnt > 1) { 5705 arc_buf_hdr_t *nhdr; 5706 uint64_t spa = hdr->b_spa; 5707 uint64_t psize = HDR_GET_PSIZE(hdr); 5708 uint64_t lsize = HDR_GET_LSIZE(hdr); 5709 enum zio_compress compress = HDR_GET_COMPRESS(hdr); 5710 arc_buf_contents_t type = arc_buf_type(hdr); 5711 VERIFY3U(hdr->b_type, ==, type); 5712 5713 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); 5714 (void) remove_reference(hdr, hash_lock, tag); 5715 5716 if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { 5717 ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); 5718 ASSERT(ARC_BUF_LAST(buf)); 5719 } 5720 5721 /* 5722 * Pull the data off of this hdr and attach it to 5723 * a new anonymous hdr. Also find the last buffer 5724 * in the hdr's buffer list. 5725 */ 5726 arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); 5727 ASSERT3P(lastbuf, !=, NULL); 5728 5729 /* 5730 * If the current arc_buf_t and the hdr are sharing their data 5731 * buffer, then we must stop sharing that block. 5732 */ 5733 if (arc_buf_is_shared(buf)) { 5734 VERIFY(!arc_buf_is_shared(lastbuf)); 5735 5736 /* 5737 * First, sever the block sharing relationship between 5738 * buf and the arc_buf_hdr_t. 5739 */ 5740 arc_unshare_buf(hdr, buf); 5741 5742 /* 5743 * Now we need to recreate the hdr's b_pabd. Since we 5744 * have lastbuf handy, we try to share with it, but if 5745 * we can't then we allocate a new b_pabd and copy the 5746 * data from buf into it. 5747 */ 5748 if (arc_can_share(hdr, lastbuf)) { 5749 arc_share_buf(hdr, lastbuf); 5750 } else { 5751 arc_hdr_alloc_pabd(hdr); 5752 abd_copy_from_buf(hdr->b_l1hdr.b_pabd, 5753 buf->b_data, psize); 5754 } 5755 VERIFY3P(lastbuf->b_data, !=, NULL); 5756 } else if (HDR_SHARED_DATA(hdr)) { 5757 /* 5758 * Uncompressed shared buffers are always at the end 5759 * of the list. Compressed buffers don't have the 5760 * same requirements. This makes it hard to 5761 * simply assert that the lastbuf is shared so 5762 * we rely on the hdr's compression flags to determine 5763 * if we have a compressed, shared buffer. 5764 */ 5765 ASSERT(arc_buf_is_shared(lastbuf) || 5766 HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); 5767 ASSERT(!ARC_BUF_SHARED(buf)); 5768 } 5769 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 5770 ASSERT3P(state, !=, arc_l2c_only); 5771 5772 (void) refcount_remove_many(&state->arcs_size, 5773 arc_buf_size(buf), buf); 5774 5775 if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { 5776 ASSERT3P(state, !=, arc_l2c_only); 5777 (void) refcount_remove_many(&state->arcs_esize[type], 5778 arc_buf_size(buf), buf); 5779 } 5780 5781 hdr->b_l1hdr.b_bufcnt -= 1; 5782 arc_cksum_verify(buf); 5783#ifdef illumos 5784 arc_buf_unwatch(buf); 5785#endif 5786 5787 mutex_exit(hash_lock); 5788 5789 /* 5790 * Allocate a new hdr. The new hdr will contain a b_pabd 5791 * buffer which will be freed in arc_write(). 5792 */ 5793 nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); 5794 ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); 5795 ASSERT0(nhdr->b_l1hdr.b_bufcnt); 5796 ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt)); 5797 VERIFY3U(nhdr->b_type, ==, type); 5798 ASSERT(!HDR_SHARED_DATA(nhdr)); 5799 5800 nhdr->b_l1hdr.b_buf = buf; 5801 nhdr->b_l1hdr.b_bufcnt = 1; 5802 (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); 5803 buf->b_hdr = nhdr; 5804 5805 mutex_exit(&buf->b_evict_lock); 5806 (void) refcount_add_many(&arc_anon->arcs_size, 5807 arc_buf_size(buf), buf); 5808 } else { 5809 mutex_exit(&buf->b_evict_lock); 5810 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); 5811 /* protected by hash lock, or hdr is on arc_anon */ 5812 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 5813 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 5814 arc_change_state(arc_anon, hdr, hash_lock); 5815 hdr->b_l1hdr.b_arc_access = 0; 5816 mutex_exit(hash_lock); 5817 5818 buf_discard_identity(hdr); 5819 arc_buf_thaw(buf); 5820 } 5821} 5822 5823int 5824arc_released(arc_buf_t *buf) 5825{ 5826 int released; 5827 5828 mutex_enter(&buf->b_evict_lock); 5829 released = (buf->b_data != NULL && 5830 buf->b_hdr->b_l1hdr.b_state == arc_anon); 5831 mutex_exit(&buf->b_evict_lock); 5832 return (released); 5833} 5834 5835#ifdef ZFS_DEBUG 5836int 5837arc_referenced(arc_buf_t *buf) 5838{ 5839 int referenced; 5840 5841 mutex_enter(&buf->b_evict_lock); 5842 referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); 5843 mutex_exit(&buf->b_evict_lock); 5844 return (referenced); 5845} 5846#endif 5847 5848static void 5849arc_write_ready(zio_t *zio) 5850{ 5851 arc_write_callback_t *callback = zio->io_private; 5852 arc_buf_t *buf = callback->awcb_buf; 5853 arc_buf_hdr_t *hdr = buf->b_hdr; 5854 uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); 5855 5856 ASSERT(HDR_HAS_L1HDR(hdr)); 5857 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); 5858 ASSERT(hdr->b_l1hdr.b_bufcnt > 0); 5859 5860 /* 5861 * If we're reexecuting this zio because the pool suspended, then 5862 * cleanup any state that was previously set the first time the 5863 * callback was invoked. 5864 */ 5865 if (zio->io_flags & ZIO_FLAG_REEXECUTED) { 5866 arc_cksum_free(hdr); 5867#ifdef illumos 5868 arc_buf_unwatch(buf); 5869#endif 5870 if (hdr->b_l1hdr.b_pabd != NULL) { 5871 if (arc_buf_is_shared(buf)) { 5872 arc_unshare_buf(hdr, buf); 5873 } else { 5874 arc_hdr_free_pabd(hdr); 5875 } 5876 } 5877 } 5878 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 5879 ASSERT(!HDR_SHARED_DATA(hdr)); 5880 ASSERT(!arc_buf_is_shared(buf)); 5881 5882 callback->awcb_ready(zio, buf, callback->awcb_private); 5883 5884 if (HDR_IO_IN_PROGRESS(hdr)) 5885 ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); 5886 5887 arc_cksum_compute(buf); 5888 arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 5889 5890 enum zio_compress compress; 5891 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 5892 compress = ZIO_COMPRESS_OFF; 5893 } else { 5894 ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp)); 5895 compress = BP_GET_COMPRESS(zio->io_bp); 5896 } 5897 HDR_SET_PSIZE(hdr, psize); 5898 arc_hdr_set_compress(hdr, compress); 5899 5900 5901 /* 5902 * Fill the hdr with data. If the hdr is compressed, the data we want 5903 * is available from the zio, otherwise we can take it from the buf. 5904 * 5905 * We might be able to share the buf's data with the hdr here. However, 5906 * doing so would cause the ARC to be full of linear ABDs if we write a 5907 * lot of shareable data. As a compromise, we check whether scattered 5908 * ABDs are allowed, and assume that if they are then the user wants 5909 * the ARC to be primarily filled with them regardless of the data being 5910 * written. Therefore, if they're allowed then we allocate one and copy 5911 * the data into it; otherwise, we share the data directly if we can. 5912 */ 5913 if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { 5914 arc_hdr_alloc_pabd(hdr); 5915 5916 /* 5917 * Ideally, we would always copy the io_abd into b_pabd, but the 5918 * user may have disabled compressed ARC, thus we must check the 5919 * hdr's compression setting rather than the io_bp's. 5920 */ 5921 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { 5922 ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, 5923 ZIO_COMPRESS_OFF); 5924 ASSERT3U(psize, >, 0); 5925 5926 abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); 5927 } else { 5928 ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); 5929 5930 abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, 5931 arc_buf_size(buf)); 5932 } 5933 } else { 5934 ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); 5935 ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); 5936 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); 5937 5938 arc_share_buf(hdr, buf); 5939 } 5940 5941 arc_hdr_verify(hdr, zio->io_bp); 5942} 5943 5944static void 5945arc_write_children_ready(zio_t *zio) 5946{ 5947 arc_write_callback_t *callback = zio->io_private; 5948 arc_buf_t *buf = callback->awcb_buf; 5949 5950 callback->awcb_children_ready(zio, buf, callback->awcb_private); 5951} 5952 5953/* 5954 * The SPA calls this callback for each physical write that happens on behalf 5955 * of a logical write. See the comment in dbuf_write_physdone() for details. 5956 */ 5957static void 5958arc_write_physdone(zio_t *zio) 5959{ 5960 arc_write_callback_t *cb = zio->io_private; 5961 if (cb->awcb_physdone != NULL) 5962 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 5963} 5964 5965static void 5966arc_write_done(zio_t *zio) 5967{ 5968 arc_write_callback_t *callback = zio->io_private; 5969 arc_buf_t *buf = callback->awcb_buf; 5970 arc_buf_hdr_t *hdr = buf->b_hdr; 5971 5972 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 5973 5974 if (zio->io_error == 0) { 5975 arc_hdr_verify(hdr, zio->io_bp); 5976 5977 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 5978 buf_discard_identity(hdr); 5979 } else { 5980 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 5981 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 5982 } 5983 } else { 5984 ASSERT(HDR_EMPTY(hdr)); 5985 } 5986 5987 /* 5988 * If the block to be written was all-zero or compressed enough to be 5989 * embedded in the BP, no write was performed so there will be no 5990 * dva/birth/checksum. The buffer must therefore remain anonymous 5991 * (and uncached). 5992 */ 5993 if (!HDR_EMPTY(hdr)) { 5994 arc_buf_hdr_t *exists; 5995 kmutex_t *hash_lock; 5996 5997 ASSERT3U(zio->io_error, ==, 0); 5998 5999 arc_cksum_verify(buf); 6000 6001 exists = buf_hash_insert(hdr, &hash_lock); 6002 if (exists != NULL) { 6003 /* 6004 * This can only happen if we overwrite for 6005 * sync-to-convergence, because we remove 6006 * buffers from the hash table when we arc_free(). 6007 */ 6008 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 6009 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 6010 panic("bad overwrite, hdr=%p exists=%p", 6011 (void *)hdr, (void *)exists); 6012 ASSERT(refcount_is_zero( 6013 &exists->b_l1hdr.b_refcnt)); 6014 arc_change_state(arc_anon, exists, hash_lock); 6015 mutex_exit(hash_lock); 6016 arc_hdr_destroy(exists); 6017 exists = buf_hash_insert(hdr, &hash_lock); 6018 ASSERT3P(exists, ==, NULL); 6019 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 6020 /* nopwrite */ 6021 ASSERT(zio->io_prop.zp_nopwrite); 6022 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 6023 panic("bad nopwrite, hdr=%p exists=%p", 6024 (void *)hdr, (void *)exists); 6025 } else { 6026 /* Dedup */ 6027 ASSERT(hdr->b_l1hdr.b_bufcnt == 1); 6028 ASSERT(hdr->b_l1hdr.b_state == arc_anon); 6029 ASSERT(BP_GET_DEDUP(zio->io_bp)); 6030 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 6031 } 6032 } 6033 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 6034 /* if it's not anon, we are doing a scrub */ 6035 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) 6036 arc_access(hdr, hash_lock); 6037 mutex_exit(hash_lock); 6038 } else { 6039 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 6040 } 6041 6042 ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 6043 callback->awcb_done(zio, buf, callback->awcb_private); 6044 6045 abd_put(zio->io_abd); 6046 kmem_free(callback, sizeof (arc_write_callback_t)); 6047} 6048 6049zio_t * 6050arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 6051 boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, 6052 arc_done_func_t *children_ready, arc_done_func_t *physdone, 6053 arc_done_func_t *done, void *private, zio_priority_t priority, 6054 int zio_flags, const zbookmark_phys_t *zb) 6055{ 6056 arc_buf_hdr_t *hdr = buf->b_hdr; 6057 arc_write_callback_t *callback; 6058 zio_t *zio; 6059 zio_prop_t localprop = *zp; 6060 6061 ASSERT3P(ready, !=, NULL); 6062 ASSERT3P(done, !=, NULL); 6063 ASSERT(!HDR_IO_ERROR(hdr)); 6064 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 6065 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 6066 ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); 6067 if (l2arc) 6068 arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); 6069 if (ARC_BUF_COMPRESSED(buf)) { 6070 /* 6071 * We're writing a pre-compressed buffer. Make the 6072 * compression algorithm requested by the zio_prop_t match 6073 * the pre-compressed buffer's compression algorithm. 6074 */ 6075 localprop.zp_compress = HDR_GET_COMPRESS(hdr); 6076 6077 ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); 6078 zio_flags |= ZIO_FLAG_RAW; 6079 } 6080 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 6081 callback->awcb_ready = ready; 6082 callback->awcb_children_ready = children_ready; 6083 callback->awcb_physdone = physdone; 6084 callback->awcb_done = done; 6085 callback->awcb_private = private; 6086 callback->awcb_buf = buf; 6087 6088 /* 6089 * The hdr's b_pabd is now stale, free it now. A new data block 6090 * will be allocated when the zio pipeline calls arc_write_ready(). 6091 */ 6092 if (hdr->b_l1hdr.b_pabd != NULL) { 6093 /* 6094 * If the buf is currently sharing the data block with 6095 * the hdr then we need to break that relationship here. 6096 * The hdr will remain with a NULL data pointer and the 6097 * buf will take sole ownership of the block. 6098 */ 6099 if (arc_buf_is_shared(buf)) { 6100 arc_unshare_buf(hdr, buf); 6101 } else { 6102 arc_hdr_free_pabd(hdr); 6103 } 6104 VERIFY3P(buf->b_data, !=, NULL); 6105 arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); 6106 } 6107 ASSERT(!arc_buf_is_shared(buf)); 6108 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 6109 6110 zio = zio_write(pio, spa, txg, bp, 6111 abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), 6112 HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready, 6113 (children_ready != NULL) ? arc_write_children_ready : NULL, 6114 arc_write_physdone, arc_write_done, callback, 6115 priority, zio_flags, zb); 6116 6117 return (zio); 6118} 6119 6120static int 6121arc_memory_throttle(uint64_t reserve, uint64_t txg) 6122{ 6123#ifdef _KERNEL 6124 uint64_t available_memory = ptob(freemem); 6125 static uint64_t page_load = 0; 6126 static uint64_t last_txg = 0; 6127 6128#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 6129 available_memory = 6130 MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); 6131#endif 6132 6133 if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) 6134 return (0); 6135 6136 if (txg > last_txg) { 6137 last_txg = txg; 6138 page_load = 0; 6139 } 6140 /* 6141 * If we are in pageout, we know that memory is already tight, 6142 * the arc is already going to be evicting, so we just want to 6143 * continue to let page writes occur as quickly as possible. 6144 */ 6145 if (curproc == pageproc) { 6146 if (page_load > MAX(ptob(minfree), available_memory) / 4) 6147 return (SET_ERROR(ERESTART)); 6148 /* Note: reserve is inflated, so we deflate */ 6149 page_load += reserve / 8; 6150 return (0); 6151 } else if (page_load > 0 && arc_reclaim_needed()) { 6152 /* memory is low, delay before restarting */ 6153 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 6154 return (SET_ERROR(EAGAIN)); 6155 } 6156 page_load = 0; 6157#endif 6158 return (0); 6159} 6160 6161void 6162arc_tempreserve_clear(uint64_t reserve) 6163{ 6164 atomic_add_64(&arc_tempreserve, -reserve); 6165 ASSERT((int64_t)arc_tempreserve >= 0); 6166} 6167 6168int 6169arc_tempreserve_space(uint64_t reserve, uint64_t txg) 6170{ 6171 int error; 6172 uint64_t anon_size; 6173 6174 if (reserve > arc_c/4 && !arc_no_grow) { 6175 arc_c = MIN(arc_c_max, reserve * 4); 6176 DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); 6177 } 6178 if (reserve > arc_c) 6179 return (SET_ERROR(ENOMEM)); 6180 6181 /* 6182 * Don't count loaned bufs as in flight dirty data to prevent long 6183 * network delays from blocking transactions that are ready to be 6184 * assigned to a txg. 6185 */ 6186 6187 /* assert that it has not wrapped around */ 6188 ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); 6189 6190 anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) - 6191 arc_loaned_bytes), 0); 6192 6193 /* 6194 * Writes will, almost always, require additional memory allocations 6195 * in order to compress/encrypt/etc the data. We therefore need to 6196 * make sure that there is sufficient available memory for this. 6197 */ 6198 error = arc_memory_throttle(reserve, txg); 6199 if (error != 0) 6200 return (error); 6201 6202 /* 6203 * Throttle writes when the amount of dirty data in the cache 6204 * gets too large. We try to keep the cache less than half full 6205 * of dirty blocks so that our sync times don't grow too large. 6206 * Note: if two requests come in concurrently, we might let them 6207 * both succeed, when one of them should fail. Not a huge deal. 6208 */ 6209 6210 if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 6211 anon_size > arc_c / 4) { 6212 uint64_t meta_esize = 6213 refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); 6214 uint64_t data_esize = 6215 refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); 6216 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 6217 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 6218 arc_tempreserve >> 10, meta_esize >> 10, 6219 data_esize >> 10, reserve >> 10, arc_c >> 10); 6220 return (SET_ERROR(ERESTART)); 6221 } 6222 atomic_add_64(&arc_tempreserve, reserve); 6223 return (0); 6224} 6225 6226static void 6227arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, 6228 kstat_named_t *evict_data, kstat_named_t *evict_metadata) 6229{ 6230 size->value.ui64 = refcount_count(&state->arcs_size); 6231 evict_data->value.ui64 = 6232 refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); 6233 evict_metadata->value.ui64 = 6234 refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); 6235} 6236 6237static int 6238arc_kstat_update(kstat_t *ksp, int rw) 6239{ 6240 arc_stats_t *as = ksp->ks_data; 6241 6242 if (rw == KSTAT_WRITE) { 6243 return (EACCES); 6244 } else { 6245 arc_kstat_update_state(arc_anon, 6246 &as->arcstat_anon_size, 6247 &as->arcstat_anon_evictable_data, 6248 &as->arcstat_anon_evictable_metadata); 6249 arc_kstat_update_state(arc_mru, 6250 &as->arcstat_mru_size, 6251 &as->arcstat_mru_evictable_data, 6252 &as->arcstat_mru_evictable_metadata); 6253 arc_kstat_update_state(arc_mru_ghost, 6254 &as->arcstat_mru_ghost_size, 6255 &as->arcstat_mru_ghost_evictable_data, 6256 &as->arcstat_mru_ghost_evictable_metadata); 6257 arc_kstat_update_state(arc_mfu, 6258 &as->arcstat_mfu_size, 6259 &as->arcstat_mfu_evictable_data, 6260 &as->arcstat_mfu_evictable_metadata); 6261 arc_kstat_update_state(arc_mfu_ghost, 6262 &as->arcstat_mfu_ghost_size, 6263 &as->arcstat_mfu_ghost_evictable_data, 6264 &as->arcstat_mfu_ghost_evictable_metadata); 6265 } 6266 6267 return (0); 6268} 6269 6270/* 6271 * This function *must* return indices evenly distributed between all 6272 * sublists of the multilist. This is needed due to how the ARC eviction 6273 * code is laid out; arc_evict_state() assumes ARC buffers are evenly 6274 * distributed between all sublists and uses this assumption when 6275 * deciding which sublist to evict from and how much to evict from it. 6276 */ 6277unsigned int 6278arc_state_multilist_index_func(multilist_t *ml, void *obj) 6279{ 6280 arc_buf_hdr_t *hdr = obj; 6281 6282 /* 6283 * We rely on b_dva to generate evenly distributed index 6284 * numbers using buf_hash below. So, as an added precaution, 6285 * let's make sure we never add empty buffers to the arc lists. 6286 */ 6287 ASSERT(!HDR_EMPTY(hdr)); 6288 6289 /* 6290 * The assumption here, is the hash value for a given 6291 * arc_buf_hdr_t will remain constant throughout it's lifetime 6292 * (i.e. it's b_spa, b_dva, and b_birth fields don't change). 6293 * Thus, we don't need to store the header's sublist index 6294 * on insertion, as this index can be recalculated on removal. 6295 * 6296 * Also, the low order bits of the hash value are thought to be 6297 * distributed evenly. Otherwise, in the case that the multilist 6298 * has a power of two number of sublists, each sublists' usage 6299 * would not be evenly distributed. 6300 */ 6301 return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % 6302 multilist_get_num_sublists(ml)); 6303} 6304 6305#ifdef _KERNEL 6306static eventhandler_tag arc_event_lowmem = NULL; 6307 6308static void 6309arc_lowmem(void *arg __unused, int howto __unused) 6310{ 6311 6312 mutex_enter(&arc_reclaim_lock); 6313 /* XXX: Memory deficit should be passed as argument. */ 6314 needfree = btoc(arc_c >> arc_shrink_shift); 6315 DTRACE_PROBE(arc__needfree); 6316 cv_signal(&arc_reclaim_thread_cv); 6317 6318 /* 6319 * It is unsafe to block here in arbitrary threads, because we can come 6320 * here from ARC itself and may hold ARC locks and thus risk a deadlock 6321 * with ARC reclaim thread. 6322 */ 6323 if (curproc == pageproc) 6324 (void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); 6325 mutex_exit(&arc_reclaim_lock); 6326} 6327#endif 6328 6329static void 6330arc_state_init(void) 6331{ 6332 arc_anon = &ARC_anon; 6333 arc_mru = &ARC_mru; 6334 arc_mru_ghost = &ARC_mru_ghost; 6335 arc_mfu = &ARC_mfu; 6336 arc_mfu_ghost = &ARC_mfu_ghost; 6337 arc_l2c_only = &ARC_l2c_only; 6338 6339 arc_mru->arcs_list[ARC_BUFC_METADATA] = 6340 multilist_create(sizeof (arc_buf_hdr_t), 6341 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6342 arc_state_multilist_index_func); 6343 arc_mru->arcs_list[ARC_BUFC_DATA] = 6344 multilist_create(sizeof (arc_buf_hdr_t), 6345 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6346 arc_state_multilist_index_func); 6347 arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] = 6348 multilist_create(sizeof (arc_buf_hdr_t), 6349 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6350 arc_state_multilist_index_func); 6351 arc_mru_ghost->arcs_list[ARC_BUFC_DATA] = 6352 multilist_create(sizeof (arc_buf_hdr_t), 6353 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6354 arc_state_multilist_index_func); 6355 arc_mfu->arcs_list[ARC_BUFC_METADATA] = 6356 multilist_create(sizeof (arc_buf_hdr_t), 6357 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6358 arc_state_multilist_index_func); 6359 arc_mfu->arcs_list[ARC_BUFC_DATA] = 6360 multilist_create(sizeof (arc_buf_hdr_t), 6361 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6362 arc_state_multilist_index_func); 6363 arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] = 6364 multilist_create(sizeof (arc_buf_hdr_t), 6365 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6366 arc_state_multilist_index_func); 6367 arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] = 6368 multilist_create(sizeof (arc_buf_hdr_t), 6369 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6370 arc_state_multilist_index_func); 6371 arc_l2c_only->arcs_list[ARC_BUFC_METADATA] = 6372 multilist_create(sizeof (arc_buf_hdr_t), 6373 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6374 arc_state_multilist_index_func); 6375 arc_l2c_only->arcs_list[ARC_BUFC_DATA] = 6376 multilist_create(sizeof (arc_buf_hdr_t), 6377 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6378 arc_state_multilist_index_func); 6379 6380 refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); 6381 refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); 6382 refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); 6383 refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); 6384 refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); 6385 refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); 6386 refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); 6387 refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); 6388 refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); 6389 refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); 6390 refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); 6391 refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); 6392 6393 refcount_create(&arc_anon->arcs_size); 6394 refcount_create(&arc_mru->arcs_size); 6395 refcount_create(&arc_mru_ghost->arcs_size); 6396 refcount_create(&arc_mfu->arcs_size); 6397 refcount_create(&arc_mfu_ghost->arcs_size); 6398 refcount_create(&arc_l2c_only->arcs_size); 6399} 6400 6401static void 6402arc_state_fini(void) 6403{ 6404 refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); 6405 refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); 6406 refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); 6407 refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); 6408 refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); 6409 refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); 6410 refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); 6411 refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); 6412 refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); 6413 refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); 6414 refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); 6415 refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); 6416 6417 refcount_destroy(&arc_anon->arcs_size); 6418 refcount_destroy(&arc_mru->arcs_size); 6419 refcount_destroy(&arc_mru_ghost->arcs_size); 6420 refcount_destroy(&arc_mfu->arcs_size); 6421 refcount_destroy(&arc_mfu_ghost->arcs_size); 6422 refcount_destroy(&arc_l2c_only->arcs_size); 6423 6424 multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]); 6425 multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 6426 multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]); 6427 multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 6428 multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]); 6429 multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 6430 multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]); 6431 multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 6432} 6433 6434uint64_t 6435arc_max_bytes(void) 6436{ 6437 return (arc_c_max); 6438} 6439 6440void 6441arc_init(void) 6442{ 6443 int i, prefetch_tunable_set = 0; 6444 6445 /* 6446 * allmem is "all memory that we could possibly use". 6447 */ 6448#ifdef illumos 6449#ifdef _KERNEL 6450 uint64_t allmem = ptob(physmem - swapfs_minfree); 6451#else 6452 uint64_t allmem = (physmem * PAGESIZE) / 2; 6453#endif 6454#else 6455 uint64_t allmem = kmem_size(); 6456#endif 6457 6458 6459 mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); 6460 cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); 6461 cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); 6462 6463 mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL); 6464 cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL); 6465 6466 /* Convert seconds to clock ticks */ 6467 arc_min_prefetch_lifespan = 1 * hz; 6468 6469 /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */ 6470 arc_c_min = MAX(allmem / 32, arc_abs_min); 6471 /* set max to 5/8 of all memory, or all but 1GB, whichever is more */ 6472 if (allmem >= 1 << 30) 6473 arc_c_max = allmem - (1 << 30); 6474 else 6475 arc_c_max = arc_c_min; 6476 arc_c_max = MAX(allmem * 5 / 8, arc_c_max); 6477 6478 /* 6479 * In userland, there's only the memory pressure that we artificially 6480 * create (see arc_available_memory()). Don't let arc_c get too 6481 * small, because it can cause transactions to be larger than 6482 * arc_c, causing arc_tempreserve_space() to fail. 6483 */ 6484#ifndef _KERNEL 6485 arc_c_min = arc_c_max / 2; 6486#endif 6487 6488#ifdef _KERNEL 6489 /* 6490 * Allow the tunables to override our calculations if they are 6491 * reasonable. 6492 */ 6493 if (zfs_arc_max > arc_abs_min && zfs_arc_max < allmem) { 6494 arc_c_max = zfs_arc_max; 6495 arc_c_min = MIN(arc_c_min, arc_c_max); 6496 } 6497 if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max) 6498 arc_c_min = zfs_arc_min; 6499#endif 6500 6501 arc_c = arc_c_max; 6502 arc_p = (arc_c >> 1); 6503 arc_size = 0; 6504 6505 /* limit meta-data to 1/4 of the arc capacity */ 6506 arc_meta_limit = arc_c_max / 4; 6507 6508#ifdef _KERNEL 6509 /* 6510 * Metadata is stored in the kernel's heap. Don't let us 6511 * use more than half the heap for the ARC. 6512 */ 6513 arc_meta_limit = MIN(arc_meta_limit, 6514 vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2); 6515#endif 6516 6517 /* Allow the tunable to override if it is reasonable */ 6518 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 6519 arc_meta_limit = zfs_arc_meta_limit; 6520 6521 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 6522 arc_c_min = arc_meta_limit / 2; 6523 6524 if (zfs_arc_meta_min > 0) { 6525 arc_meta_min = zfs_arc_meta_min; 6526 } else { 6527 arc_meta_min = arc_c_min / 2; 6528 } 6529 6530 if (zfs_arc_grow_retry > 0) 6531 arc_grow_retry = zfs_arc_grow_retry; 6532 6533 if (zfs_arc_shrink_shift > 0) 6534 arc_shrink_shift = zfs_arc_shrink_shift; 6535 6536 if (zfs_arc_no_grow_shift > 0) 6537 arc_no_grow_shift = zfs_arc_no_grow_shift; 6538 /* 6539 * Ensure that arc_no_grow_shift is less than arc_shrink_shift. 6540 */ 6541 if (arc_no_grow_shift >= arc_shrink_shift) 6542 arc_no_grow_shift = arc_shrink_shift - 1; 6543 6544 if (zfs_arc_p_min_shift > 0) 6545 arc_p_min_shift = zfs_arc_p_min_shift; 6546 6547 /* if kmem_flags are set, lets try to use less memory */ 6548 if (kmem_debugging()) 6549 arc_c = arc_c / 2; 6550 if (arc_c < arc_c_min) 6551 arc_c = arc_c_min; 6552 6553 zfs_arc_min = arc_c_min; 6554 zfs_arc_max = arc_c_max; 6555 6556 arc_state_init(); 6557 buf_init(); 6558 6559 arc_reclaim_thread_exit = B_FALSE; 6560 arc_dnlc_evicts_thread_exit = FALSE; 6561 6562 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 6563 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 6564 6565 if (arc_ksp != NULL) { 6566 arc_ksp->ks_data = &arc_stats; 6567 arc_ksp->ks_update = arc_kstat_update; 6568 kstat_install(arc_ksp); 6569 } 6570 6571 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 6572 TS_RUN, minclsyspri); 6573 6574#ifdef _KERNEL 6575 arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 6576 EVENTHANDLER_PRI_FIRST); 6577#endif 6578 6579 (void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0, 6580 TS_RUN, minclsyspri); 6581 6582 arc_dead = B_FALSE; 6583 arc_warm = B_FALSE; 6584 6585 /* 6586 * Calculate maximum amount of dirty data per pool. 6587 * 6588 * If it has been set by /etc/system, take that. 6589 * Otherwise, use a percentage of physical memory defined by 6590 * zfs_dirty_data_max_percent (default 10%) with a cap at 6591 * zfs_dirty_data_max_max (default 4GB). 6592 */ 6593 if (zfs_dirty_data_max == 0) { 6594 zfs_dirty_data_max = ptob(physmem) * 6595 zfs_dirty_data_max_percent / 100; 6596 zfs_dirty_data_max = MIN(zfs_dirty_data_max, 6597 zfs_dirty_data_max_max); 6598 } 6599 6600#ifdef _KERNEL 6601 if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 6602 prefetch_tunable_set = 1; 6603 6604#ifdef __i386__ 6605 if (prefetch_tunable_set == 0) { 6606 printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 6607 "-- to enable,\n"); 6608 printf(" add \"vfs.zfs.prefetch_disable=0\" " 6609 "to /boot/loader.conf.\n"); 6610 zfs_prefetch_disable = 1; 6611 } 6612#else 6613 if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 6614 prefetch_tunable_set == 0) { 6615 printf("ZFS NOTICE: Prefetch is disabled by default if less " 6616 "than 4GB of RAM is present;\n" 6617 " to enable, add \"vfs.zfs.prefetch_disable=0\" " 6618 "to /boot/loader.conf.\n"); 6619 zfs_prefetch_disable = 1; 6620 } 6621#endif 6622 /* Warn about ZFS memory and address space requirements. */ 6623 if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 6624 printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 6625 "expect unstable behavior.\n"); 6626 } 6627 if (allmem < 512 * (1 << 20)) { 6628 printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 6629 "expect unstable behavior.\n"); 6630 printf(" Consider tuning vm.kmem_size and " 6631 "vm.kmem_size_max\n"); 6632 printf(" in /boot/loader.conf.\n"); 6633 } 6634#endif 6635} 6636 6637void 6638arc_fini(void) 6639{ 6640 mutex_enter(&arc_reclaim_lock); 6641 arc_reclaim_thread_exit = B_TRUE; 6642 /* 6643 * The reclaim thread will set arc_reclaim_thread_exit back to 6644 * B_FALSE when it is finished exiting; we're waiting for that. 6645 */ 6646 while (arc_reclaim_thread_exit) { 6647 cv_signal(&arc_reclaim_thread_cv); 6648 cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); 6649 } 6650 mutex_exit(&arc_reclaim_lock); 6651 6652 /* Use B_TRUE to ensure *all* buffers are evicted */ 6653 arc_flush(NULL, B_TRUE); 6654 6655 mutex_enter(&arc_dnlc_evicts_lock); 6656 arc_dnlc_evicts_thread_exit = TRUE; 6657 /* 6658 * The user evicts thread will set arc_user_evicts_thread_exit 6659 * to FALSE when it is finished exiting; we're waiting for that. 6660 */ 6661 while (arc_dnlc_evicts_thread_exit) { 6662 cv_signal(&arc_dnlc_evicts_cv); 6663 cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); 6664 } 6665 mutex_exit(&arc_dnlc_evicts_lock); 6666 6667 arc_dead = B_TRUE; 6668 6669 if (arc_ksp != NULL) { 6670 kstat_delete(arc_ksp); 6671 arc_ksp = NULL; 6672 } 6673 6674 mutex_destroy(&arc_reclaim_lock); 6675 cv_destroy(&arc_reclaim_thread_cv); 6676 cv_destroy(&arc_reclaim_waiters_cv); 6677 6678 mutex_destroy(&arc_dnlc_evicts_lock); 6679 cv_destroy(&arc_dnlc_evicts_cv); 6680 6681 arc_state_fini(); 6682 buf_fini(); 6683 6684 ASSERT0(arc_loaned_bytes); 6685 6686#ifdef _KERNEL 6687 if (arc_event_lowmem != NULL) 6688 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 6689#endif 6690} 6691 6692/* 6693 * Level 2 ARC 6694 * 6695 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 6696 * It uses dedicated storage devices to hold cached data, which are populated 6697 * using large infrequent writes. The main role of this cache is to boost 6698 * the performance of random read workloads. The intended L2ARC devices 6699 * include short-stroked disks, solid state disks, and other media with 6700 * substantially faster read latency than disk. 6701 * 6702 * +-----------------------+ 6703 * | ARC | 6704 * +-----------------------+ 6705 * | ^ ^ 6706 * | | | 6707 * l2arc_feed_thread() arc_read() 6708 * | | | 6709 * | l2arc read | 6710 * V | | 6711 * +---------------+ | 6712 * | L2ARC | | 6713 * +---------------+ | 6714 * | ^ | 6715 * l2arc_write() | | 6716 * | | | 6717 * V | | 6718 * +-------+ +-------+ 6719 * | vdev | | vdev | 6720 * | cache | | cache | 6721 * +-------+ +-------+ 6722 * +=========+ .-----. 6723 * : L2ARC : |-_____-| 6724 * : devices : | Disks | 6725 * +=========+ `-_____-' 6726 * 6727 * Read requests are satisfied from the following sources, in order: 6728 * 6729 * 1) ARC 6730 * 2) vdev cache of L2ARC devices 6731 * 3) L2ARC devices 6732 * 4) vdev cache of disks 6733 * 5) disks 6734 * 6735 * Some L2ARC device types exhibit extremely slow write performance. 6736 * To accommodate for this there are some significant differences between 6737 * the L2ARC and traditional cache design: 6738 * 6739 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 6740 * the ARC behave as usual, freeing buffers and placing headers on ghost 6741 * lists. The ARC does not send buffers to the L2ARC during eviction as 6742 * this would add inflated write latencies for all ARC memory pressure. 6743 * 6744 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 6745 * It does this by periodically scanning buffers from the eviction-end of 6746 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 6747 * not already there. It scans until a headroom of buffers is satisfied, 6748 * which itself is a buffer for ARC eviction. If a compressible buffer is 6749 * found during scanning and selected for writing to an L2ARC device, we 6750 * temporarily boost scanning headroom during the next scan cycle to make 6751 * sure we adapt to compression effects (which might significantly reduce 6752 * the data volume we write to L2ARC). The thread that does this is 6753 * l2arc_feed_thread(), illustrated below; example sizes are included to 6754 * provide a better sense of ratio than this diagram: 6755 * 6756 * head --> tail 6757 * +---------------------+----------+ 6758 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 6759 * +---------------------+----------+ | o L2ARC eligible 6760 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 6761 * +---------------------+----------+ | 6762 * 15.9 Gbytes ^ 32 Mbytes | 6763 * headroom | 6764 * l2arc_feed_thread() 6765 * | 6766 * l2arc write hand <--[oooo]--' 6767 * | 8 Mbyte 6768 * | write max 6769 * V 6770 * +==============================+ 6771 * L2ARC dev |####|#|###|###| |####| ... | 6772 * +==============================+ 6773 * 32 Gbytes 6774 * 6775 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 6776 * evicted, then the L2ARC has cached a buffer much sooner than it probably 6777 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 6778 * safe to say that this is an uncommon case, since buffers at the end of 6779 * the ARC lists have moved there due to inactivity. 6780 * 6781 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 6782 * then the L2ARC simply misses copying some buffers. This serves as a 6783 * pressure valve to prevent heavy read workloads from both stalling the ARC 6784 * with waits and clogging the L2ARC with writes. This also helps prevent 6785 * the potential for the L2ARC to churn if it attempts to cache content too 6786 * quickly, such as during backups of the entire pool. 6787 * 6788 * 5. After system boot and before the ARC has filled main memory, there are 6789 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 6790 * lists can remain mostly static. Instead of searching from tail of these 6791 * lists as pictured, the l2arc_feed_thread() will search from the list heads 6792 * for eligible buffers, greatly increasing its chance of finding them. 6793 * 6794 * The L2ARC device write speed is also boosted during this time so that 6795 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 6796 * there are no L2ARC reads, and no fear of degrading read performance 6797 * through increased writes. 6798 * 6799 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 6800 * the vdev queue can aggregate them into larger and fewer writes. Each 6801 * device is written to in a rotor fashion, sweeping writes through 6802 * available space then repeating. 6803 * 6804 * 7. The L2ARC does not store dirty content. It never needs to flush 6805 * write buffers back to disk based storage. 6806 * 6807 * 8. If an ARC buffer is written (and dirtied) which also exists in the 6808 * L2ARC, the now stale L2ARC buffer is immediately dropped. 6809 * 6810 * The performance of the L2ARC can be tweaked by a number of tunables, which 6811 * may be necessary for different workloads: 6812 * 6813 * l2arc_write_max max write bytes per interval 6814 * l2arc_write_boost extra write bytes during device warmup 6815 * l2arc_noprefetch skip caching prefetched buffers 6816 * l2arc_headroom number of max device writes to precache 6817 * l2arc_headroom_boost when we find compressed buffers during ARC 6818 * scanning, we multiply headroom by this 6819 * percentage factor for the next scan cycle, 6820 * since more compressed buffers are likely to 6821 * be present 6822 * l2arc_feed_secs seconds between L2ARC writing 6823 * 6824 * Tunables may be removed or added as future performance improvements are 6825 * integrated, and also may become zpool properties. 6826 * 6827 * There are three key functions that control how the L2ARC warms up: 6828 * 6829 * l2arc_write_eligible() check if a buffer is eligible to cache 6830 * l2arc_write_size() calculate how much to write 6831 * l2arc_write_interval() calculate sleep delay between writes 6832 * 6833 * These three functions determine what to write, how much, and how quickly 6834 * to send writes. 6835 */ 6836 6837static boolean_t 6838l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) 6839{ 6840 /* 6841 * A buffer is *not* eligible for the L2ARC if it: 6842 * 1. belongs to a different spa. 6843 * 2. is already cached on the L2ARC. 6844 * 3. has an I/O in progress (it may be an incomplete read). 6845 * 4. is flagged not eligible (zfs property). 6846 */ 6847 if (hdr->b_spa != spa_guid) { 6848 ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 6849 return (B_FALSE); 6850 } 6851 if (HDR_HAS_L2HDR(hdr)) { 6852 ARCSTAT_BUMP(arcstat_l2_write_in_l2); 6853 return (B_FALSE); 6854 } 6855 if (HDR_IO_IN_PROGRESS(hdr)) { 6856 ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 6857 return (B_FALSE); 6858 } 6859 if (!HDR_L2CACHE(hdr)) { 6860 ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 6861 return (B_FALSE); 6862 } 6863 6864 return (B_TRUE); 6865} 6866 6867static uint64_t 6868l2arc_write_size(void) 6869{ 6870 uint64_t size; 6871 6872 /* 6873 * Make sure our globals have meaningful values in case the user 6874 * altered them. 6875 */ 6876 size = l2arc_write_max; 6877 if (size == 0) { 6878 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 6879 "be greater than zero, resetting it to the default (%d)", 6880 L2ARC_WRITE_SIZE); 6881 size = l2arc_write_max = L2ARC_WRITE_SIZE; 6882 } 6883 6884 if (arc_warm == B_FALSE) 6885 size += l2arc_write_boost; 6886 6887 return (size); 6888 6889} 6890 6891static clock_t 6892l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 6893{ 6894 clock_t interval, next, now; 6895 6896 /* 6897 * If the ARC lists are busy, increase our write rate; if the 6898 * lists are stale, idle back. This is achieved by checking 6899 * how much we previously wrote - if it was more than half of 6900 * what we wanted, schedule the next write much sooner. 6901 */ 6902 if (l2arc_feed_again && wrote > (wanted / 2)) 6903 interval = (hz * l2arc_feed_min_ms) / 1000; 6904 else 6905 interval = hz * l2arc_feed_secs; 6906 6907 now = ddi_get_lbolt(); 6908 next = MAX(now, MIN(now + interval, began + interval)); 6909 6910 return (next); 6911} 6912 6913/* 6914 * Cycle through L2ARC devices. This is how L2ARC load balances. 6915 * If a device is returned, this also returns holding the spa config lock. 6916 */ 6917static l2arc_dev_t * 6918l2arc_dev_get_next(void) 6919{ 6920 l2arc_dev_t *first, *next = NULL; 6921 6922 /* 6923 * Lock out the removal of spas (spa_namespace_lock), then removal 6924 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 6925 * both locks will be dropped and a spa config lock held instead. 6926 */ 6927 mutex_enter(&spa_namespace_lock); 6928 mutex_enter(&l2arc_dev_mtx); 6929 6930 /* if there are no vdevs, there is nothing to do */ 6931 if (l2arc_ndev == 0) 6932 goto out; 6933 6934 first = NULL; 6935 next = l2arc_dev_last; 6936 do { 6937 /* loop around the list looking for a non-faulted vdev */ 6938 if (next == NULL) { 6939 next = list_head(l2arc_dev_list); 6940 } else { 6941 next = list_next(l2arc_dev_list, next); 6942 if (next == NULL) 6943 next = list_head(l2arc_dev_list); 6944 } 6945 6946 /* if we have come back to the start, bail out */ 6947 if (first == NULL) 6948 first = next; 6949 else if (next == first) 6950 break; 6951 6952 } while (vdev_is_dead(next->l2ad_vdev)); 6953 6954 /* if we were unable to find any usable vdevs, return NULL */ 6955 if (vdev_is_dead(next->l2ad_vdev)) 6956 next = NULL; 6957 6958 l2arc_dev_last = next; 6959 6960out: 6961 mutex_exit(&l2arc_dev_mtx); 6962 6963 /* 6964 * Grab the config lock to prevent the 'next' device from being 6965 * removed while we are writing to it. 6966 */ 6967 if (next != NULL) 6968 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 6969 mutex_exit(&spa_namespace_lock); 6970 6971 return (next); 6972} 6973 6974/* 6975 * Free buffers that were tagged for destruction. 6976 */ 6977static void 6978l2arc_do_free_on_write() 6979{ 6980 list_t *buflist; 6981 l2arc_data_free_t *df, *df_prev; 6982 6983 mutex_enter(&l2arc_free_on_write_mtx); 6984 buflist = l2arc_free_on_write; 6985 6986 for (df = list_tail(buflist); df; df = df_prev) { 6987 df_prev = list_prev(buflist, df); 6988 ASSERT3P(df->l2df_abd, !=, NULL); 6989 abd_free(df->l2df_abd); 6990 list_remove(buflist, df); 6991 kmem_free(df, sizeof (l2arc_data_free_t)); 6992 } 6993 6994 mutex_exit(&l2arc_free_on_write_mtx); 6995} 6996 6997/* 6998 * A write to a cache device has completed. Update all headers to allow 6999 * reads from these buffers to begin. 7000 */ 7001static void 7002l2arc_write_done(zio_t *zio) 7003{ 7004 l2arc_write_callback_t *cb; 7005 l2arc_dev_t *dev; 7006 list_t *buflist; 7007 arc_buf_hdr_t *head, *hdr, *hdr_prev; 7008 kmutex_t *hash_lock; 7009 int64_t bytes_dropped = 0; 7010 7011 cb = zio->io_private; 7012 ASSERT3P(cb, !=, NULL); 7013 dev = cb->l2wcb_dev; 7014 ASSERT3P(dev, !=, NULL); 7015 head = cb->l2wcb_head; 7016 ASSERT3P(head, !=, NULL); 7017 buflist = &dev->l2ad_buflist; 7018 ASSERT3P(buflist, !=, NULL); 7019 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 7020 l2arc_write_callback_t *, cb); 7021 7022 if (zio->io_error != 0) 7023 ARCSTAT_BUMP(arcstat_l2_writes_error); 7024 7025 /* 7026 * All writes completed, or an error was hit. 7027 */ 7028top: 7029 mutex_enter(&dev->l2ad_mtx); 7030 for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { 7031 hdr_prev = list_prev(buflist, hdr); 7032 7033 hash_lock = HDR_LOCK(hdr); 7034 7035 /* 7036 * We cannot use mutex_enter or else we can deadlock 7037 * with l2arc_write_buffers (due to swapping the order 7038 * the hash lock and l2ad_mtx are taken). 7039 */ 7040 if (!mutex_tryenter(hash_lock)) { 7041 /* 7042 * Missed the hash lock. We must retry so we 7043 * don't leave the ARC_FLAG_L2_WRITING bit set. 7044 */ 7045 ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); 7046 7047 /* 7048 * We don't want to rescan the headers we've 7049 * already marked as having been written out, so 7050 * we reinsert the head node so we can pick up 7051 * where we left off. 7052 */ 7053 list_remove(buflist, head); 7054 list_insert_after(buflist, hdr, head); 7055 7056 mutex_exit(&dev->l2ad_mtx); 7057 7058 /* 7059 * We wait for the hash lock to become available 7060 * to try and prevent busy waiting, and increase 7061 * the chance we'll be able to acquire the lock 7062 * the next time around. 7063 */ 7064 mutex_enter(hash_lock); 7065 mutex_exit(hash_lock); 7066 goto top; 7067 } 7068 7069 /* 7070 * We could not have been moved into the arc_l2c_only 7071 * state while in-flight due to our ARC_FLAG_L2_WRITING 7072 * bit being set. Let's just ensure that's being enforced. 7073 */ 7074 ASSERT(HDR_HAS_L1HDR(hdr)); 7075 7076 if (zio->io_error != 0) { 7077 /* 7078 * Error - drop L2ARC entry. 7079 */ 7080 list_remove(buflist, hdr); 7081 l2arc_trim(hdr); 7082 arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); 7083 7084 ARCSTAT_INCR(arcstat_l2_asize, -arc_hdr_size(hdr)); 7085 ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr)); 7086 7087 bytes_dropped += arc_hdr_size(hdr); 7088 (void) refcount_remove_many(&dev->l2ad_alloc, 7089 arc_hdr_size(hdr), hdr); 7090 } 7091 7092 /* 7093 * Allow ARC to begin reads and ghost list evictions to 7094 * this L2ARC entry. 7095 */ 7096 arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); 7097 7098 mutex_exit(hash_lock); 7099 } 7100 7101 atomic_inc_64(&l2arc_writes_done); 7102 list_remove(buflist, head); 7103 ASSERT(!HDR_HAS_L1HDR(head)); 7104 kmem_cache_free(hdr_l2only_cache, head); 7105 mutex_exit(&dev->l2ad_mtx); 7106 7107 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); 7108 7109 l2arc_do_free_on_write(); 7110 7111 kmem_free(cb, sizeof (l2arc_write_callback_t)); 7112} 7113 7114/* 7115 * A read to a cache device completed. Validate buffer contents before 7116 * handing over to the regular ARC routines. 7117 */ 7118static void 7119l2arc_read_done(zio_t *zio) 7120{ 7121 l2arc_read_callback_t *cb; 7122 arc_buf_hdr_t *hdr; 7123 kmutex_t *hash_lock; 7124 boolean_t valid_cksum; 7125 7126 ASSERT3P(zio->io_vd, !=, NULL); 7127 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 7128 7129 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 7130 7131 cb = zio->io_private; 7132 ASSERT3P(cb, !=, NULL); 7133 hdr = cb->l2rcb_hdr; 7134 ASSERT3P(hdr, !=, NULL); 7135 7136 hash_lock = HDR_LOCK(hdr); 7137 mutex_enter(hash_lock); 7138 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 7139 7140 /* 7141 * If the data was read into a temporary buffer, 7142 * move it and free the buffer. 7143 */ 7144 if (cb->l2rcb_abd != NULL) { 7145 ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); 7146 if (zio->io_error == 0) { 7147 abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd, 7148 arc_hdr_size(hdr)); 7149 } 7150 7151 /* 7152 * The following must be done regardless of whether 7153 * there was an error: 7154 * - free the temporary buffer 7155 * - point zio to the real ARC buffer 7156 * - set zio size accordingly 7157 * These are required because zio is either re-used for 7158 * an I/O of the block in the case of the error 7159 * or the zio is passed to arc_read_done() and it 7160 * needs real data. 7161 */ 7162 abd_free(cb->l2rcb_abd); 7163 zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); 7164 zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd; 7165 } 7166 7167 ASSERT3P(zio->io_abd, !=, NULL); 7168 7169 /* 7170 * Check this survived the L2ARC journey. 7171 */ 7172 ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd); 7173 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 7174 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 7175 7176 valid_cksum = arc_cksum_is_equal(hdr, zio); 7177 if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 7178 mutex_exit(hash_lock); 7179 zio->io_private = hdr; 7180 arc_read_done(zio); 7181 } else { 7182 mutex_exit(hash_lock); 7183 /* 7184 * Buffer didn't survive caching. Increment stats and 7185 * reissue to the original storage device. 7186 */ 7187 if (zio->io_error != 0) { 7188 ARCSTAT_BUMP(arcstat_l2_io_error); 7189 } else { 7190 zio->io_error = SET_ERROR(EIO); 7191 } 7192 if (!valid_cksum) 7193 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 7194 7195 /* 7196 * If there's no waiter, issue an async i/o to the primary 7197 * storage now. If there *is* a waiter, the caller must 7198 * issue the i/o in a context where it's OK to block. 7199 */ 7200 if (zio->io_waiter == NULL) { 7201 zio_t *pio = zio_unique_parent(zio); 7202 7203 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 7204 7205 zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, 7206 hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done, 7207 hdr, zio->io_priority, cb->l2rcb_flags, 7208 &cb->l2rcb_zb)); 7209 } 7210 } 7211 7212 kmem_free(cb, sizeof (l2arc_read_callback_t)); 7213} 7214 7215/* 7216 * This is the list priority from which the L2ARC will search for pages to 7217 * cache. This is used within loops (0..3) to cycle through lists in the 7218 * desired order. This order can have a significant effect on cache 7219 * performance. 7220 * 7221 * Currently the metadata lists are hit first, MFU then MRU, followed by 7222 * the data lists. This function returns a locked list, and also returns 7223 * the lock pointer. 7224 */ 7225static multilist_sublist_t * 7226l2arc_sublist_lock(int list_num) 7227{ 7228 multilist_t *ml = NULL; 7229 unsigned int idx; 7230 7231 ASSERT(list_num >= 0 && list_num <= 3); 7232 7233 switch (list_num) { 7234 case 0: 7235 ml = arc_mfu->arcs_list[ARC_BUFC_METADATA]; 7236 break; 7237 case 1: 7238 ml = arc_mru->arcs_list[ARC_BUFC_METADATA]; 7239 break; 7240 case 2: 7241 ml = arc_mfu->arcs_list[ARC_BUFC_DATA]; 7242 break; 7243 case 3: 7244 ml = arc_mru->arcs_list[ARC_BUFC_DATA]; 7245 break; 7246 } 7247 7248 /* 7249 * Return a randomly-selected sublist. This is acceptable 7250 * because the caller feeds only a little bit of data for each 7251 * call (8MB). Subsequent calls will result in different 7252 * sublists being selected. 7253 */ 7254 idx = multilist_get_random_index(ml); 7255 return (multilist_sublist_lock(ml, idx)); 7256} 7257 7258/* 7259 * Evict buffers from the device write hand to the distance specified in 7260 * bytes. This distance may span populated buffers, it may span nothing. 7261 * This is clearing a region on the L2ARC device ready for writing. 7262 * If the 'all' boolean is set, every buffer is evicted. 7263 */ 7264static void 7265l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 7266{ 7267 list_t *buflist; 7268 arc_buf_hdr_t *hdr, *hdr_prev; 7269 kmutex_t *hash_lock; 7270 uint64_t taddr; 7271 7272 buflist = &dev->l2ad_buflist; 7273 7274 if (!all && dev->l2ad_first) { 7275 /* 7276 * This is the first sweep through the device. There is 7277 * nothing to evict. 7278 */ 7279 return; 7280 } 7281 7282 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 7283 /* 7284 * When nearing the end of the device, evict to the end 7285 * before the device write hand jumps to the start. 7286 */ 7287 taddr = dev->l2ad_end; 7288 } else { 7289 taddr = dev->l2ad_hand + distance; 7290 } 7291 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 7292 uint64_t, taddr, boolean_t, all); 7293 7294top: 7295 mutex_enter(&dev->l2ad_mtx); 7296 for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { 7297 hdr_prev = list_prev(buflist, hdr); 7298 7299 hash_lock = HDR_LOCK(hdr); 7300 7301 /* 7302 * We cannot use mutex_enter or else we can deadlock 7303 * with l2arc_write_buffers (due to swapping the order 7304 * the hash lock and l2ad_mtx are taken). 7305 */ 7306 if (!mutex_tryenter(hash_lock)) { 7307 /* 7308 * Missed the hash lock. Retry. 7309 */ 7310 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 7311 mutex_exit(&dev->l2ad_mtx); 7312 mutex_enter(hash_lock); 7313 mutex_exit(hash_lock); 7314 goto top; 7315 } 7316 7317 if (HDR_L2_WRITE_HEAD(hdr)) { 7318 /* 7319 * We hit a write head node. Leave it for 7320 * l2arc_write_done(). 7321 */ 7322 list_remove(buflist, hdr); 7323 mutex_exit(hash_lock); 7324 continue; 7325 } 7326 7327 if (!all && HDR_HAS_L2HDR(hdr) && 7328 (hdr->b_l2hdr.b_daddr >= taddr || 7329 hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { 7330 /* 7331 * We've evicted to the target address, 7332 * or the end of the device. 7333 */ 7334 mutex_exit(hash_lock); 7335 break; 7336 } 7337 7338 ASSERT(HDR_HAS_L2HDR(hdr)); 7339 if (!HDR_HAS_L1HDR(hdr)) { 7340 ASSERT(!HDR_L2_READING(hdr)); 7341 /* 7342 * This doesn't exist in the ARC. Destroy. 7343 * arc_hdr_destroy() will call list_remove() 7344 * and decrement arcstat_l2_size. 7345 */ 7346 arc_change_state(arc_anon, hdr, hash_lock); 7347 arc_hdr_destroy(hdr); 7348 } else { 7349 ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); 7350 ARCSTAT_BUMP(arcstat_l2_evict_l1cached); 7351 /* 7352 * Invalidate issued or about to be issued 7353 * reads, since we may be about to write 7354 * over this location. 7355 */ 7356 if (HDR_L2_READING(hdr)) { 7357 ARCSTAT_BUMP(arcstat_l2_evict_reading); 7358 arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); 7359 } 7360 7361 /* Ensure this header has finished being written */ 7362 ASSERT(!HDR_L2_WRITING(hdr)); 7363 7364 arc_hdr_l2hdr_destroy(hdr); 7365 } 7366 mutex_exit(hash_lock); 7367 } 7368 mutex_exit(&dev->l2ad_mtx); 7369} 7370 7371/* 7372 * Find and write ARC buffers to the L2ARC device. 7373 * 7374 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid 7375 * for reading until they have completed writing. 7376 * The headroom_boost is an in-out parameter used to maintain headroom boost 7377 * state between calls to this function. 7378 * 7379 * Returns the number of bytes actually written (which may be smaller than 7380 * the delta by which the device hand has changed due to alignment). 7381 */ 7382static uint64_t 7383l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) 7384{ 7385 arc_buf_hdr_t *hdr, *hdr_prev, *head; 7386 uint64_t write_asize, write_psize, write_sz, headroom; 7387 boolean_t full; 7388 l2arc_write_callback_t *cb; 7389 zio_t *pio, *wzio; 7390 uint64_t guid = spa_load_guid(spa); 7391 int try; 7392 7393 ASSERT3P(dev->l2ad_vdev, !=, NULL); 7394 7395 pio = NULL; 7396 write_sz = write_asize = write_psize = 0; 7397 full = B_FALSE; 7398 head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); 7399 arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); 7400 7401 ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); 7402 /* 7403 * Copy buffers for L2ARC writing. 7404 */ 7405 for (try = 0; try <= 3; try++) { 7406 multilist_sublist_t *mls = l2arc_sublist_lock(try); 7407 uint64_t passed_sz = 0; 7408 7409 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); 7410 7411 /* 7412 * L2ARC fast warmup. 7413 * 7414 * Until the ARC is warm and starts to evict, read from the 7415 * head of the ARC lists rather than the tail. 7416 */ 7417 if (arc_warm == B_FALSE) 7418 hdr = multilist_sublist_head(mls); 7419 else 7420 hdr = multilist_sublist_tail(mls); 7421 if (hdr == NULL) 7422 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); 7423 7424 headroom = target_sz * l2arc_headroom; 7425 if (zfs_compressed_arc_enabled) 7426 headroom = (headroom * l2arc_headroom_boost) / 100; 7427 7428 for (; hdr; hdr = hdr_prev) { 7429 kmutex_t *hash_lock; 7430 7431 if (arc_warm == B_FALSE) 7432 hdr_prev = multilist_sublist_next(mls, hdr); 7433 else 7434 hdr_prev = multilist_sublist_prev(mls, hdr); 7435 ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, 7436 HDR_GET_LSIZE(hdr)); 7437 7438 hash_lock = HDR_LOCK(hdr); 7439 if (!mutex_tryenter(hash_lock)) { 7440 ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); 7441 /* 7442 * Skip this buffer rather than waiting. 7443 */ 7444 continue; 7445 } 7446 7447 passed_sz += HDR_GET_LSIZE(hdr); 7448 if (passed_sz > headroom) { 7449 /* 7450 * Searched too far. 7451 */ 7452 mutex_exit(hash_lock); 7453 ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); 7454 break; 7455 } 7456 7457 if (!l2arc_write_eligible(guid, hdr)) { 7458 mutex_exit(hash_lock); 7459 continue; 7460 } 7461 7462 /* 7463 * We rely on the L1 portion of the header below, so 7464 * it's invalid for this header to have been evicted out 7465 * of the ghost cache, prior to being written out. The 7466 * ARC_FLAG_L2_WRITING bit ensures this won't happen. 7467 */ 7468 ASSERT(HDR_HAS_L1HDR(hdr)); 7469 7470 ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); 7471 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 7472 ASSERT3U(arc_hdr_size(hdr), >, 0); 7473 uint64_t size = arc_hdr_size(hdr); 7474 uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, 7475 size); 7476 7477 if ((write_psize + asize) > target_sz) { 7478 full = B_TRUE; 7479 mutex_exit(hash_lock); 7480 ARCSTAT_BUMP(arcstat_l2_write_full); 7481 break; 7482 } 7483 7484 if (pio == NULL) { 7485 /* 7486 * Insert a dummy header on the buflist so 7487 * l2arc_write_done() can find where the 7488 * write buffers begin without searching. 7489 */ 7490 mutex_enter(&dev->l2ad_mtx); 7491 list_insert_head(&dev->l2ad_buflist, head); 7492 mutex_exit(&dev->l2ad_mtx); 7493 7494 cb = kmem_alloc( 7495 sizeof (l2arc_write_callback_t), KM_SLEEP); 7496 cb->l2wcb_dev = dev; 7497 cb->l2wcb_head = head; 7498 pio = zio_root(spa, l2arc_write_done, cb, 7499 ZIO_FLAG_CANFAIL); 7500 ARCSTAT_BUMP(arcstat_l2_write_pios); 7501 } 7502 7503 hdr->b_l2hdr.b_dev = dev; 7504 hdr->b_l2hdr.b_daddr = dev->l2ad_hand; 7505 arc_hdr_set_flags(hdr, 7506 ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR); 7507 7508 mutex_enter(&dev->l2ad_mtx); 7509 list_insert_head(&dev->l2ad_buflist, hdr); 7510 mutex_exit(&dev->l2ad_mtx); 7511 7512 (void) refcount_add_many(&dev->l2ad_alloc, size, hdr); 7513 7514 /* 7515 * Normally the L2ARC can use the hdr's data, but if 7516 * we're sharing data between the hdr and one of its 7517 * bufs, L2ARC needs its own copy of the data so that 7518 * the ZIO below can't race with the buf consumer. 7519 * Another case where we need to create a copy of the 7520 * data is when the buffer size is not device-aligned 7521 * and we need to pad the block to make it such. 7522 * That also keeps the clock hand suitably aligned. 7523 * 7524 * To ensure that the copy will be available for the 7525 * lifetime of the ZIO and be cleaned up afterwards, we 7526 * add it to the l2arc_free_on_write queue. 7527 */ 7528 abd_t *to_write; 7529 if (!HDR_SHARED_DATA(hdr) && size == asize) { 7530 to_write = hdr->b_l1hdr.b_pabd; 7531 } else { 7532 to_write = abd_alloc_for_io(asize, 7533 HDR_ISTYPE_METADATA(hdr)); 7534 abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); 7535 if (asize != size) { 7536 abd_zero_off(to_write, size, 7537 asize - size); 7538 } 7539 l2arc_free_abd_on_write(to_write, asize, 7540 arc_buf_type(hdr)); 7541 } 7542 wzio = zio_write_phys(pio, dev->l2ad_vdev, 7543 hdr->b_l2hdr.b_daddr, asize, to_write, 7544 ZIO_CHECKSUM_OFF, NULL, hdr, 7545 ZIO_PRIORITY_ASYNC_WRITE, 7546 ZIO_FLAG_CANFAIL, B_FALSE); 7547 7548 write_sz += HDR_GET_LSIZE(hdr); 7549 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 7550 zio_t *, wzio); 7551 7552 write_asize += size; 7553 write_psize += asize; 7554 dev->l2ad_hand += asize; 7555 7556 mutex_exit(hash_lock); 7557 7558 (void) zio_nowait(wzio); 7559 } 7560 7561 multilist_sublist_unlock(mls); 7562 7563 if (full == B_TRUE) 7564 break; 7565 } 7566 7567 /* No buffers selected for writing? */ 7568 if (pio == NULL) { 7569 ASSERT0(write_sz); 7570 ASSERT(!HDR_HAS_L1HDR(head)); 7571 kmem_cache_free(hdr_l2only_cache, head); 7572 return (0); 7573 } 7574 7575 ASSERT3U(write_psize, <=, target_sz); 7576 ARCSTAT_BUMP(arcstat_l2_writes_sent); 7577 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); 7578 ARCSTAT_INCR(arcstat_l2_size, write_sz); 7579 ARCSTAT_INCR(arcstat_l2_asize, write_asize); 7580 vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0); 7581 7582 /* 7583 * Bump device hand to the device start if it is approaching the end. 7584 * l2arc_evict() will already have evicted ahead for this case. 7585 */ 7586 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 7587 dev->l2ad_hand = dev->l2ad_start; 7588 dev->l2ad_first = B_FALSE; 7589 } 7590 7591 dev->l2ad_writing = B_TRUE; 7592 (void) zio_wait(pio); 7593 dev->l2ad_writing = B_FALSE; 7594 7595 return (write_asize); 7596} 7597 7598/* 7599 * This thread feeds the L2ARC at regular intervals. This is the beating 7600 * heart of the L2ARC. 7601 */ 7602static void 7603l2arc_feed_thread(void *dummy __unused) 7604{ 7605 callb_cpr_t cpr; 7606 l2arc_dev_t *dev; 7607 spa_t *spa; 7608 uint64_t size, wrote; 7609 clock_t begin, next = ddi_get_lbolt(); 7610 7611 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 7612 7613 mutex_enter(&l2arc_feed_thr_lock); 7614 7615 while (l2arc_thread_exit == 0) { 7616 CALLB_CPR_SAFE_BEGIN(&cpr); 7617 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 7618 next - ddi_get_lbolt()); 7619 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 7620 next = ddi_get_lbolt() + hz; 7621 7622 /* 7623 * Quick check for L2ARC devices. 7624 */ 7625 mutex_enter(&l2arc_dev_mtx); 7626 if (l2arc_ndev == 0) { 7627 mutex_exit(&l2arc_dev_mtx); 7628 continue; 7629 } 7630 mutex_exit(&l2arc_dev_mtx); 7631 begin = ddi_get_lbolt(); 7632 7633 /* 7634 * This selects the next l2arc device to write to, and in 7635 * doing so the next spa to feed from: dev->l2ad_spa. This 7636 * will return NULL if there are now no l2arc devices or if 7637 * they are all faulted. 7638 * 7639 * If a device is returned, its spa's config lock is also 7640 * held to prevent device removal. l2arc_dev_get_next() 7641 * will grab and release l2arc_dev_mtx. 7642 */ 7643 if ((dev = l2arc_dev_get_next()) == NULL) 7644 continue; 7645 7646 spa = dev->l2ad_spa; 7647 ASSERT3P(spa, !=, NULL); 7648 7649 /* 7650 * If the pool is read-only then force the feed thread to 7651 * sleep a little longer. 7652 */ 7653 if (!spa_writeable(spa)) { 7654 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 7655 spa_config_exit(spa, SCL_L2ARC, dev); 7656 continue; 7657 } 7658 7659 /* 7660 * Avoid contributing to memory pressure. 7661 */ 7662 if (arc_reclaim_needed()) { 7663 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 7664 spa_config_exit(spa, SCL_L2ARC, dev); 7665 continue; 7666 } 7667 7668 ARCSTAT_BUMP(arcstat_l2_feeds); 7669 7670 size = l2arc_write_size(); 7671 7672 /* 7673 * Evict L2ARC buffers that will be overwritten. 7674 */ 7675 l2arc_evict(dev, size, B_FALSE); 7676 7677 /* 7678 * Write ARC buffers. 7679 */ 7680 wrote = l2arc_write_buffers(spa, dev, size); 7681 7682 /* 7683 * Calculate interval between writes. 7684 */ 7685 next = l2arc_write_interval(begin, size, wrote); 7686 spa_config_exit(spa, SCL_L2ARC, dev); 7687 } 7688 7689 l2arc_thread_exit = 0; 7690 cv_broadcast(&l2arc_feed_thr_cv); 7691 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 7692 thread_exit(); 7693} 7694 7695boolean_t 7696l2arc_vdev_present(vdev_t *vd) 7697{ 7698 l2arc_dev_t *dev; 7699 7700 mutex_enter(&l2arc_dev_mtx); 7701 for (dev = list_head(l2arc_dev_list); dev != NULL; 7702 dev = list_next(l2arc_dev_list, dev)) { 7703 if (dev->l2ad_vdev == vd) 7704 break; 7705 } 7706 mutex_exit(&l2arc_dev_mtx); 7707 7708 return (dev != NULL); 7709} 7710 7711/* 7712 * Add a vdev for use by the L2ARC. By this point the spa has already 7713 * validated the vdev and opened it. 7714 */ 7715void 7716l2arc_add_vdev(spa_t *spa, vdev_t *vd) 7717{ 7718 l2arc_dev_t *adddev; 7719 7720 ASSERT(!l2arc_vdev_present(vd)); 7721 7722 vdev_ashift_optimize(vd); 7723 7724 /* 7725 * Create a new l2arc device entry. 7726 */ 7727 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 7728 adddev->l2ad_spa = spa; 7729 adddev->l2ad_vdev = vd; 7730 adddev->l2ad_start = VDEV_LABEL_START_SIZE; 7731 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 7732 adddev->l2ad_hand = adddev->l2ad_start; 7733 adddev->l2ad_first = B_TRUE; 7734 adddev->l2ad_writing = B_FALSE; 7735 7736 mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); 7737 /* 7738 * This is a list of all ARC buffers that are still valid on the 7739 * device. 7740 */ 7741 list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 7742 offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); 7743 7744 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 7745 refcount_create(&adddev->l2ad_alloc); 7746 7747 /* 7748 * Add device to global list 7749 */ 7750 mutex_enter(&l2arc_dev_mtx); 7751 list_insert_head(l2arc_dev_list, adddev); 7752 atomic_inc_64(&l2arc_ndev); 7753 mutex_exit(&l2arc_dev_mtx); 7754} 7755 7756/* 7757 * Remove a vdev from the L2ARC. 7758 */ 7759void 7760l2arc_remove_vdev(vdev_t *vd) 7761{ 7762 l2arc_dev_t *dev, *nextdev, *remdev = NULL; 7763 7764 /* 7765 * Find the device by vdev 7766 */ 7767 mutex_enter(&l2arc_dev_mtx); 7768 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 7769 nextdev = list_next(l2arc_dev_list, dev); 7770 if (vd == dev->l2ad_vdev) { 7771 remdev = dev; 7772 break; 7773 } 7774 } 7775 ASSERT3P(remdev, !=, NULL); 7776 7777 /* 7778 * Remove device from global list 7779 */ 7780 list_remove(l2arc_dev_list, remdev); 7781 l2arc_dev_last = NULL; /* may have been invalidated */ 7782 atomic_dec_64(&l2arc_ndev); 7783 mutex_exit(&l2arc_dev_mtx); 7784 7785 /* 7786 * Clear all buflists and ARC references. L2ARC device flush. 7787 */ 7788 l2arc_evict(remdev, 0, B_TRUE); 7789 list_destroy(&remdev->l2ad_buflist); 7790 mutex_destroy(&remdev->l2ad_mtx); 7791 refcount_destroy(&remdev->l2ad_alloc); 7792 kmem_free(remdev, sizeof (l2arc_dev_t)); 7793} 7794 7795void 7796l2arc_init(void) 7797{ 7798 l2arc_thread_exit = 0; 7799 l2arc_ndev = 0; 7800 l2arc_writes_sent = 0; 7801 l2arc_writes_done = 0; 7802 7803 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 7804 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 7805 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 7806 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 7807 7808 l2arc_dev_list = &L2ARC_dev_list; 7809 l2arc_free_on_write = &L2ARC_free_on_write; 7810 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 7811 offsetof(l2arc_dev_t, l2ad_node)); 7812 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 7813 offsetof(l2arc_data_free_t, l2df_list_node)); 7814} 7815 7816void 7817l2arc_fini(void) 7818{ 7819 /* 7820 * This is called from dmu_fini(), which is called from spa_fini(); 7821 * Because of this, we can assume that all l2arc devices have 7822 * already been removed when the pools themselves were removed. 7823 */ 7824 7825 l2arc_do_free_on_write(); 7826 7827 mutex_destroy(&l2arc_feed_thr_lock); 7828 cv_destroy(&l2arc_feed_thr_cv); 7829 mutex_destroy(&l2arc_dev_mtx); 7830 mutex_destroy(&l2arc_free_on_write_mtx); 7831 7832 list_destroy(l2arc_dev_list); 7833 list_destroy(l2arc_free_on_write); 7834} 7835 7836void 7837l2arc_start(void) 7838{ 7839 if (!(spa_mode_global & FWRITE)) 7840 return; 7841 7842 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 7843 TS_RUN, minclsyspri); 7844} 7845 7846void 7847l2arc_stop(void) 7848{ 7849 if (!(spa_mode_global & FWRITE)) 7850 return; 7851 7852 mutex_enter(&l2arc_feed_thr_lock); 7853 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 7854 l2arc_thread_exit = 1; 7855 while (l2arc_thread_exit != 0) 7856 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 7857 mutex_exit(&l2arc_feed_thr_lock); 7858} 7859