arc.c revision 339141
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2018, Joyent, Inc. 24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. 26 * Copyright 2017 Nexenta Systems, Inc. All rights reserved. 27 */ 28 29/* 30 * DVA-based Adjustable Replacement Cache 31 * 32 * While much of the theory of operation used here is 33 * based on the self-tuning, low overhead replacement cache 34 * presented by Megiddo and Modha at FAST 2003, there are some 35 * significant differences: 36 * 37 * 1. The Megiddo and Modha model assumes any page is evictable. 38 * Pages in its cache cannot be "locked" into memory. This makes 39 * the eviction algorithm simple: evict the last page in the list. 40 * This also make the performance characteristics easy to reason 41 * about. Our cache is not so simple. At any given moment, some 42 * subset of the blocks in the cache are un-evictable because we 43 * have handed out a reference to them. Blocks are only evictable 44 * when there are no external references active. This makes 45 * eviction far more problematic: we choose to evict the evictable 46 * blocks that are the "lowest" in the list. 47 * 48 * There are times when it is not possible to evict the requested 49 * space. In these circumstances we are unable to adjust the cache 50 * size. To prevent the cache growing unbounded at these times we 51 * implement a "cache throttle" that slows the flow of new data 52 * into the cache until we can make space available. 53 * 54 * 2. The Megiddo and Modha model assumes a fixed cache size. 55 * Pages are evicted when the cache is full and there is a cache 56 * miss. Our model has a variable sized cache. It grows with 57 * high use, but also tries to react to memory pressure from the 58 * operating system: decreasing its size when system memory is 59 * tight. 60 * 61 * 3. The Megiddo and Modha model assumes a fixed page size. All 62 * elements of the cache are therefore exactly the same size. So 63 * when adjusting the cache size following a cache miss, its simply 64 * a matter of choosing a single page to evict. In our model, we 65 * have variable sized cache blocks (rangeing from 512 bytes to 66 * 128K bytes). We therefore choose a set of blocks to evict to make 67 * space for a cache miss that approximates as closely as possible 68 * the space used by the new block. 69 * 70 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 71 * by N. Megiddo & D. Modha, FAST 2003 72 */ 73 74/* 75 * The locking model: 76 * 77 * A new reference to a cache buffer can be obtained in two 78 * ways: 1) via a hash table lookup using the DVA as a key, 79 * or 2) via one of the ARC lists. The arc_read() interface 80 * uses method 1, while the internal ARC algorithms for 81 * adjusting the cache use method 2. We therefore provide two 82 * types of locks: 1) the hash table lock array, and 2) the 83 * ARC list locks. 84 * 85 * Buffers do not have their own mutexes, rather they rely on the 86 * hash table mutexes for the bulk of their protection (i.e. most 87 * fields in the arc_buf_hdr_t are protected by these mutexes). 88 * 89 * buf_hash_find() returns the appropriate mutex (held) when it 90 * locates the requested buffer in the hash table. It returns 91 * NULL for the mutex if the buffer was not in the table. 92 * 93 * buf_hash_remove() expects the appropriate hash mutex to be 94 * already held before it is invoked. 95 * 96 * Each ARC state also has a mutex which is used to protect the 97 * buffer list associated with the state. When attempting to 98 * obtain a hash table lock while holding an ARC list lock you 99 * must use: mutex_tryenter() to avoid deadlock. Also note that 100 * the active state mutex must be held before the ghost state mutex. 101 * 102 * Note that the majority of the performance stats are manipulated 103 * with atomic operations. 104 * 105 * The L2ARC uses the l2ad_mtx on each vdev for the following: 106 * 107 * - L2ARC buflist creation 108 * - L2ARC buflist eviction 109 * - L2ARC write completion, which walks L2ARC buflists 110 * - ARC header destruction, as it removes from L2ARC buflists 111 * - ARC header release, as it removes from L2ARC buflists 112 */ 113 114/* 115 * ARC operation: 116 * 117 * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. 118 * This structure can point either to a block that is still in the cache or to 119 * one that is only accessible in an L2 ARC device, or it can provide 120 * information about a block that was recently evicted. If a block is 121 * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough 122 * information to retrieve it from the L2ARC device. This information is 123 * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block 124 * that is in this state cannot access the data directly. 125 * 126 * Blocks that are actively being referenced or have not been evicted 127 * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within 128 * the arc_buf_hdr_t that will point to the data block in memory. A block can 129 * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC 130 * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and 131 * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). 132 * 133 * The L1ARC's data pointer may or may not be uncompressed. The ARC has the 134 * ability to store the physical data (b_pabd) associated with the DVA of the 135 * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, 136 * it will match its on-disk compression characteristics. This behavior can be 137 * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the 138 * compressed ARC functionality is disabled, the b_pabd will point to an 139 * uncompressed version of the on-disk data. 140 * 141 * Data in the L1ARC is not accessed by consumers of the ARC directly. Each 142 * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it. 143 * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC 144 * consumer. The ARC will provide references to this data and will keep it 145 * cached until it is no longer in use. The ARC caches only the L1ARC's physical 146 * data block and will evict any arc_buf_t that is no longer referenced. The 147 * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the 148 * "overhead_size" kstat. 149 * 150 * Depending on the consumer, an arc_buf_t can be requested in uncompressed or 151 * compressed form. The typical case is that consumers will want uncompressed 152 * data, and when that happens a new data buffer is allocated where the data is 153 * decompressed for them to use. Currently the only consumer who wants 154 * compressed arc_buf_t's is "zfs send", when it streams data exactly as it 155 * exists on disk. When this happens, the arc_buf_t's data buffer is shared 156 * with the arc_buf_hdr_t. 157 * 158 * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The 159 * first one is owned by a compressed send consumer (and therefore references 160 * the same compressed data buffer as the arc_buf_hdr_t) and the second could be 161 * used by any other consumer (and has its own uncompressed copy of the data 162 * buffer). 163 * 164 * arc_buf_hdr_t 165 * +-----------+ 166 * | fields | 167 * | common to | 168 * | L1- and | 169 * | L2ARC | 170 * +-----------+ 171 * | l2arc_buf_hdr_t 172 * | | 173 * +-----------+ 174 * | l1arc_buf_hdr_t 175 * | | arc_buf_t 176 * | b_buf +------------>+-----------+ arc_buf_t 177 * | b_pabd +-+ |b_next +---->+-----------+ 178 * +-----------+ | |-----------| |b_next +-->NULL 179 * | |b_comp = T | +-----------+ 180 * | |b_data +-+ |b_comp = F | 181 * | +-----------+ | |b_data +-+ 182 * +->+------+ | +-----------+ | 183 * compressed | | | | 184 * data | |<--------------+ | uncompressed 185 * +------+ compressed, | data 186 * shared +-->+------+ 187 * data | | 188 * | | 189 * +------+ 190 * 191 * When a consumer reads a block, the ARC must first look to see if the 192 * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new 193 * arc_buf_t and either copies uncompressed data into a new data buffer from an 194 * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a 195 * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the 196 * hdr is compressed and the desired compression characteristics of the 197 * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the 198 * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be 199 * the last buffer in the hdr's b_buf list, however a shared compressed buf can 200 * be anywhere in the hdr's list. 201 * 202 * The diagram below shows an example of an uncompressed ARC hdr that is 203 * sharing its data with an arc_buf_t (note that the shared uncompressed buf is 204 * the last element in the buf list): 205 * 206 * arc_buf_hdr_t 207 * +-----------+ 208 * | | 209 * | | 210 * | | 211 * +-----------+ 212 * l2arc_buf_hdr_t| | 213 * | | 214 * +-----------+ 215 * l1arc_buf_hdr_t| | 216 * | | arc_buf_t (shared) 217 * | b_buf +------------>+---------+ arc_buf_t 218 * | | |b_next +---->+---------+ 219 * | b_pabd +-+ |---------| |b_next +-->NULL 220 * +-----------+ | | | +---------+ 221 * | |b_data +-+ | | 222 * | +---------+ | |b_data +-+ 223 * +->+------+ | +---------+ | 224 * | | | | 225 * uncompressed | | | | 226 * data +------+ | | 227 * ^ +->+------+ | 228 * | uncompressed | | | 229 * | data | | | 230 * | +------+ | 231 * +---------------------------------+ 232 * 233 * Writing to the ARC requires that the ARC first discard the hdr's b_pabd 234 * since the physical block is about to be rewritten. The new data contents 235 * will be contained in the arc_buf_t. As the I/O pipeline performs the write, 236 * it may compress the data before writing it to disk. The ARC will be called 237 * with the transformed data and will bcopy the transformed on-disk block into 238 * a newly allocated b_pabd. Writes are always done into buffers which have 239 * either been loaned (and hence are new and don't have other readers) or 240 * buffers which have been released (and hence have their own hdr, if there 241 * were originally other readers of the buf's original hdr). This ensures that 242 * the ARC only needs to update a single buf and its hdr after a write occurs. 243 * 244 * When the L2ARC is in use, it will also take advantage of the b_pabd. The 245 * L2ARC will always write the contents of b_pabd to the L2ARC. This means 246 * that when compressed ARC is enabled that the L2ARC blocks are identical 247 * to the on-disk block in the main data pool. This provides a significant 248 * advantage since the ARC can leverage the bp's checksum when reading from the 249 * L2ARC to determine if the contents are valid. However, if the compressed 250 * ARC is disabled, then the L2ARC's block must be transformed to look 251 * like the physical block in the main data pool before comparing the 252 * checksum and determining its validity. 253 */ 254 255#include <sys/spa.h> 256#include <sys/zio.h> 257#include <sys/spa_impl.h> 258#include <sys/zio_compress.h> 259#include <sys/zio_checksum.h> 260#include <sys/zfs_context.h> 261#include <sys/arc.h> 262#include <sys/refcount.h> 263#include <sys/vdev.h> 264#include <sys/vdev_impl.h> 265#include <sys/dsl_pool.h> 266#include <sys/zio_checksum.h> 267#include <sys/multilist.h> 268#include <sys/abd.h> 269#ifdef _KERNEL 270#include <sys/dnlc.h> 271#include <sys/racct.h> 272#endif 273#include <sys/callb.h> 274#include <sys/kstat.h> 275#include <sys/trim_map.h> 276#include <zfs_fletcher.h> 277#include <sys/sdt.h> 278#include <sys/aggsum.h> 279#include <sys/cityhash.h> 280 281#include <machine/vmparam.h> 282 283#ifdef illumos 284#ifndef _KERNEL 285/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 286boolean_t arc_watch = B_FALSE; 287int arc_procfd; 288#endif 289#endif /* illumos */ 290 291static kmutex_t arc_reclaim_lock; 292static kcondvar_t arc_reclaim_thread_cv; 293static boolean_t arc_reclaim_thread_exit; 294static kcondvar_t arc_reclaim_waiters_cv; 295 296static kmutex_t arc_dnlc_evicts_lock; 297static kcondvar_t arc_dnlc_evicts_cv; 298static boolean_t arc_dnlc_evicts_thread_exit; 299 300uint_t arc_reduce_dnlc_percent = 3; 301 302/* 303 * The number of headers to evict in arc_evict_state_impl() before 304 * dropping the sublist lock and evicting from another sublist. A lower 305 * value means we're more likely to evict the "correct" header (i.e. the 306 * oldest header in the arc state), but comes with higher overhead 307 * (i.e. more invocations of arc_evict_state_impl()). 308 */ 309int zfs_arc_evict_batch_limit = 10; 310 311/* number of seconds before growing cache again */ 312static int arc_grow_retry = 60; 313 314/* number of milliseconds before attempting a kmem-cache-reap */ 315static int arc_kmem_cache_reap_retry_ms = 0; 316 317/* shift of arc_c for calculating overflow limit in arc_get_data_impl */ 318int zfs_arc_overflow_shift = 8; 319 320/* shift of arc_c for calculating both min and max arc_p */ 321static int arc_p_min_shift = 4; 322 323/* log2(fraction of arc to reclaim) */ 324static int arc_shrink_shift = 7; 325 326/* 327 * log2(fraction of ARC which must be free to allow growing). 328 * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, 329 * when reading a new block into the ARC, we will evict an equal-sized block 330 * from the ARC. 331 * 332 * This must be less than arc_shrink_shift, so that when we shrink the ARC, 333 * we will still not allow it to grow. 334 */ 335int arc_no_grow_shift = 5; 336 337 338/* 339 * minimum lifespan of a prefetch block in clock ticks 340 * (initialized in arc_init()) 341 */ 342static int zfs_arc_min_prefetch_ms = 1; 343static int zfs_arc_min_prescient_prefetch_ms = 6; 344 345/* 346 * If this percent of memory is free, don't throttle. 347 */ 348int arc_lotsfree_percent = 10; 349 350static int arc_dead; 351extern boolean_t zfs_prefetch_disable; 352 353/* 354 * The arc has filled available memory and has now warmed up. 355 */ 356static boolean_t arc_warm; 357 358/* 359 * log2 fraction of the zio arena to keep free. 360 */ 361int arc_zio_arena_free_shift = 2; 362 363/* 364 * These tunables are for performance analysis. 365 */ 366uint64_t zfs_arc_max; 367uint64_t zfs_arc_min; 368uint64_t zfs_arc_meta_limit = 0; 369uint64_t zfs_arc_meta_min = 0; 370int zfs_arc_grow_retry = 0; 371int zfs_arc_shrink_shift = 0; 372int zfs_arc_no_grow_shift = 0; 373int zfs_arc_p_min_shift = 0; 374uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ 375u_int zfs_arc_free_target = 0; 376 377/* Absolute min for arc min / max is 16MB. */ 378static uint64_t arc_abs_min = 16 << 20; 379 380/* 381 * ARC dirty data constraints for arc_tempreserve_space() throttle 382 */ 383uint_t zfs_arc_dirty_limit_percent = 50; /* total dirty data limit */ 384uint_t zfs_arc_anon_limit_percent = 25; /* anon block dirty limit */ 385uint_t zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */ 386 387boolean_t zfs_compressed_arc_enabled = B_TRUE; 388 389static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); 390static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); 391static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS); 392static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS); 393static int sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS); 394 395#if defined(__FreeBSD__) && defined(_KERNEL) 396static void 397arc_free_target_init(void *unused __unused) 398{ 399 400 zfs_arc_free_target = vm_pageout_wakeup_thresh; 401} 402SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, 403 arc_free_target_init, NULL); 404 405TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 406TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); 407TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); 408TUNABLE_INT("vfs.zfs.arc_grow_retry", &zfs_arc_grow_retry); 409TUNABLE_INT("vfs.zfs.arc_no_grow_shift", &zfs_arc_no_grow_shift); 410SYSCTL_DECL(_vfs_zfs); 411SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_U64 | CTLFLAG_RWTUN, 412 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size"); 413SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_U64 | CTLFLAG_RWTUN, 414 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size"); 415SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, CTLTYPE_U32 | CTLFLAG_RWTUN, 416 0, sizeof(uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U", 417 "log2(fraction of ARC which must be free to allow growing)"); 418SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, 419 &zfs_arc_average_blocksize, 0, 420 "ARC average blocksize"); 421SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, 422 &arc_shrink_shift, 0, 423 "log2(fraction of arc to reclaim)"); 424SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_grow_retry, CTLFLAG_RW, 425 &arc_grow_retry, 0, 426 "Wait in seconds before considering growing ARC"); 427SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN, 428 &zfs_compressed_arc_enabled, 0, 429 "Enable compressed ARC"); 430SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_kmem_cache_reap_retry_ms, CTLFLAG_RWTUN, 431 &arc_kmem_cache_reap_retry_ms, 0, 432 "Interval between ARC kmem_cache reapings"); 433 434/* 435 * We don't have a tunable for arc_free_target due to the dependency on 436 * pagedaemon initialisation. 437 */ 438SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, 439 CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), 440 sysctl_vfs_zfs_arc_free_target, "IU", 441 "Desired number of free pages below which ARC triggers reclaim"); 442 443static int 444sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) 445{ 446 u_int val; 447 int err; 448 449 val = zfs_arc_free_target; 450 err = sysctl_handle_int(oidp, &val, 0, req); 451 if (err != 0 || req->newptr == NULL) 452 return (err); 453 454 if (val < minfree) 455 return (EINVAL); 456 if (val > vm_cnt.v_page_count) 457 return (EINVAL); 458 459 zfs_arc_free_target = val; 460 461 return (0); 462} 463 464/* 465 * Must be declared here, before the definition of corresponding kstat 466 * macro which uses the same names will confuse the compiler. 467 */ 468SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, 469 CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 470 sysctl_vfs_zfs_arc_meta_limit, "QU", 471 "ARC metadata limit"); 472#endif 473 474/* 475 * Note that buffers can be in one of 6 states: 476 * ARC_anon - anonymous (discussed below) 477 * ARC_mru - recently used, currently cached 478 * ARC_mru_ghost - recentely used, no longer in cache 479 * ARC_mfu - frequently used, currently cached 480 * ARC_mfu_ghost - frequently used, no longer in cache 481 * ARC_l2c_only - exists in L2ARC but not other states 482 * When there are no active references to the buffer, they are 483 * are linked onto a list in one of these arc states. These are 484 * the only buffers that can be evicted or deleted. Within each 485 * state there are multiple lists, one for meta-data and one for 486 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 487 * etc.) is tracked separately so that it can be managed more 488 * explicitly: favored over data, limited explicitly. 489 * 490 * Anonymous buffers are buffers that are not associated with 491 * a DVA. These are buffers that hold dirty block copies 492 * before they are written to stable storage. By definition, 493 * they are "ref'd" and are considered part of arc_mru 494 * that cannot be freed. Generally, they will aquire a DVA 495 * as they are written and migrate onto the arc_mru list. 496 * 497 * The ARC_l2c_only state is for buffers that are in the second 498 * level ARC but no longer in any of the ARC_m* lists. The second 499 * level ARC itself may also contain buffers that are in any of 500 * the ARC_m* states - meaning that a buffer can exist in two 501 * places. The reason for the ARC_l2c_only state is to keep the 502 * buffer header in the hash table, so that reads that hit the 503 * second level ARC benefit from these fast lookups. 504 */ 505 506typedef struct arc_state { 507 /* 508 * list of evictable buffers 509 */ 510 multilist_t *arcs_list[ARC_BUFC_NUMTYPES]; 511 /* 512 * total amount of evictable data in this state 513 */ 514 refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; 515 /* 516 * total amount of data in this state; this includes: evictable, 517 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. 518 */ 519 refcount_t arcs_size; 520} arc_state_t; 521 522/* The 6 states: */ 523static arc_state_t ARC_anon; 524static arc_state_t ARC_mru; 525static arc_state_t ARC_mru_ghost; 526static arc_state_t ARC_mfu; 527static arc_state_t ARC_mfu_ghost; 528static arc_state_t ARC_l2c_only; 529 530typedef struct arc_stats { 531 kstat_named_t arcstat_hits; 532 kstat_named_t arcstat_misses; 533 kstat_named_t arcstat_demand_data_hits; 534 kstat_named_t arcstat_demand_data_misses; 535 kstat_named_t arcstat_demand_metadata_hits; 536 kstat_named_t arcstat_demand_metadata_misses; 537 kstat_named_t arcstat_prefetch_data_hits; 538 kstat_named_t arcstat_prefetch_data_misses; 539 kstat_named_t arcstat_prefetch_metadata_hits; 540 kstat_named_t arcstat_prefetch_metadata_misses; 541 kstat_named_t arcstat_mru_hits; 542 kstat_named_t arcstat_mru_ghost_hits; 543 kstat_named_t arcstat_mfu_hits; 544 kstat_named_t arcstat_mfu_ghost_hits; 545 kstat_named_t arcstat_allocated; 546 kstat_named_t arcstat_deleted; 547 /* 548 * Number of buffers that could not be evicted because the hash lock 549 * was held by another thread. The lock may not necessarily be held 550 * by something using the same buffer, since hash locks are shared 551 * by multiple buffers. 552 */ 553 kstat_named_t arcstat_mutex_miss; 554 /* 555 * Number of buffers skipped when updating the access state due to the 556 * header having already been released after acquiring the hash lock. 557 */ 558 kstat_named_t arcstat_access_skip; 559 /* 560 * Number of buffers skipped because they have I/O in progress, are 561 * indirect prefetch buffers that have not lived long enough, or are 562 * not from the spa we're trying to evict from. 563 */ 564 kstat_named_t arcstat_evict_skip; 565 /* 566 * Number of times arc_evict_state() was unable to evict enough 567 * buffers to reach it's target amount. 568 */ 569 kstat_named_t arcstat_evict_not_enough; 570 kstat_named_t arcstat_evict_l2_cached; 571 kstat_named_t arcstat_evict_l2_eligible; 572 kstat_named_t arcstat_evict_l2_ineligible; 573 kstat_named_t arcstat_evict_l2_skip; 574 kstat_named_t arcstat_hash_elements; 575 kstat_named_t arcstat_hash_elements_max; 576 kstat_named_t arcstat_hash_collisions; 577 kstat_named_t arcstat_hash_chains; 578 kstat_named_t arcstat_hash_chain_max; 579 kstat_named_t arcstat_p; 580 kstat_named_t arcstat_c; 581 kstat_named_t arcstat_c_min; 582 kstat_named_t arcstat_c_max; 583 /* Not updated directly; only synced in arc_kstat_update. */ 584 kstat_named_t arcstat_size; 585 /* 586 * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. 587 * Note that the compressed bytes may match the uncompressed bytes 588 * if the block is either not compressed or compressed arc is disabled. 589 */ 590 kstat_named_t arcstat_compressed_size; 591 /* 592 * Uncompressed size of the data stored in b_pabd. If compressed 593 * arc is disabled then this value will be identical to the stat 594 * above. 595 */ 596 kstat_named_t arcstat_uncompressed_size; 597 /* 598 * Number of bytes stored in all the arc_buf_t's. This is classified 599 * as "overhead" since this data is typically short-lived and will 600 * be evicted from the arc when it becomes unreferenced unless the 601 * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level 602 * values have been set (see comment in dbuf.c for more information). 603 */ 604 kstat_named_t arcstat_overhead_size; 605 /* 606 * Number of bytes consumed by internal ARC structures necessary 607 * for tracking purposes; these structures are not actually 608 * backed by ARC buffers. This includes arc_buf_hdr_t structures 609 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 610 * caches), and arc_buf_t structures (allocated via arc_buf_t 611 * cache). 612 * Not updated directly; only synced in arc_kstat_update. 613 */ 614 kstat_named_t arcstat_hdr_size; 615 /* 616 * Number of bytes consumed by ARC buffers of type equal to 617 * ARC_BUFC_DATA. This is generally consumed by buffers backing 618 * on disk user data (e.g. plain file contents). 619 * Not updated directly; only synced in arc_kstat_update. 620 */ 621 kstat_named_t arcstat_data_size; 622 /* 623 * Number of bytes consumed by ARC buffers of type equal to 624 * ARC_BUFC_METADATA. This is generally consumed by buffers 625 * backing on disk data that is used for internal ZFS 626 * structures (e.g. ZAP, dnode, indirect blocks, etc). 627 * Not updated directly; only synced in arc_kstat_update. 628 */ 629 kstat_named_t arcstat_metadata_size; 630 /* 631 * Number of bytes consumed by various buffers and structures 632 * not actually backed with ARC buffers. This includes bonus 633 * buffers (allocated directly via zio_buf_* functions), 634 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t 635 * cache), and dnode_t structures (allocated via dnode_t cache). 636 * Not updated directly; only synced in arc_kstat_update. 637 */ 638 kstat_named_t arcstat_other_size; 639 /* 640 * Total number of bytes consumed by ARC buffers residing in the 641 * arc_anon state. This includes *all* buffers in the arc_anon 642 * state; e.g. data, metadata, evictable, and unevictable buffers 643 * are all included in this value. 644 * Not updated directly; only synced in arc_kstat_update. 645 */ 646 kstat_named_t arcstat_anon_size; 647 /* 648 * Number of bytes consumed by ARC buffers that meet the 649 * following criteria: backing buffers of type ARC_BUFC_DATA, 650 * residing in the arc_anon state, and are eligible for eviction 651 * (e.g. have no outstanding holds on the buffer). 652 * Not updated directly; only synced in arc_kstat_update. 653 */ 654 kstat_named_t arcstat_anon_evictable_data; 655 /* 656 * Number of bytes consumed by ARC buffers that meet the 657 * following criteria: backing buffers of type ARC_BUFC_METADATA, 658 * residing in the arc_anon state, and are eligible for eviction 659 * (e.g. have no outstanding holds on the buffer). 660 * Not updated directly; only synced in arc_kstat_update. 661 */ 662 kstat_named_t arcstat_anon_evictable_metadata; 663 /* 664 * Total number of bytes consumed by ARC buffers residing in the 665 * arc_mru state. This includes *all* buffers in the arc_mru 666 * state; e.g. data, metadata, evictable, and unevictable buffers 667 * are all included in this value. 668 * Not updated directly; only synced in arc_kstat_update. 669 */ 670 kstat_named_t arcstat_mru_size; 671 /* 672 * Number of bytes consumed by ARC buffers that meet the 673 * following criteria: backing buffers of type ARC_BUFC_DATA, 674 * residing in the arc_mru state, and are eligible for eviction 675 * (e.g. have no outstanding holds on the buffer). 676 * Not updated directly; only synced in arc_kstat_update. 677 */ 678 kstat_named_t arcstat_mru_evictable_data; 679 /* 680 * Number of bytes consumed by ARC buffers that meet the 681 * following criteria: backing buffers of type ARC_BUFC_METADATA, 682 * residing in the arc_mru state, and are eligible for eviction 683 * (e.g. have no outstanding holds on the buffer). 684 * Not updated directly; only synced in arc_kstat_update. 685 */ 686 kstat_named_t arcstat_mru_evictable_metadata; 687 /* 688 * Total number of bytes that *would have been* consumed by ARC 689 * buffers in the arc_mru_ghost state. The key thing to note 690 * here, is the fact that this size doesn't actually indicate 691 * RAM consumption. The ghost lists only consist of headers and 692 * don't actually have ARC buffers linked off of these headers. 693 * Thus, *if* the headers had associated ARC buffers, these 694 * buffers *would have* consumed this number of bytes. 695 * Not updated directly; only synced in arc_kstat_update. 696 */ 697 kstat_named_t arcstat_mru_ghost_size; 698 /* 699 * Number of bytes that *would have been* consumed by ARC 700 * buffers that are eligible for eviction, of type 701 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 702 * Not updated directly; only synced in arc_kstat_update. 703 */ 704 kstat_named_t arcstat_mru_ghost_evictable_data; 705 /* 706 * Number of bytes that *would have been* consumed by ARC 707 * buffers that are eligible for eviction, of type 708 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 709 * Not updated directly; only synced in arc_kstat_update. 710 */ 711 kstat_named_t arcstat_mru_ghost_evictable_metadata; 712 /* 713 * Total number of bytes consumed by ARC buffers residing in the 714 * arc_mfu state. This includes *all* buffers in the arc_mfu 715 * state; e.g. data, metadata, evictable, and unevictable buffers 716 * are all included in this value. 717 * Not updated directly; only synced in arc_kstat_update. 718 */ 719 kstat_named_t arcstat_mfu_size; 720 /* 721 * Number of bytes consumed by ARC buffers that are eligible for 722 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 723 * state. 724 * Not updated directly; only synced in arc_kstat_update. 725 */ 726 kstat_named_t arcstat_mfu_evictable_data; 727 /* 728 * Number of bytes consumed by ARC buffers that are eligible for 729 * eviction, of type ARC_BUFC_METADATA, and reside in the 730 * arc_mfu state. 731 * Not updated directly; only synced in arc_kstat_update. 732 */ 733 kstat_named_t arcstat_mfu_evictable_metadata; 734 /* 735 * Total number of bytes that *would have been* consumed by ARC 736 * buffers in the arc_mfu_ghost state. See the comment above 737 * arcstat_mru_ghost_size for more details. 738 * Not updated directly; only synced in arc_kstat_update. 739 */ 740 kstat_named_t arcstat_mfu_ghost_size; 741 /* 742 * Number of bytes that *would have been* consumed by ARC 743 * buffers that are eligible for eviction, of type 744 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 745 * Not updated directly; only synced in arc_kstat_update. 746 */ 747 kstat_named_t arcstat_mfu_ghost_evictable_data; 748 /* 749 * Number of bytes that *would have been* consumed by ARC 750 * buffers that are eligible for eviction, of type 751 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 752 * Not updated directly; only synced in arc_kstat_update. 753 */ 754 kstat_named_t arcstat_mfu_ghost_evictable_metadata; 755 kstat_named_t arcstat_l2_hits; 756 kstat_named_t arcstat_l2_misses; 757 kstat_named_t arcstat_l2_feeds; 758 kstat_named_t arcstat_l2_rw_clash; 759 kstat_named_t arcstat_l2_read_bytes; 760 kstat_named_t arcstat_l2_write_bytes; 761 kstat_named_t arcstat_l2_writes_sent; 762 kstat_named_t arcstat_l2_writes_done; 763 kstat_named_t arcstat_l2_writes_error; 764 kstat_named_t arcstat_l2_writes_lock_retry; 765 kstat_named_t arcstat_l2_evict_lock_retry; 766 kstat_named_t arcstat_l2_evict_reading; 767 kstat_named_t arcstat_l2_evict_l1cached; 768 kstat_named_t arcstat_l2_free_on_write; 769 kstat_named_t arcstat_l2_abort_lowmem; 770 kstat_named_t arcstat_l2_cksum_bad; 771 kstat_named_t arcstat_l2_io_error; 772 kstat_named_t arcstat_l2_lsize; 773 kstat_named_t arcstat_l2_psize; 774 /* Not updated directly; only synced in arc_kstat_update. */ 775 kstat_named_t arcstat_l2_hdr_size; 776 kstat_named_t arcstat_l2_write_trylock_fail; 777 kstat_named_t arcstat_l2_write_passed_headroom; 778 kstat_named_t arcstat_l2_write_spa_mismatch; 779 kstat_named_t arcstat_l2_write_in_l2; 780 kstat_named_t arcstat_l2_write_hdr_io_in_progress; 781 kstat_named_t arcstat_l2_write_not_cacheable; 782 kstat_named_t arcstat_l2_write_full; 783 kstat_named_t arcstat_l2_write_buffer_iter; 784 kstat_named_t arcstat_l2_write_pios; 785 kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 786 kstat_named_t arcstat_l2_write_buffer_list_iter; 787 kstat_named_t arcstat_l2_write_buffer_list_null_iter; 788 kstat_named_t arcstat_memory_throttle_count; 789 /* Not updated directly; only synced in arc_kstat_update. */ 790 kstat_named_t arcstat_meta_used; 791 kstat_named_t arcstat_meta_limit; 792 kstat_named_t arcstat_meta_max; 793 kstat_named_t arcstat_meta_min; 794 kstat_named_t arcstat_async_upgrade_sync; 795 kstat_named_t arcstat_demand_hit_predictive_prefetch; 796 kstat_named_t arcstat_demand_hit_prescient_prefetch; 797} arc_stats_t; 798 799static arc_stats_t arc_stats = { 800 { "hits", KSTAT_DATA_UINT64 }, 801 { "misses", KSTAT_DATA_UINT64 }, 802 { "demand_data_hits", KSTAT_DATA_UINT64 }, 803 { "demand_data_misses", KSTAT_DATA_UINT64 }, 804 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 805 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 806 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 807 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 808 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 809 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 810 { "mru_hits", KSTAT_DATA_UINT64 }, 811 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 812 { "mfu_hits", KSTAT_DATA_UINT64 }, 813 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 814 { "allocated", KSTAT_DATA_UINT64 }, 815 { "deleted", KSTAT_DATA_UINT64 }, 816 { "mutex_miss", KSTAT_DATA_UINT64 }, 817 { "access_skip", KSTAT_DATA_UINT64 }, 818 { "evict_skip", KSTAT_DATA_UINT64 }, 819 { "evict_not_enough", KSTAT_DATA_UINT64 }, 820 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 821 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 822 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 823 { "evict_l2_skip", KSTAT_DATA_UINT64 }, 824 { "hash_elements", KSTAT_DATA_UINT64 }, 825 { "hash_elements_max", KSTAT_DATA_UINT64 }, 826 { "hash_collisions", KSTAT_DATA_UINT64 }, 827 { "hash_chains", KSTAT_DATA_UINT64 }, 828 { "hash_chain_max", KSTAT_DATA_UINT64 }, 829 { "p", KSTAT_DATA_UINT64 }, 830 { "c", KSTAT_DATA_UINT64 }, 831 { "c_min", KSTAT_DATA_UINT64 }, 832 { "c_max", KSTAT_DATA_UINT64 }, 833 { "size", KSTAT_DATA_UINT64 }, 834 { "compressed_size", KSTAT_DATA_UINT64 }, 835 { "uncompressed_size", KSTAT_DATA_UINT64 }, 836 { "overhead_size", KSTAT_DATA_UINT64 }, 837 { "hdr_size", KSTAT_DATA_UINT64 }, 838 { "data_size", KSTAT_DATA_UINT64 }, 839 { "metadata_size", KSTAT_DATA_UINT64 }, 840 { "other_size", KSTAT_DATA_UINT64 }, 841 { "anon_size", KSTAT_DATA_UINT64 }, 842 { "anon_evictable_data", KSTAT_DATA_UINT64 }, 843 { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, 844 { "mru_size", KSTAT_DATA_UINT64 }, 845 { "mru_evictable_data", KSTAT_DATA_UINT64 }, 846 { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, 847 { "mru_ghost_size", KSTAT_DATA_UINT64 }, 848 { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, 849 { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 850 { "mfu_size", KSTAT_DATA_UINT64 }, 851 { "mfu_evictable_data", KSTAT_DATA_UINT64 }, 852 { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, 853 { "mfu_ghost_size", KSTAT_DATA_UINT64 }, 854 { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, 855 { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 856 { "l2_hits", KSTAT_DATA_UINT64 }, 857 { "l2_misses", KSTAT_DATA_UINT64 }, 858 { "l2_feeds", KSTAT_DATA_UINT64 }, 859 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 860 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 861 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 862 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 863 { "l2_writes_done", KSTAT_DATA_UINT64 }, 864 { "l2_writes_error", KSTAT_DATA_UINT64 }, 865 { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, 866 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 867 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 868 { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, 869 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 870 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 871 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 872 { "l2_io_error", KSTAT_DATA_UINT64 }, 873 { "l2_size", KSTAT_DATA_UINT64 }, 874 { "l2_asize", KSTAT_DATA_UINT64 }, 875 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 876 { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 877 { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 878 { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 879 { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 880 { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 881 { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 882 { "l2_write_full", KSTAT_DATA_UINT64 }, 883 { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 884 { "l2_write_pios", KSTAT_DATA_UINT64 }, 885 { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 886 { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 887 { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, 888 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 889 { "arc_meta_used", KSTAT_DATA_UINT64 }, 890 { "arc_meta_limit", KSTAT_DATA_UINT64 }, 891 { "arc_meta_max", KSTAT_DATA_UINT64 }, 892 { "arc_meta_min", KSTAT_DATA_UINT64 }, 893 { "async_upgrade_sync", KSTAT_DATA_UINT64 }, 894 { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, 895 { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, 896}; 897 898#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 899 900#define ARCSTAT_INCR(stat, val) \ 901 atomic_add_64(&arc_stats.stat.value.ui64, (val)) 902 903#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 904#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 905 906#define ARCSTAT_MAX(stat, val) { \ 907 uint64_t m; \ 908 while ((val) > (m = arc_stats.stat.value.ui64) && \ 909 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 910 continue; \ 911} 912 913#define ARCSTAT_MAXSTAT(stat) \ 914 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 915 916/* 917 * We define a macro to allow ARC hits/misses to be easily broken down by 918 * two separate conditions, giving a total of four different subtypes for 919 * each of hits and misses (so eight statistics total). 920 */ 921#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 922 if (cond1) { \ 923 if (cond2) { \ 924 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 925 } else { \ 926 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 927 } \ 928 } else { \ 929 if (cond2) { \ 930 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 931 } else { \ 932 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 933 } \ 934 } 935 936kstat_t *arc_ksp; 937static arc_state_t *arc_anon; 938static arc_state_t *arc_mru; 939static arc_state_t *arc_mru_ghost; 940static arc_state_t *arc_mfu; 941static arc_state_t *arc_mfu_ghost; 942static arc_state_t *arc_l2c_only; 943 944/* 945 * There are several ARC variables that are critical to export as kstats -- 946 * but we don't want to have to grovel around in the kstat whenever we wish to 947 * manipulate them. For these variables, we therefore define them to be in 948 * terms of the statistic variable. This assures that we are not introducing 949 * the possibility of inconsistency by having shadow copies of the variables, 950 * while still allowing the code to be readable. 951 */ 952#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 953#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 954#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 955#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 956#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 957#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ 958#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 959 960/* compressed size of entire arc */ 961#define arc_compressed_size ARCSTAT(arcstat_compressed_size) 962/* uncompressed size of entire arc */ 963#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) 964/* number of bytes in the arc from arc_buf_t's */ 965#define arc_overhead_size ARCSTAT(arcstat_overhead_size) 966 967/* 968 * There are also some ARC variables that we want to export, but that are 969 * updated so often that having the canonical representation be the statistic 970 * variable causes a performance bottleneck. We want to use aggsum_t's for these 971 * instead, but still be able to export the kstat in the same way as before. 972 * The solution is to always use the aggsum version, except in the kstat update 973 * callback. 974 */ 975aggsum_t arc_size; 976aggsum_t arc_meta_used; 977aggsum_t astat_data_size; 978aggsum_t astat_metadata_size; 979aggsum_t astat_hdr_size; 980aggsum_t astat_other_size; 981aggsum_t astat_l2_hdr_size; 982 983static int arc_no_grow; /* Don't try to grow cache size */ 984static uint64_t arc_tempreserve; 985static uint64_t arc_loaned_bytes; 986 987typedef struct arc_callback arc_callback_t; 988 989struct arc_callback { 990 void *acb_private; 991 arc_read_done_func_t *acb_done; 992 arc_buf_t *acb_buf; 993 boolean_t acb_compressed; 994 zio_t *acb_zio_dummy; 995 zio_t *acb_zio_head; 996 arc_callback_t *acb_next; 997}; 998 999typedef struct arc_write_callback arc_write_callback_t; 1000 1001struct arc_write_callback { 1002 void *awcb_private; 1003 arc_write_done_func_t *awcb_ready; 1004 arc_write_done_func_t *awcb_children_ready; 1005 arc_write_done_func_t *awcb_physdone; 1006 arc_write_done_func_t *awcb_done; 1007 arc_buf_t *awcb_buf; 1008}; 1009 1010/* 1011 * ARC buffers are separated into multiple structs as a memory saving measure: 1012 * - Common fields struct, always defined, and embedded within it: 1013 * - L2-only fields, always allocated but undefined when not in L2ARC 1014 * - L1-only fields, only allocated when in L1ARC 1015 * 1016 * Buffer in L1 Buffer only in L2 1017 * +------------------------+ +------------------------+ 1018 * | arc_buf_hdr_t | | arc_buf_hdr_t | 1019 * | | | | 1020 * | | | | 1021 * | | | | 1022 * +------------------------+ +------------------------+ 1023 * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 1024 * | (undefined if L1-only) | | | 1025 * +------------------------+ +------------------------+ 1026 * | l1arc_buf_hdr_t | 1027 * | | 1028 * | | 1029 * | | 1030 * | | 1031 * +------------------------+ 1032 * 1033 * Because it's possible for the L2ARC to become extremely large, we can wind 1034 * up eating a lot of memory in L2ARC buffer headers, so the size of a header 1035 * is minimized by only allocating the fields necessary for an L1-cached buffer 1036 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 1037 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 1038 * words in pointers. arc_hdr_realloc() is used to switch a header between 1039 * these two allocation states. 1040 */ 1041typedef struct l1arc_buf_hdr { 1042 kmutex_t b_freeze_lock; 1043 zio_cksum_t *b_freeze_cksum; 1044#ifdef ZFS_DEBUG 1045 /* 1046 * Used for debugging with kmem_flags - by allocating and freeing 1047 * b_thawed when the buffer is thawed, we get a record of the stack 1048 * trace that thawed it. 1049 */ 1050 void *b_thawed; 1051#endif 1052 1053 arc_buf_t *b_buf; 1054 uint32_t b_bufcnt; 1055 /* for waiting on writes to complete */ 1056 kcondvar_t b_cv; 1057 uint8_t b_byteswap; 1058 1059 /* protected by arc state mutex */ 1060 arc_state_t *b_state; 1061 multilist_node_t b_arc_node; 1062 1063 /* updated atomically */ 1064 clock_t b_arc_access; 1065 1066 /* self protecting */ 1067 refcount_t b_refcnt; 1068 1069 arc_callback_t *b_acb; 1070 abd_t *b_pabd; 1071} l1arc_buf_hdr_t; 1072 1073typedef struct l2arc_dev l2arc_dev_t; 1074 1075typedef struct l2arc_buf_hdr { 1076 /* protected by arc_buf_hdr mutex */ 1077 l2arc_dev_t *b_dev; /* L2ARC device */ 1078 uint64_t b_daddr; /* disk address, offset byte */ 1079 1080 list_node_t b_l2node; 1081} l2arc_buf_hdr_t; 1082 1083struct arc_buf_hdr { 1084 /* protected by hash lock */ 1085 dva_t b_dva; 1086 uint64_t b_birth; 1087 1088 arc_buf_contents_t b_type; 1089 arc_buf_hdr_t *b_hash_next; 1090 arc_flags_t b_flags; 1091 1092 /* 1093 * This field stores the size of the data buffer after 1094 * compression, and is set in the arc's zio completion handlers. 1095 * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). 1096 * 1097 * While the block pointers can store up to 32MB in their psize 1098 * field, we can only store up to 32MB minus 512B. This is due 1099 * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. 1100 * a field of zeros represents 512B in the bp). We can't use a 1101 * bias of 1 since we need to reserve a psize of zero, here, to 1102 * represent holes and embedded blocks. 1103 * 1104 * This isn't a problem in practice, since the maximum size of a 1105 * buffer is limited to 16MB, so we never need to store 32MB in 1106 * this field. Even in the upstream illumos code base, the 1107 * maximum size of a buffer is limited to 16MB. 1108 */ 1109 uint16_t b_psize; 1110 1111 /* 1112 * This field stores the size of the data buffer before 1113 * compression, and cannot change once set. It is in units 1114 * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) 1115 */ 1116 uint16_t b_lsize; /* immutable */ 1117 uint64_t b_spa; /* immutable */ 1118 1119 /* L2ARC fields. Undefined when not in L2ARC. */ 1120 l2arc_buf_hdr_t b_l2hdr; 1121 /* L1ARC fields. Undefined when in l2arc_only state */ 1122 l1arc_buf_hdr_t b_l1hdr; 1123}; 1124 1125#if defined(__FreeBSD__) && defined(_KERNEL) 1126static int 1127sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) 1128{ 1129 uint64_t val; 1130 int err; 1131 1132 val = arc_meta_limit; 1133 err = sysctl_handle_64(oidp, &val, 0, req); 1134 if (err != 0 || req->newptr == NULL) 1135 return (err); 1136 1137 if (val <= 0 || val > arc_c_max) 1138 return (EINVAL); 1139 1140 arc_meta_limit = val; 1141 return (0); 1142} 1143 1144static int 1145sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) 1146{ 1147 uint32_t val; 1148 int err; 1149 1150 val = arc_no_grow_shift; 1151 err = sysctl_handle_32(oidp, &val, 0, req); 1152 if (err != 0 || req->newptr == NULL) 1153 return (err); 1154 1155 if (val >= arc_shrink_shift) 1156 return (EINVAL); 1157 1158 arc_no_grow_shift = val; 1159 return (0); 1160} 1161 1162static int 1163sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS) 1164{ 1165 uint64_t val; 1166 int err; 1167 1168 val = zfs_arc_max; 1169 err = sysctl_handle_64(oidp, &val, 0, req); 1170 if (err != 0 || req->newptr == NULL) 1171 return (err); 1172 1173 if (zfs_arc_max == 0) { 1174 /* Loader tunable so blindly set */ 1175 zfs_arc_max = val; 1176 return (0); 1177 } 1178 1179 if (val < arc_abs_min || val > kmem_size()) 1180 return (EINVAL); 1181 if (val < arc_c_min) 1182 return (EINVAL); 1183 if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit) 1184 return (EINVAL); 1185 1186 arc_c_max = val; 1187 1188 arc_c = arc_c_max; 1189 arc_p = (arc_c >> 1); 1190 1191 if (zfs_arc_meta_limit == 0) { 1192 /* limit meta-data to 1/4 of the arc capacity */ 1193 arc_meta_limit = arc_c_max / 4; 1194 } 1195 1196 /* if kmem_flags are set, lets try to use less memory */ 1197 if (kmem_debugging()) 1198 arc_c = arc_c / 2; 1199 1200 zfs_arc_max = arc_c; 1201 1202 return (0); 1203} 1204 1205static int 1206sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS) 1207{ 1208 uint64_t val; 1209 int err; 1210 1211 val = zfs_arc_min; 1212 err = sysctl_handle_64(oidp, &val, 0, req); 1213 if (err != 0 || req->newptr == NULL) 1214 return (err); 1215 1216 if (zfs_arc_min == 0) { 1217 /* Loader tunable so blindly set */ 1218 zfs_arc_min = val; 1219 return (0); 1220 } 1221 1222 if (val < arc_abs_min || val > arc_c_max) 1223 return (EINVAL); 1224 1225 arc_c_min = val; 1226 1227 if (zfs_arc_meta_min == 0) 1228 arc_meta_min = arc_c_min / 2; 1229 1230 if (arc_c < arc_c_min) 1231 arc_c = arc_c_min; 1232 1233 zfs_arc_min = arc_c_min; 1234 1235 return (0); 1236} 1237#endif 1238 1239#define GHOST_STATE(state) \ 1240 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 1241 (state) == arc_l2c_only) 1242 1243#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) 1244#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) 1245#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) 1246#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) 1247#define HDR_PRESCIENT_PREFETCH(hdr) \ 1248 ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) 1249#define HDR_COMPRESSION_ENABLED(hdr) \ 1250 ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) 1251 1252#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) 1253#define HDR_L2_READING(hdr) \ 1254 (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ 1255 ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) 1256#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) 1257#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) 1258#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) 1259#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) 1260 1261#define HDR_ISTYPE_METADATA(hdr) \ 1262 ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) 1263#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) 1264 1265#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) 1266#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) 1267 1268/* For storing compression mode in b_flags */ 1269#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) 1270 1271#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ 1272 HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) 1273#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ 1274 HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); 1275 1276#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) 1277#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) 1278#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) 1279 1280/* 1281 * Other sizes 1282 */ 1283 1284#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 1285#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) 1286 1287/* 1288 * Hash table routines 1289 */ 1290 1291#define HT_LOCK_PAD CACHE_LINE_SIZE 1292 1293struct ht_lock { 1294 kmutex_t ht_lock; 1295#ifdef _KERNEL 1296 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 1297#endif 1298}; 1299 1300#define BUF_LOCKS 256 1301typedef struct buf_hash_table { 1302 uint64_t ht_mask; 1303 arc_buf_hdr_t **ht_table; 1304 struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 1305} buf_hash_table_t; 1306 1307static buf_hash_table_t buf_hash_table; 1308 1309#define BUF_HASH_INDEX(spa, dva, birth) \ 1310 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 1311#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 1312#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 1313#define HDR_LOCK(hdr) \ 1314 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 1315 1316uint64_t zfs_crc64_table[256]; 1317 1318/* 1319 * Level 2 ARC 1320 */ 1321 1322#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 1323#define L2ARC_HEADROOM 2 /* num of writes */ 1324/* 1325 * If we discover during ARC scan any buffers to be compressed, we boost 1326 * our headroom for the next scanning cycle by this percentage multiple. 1327 */ 1328#define L2ARC_HEADROOM_BOOST 200 1329#define L2ARC_FEED_SECS 1 /* caching interval secs */ 1330#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 1331 1332#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 1333#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 1334 1335/* L2ARC Performance Tunables */ 1336uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 1337uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 1338uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 1339uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 1340uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 1341uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 1342boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 1343boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 1344boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 1345 1346SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, 1347 &l2arc_write_max, 0, "max write size"); 1348SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, 1349 &l2arc_write_boost, 0, "extra write during warmup"); 1350SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, 1351 &l2arc_headroom, 0, "number of dev writes"); 1352SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, 1353 &l2arc_feed_secs, 0, "interval seconds"); 1354SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, 1355 &l2arc_feed_min_ms, 0, "min interval milliseconds"); 1356 1357SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, 1358 &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 1359SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, 1360 &l2arc_feed_again, 0, "turbo warmup"); 1361SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, 1362 &l2arc_norw, 0, "no reads during writes"); 1363 1364SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 1365 &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); 1366SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, 1367 &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1368 "size of anonymous state"); 1369SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, 1370 &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1371 "size of anonymous state"); 1372 1373SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 1374 &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); 1375SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, 1376 &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1377 "size of metadata in mru state"); 1378SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, 1379 &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1380 "size of data in mru state"); 1381 1382SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 1383 &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); 1384SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, 1385 &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1386 "size of metadata in mru ghost state"); 1387SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, 1388 &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1389 "size of data in mru ghost state"); 1390 1391SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 1392 &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); 1393SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, 1394 &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1395 "size of metadata in mfu state"); 1396SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, 1397 &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1398 "size of data in mfu state"); 1399 1400SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 1401 &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); 1402SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, 1403 &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1404 "size of metadata in mfu ghost state"); 1405SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, 1406 &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1407 "size of data in mfu ghost state"); 1408 1409SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 1410 &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); 1411 1412SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prefetch_ms, CTLFLAG_RW, 1413 &zfs_arc_min_prefetch_ms, 0, "Min life of prefetch block in ms"); 1414SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prescient_prefetch_ms, CTLFLAG_RW, 1415 &zfs_arc_min_prescient_prefetch_ms, 0, "Min life of prescient prefetched block in ms"); 1416 1417/* 1418 * L2ARC Internals 1419 */ 1420struct l2arc_dev { 1421 vdev_t *l2ad_vdev; /* vdev */ 1422 spa_t *l2ad_spa; /* spa */ 1423 uint64_t l2ad_hand; /* next write location */ 1424 uint64_t l2ad_start; /* first addr on device */ 1425 uint64_t l2ad_end; /* last addr on device */ 1426 boolean_t l2ad_first; /* first sweep through */ 1427 boolean_t l2ad_writing; /* currently writing */ 1428 kmutex_t l2ad_mtx; /* lock for buffer list */ 1429 list_t l2ad_buflist; /* buffer list */ 1430 list_node_t l2ad_node; /* device list node */ 1431 refcount_t l2ad_alloc; /* allocated bytes */ 1432}; 1433 1434static list_t L2ARC_dev_list; /* device list */ 1435static list_t *l2arc_dev_list; /* device list pointer */ 1436static kmutex_t l2arc_dev_mtx; /* device list mutex */ 1437static l2arc_dev_t *l2arc_dev_last; /* last device used */ 1438static list_t L2ARC_free_on_write; /* free after write buf list */ 1439static list_t *l2arc_free_on_write; /* free after write list ptr */ 1440static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 1441static uint64_t l2arc_ndev; /* number of devices */ 1442 1443typedef struct l2arc_read_callback { 1444 arc_buf_hdr_t *l2rcb_hdr; /* read header */ 1445 blkptr_t l2rcb_bp; /* original blkptr */ 1446 zbookmark_phys_t l2rcb_zb; /* original bookmark */ 1447 int l2rcb_flags; /* original flags */ 1448 abd_t *l2rcb_abd; /* temporary buffer */ 1449} l2arc_read_callback_t; 1450 1451typedef struct l2arc_write_callback { 1452 l2arc_dev_t *l2wcb_dev; /* device info */ 1453 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 1454} l2arc_write_callback_t; 1455 1456typedef struct l2arc_data_free { 1457 /* protected by l2arc_free_on_write_mtx */ 1458 abd_t *l2df_abd; 1459 size_t l2df_size; 1460 arc_buf_contents_t l2df_type; 1461 list_node_t l2df_list_node; 1462} l2arc_data_free_t; 1463 1464static kmutex_t l2arc_feed_thr_lock; 1465static kcondvar_t l2arc_feed_thr_cv; 1466static uint8_t l2arc_thread_exit; 1467 1468static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *); 1469static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); 1470static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *); 1471static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); 1472static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); 1473static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); 1474static void arc_hdr_free_pabd(arc_buf_hdr_t *); 1475static void arc_hdr_alloc_pabd(arc_buf_hdr_t *); 1476static void arc_access(arc_buf_hdr_t *, kmutex_t *); 1477static boolean_t arc_is_overflowing(); 1478static void arc_buf_watch(arc_buf_t *); 1479 1480static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); 1481static uint32_t arc_bufc_to_flags(arc_buf_contents_t); 1482static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); 1483static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); 1484 1485static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); 1486static void l2arc_read_done(zio_t *); 1487 1488static void 1489l2arc_trim(const arc_buf_hdr_t *hdr) 1490{ 1491 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 1492 1493 ASSERT(HDR_HAS_L2HDR(hdr)); 1494 ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 1495 1496 if (HDR_GET_PSIZE(hdr) != 0) { 1497 trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, 1498 HDR_GET_PSIZE(hdr), 0); 1499 } 1500} 1501 1502/* 1503 * We use Cityhash for this. It's fast, and has good hash properties without 1504 * requiring any large static buffers. 1505 */ 1506static uint64_t 1507buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 1508{ 1509 return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth)); 1510} 1511 1512#define HDR_EMPTY(hdr) \ 1513 ((hdr)->b_dva.dva_word[0] == 0 && \ 1514 (hdr)->b_dva.dva_word[1] == 0) 1515 1516#define HDR_EQUAL(spa, dva, birth, hdr) \ 1517 ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 1518 ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 1519 ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) 1520 1521static void 1522buf_discard_identity(arc_buf_hdr_t *hdr) 1523{ 1524 hdr->b_dva.dva_word[0] = 0; 1525 hdr->b_dva.dva_word[1] = 0; 1526 hdr->b_birth = 0; 1527} 1528 1529static arc_buf_hdr_t * 1530buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) 1531{ 1532 const dva_t *dva = BP_IDENTITY(bp); 1533 uint64_t birth = BP_PHYSICAL_BIRTH(bp); 1534 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 1535 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1536 arc_buf_hdr_t *hdr; 1537 1538 mutex_enter(hash_lock); 1539 for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; 1540 hdr = hdr->b_hash_next) { 1541 if (HDR_EQUAL(spa, dva, birth, hdr)) { 1542 *lockp = hash_lock; 1543 return (hdr); 1544 } 1545 } 1546 mutex_exit(hash_lock); 1547 *lockp = NULL; 1548 return (NULL); 1549} 1550 1551/* 1552 * Insert an entry into the hash table. If there is already an element 1553 * equal to elem in the hash table, then the already existing element 1554 * will be returned and the new element will not be inserted. 1555 * Otherwise returns NULL. 1556 * If lockp == NULL, the caller is assumed to already hold the hash lock. 1557 */ 1558static arc_buf_hdr_t * 1559buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) 1560{ 1561 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1562 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1563 arc_buf_hdr_t *fhdr; 1564 uint32_t i; 1565 1566 ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); 1567 ASSERT(hdr->b_birth != 0); 1568 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1569 1570 if (lockp != NULL) { 1571 *lockp = hash_lock; 1572 mutex_enter(hash_lock); 1573 } else { 1574 ASSERT(MUTEX_HELD(hash_lock)); 1575 } 1576 1577 for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; 1578 fhdr = fhdr->b_hash_next, i++) { 1579 if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) 1580 return (fhdr); 1581 } 1582 1583 hdr->b_hash_next = buf_hash_table.ht_table[idx]; 1584 buf_hash_table.ht_table[idx] = hdr; 1585 arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); 1586 1587 /* collect some hash table performance data */ 1588 if (i > 0) { 1589 ARCSTAT_BUMP(arcstat_hash_collisions); 1590 if (i == 1) 1591 ARCSTAT_BUMP(arcstat_hash_chains); 1592 1593 ARCSTAT_MAX(arcstat_hash_chain_max, i); 1594 } 1595 1596 ARCSTAT_BUMP(arcstat_hash_elements); 1597 ARCSTAT_MAXSTAT(arcstat_hash_elements); 1598 1599 return (NULL); 1600} 1601 1602static void 1603buf_hash_remove(arc_buf_hdr_t *hdr) 1604{ 1605 arc_buf_hdr_t *fhdr, **hdrp; 1606 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1607 1608 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 1609 ASSERT(HDR_IN_HASH_TABLE(hdr)); 1610 1611 hdrp = &buf_hash_table.ht_table[idx]; 1612 while ((fhdr = *hdrp) != hdr) { 1613 ASSERT3P(fhdr, !=, NULL); 1614 hdrp = &fhdr->b_hash_next; 1615 } 1616 *hdrp = hdr->b_hash_next; 1617 hdr->b_hash_next = NULL; 1618 arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); 1619 1620 /* collect some hash table performance data */ 1621 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 1622 1623 if (buf_hash_table.ht_table[idx] && 1624 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 1625 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 1626} 1627 1628/* 1629 * Global data structures and functions for the buf kmem cache. 1630 */ 1631static kmem_cache_t *hdr_full_cache; 1632static kmem_cache_t *hdr_l2only_cache; 1633static kmem_cache_t *buf_cache; 1634 1635static void 1636buf_fini(void) 1637{ 1638 int i; 1639 1640 kmem_free(buf_hash_table.ht_table, 1641 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 1642 for (i = 0; i < BUF_LOCKS; i++) 1643 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 1644 kmem_cache_destroy(hdr_full_cache); 1645 kmem_cache_destroy(hdr_l2only_cache); 1646 kmem_cache_destroy(buf_cache); 1647} 1648 1649/* 1650 * Constructor callback - called when the cache is empty 1651 * and a new buf is requested. 1652 */ 1653/* ARGSUSED */ 1654static int 1655hdr_full_cons(void *vbuf, void *unused, int kmflag) 1656{ 1657 arc_buf_hdr_t *hdr = vbuf; 1658 1659 bzero(hdr, HDR_FULL_SIZE); 1660 cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); 1661 refcount_create(&hdr->b_l1hdr.b_refcnt); 1662 mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 1663 multilist_link_init(&hdr->b_l1hdr.b_arc_node); 1664 arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1665 1666 return (0); 1667} 1668 1669/* ARGSUSED */ 1670static int 1671hdr_l2only_cons(void *vbuf, void *unused, int kmflag) 1672{ 1673 arc_buf_hdr_t *hdr = vbuf; 1674 1675 bzero(hdr, HDR_L2ONLY_SIZE); 1676 arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1677 1678 return (0); 1679} 1680 1681/* ARGSUSED */ 1682static int 1683buf_cons(void *vbuf, void *unused, int kmflag) 1684{ 1685 arc_buf_t *buf = vbuf; 1686 1687 bzero(buf, sizeof (arc_buf_t)); 1688 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 1689 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1690 1691 return (0); 1692} 1693 1694/* 1695 * Destructor callback - called when a cached buf is 1696 * no longer required. 1697 */ 1698/* ARGSUSED */ 1699static void 1700hdr_full_dest(void *vbuf, void *unused) 1701{ 1702 arc_buf_hdr_t *hdr = vbuf; 1703 1704 ASSERT(HDR_EMPTY(hdr)); 1705 cv_destroy(&hdr->b_l1hdr.b_cv); 1706 refcount_destroy(&hdr->b_l1hdr.b_refcnt); 1707 mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); 1708 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 1709 arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1710} 1711 1712/* ARGSUSED */ 1713static void 1714hdr_l2only_dest(void *vbuf, void *unused) 1715{ 1716 arc_buf_hdr_t *hdr = vbuf; 1717 1718 ASSERT(HDR_EMPTY(hdr)); 1719 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1720} 1721 1722/* ARGSUSED */ 1723static void 1724buf_dest(void *vbuf, void *unused) 1725{ 1726 arc_buf_t *buf = vbuf; 1727 1728 mutex_destroy(&buf->b_evict_lock); 1729 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1730} 1731 1732/* 1733 * Reclaim callback -- invoked when memory is low. 1734 */ 1735/* ARGSUSED */ 1736static void 1737hdr_recl(void *unused) 1738{ 1739 dprintf("hdr_recl called\n"); 1740 /* 1741 * umem calls the reclaim func when we destroy the buf cache, 1742 * which is after we do arc_fini(). 1743 */ 1744 if (!arc_dead) 1745 cv_signal(&arc_reclaim_thread_cv); 1746} 1747 1748static void 1749buf_init(void) 1750{ 1751 uint64_t *ct; 1752 uint64_t hsize = 1ULL << 12; 1753 int i, j; 1754 1755 /* 1756 * The hash table is big enough to fill all of physical memory 1757 * with an average block size of zfs_arc_average_blocksize (default 8K). 1758 * By default, the table will take up 1759 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). 1760 */ 1761 while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) 1762 hsize <<= 1; 1763retry: 1764 buf_hash_table.ht_mask = hsize - 1; 1765 buf_hash_table.ht_table = 1766 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1767 if (buf_hash_table.ht_table == NULL) { 1768 ASSERT(hsize > (1ULL << 8)); 1769 hsize >>= 1; 1770 goto retry; 1771 } 1772 1773 hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 1774 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); 1775 hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", 1776 HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, 1777 NULL, NULL, 0); 1778 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1779 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1780 1781 for (i = 0; i < 256; i++) 1782 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1783 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1784 1785 for (i = 0; i < BUF_LOCKS; i++) { 1786 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1787 NULL, MUTEX_DEFAULT, NULL); 1788 } 1789} 1790 1791/* 1792 * This is the size that the buf occupies in memory. If the buf is compressed, 1793 * it will correspond to the compressed size. You should use this method of 1794 * getting the buf size unless you explicitly need the logical size. 1795 */ 1796int32_t 1797arc_buf_size(arc_buf_t *buf) 1798{ 1799 return (ARC_BUF_COMPRESSED(buf) ? 1800 HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr)); 1801} 1802 1803int32_t 1804arc_buf_lsize(arc_buf_t *buf) 1805{ 1806 return (HDR_GET_LSIZE(buf->b_hdr)); 1807} 1808 1809enum zio_compress 1810arc_get_compression(arc_buf_t *buf) 1811{ 1812 return (ARC_BUF_COMPRESSED(buf) ? 1813 HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); 1814} 1815 1816#define ARC_MINTIME (hz>>4) /* 62 ms */ 1817 1818static inline boolean_t 1819arc_buf_is_shared(arc_buf_t *buf) 1820{ 1821 boolean_t shared = (buf->b_data != NULL && 1822 buf->b_hdr->b_l1hdr.b_pabd != NULL && 1823 abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && 1824 buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); 1825 IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); 1826 IMPLY(shared, ARC_BUF_SHARED(buf)); 1827 IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); 1828 1829 /* 1830 * It would be nice to assert arc_can_share() too, but the "hdr isn't 1831 * already being shared" requirement prevents us from doing that. 1832 */ 1833 1834 return (shared); 1835} 1836 1837/* 1838 * Free the checksum associated with this header. If there is no checksum, this 1839 * is a no-op. 1840 */ 1841static inline void 1842arc_cksum_free(arc_buf_hdr_t *hdr) 1843{ 1844 ASSERT(HDR_HAS_L1HDR(hdr)); 1845 mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 1846 if (hdr->b_l1hdr.b_freeze_cksum != NULL) { 1847 kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); 1848 hdr->b_l1hdr.b_freeze_cksum = NULL; 1849 } 1850 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1851} 1852 1853/* 1854 * Return true iff at least one of the bufs on hdr is not compressed. 1855 */ 1856static boolean_t 1857arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) 1858{ 1859 for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) { 1860 if (!ARC_BUF_COMPRESSED(b)) { 1861 return (B_TRUE); 1862 } 1863 } 1864 return (B_FALSE); 1865} 1866 1867/* 1868 * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data 1869 * matches the checksum that is stored in the hdr. If there is no checksum, 1870 * or if the buf is compressed, this is a no-op. 1871 */ 1872static void 1873arc_cksum_verify(arc_buf_t *buf) 1874{ 1875 arc_buf_hdr_t *hdr = buf->b_hdr; 1876 zio_cksum_t zc; 1877 1878 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1879 return; 1880 1881 if (ARC_BUF_COMPRESSED(buf)) { 1882 ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || 1883 arc_hdr_has_uncompressed_buf(hdr)); 1884 return; 1885 } 1886 1887 ASSERT(HDR_HAS_L1HDR(hdr)); 1888 1889 mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 1890 if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { 1891 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1892 return; 1893 } 1894 1895 fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc); 1896 if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) 1897 panic("buffer modified while frozen!"); 1898 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1899} 1900 1901static boolean_t 1902arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) 1903{ 1904 enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp); 1905 boolean_t valid_cksum; 1906 1907 ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); 1908 VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); 1909 1910 /* 1911 * We rely on the blkptr's checksum to determine if the block 1912 * is valid or not. When compressed arc is enabled, the l2arc 1913 * writes the block to the l2arc just as it appears in the pool. 1914 * This allows us to use the blkptr's checksum to validate the 1915 * data that we just read off of the l2arc without having to store 1916 * a separate checksum in the arc_buf_hdr_t. However, if compressed 1917 * arc is disabled, then the data written to the l2arc is always 1918 * uncompressed and won't match the block as it exists in the main 1919 * pool. When this is the case, we must first compress it if it is 1920 * compressed on the main pool before we can validate the checksum. 1921 */ 1922 if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) { 1923 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); 1924 uint64_t lsize = HDR_GET_LSIZE(hdr); 1925 uint64_t csize; 1926 1927 abd_t *cdata = abd_alloc_linear(HDR_GET_PSIZE(hdr), B_TRUE); 1928 csize = zio_compress_data(compress, zio->io_abd, 1929 abd_to_buf(cdata), lsize); 1930 1931 ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); 1932 if (csize < HDR_GET_PSIZE(hdr)) { 1933 /* 1934 * Compressed blocks are always a multiple of the 1935 * smallest ashift in the pool. Ideally, we would 1936 * like to round up the csize to the next 1937 * spa_min_ashift but that value may have changed 1938 * since the block was last written. Instead, 1939 * we rely on the fact that the hdr's psize 1940 * was set to the psize of the block when it was 1941 * last written. We set the csize to that value 1942 * and zero out any part that should not contain 1943 * data. 1944 */ 1945 abd_zero_off(cdata, csize, HDR_GET_PSIZE(hdr) - csize); 1946 csize = HDR_GET_PSIZE(hdr); 1947 } 1948 zio_push_transform(zio, cdata, csize, HDR_GET_PSIZE(hdr), NULL); 1949 } 1950 1951 /* 1952 * Block pointers always store the checksum for the logical data. 1953 * If the block pointer has the gang bit set, then the checksum 1954 * it represents is for the reconstituted data and not for an 1955 * individual gang member. The zio pipeline, however, must be able to 1956 * determine the checksum of each of the gang constituents so it 1957 * treats the checksum comparison differently than what we need 1958 * for l2arc blocks. This prevents us from using the 1959 * zio_checksum_error() interface directly. Instead we must call the 1960 * zio_checksum_error_impl() so that we can ensure the checksum is 1961 * generated using the correct checksum algorithm and accounts for the 1962 * logical I/O size and not just a gang fragment. 1963 */ 1964 valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, 1965 BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, 1966 zio->io_offset, NULL) == 0); 1967 zio_pop_transforms(zio); 1968 return (valid_cksum); 1969} 1970 1971/* 1972 * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a 1973 * checksum and attaches it to the buf's hdr so that we can ensure that the buf 1974 * isn't modified later on. If buf is compressed or there is already a checksum 1975 * on the hdr, this is a no-op (we only checksum uncompressed bufs). 1976 */ 1977static void 1978arc_cksum_compute(arc_buf_t *buf) 1979{ 1980 arc_buf_hdr_t *hdr = buf->b_hdr; 1981 1982 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1983 return; 1984 1985 ASSERT(HDR_HAS_L1HDR(hdr)); 1986 1987 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1988 if (hdr->b_l1hdr.b_freeze_cksum != NULL) { 1989 ASSERT(arc_hdr_has_uncompressed_buf(hdr)); 1990 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1991 return; 1992 } else if (ARC_BUF_COMPRESSED(buf)) { 1993 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1994 return; 1995 } 1996 1997 ASSERT(!ARC_BUF_COMPRESSED(buf)); 1998 hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), 1999 KM_SLEEP); 2000 fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, 2001 hdr->b_l1hdr.b_freeze_cksum); 2002 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 2003#ifdef illumos 2004 arc_buf_watch(buf); 2005#endif 2006} 2007 2008#ifdef illumos 2009#ifndef _KERNEL 2010typedef struct procctl { 2011 long cmd; 2012 prwatch_t prwatch; 2013} procctl_t; 2014#endif 2015 2016/* ARGSUSED */ 2017static void 2018arc_buf_unwatch(arc_buf_t *buf) 2019{ 2020#ifndef _KERNEL 2021 if (arc_watch) { 2022 int result; 2023 procctl_t ctl; 2024 ctl.cmd = PCWATCH; 2025 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 2026 ctl.prwatch.pr_size = 0; 2027 ctl.prwatch.pr_wflags = 0; 2028 result = write(arc_procfd, &ctl, sizeof (ctl)); 2029 ASSERT3U(result, ==, sizeof (ctl)); 2030 } 2031#endif 2032} 2033 2034/* ARGSUSED */ 2035static void 2036arc_buf_watch(arc_buf_t *buf) 2037{ 2038#ifndef _KERNEL 2039 if (arc_watch) { 2040 int result; 2041 procctl_t ctl; 2042 ctl.cmd = PCWATCH; 2043 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 2044 ctl.prwatch.pr_size = arc_buf_size(buf); 2045 ctl.prwatch.pr_wflags = WA_WRITE; 2046 result = write(arc_procfd, &ctl, sizeof (ctl)); 2047 ASSERT3U(result, ==, sizeof (ctl)); 2048 } 2049#endif 2050} 2051#endif /* illumos */ 2052 2053static arc_buf_contents_t 2054arc_buf_type(arc_buf_hdr_t *hdr) 2055{ 2056 arc_buf_contents_t type; 2057 if (HDR_ISTYPE_METADATA(hdr)) { 2058 type = ARC_BUFC_METADATA; 2059 } else { 2060 type = ARC_BUFC_DATA; 2061 } 2062 VERIFY3U(hdr->b_type, ==, type); 2063 return (type); 2064} 2065 2066boolean_t 2067arc_is_metadata(arc_buf_t *buf) 2068{ 2069 return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0); 2070} 2071 2072static uint32_t 2073arc_bufc_to_flags(arc_buf_contents_t type) 2074{ 2075 switch (type) { 2076 case ARC_BUFC_DATA: 2077 /* metadata field is 0 if buffer contains normal data */ 2078 return (0); 2079 case ARC_BUFC_METADATA: 2080 return (ARC_FLAG_BUFC_METADATA); 2081 default: 2082 break; 2083 } 2084 panic("undefined ARC buffer type!"); 2085 return ((uint32_t)-1); 2086} 2087 2088void 2089arc_buf_thaw(arc_buf_t *buf) 2090{ 2091 arc_buf_hdr_t *hdr = buf->b_hdr; 2092 2093 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 2094 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2095 2096 arc_cksum_verify(buf); 2097 2098 /* 2099 * Compressed buffers do not manipulate the b_freeze_cksum or 2100 * allocate b_thawed. 2101 */ 2102 if (ARC_BUF_COMPRESSED(buf)) { 2103 ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || 2104 arc_hdr_has_uncompressed_buf(hdr)); 2105 return; 2106 } 2107 2108 ASSERT(HDR_HAS_L1HDR(hdr)); 2109 arc_cksum_free(hdr); 2110 2111 mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 2112#ifdef ZFS_DEBUG 2113 if (zfs_flags & ZFS_DEBUG_MODIFY) { 2114 if (hdr->b_l1hdr.b_thawed != NULL) 2115 kmem_free(hdr->b_l1hdr.b_thawed, 1); 2116 hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); 2117 } 2118#endif 2119 2120 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 2121 2122#ifdef illumos 2123 arc_buf_unwatch(buf); 2124#endif 2125} 2126 2127void 2128arc_buf_freeze(arc_buf_t *buf) 2129{ 2130 arc_buf_hdr_t *hdr = buf->b_hdr; 2131 kmutex_t *hash_lock; 2132 2133 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 2134 return; 2135 2136 if (ARC_BUF_COMPRESSED(buf)) { 2137 ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || 2138 arc_hdr_has_uncompressed_buf(hdr)); 2139 return; 2140 } 2141 2142 hash_lock = HDR_LOCK(hdr); 2143 mutex_enter(hash_lock); 2144 2145 ASSERT(HDR_HAS_L1HDR(hdr)); 2146 ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL || 2147 hdr->b_l1hdr.b_state == arc_anon); 2148 arc_cksum_compute(buf); 2149 mutex_exit(hash_lock); 2150} 2151 2152/* 2153 * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, 2154 * the following functions should be used to ensure that the flags are 2155 * updated in a thread-safe way. When manipulating the flags either 2156 * the hash_lock must be held or the hdr must be undiscoverable. This 2157 * ensures that we're not racing with any other threads when updating 2158 * the flags. 2159 */ 2160static inline void 2161arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) 2162{ 2163 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2164 hdr->b_flags |= flags; 2165} 2166 2167static inline void 2168arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) 2169{ 2170 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2171 hdr->b_flags &= ~flags; 2172} 2173 2174/* 2175 * Setting the compression bits in the arc_buf_hdr_t's b_flags is 2176 * done in a special way since we have to clear and set bits 2177 * at the same time. Consumers that wish to set the compression bits 2178 * must use this function to ensure that the flags are updated in 2179 * thread-safe manner. 2180 */ 2181static void 2182arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) 2183{ 2184 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2185 2186 /* 2187 * Holes and embedded blocks will always have a psize = 0 so 2188 * we ignore the compression of the blkptr and set the 2189 * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF. 2190 * Holes and embedded blocks remain anonymous so we don't 2191 * want to uncompress them. Mark them as uncompressed. 2192 */ 2193 if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { 2194 arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); 2195 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); 2196 ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); 2197 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); 2198 } else { 2199 arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); 2200 HDR_SET_COMPRESS(hdr, cmp); 2201 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); 2202 ASSERT(HDR_COMPRESSION_ENABLED(hdr)); 2203 } 2204} 2205 2206/* 2207 * Looks for another buf on the same hdr which has the data decompressed, copies 2208 * from it, and returns true. If no such buf exists, returns false. 2209 */ 2210static boolean_t 2211arc_buf_try_copy_decompressed_data(arc_buf_t *buf) 2212{ 2213 arc_buf_hdr_t *hdr = buf->b_hdr; 2214 boolean_t copied = B_FALSE; 2215 2216 ASSERT(HDR_HAS_L1HDR(hdr)); 2217 ASSERT3P(buf->b_data, !=, NULL); 2218 ASSERT(!ARC_BUF_COMPRESSED(buf)); 2219 2220 for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL; 2221 from = from->b_next) { 2222 /* can't use our own data buffer */ 2223 if (from == buf) { 2224 continue; 2225 } 2226 2227 if (!ARC_BUF_COMPRESSED(from)) { 2228 bcopy(from->b_data, buf->b_data, arc_buf_size(buf)); 2229 copied = B_TRUE; 2230 break; 2231 } 2232 } 2233 2234 /* 2235 * There were no decompressed bufs, so there should not be a 2236 * checksum on the hdr either. 2237 */ 2238 EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); 2239 2240 return (copied); 2241} 2242 2243/* 2244 * Given a buf that has a data buffer attached to it, this function will 2245 * efficiently fill the buf with data of the specified compression setting from 2246 * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr 2247 * are already sharing a data buf, no copy is performed. 2248 * 2249 * If the buf is marked as compressed but uncompressed data was requested, this 2250 * will allocate a new data buffer for the buf, remove that flag, and fill the 2251 * buf with uncompressed data. You can't request a compressed buf on a hdr with 2252 * uncompressed data, and (since we haven't added support for it yet) if you 2253 * want compressed data your buf must already be marked as compressed and have 2254 * the correct-sized data buffer. 2255 */ 2256static int 2257arc_buf_fill(arc_buf_t *buf, boolean_t compressed) 2258{ 2259 arc_buf_hdr_t *hdr = buf->b_hdr; 2260 boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); 2261 dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; 2262 2263 ASSERT3P(buf->b_data, !=, NULL); 2264 IMPLY(compressed, hdr_compressed); 2265 IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); 2266 2267 if (hdr_compressed == compressed) { 2268 if (!arc_buf_is_shared(buf)) { 2269 abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, 2270 arc_buf_size(buf)); 2271 } 2272 } else { 2273 ASSERT(hdr_compressed); 2274 ASSERT(!compressed); 2275 ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); 2276 2277 /* 2278 * If the buf is sharing its data with the hdr, unlink it and 2279 * allocate a new data buffer for the buf. 2280 */ 2281 if (arc_buf_is_shared(buf)) { 2282 ASSERT(ARC_BUF_COMPRESSED(buf)); 2283 2284 /* We need to give the buf it's own b_data */ 2285 buf->b_flags &= ~ARC_BUF_FLAG_SHARED; 2286 buf->b_data = 2287 arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); 2288 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); 2289 2290 /* Previously overhead was 0; just add new overhead */ 2291 ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); 2292 } else if (ARC_BUF_COMPRESSED(buf)) { 2293 /* We need to reallocate the buf's b_data */ 2294 arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), 2295 buf); 2296 buf->b_data = 2297 arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); 2298 2299 /* We increased the size of b_data; update overhead */ 2300 ARCSTAT_INCR(arcstat_overhead_size, 2301 HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr)); 2302 } 2303 2304 /* 2305 * Regardless of the buf's previous compression settings, it 2306 * should not be compressed at the end of this function. 2307 */ 2308 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; 2309 2310 /* 2311 * Try copying the data from another buf which already has a 2312 * decompressed version. If that's not possible, it's time to 2313 * bite the bullet and decompress the data from the hdr. 2314 */ 2315 if (arc_buf_try_copy_decompressed_data(buf)) { 2316 /* Skip byteswapping and checksumming (already done) */ 2317 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL); 2318 return (0); 2319 } else { 2320 int error = zio_decompress_data(HDR_GET_COMPRESS(hdr), 2321 hdr->b_l1hdr.b_pabd, buf->b_data, 2322 HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); 2323 2324 /* 2325 * Absent hardware errors or software bugs, this should 2326 * be impossible, but log it anyway so we can debug it. 2327 */ 2328 if (error != 0) { 2329 zfs_dbgmsg( 2330 "hdr %p, compress %d, psize %d, lsize %d", 2331 hdr, HDR_GET_COMPRESS(hdr), 2332 HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); 2333 return (SET_ERROR(EIO)); 2334 } 2335 } 2336 } 2337 2338 /* Byteswap the buf's data if necessary */ 2339 if (bswap != DMU_BSWAP_NUMFUNCS) { 2340 ASSERT(!HDR_SHARED_DATA(hdr)); 2341 ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); 2342 dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); 2343 } 2344 2345 /* Compute the hdr's checksum if necessary */ 2346 arc_cksum_compute(buf); 2347 2348 return (0); 2349} 2350 2351int 2352arc_decompress(arc_buf_t *buf) 2353{ 2354 return (arc_buf_fill(buf, B_FALSE)); 2355} 2356 2357/* 2358 * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. 2359 */ 2360static uint64_t 2361arc_hdr_size(arc_buf_hdr_t *hdr) 2362{ 2363 uint64_t size; 2364 2365 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && 2366 HDR_GET_PSIZE(hdr) > 0) { 2367 size = HDR_GET_PSIZE(hdr); 2368 } else { 2369 ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); 2370 size = HDR_GET_LSIZE(hdr); 2371 } 2372 return (size); 2373} 2374 2375/* 2376 * Increment the amount of evictable space in the arc_state_t's refcount. 2377 * We account for the space used by the hdr and the arc buf individually 2378 * so that we can add and remove them from the refcount individually. 2379 */ 2380static void 2381arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) 2382{ 2383 arc_buf_contents_t type = arc_buf_type(hdr); 2384 2385 ASSERT(HDR_HAS_L1HDR(hdr)); 2386 2387 if (GHOST_STATE(state)) { 2388 ASSERT0(hdr->b_l1hdr.b_bufcnt); 2389 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2390 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2391 (void) refcount_add_many(&state->arcs_esize[type], 2392 HDR_GET_LSIZE(hdr), hdr); 2393 return; 2394 } 2395 2396 ASSERT(!GHOST_STATE(state)); 2397 if (hdr->b_l1hdr.b_pabd != NULL) { 2398 (void) refcount_add_many(&state->arcs_esize[type], 2399 arc_hdr_size(hdr), hdr); 2400 } 2401 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2402 buf = buf->b_next) { 2403 if (arc_buf_is_shared(buf)) 2404 continue; 2405 (void) refcount_add_many(&state->arcs_esize[type], 2406 arc_buf_size(buf), buf); 2407 } 2408} 2409 2410/* 2411 * Decrement the amount of evictable space in the arc_state_t's refcount. 2412 * We account for the space used by the hdr and the arc buf individually 2413 * so that we can add and remove them from the refcount individually. 2414 */ 2415static void 2416arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) 2417{ 2418 arc_buf_contents_t type = arc_buf_type(hdr); 2419 2420 ASSERT(HDR_HAS_L1HDR(hdr)); 2421 2422 if (GHOST_STATE(state)) { 2423 ASSERT0(hdr->b_l1hdr.b_bufcnt); 2424 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2425 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2426 (void) refcount_remove_many(&state->arcs_esize[type], 2427 HDR_GET_LSIZE(hdr), hdr); 2428 return; 2429 } 2430 2431 ASSERT(!GHOST_STATE(state)); 2432 if (hdr->b_l1hdr.b_pabd != NULL) { 2433 (void) refcount_remove_many(&state->arcs_esize[type], 2434 arc_hdr_size(hdr), hdr); 2435 } 2436 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2437 buf = buf->b_next) { 2438 if (arc_buf_is_shared(buf)) 2439 continue; 2440 (void) refcount_remove_many(&state->arcs_esize[type], 2441 arc_buf_size(buf), buf); 2442 } 2443} 2444 2445/* 2446 * Add a reference to this hdr indicating that someone is actively 2447 * referencing that memory. When the refcount transitions from 0 to 1, 2448 * we remove it from the respective arc_state_t list to indicate that 2449 * it is not evictable. 2450 */ 2451static void 2452add_reference(arc_buf_hdr_t *hdr, void *tag) 2453{ 2454 ASSERT(HDR_HAS_L1HDR(hdr)); 2455 if (!MUTEX_HELD(HDR_LOCK(hdr))) { 2456 ASSERT(hdr->b_l1hdr.b_state == arc_anon); 2457 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2458 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2459 } 2460 2461 arc_state_t *state = hdr->b_l1hdr.b_state; 2462 2463 if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && 2464 (state != arc_anon)) { 2465 /* We don't use the L2-only state list. */ 2466 if (state != arc_l2c_only) { 2467 multilist_remove(state->arcs_list[arc_buf_type(hdr)], 2468 hdr); 2469 arc_evictable_space_decrement(hdr, state); 2470 } 2471 /* remove the prefetch flag if we get a reference */ 2472 arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); 2473 } 2474} 2475 2476/* 2477 * Remove a reference from this hdr. When the reference transitions from 2478 * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's 2479 * list making it eligible for eviction. 2480 */ 2481static int 2482remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 2483{ 2484 int cnt; 2485 arc_state_t *state = hdr->b_l1hdr.b_state; 2486 2487 ASSERT(HDR_HAS_L1HDR(hdr)); 2488 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 2489 ASSERT(!GHOST_STATE(state)); 2490 2491 /* 2492 * arc_l2c_only counts as a ghost state so we don't need to explicitly 2493 * check to prevent usage of the arc_l2c_only list. 2494 */ 2495 if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && 2496 (state != arc_anon)) { 2497 multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr); 2498 ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); 2499 arc_evictable_space_increment(hdr, state); 2500 } 2501 return (cnt); 2502} 2503 2504/* 2505 * Move the supplied buffer to the indicated state. The hash lock 2506 * for the buffer must be held by the caller. 2507 */ 2508static void 2509arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, 2510 kmutex_t *hash_lock) 2511{ 2512 arc_state_t *old_state; 2513 int64_t refcnt; 2514 uint32_t bufcnt; 2515 boolean_t update_old, update_new; 2516 arc_buf_contents_t buftype = arc_buf_type(hdr); 2517 2518 /* 2519 * We almost always have an L1 hdr here, since we call arc_hdr_realloc() 2520 * in arc_read() when bringing a buffer out of the L2ARC. However, the 2521 * L1 hdr doesn't always exist when we change state to arc_anon before 2522 * destroying a header, in which case reallocating to add the L1 hdr is 2523 * pointless. 2524 */ 2525 if (HDR_HAS_L1HDR(hdr)) { 2526 old_state = hdr->b_l1hdr.b_state; 2527 refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); 2528 bufcnt = hdr->b_l1hdr.b_bufcnt; 2529 update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL); 2530 } else { 2531 old_state = arc_l2c_only; 2532 refcnt = 0; 2533 bufcnt = 0; 2534 update_old = B_FALSE; 2535 } 2536 update_new = update_old; 2537 2538 ASSERT(MUTEX_HELD(hash_lock)); 2539 ASSERT3P(new_state, !=, old_state); 2540 ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); 2541 ASSERT(old_state != arc_anon || bufcnt <= 1); 2542 2543 /* 2544 * If this buffer is evictable, transfer it from the 2545 * old state list to the new state list. 2546 */ 2547 if (refcnt == 0) { 2548 if (old_state != arc_anon && old_state != arc_l2c_only) { 2549 ASSERT(HDR_HAS_L1HDR(hdr)); 2550 multilist_remove(old_state->arcs_list[buftype], hdr); 2551 2552 if (GHOST_STATE(old_state)) { 2553 ASSERT0(bufcnt); 2554 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2555 update_old = B_TRUE; 2556 } 2557 arc_evictable_space_decrement(hdr, old_state); 2558 } 2559 if (new_state != arc_anon && new_state != arc_l2c_only) { 2560 2561 /* 2562 * An L1 header always exists here, since if we're 2563 * moving to some L1-cached state (i.e. not l2c_only or 2564 * anonymous), we realloc the header to add an L1hdr 2565 * beforehand. 2566 */ 2567 ASSERT(HDR_HAS_L1HDR(hdr)); 2568 multilist_insert(new_state->arcs_list[buftype], hdr); 2569 2570 if (GHOST_STATE(new_state)) { 2571 ASSERT0(bufcnt); 2572 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2573 update_new = B_TRUE; 2574 } 2575 arc_evictable_space_increment(hdr, new_state); 2576 } 2577 } 2578 2579 ASSERT(!HDR_EMPTY(hdr)); 2580 if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) 2581 buf_hash_remove(hdr); 2582 2583 /* adjust state sizes (ignore arc_l2c_only) */ 2584 2585 if (update_new && new_state != arc_l2c_only) { 2586 ASSERT(HDR_HAS_L1HDR(hdr)); 2587 if (GHOST_STATE(new_state)) { 2588 ASSERT0(bufcnt); 2589 2590 /* 2591 * When moving a header to a ghost state, we first 2592 * remove all arc buffers. Thus, we'll have a 2593 * bufcnt of zero, and no arc buffer to use for 2594 * the reference. As a result, we use the arc 2595 * header pointer for the reference. 2596 */ 2597 (void) refcount_add_many(&new_state->arcs_size, 2598 HDR_GET_LSIZE(hdr), hdr); 2599 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2600 } else { 2601 uint32_t buffers = 0; 2602 2603 /* 2604 * Each individual buffer holds a unique reference, 2605 * thus we must remove each of these references one 2606 * at a time. 2607 */ 2608 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2609 buf = buf->b_next) { 2610 ASSERT3U(bufcnt, !=, 0); 2611 buffers++; 2612 2613 /* 2614 * When the arc_buf_t is sharing the data 2615 * block with the hdr, the owner of the 2616 * reference belongs to the hdr. Only 2617 * add to the refcount if the arc_buf_t is 2618 * not shared. 2619 */ 2620 if (arc_buf_is_shared(buf)) 2621 continue; 2622 2623 (void) refcount_add_many(&new_state->arcs_size, 2624 arc_buf_size(buf), buf); 2625 } 2626 ASSERT3U(bufcnt, ==, buffers); 2627 2628 if (hdr->b_l1hdr.b_pabd != NULL) { 2629 (void) refcount_add_many(&new_state->arcs_size, 2630 arc_hdr_size(hdr), hdr); 2631 } else { 2632 ASSERT(GHOST_STATE(old_state)); 2633 } 2634 } 2635 } 2636 2637 if (update_old && old_state != arc_l2c_only) { 2638 ASSERT(HDR_HAS_L1HDR(hdr)); 2639 if (GHOST_STATE(old_state)) { 2640 ASSERT0(bufcnt); 2641 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2642 2643 /* 2644 * When moving a header off of a ghost state, 2645 * the header will not contain any arc buffers. 2646 * We use the arc header pointer for the reference 2647 * which is exactly what we did when we put the 2648 * header on the ghost state. 2649 */ 2650 2651 (void) refcount_remove_many(&old_state->arcs_size, 2652 HDR_GET_LSIZE(hdr), hdr); 2653 } else { 2654 uint32_t buffers = 0; 2655 2656 /* 2657 * Each individual buffer holds a unique reference, 2658 * thus we must remove each of these references one 2659 * at a time. 2660 */ 2661 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2662 buf = buf->b_next) { 2663 ASSERT3U(bufcnt, !=, 0); 2664 buffers++; 2665 2666 /* 2667 * When the arc_buf_t is sharing the data 2668 * block with the hdr, the owner of the 2669 * reference belongs to the hdr. Only 2670 * add to the refcount if the arc_buf_t is 2671 * not shared. 2672 */ 2673 if (arc_buf_is_shared(buf)) 2674 continue; 2675 2676 (void) refcount_remove_many( 2677 &old_state->arcs_size, arc_buf_size(buf), 2678 buf); 2679 } 2680 ASSERT3U(bufcnt, ==, buffers); 2681 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 2682 (void) refcount_remove_many( 2683 &old_state->arcs_size, arc_hdr_size(hdr), hdr); 2684 } 2685 } 2686 2687 if (HDR_HAS_L1HDR(hdr)) 2688 hdr->b_l1hdr.b_state = new_state; 2689 2690 /* 2691 * L2 headers should never be on the L2 state list since they don't 2692 * have L1 headers allocated. 2693 */ 2694 ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && 2695 multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); 2696} 2697 2698void 2699arc_space_consume(uint64_t space, arc_space_type_t type) 2700{ 2701 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 2702 2703 switch (type) { 2704 case ARC_SPACE_DATA: 2705 aggsum_add(&astat_data_size, space); 2706 break; 2707 case ARC_SPACE_META: 2708 aggsum_add(&astat_metadata_size, space); 2709 break; 2710 case ARC_SPACE_OTHER: 2711 aggsum_add(&astat_other_size, space); 2712 break; 2713 case ARC_SPACE_HDRS: 2714 aggsum_add(&astat_hdr_size, space); 2715 break; 2716 case ARC_SPACE_L2HDRS: 2717 aggsum_add(&astat_l2_hdr_size, space); 2718 break; 2719 } 2720 2721 if (type != ARC_SPACE_DATA) 2722 aggsum_add(&arc_meta_used, space); 2723 2724 aggsum_add(&arc_size, space); 2725} 2726 2727void 2728arc_space_return(uint64_t space, arc_space_type_t type) 2729{ 2730 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 2731 2732 switch (type) { 2733 case ARC_SPACE_DATA: 2734 aggsum_add(&astat_data_size, -space); 2735 break; 2736 case ARC_SPACE_META: 2737 aggsum_add(&astat_metadata_size, -space); 2738 break; 2739 case ARC_SPACE_OTHER: 2740 aggsum_add(&astat_other_size, -space); 2741 break; 2742 case ARC_SPACE_HDRS: 2743 aggsum_add(&astat_hdr_size, -space); 2744 break; 2745 case ARC_SPACE_L2HDRS: 2746 aggsum_add(&astat_l2_hdr_size, -space); 2747 break; 2748 } 2749 2750 if (type != ARC_SPACE_DATA) { 2751 ASSERT(aggsum_compare(&arc_meta_used, space) >= 0); 2752 /* 2753 * We use the upper bound here rather than the precise value 2754 * because the arc_meta_max value doesn't need to be 2755 * precise. It's only consumed by humans via arcstats. 2756 */ 2757 if (arc_meta_max < aggsum_upper_bound(&arc_meta_used)) 2758 arc_meta_max = aggsum_upper_bound(&arc_meta_used); 2759 aggsum_add(&arc_meta_used, -space); 2760 } 2761 2762 ASSERT(aggsum_compare(&arc_size, space) >= 0); 2763 aggsum_add(&arc_size, -space); 2764} 2765 2766/* 2767 * Given a hdr and a buf, returns whether that buf can share its b_data buffer 2768 * with the hdr's b_pabd. 2769 */ 2770static boolean_t 2771arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) 2772{ 2773 /* 2774 * The criteria for sharing a hdr's data are: 2775 * 1. the hdr's compression matches the buf's compression 2776 * 2. the hdr doesn't need to be byteswapped 2777 * 3. the hdr isn't already being shared 2778 * 4. the buf is either compressed or it is the last buf in the hdr list 2779 * 2780 * Criterion #4 maintains the invariant that shared uncompressed 2781 * bufs must be the final buf in the hdr's b_buf list. Reading this, you 2782 * might ask, "if a compressed buf is allocated first, won't that be the 2783 * last thing in the list?", but in that case it's impossible to create 2784 * a shared uncompressed buf anyway (because the hdr must be compressed 2785 * to have the compressed buf). You might also think that #3 is 2786 * sufficient to make this guarantee, however it's possible 2787 * (specifically in the rare L2ARC write race mentioned in 2788 * arc_buf_alloc_impl()) there will be an existing uncompressed buf that 2789 * is sharable, but wasn't at the time of its allocation. Rather than 2790 * allow a new shared uncompressed buf to be created and then shuffle 2791 * the list around to make it the last element, this simply disallows 2792 * sharing if the new buf isn't the first to be added. 2793 */ 2794 ASSERT3P(buf->b_hdr, ==, hdr); 2795 boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF; 2796 boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; 2797 return (buf_compressed == hdr_compressed && 2798 hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && 2799 !HDR_SHARED_DATA(hdr) && 2800 (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); 2801} 2802 2803/* 2804 * Allocate a buf for this hdr. If you care about the data that's in the hdr, 2805 * or if you want a compressed buffer, pass those flags in. Returns 0 if the 2806 * copy was made successfully, or an error code otherwise. 2807 */ 2808static int 2809arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, 2810 boolean_t fill, arc_buf_t **ret) 2811{ 2812 arc_buf_t *buf; 2813 2814 ASSERT(HDR_HAS_L1HDR(hdr)); 2815 ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); 2816 VERIFY(hdr->b_type == ARC_BUFC_DATA || 2817 hdr->b_type == ARC_BUFC_METADATA); 2818 ASSERT3P(ret, !=, NULL); 2819 ASSERT3P(*ret, ==, NULL); 2820 2821 buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2822 buf->b_hdr = hdr; 2823 buf->b_data = NULL; 2824 buf->b_next = hdr->b_l1hdr.b_buf; 2825 buf->b_flags = 0; 2826 2827 add_reference(hdr, tag); 2828 2829 /* 2830 * We're about to change the hdr's b_flags. We must either 2831 * hold the hash_lock or be undiscoverable. 2832 */ 2833 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2834 2835 /* 2836 * Only honor requests for compressed bufs if the hdr is actually 2837 * compressed. 2838 */ 2839 if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 2840 buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; 2841 2842 /* 2843 * If the hdr's data can be shared then we share the data buffer and 2844 * set the appropriate bit in the hdr's b_flags to indicate the hdr is 2845 * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new 2846 * buffer to store the buf's data. 2847 * 2848 * There are two additional restrictions here because we're sharing 2849 * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be 2850 * actively involved in an L2ARC write, because if this buf is used by 2851 * an arc_write() then the hdr's data buffer will be released when the 2852 * write completes, even though the L2ARC write might still be using it. 2853 * Second, the hdr's ABD must be linear so that the buf's user doesn't 2854 * need to be ABD-aware. 2855 */ 2856 boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && 2857 abd_is_linear(hdr->b_l1hdr.b_pabd); 2858 2859 /* Set up b_data and sharing */ 2860 if (can_share) { 2861 buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); 2862 buf->b_flags |= ARC_BUF_FLAG_SHARED; 2863 arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); 2864 } else { 2865 buf->b_data = 2866 arc_get_data_buf(hdr, arc_buf_size(buf), buf); 2867 ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); 2868 } 2869 VERIFY3P(buf->b_data, !=, NULL); 2870 2871 hdr->b_l1hdr.b_buf = buf; 2872 hdr->b_l1hdr.b_bufcnt += 1; 2873 2874 /* 2875 * If the user wants the data from the hdr, we need to either copy or 2876 * decompress the data. 2877 */ 2878 if (fill) { 2879 return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0)); 2880 } 2881 2882 return (0); 2883} 2884 2885static char *arc_onloan_tag = "onloan"; 2886 2887static inline void 2888arc_loaned_bytes_update(int64_t delta) 2889{ 2890 atomic_add_64(&arc_loaned_bytes, delta); 2891 2892 /* assert that it did not wrap around */ 2893 ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); 2894} 2895 2896/* 2897 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 2898 * flight data by arc_tempreserve_space() until they are "returned". Loaned 2899 * buffers must be returned to the arc before they can be used by the DMU or 2900 * freed. 2901 */ 2902arc_buf_t * 2903arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) 2904{ 2905 arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, 2906 is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); 2907 2908 arc_loaned_bytes_update(arc_buf_size(buf)); 2909 2910 return (buf); 2911} 2912 2913arc_buf_t * 2914arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, 2915 enum zio_compress compression_type) 2916{ 2917 arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, 2918 psize, lsize, compression_type); 2919 2920 arc_loaned_bytes_update(arc_buf_size(buf)); 2921 2922 return (buf); 2923} 2924 2925 2926/* 2927 * Return a loaned arc buffer to the arc. 2928 */ 2929void 2930arc_return_buf(arc_buf_t *buf, void *tag) 2931{ 2932 arc_buf_hdr_t *hdr = buf->b_hdr; 2933 2934 ASSERT3P(buf->b_data, !=, NULL); 2935 ASSERT(HDR_HAS_L1HDR(hdr)); 2936 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 2937 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2938 2939 arc_loaned_bytes_update(-arc_buf_size(buf)); 2940} 2941 2942/* Detach an arc_buf from a dbuf (tag) */ 2943void 2944arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 2945{ 2946 arc_buf_hdr_t *hdr = buf->b_hdr; 2947 2948 ASSERT3P(buf->b_data, !=, NULL); 2949 ASSERT(HDR_HAS_L1HDR(hdr)); 2950 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2951 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); 2952 2953 arc_loaned_bytes_update(arc_buf_size(buf)); 2954} 2955 2956static void 2957l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) 2958{ 2959 l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); 2960 2961 df->l2df_abd = abd; 2962 df->l2df_size = size; 2963 df->l2df_type = type; 2964 mutex_enter(&l2arc_free_on_write_mtx); 2965 list_insert_head(l2arc_free_on_write, df); 2966 mutex_exit(&l2arc_free_on_write_mtx); 2967} 2968 2969static void 2970arc_hdr_free_on_write(arc_buf_hdr_t *hdr) 2971{ 2972 arc_state_t *state = hdr->b_l1hdr.b_state; 2973 arc_buf_contents_t type = arc_buf_type(hdr); 2974 uint64_t size = arc_hdr_size(hdr); 2975 2976 /* protected by hash lock, if in the hash table */ 2977 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 2978 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2979 ASSERT(state != arc_anon && state != arc_l2c_only); 2980 2981 (void) refcount_remove_many(&state->arcs_esize[type], 2982 size, hdr); 2983 } 2984 (void) refcount_remove_many(&state->arcs_size, size, hdr); 2985 if (type == ARC_BUFC_METADATA) { 2986 arc_space_return(size, ARC_SPACE_META); 2987 } else { 2988 ASSERT(type == ARC_BUFC_DATA); 2989 arc_space_return(size, ARC_SPACE_DATA); 2990 } 2991 2992 l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); 2993} 2994 2995/* 2996 * Share the arc_buf_t's data with the hdr. Whenever we are sharing the 2997 * data buffer, we transfer the refcount ownership to the hdr and update 2998 * the appropriate kstats. 2999 */ 3000static void 3001arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) 3002{ 3003 arc_state_t *state = hdr->b_l1hdr.b_state; 3004 3005 ASSERT(arc_can_share(hdr, buf)); 3006 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3007 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 3008 3009 /* 3010 * Start sharing the data buffer. We transfer the 3011 * refcount ownership to the hdr since it always owns 3012 * the refcount whenever an arc_buf_t is shared. 3013 */ 3014 refcount_transfer_ownership(&state->arcs_size, buf, hdr); 3015 hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); 3016 abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, 3017 HDR_ISTYPE_METADATA(hdr)); 3018 arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); 3019 buf->b_flags |= ARC_BUF_FLAG_SHARED; 3020 3021 /* 3022 * Since we've transferred ownership to the hdr we need 3023 * to increment its compressed and uncompressed kstats and 3024 * decrement the overhead size. 3025 */ 3026 ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); 3027 ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); 3028 ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf)); 3029} 3030 3031static void 3032arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) 3033{ 3034 arc_state_t *state = hdr->b_l1hdr.b_state; 3035 3036 ASSERT(arc_buf_is_shared(buf)); 3037 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3038 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 3039 3040 /* 3041 * We are no longer sharing this buffer so we need 3042 * to transfer its ownership to the rightful owner. 3043 */ 3044 refcount_transfer_ownership(&state->arcs_size, hdr, buf); 3045 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); 3046 abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); 3047 abd_put(hdr->b_l1hdr.b_pabd); 3048 hdr->b_l1hdr.b_pabd = NULL; 3049 buf->b_flags &= ~ARC_BUF_FLAG_SHARED; 3050 3051 /* 3052 * Since the buffer is no longer shared between 3053 * the arc buf and the hdr, count it as overhead. 3054 */ 3055 ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); 3056 ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); 3057 ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); 3058} 3059 3060/* 3061 * Remove an arc_buf_t from the hdr's buf list and return the last 3062 * arc_buf_t on the list. If no buffers remain on the list then return 3063 * NULL. 3064 */ 3065static arc_buf_t * 3066arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) 3067{ 3068 ASSERT(HDR_HAS_L1HDR(hdr)); 3069 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 3070 3071 arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; 3072 arc_buf_t *lastbuf = NULL; 3073 3074 /* 3075 * Remove the buf from the hdr list and locate the last 3076 * remaining buffer on the list. 3077 */ 3078 while (*bufp != NULL) { 3079 if (*bufp == buf) 3080 *bufp = buf->b_next; 3081 3082 /* 3083 * If we've removed a buffer in the middle of 3084 * the list then update the lastbuf and update 3085 * bufp. 3086 */ 3087 if (*bufp != NULL) { 3088 lastbuf = *bufp; 3089 bufp = &(*bufp)->b_next; 3090 } 3091 } 3092 buf->b_next = NULL; 3093 ASSERT3P(lastbuf, !=, buf); 3094 IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); 3095 IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); 3096 IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); 3097 3098 return (lastbuf); 3099} 3100 3101/* 3102 * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's 3103 * list and free it. 3104 */ 3105static void 3106arc_buf_destroy_impl(arc_buf_t *buf) 3107{ 3108 arc_buf_hdr_t *hdr = buf->b_hdr; 3109 3110 /* 3111 * Free up the data associated with the buf but only if we're not 3112 * sharing this with the hdr. If we are sharing it with the hdr, the 3113 * hdr is responsible for doing the free. 3114 */ 3115 if (buf->b_data != NULL) { 3116 /* 3117 * We're about to change the hdr's b_flags. We must either 3118 * hold the hash_lock or be undiscoverable. 3119 */ 3120 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 3121 3122 arc_cksum_verify(buf); 3123#ifdef illumos 3124 arc_buf_unwatch(buf); 3125#endif 3126 3127 if (arc_buf_is_shared(buf)) { 3128 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); 3129 } else { 3130 uint64_t size = arc_buf_size(buf); 3131 arc_free_data_buf(hdr, buf->b_data, size, buf); 3132 ARCSTAT_INCR(arcstat_overhead_size, -size); 3133 } 3134 buf->b_data = NULL; 3135 3136 ASSERT(hdr->b_l1hdr.b_bufcnt > 0); 3137 hdr->b_l1hdr.b_bufcnt -= 1; 3138 } 3139 3140 arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); 3141 3142 if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { 3143 /* 3144 * If the current arc_buf_t is sharing its data buffer with the 3145 * hdr, then reassign the hdr's b_pabd to share it with the new 3146 * buffer at the end of the list. The shared buffer is always 3147 * the last one on the hdr's buffer list. 3148 * 3149 * There is an equivalent case for compressed bufs, but since 3150 * they aren't guaranteed to be the last buf in the list and 3151 * that is an exceedingly rare case, we just allow that space be 3152 * wasted temporarily. 3153 */ 3154 if (lastbuf != NULL) { 3155 /* Only one buf can be shared at once */ 3156 VERIFY(!arc_buf_is_shared(lastbuf)); 3157 /* hdr is uncompressed so can't have compressed buf */ 3158 VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); 3159 3160 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3161 arc_hdr_free_pabd(hdr); 3162 3163 /* 3164 * We must setup a new shared block between the 3165 * last buffer and the hdr. The data would have 3166 * been allocated by the arc buf so we need to transfer 3167 * ownership to the hdr since it's now being shared. 3168 */ 3169 arc_share_buf(hdr, lastbuf); 3170 } 3171 } else if (HDR_SHARED_DATA(hdr)) { 3172 /* 3173 * Uncompressed shared buffers are always at the end 3174 * of the list. Compressed buffers don't have the 3175 * same requirements. This makes it hard to 3176 * simply assert that the lastbuf is shared so 3177 * we rely on the hdr's compression flags to determine 3178 * if we have a compressed, shared buffer. 3179 */ 3180 ASSERT3P(lastbuf, !=, NULL); 3181 ASSERT(arc_buf_is_shared(lastbuf) || 3182 HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); 3183 } 3184 3185 /* 3186 * Free the checksum if we're removing the last uncompressed buf from 3187 * this hdr. 3188 */ 3189 if (!arc_hdr_has_uncompressed_buf(hdr)) { 3190 arc_cksum_free(hdr); 3191 } 3192 3193 /* clean up the buf */ 3194 buf->b_hdr = NULL; 3195 kmem_cache_free(buf_cache, buf); 3196} 3197 3198static void 3199arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr) 3200{ 3201 ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); 3202 ASSERT(HDR_HAS_L1HDR(hdr)); 3203 ASSERT(!HDR_SHARED_DATA(hdr)); 3204 3205 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3206 hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); 3207 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; 3208 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3209 3210 ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); 3211 ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); 3212} 3213 3214static void 3215arc_hdr_free_pabd(arc_buf_hdr_t *hdr) 3216{ 3217 ASSERT(HDR_HAS_L1HDR(hdr)); 3218 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3219 3220 /* 3221 * If the hdr is currently being written to the l2arc then 3222 * we defer freeing the data by adding it to the l2arc_free_on_write 3223 * list. The l2arc will free the data once it's finished 3224 * writing it to the l2arc device. 3225 */ 3226 if (HDR_L2_WRITING(hdr)) { 3227 arc_hdr_free_on_write(hdr); 3228 ARCSTAT_BUMP(arcstat_l2_free_on_write); 3229 } else { 3230 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, 3231 arc_hdr_size(hdr), hdr); 3232 } 3233 hdr->b_l1hdr.b_pabd = NULL; 3234 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; 3235 3236 ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); 3237 ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); 3238} 3239 3240static arc_buf_hdr_t * 3241arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, 3242 enum zio_compress compression_type, arc_buf_contents_t type) 3243{ 3244 arc_buf_hdr_t *hdr; 3245 3246 VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); 3247 3248 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 3249 ASSERT(HDR_EMPTY(hdr)); 3250 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 3251 ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL); 3252 HDR_SET_PSIZE(hdr, psize); 3253 HDR_SET_LSIZE(hdr, lsize); 3254 hdr->b_spa = spa; 3255 hdr->b_type = type; 3256 hdr->b_flags = 0; 3257 arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); 3258 arc_hdr_set_compress(hdr, compression_type); 3259 3260 hdr->b_l1hdr.b_state = arc_anon; 3261 hdr->b_l1hdr.b_arc_access = 0; 3262 hdr->b_l1hdr.b_bufcnt = 0; 3263 hdr->b_l1hdr.b_buf = NULL; 3264 3265 /* 3266 * Allocate the hdr's buffer. This will contain either 3267 * the compressed or uncompressed data depending on the block 3268 * it references and compressed arc enablement. 3269 */ 3270 arc_hdr_alloc_pabd(hdr); 3271 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3272 3273 return (hdr); 3274} 3275 3276/* 3277 * Transition between the two allocation states for the arc_buf_hdr struct. 3278 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without 3279 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller 3280 * version is used when a cache buffer is only in the L2ARC in order to reduce 3281 * memory usage. 3282 */ 3283static arc_buf_hdr_t * 3284arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) 3285{ 3286 ASSERT(HDR_HAS_L2HDR(hdr)); 3287 3288 arc_buf_hdr_t *nhdr; 3289 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 3290 3291 ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || 3292 (old == hdr_l2only_cache && new == hdr_full_cache)); 3293 3294 nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); 3295 3296 ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); 3297 buf_hash_remove(hdr); 3298 3299 bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); 3300 3301 if (new == hdr_full_cache) { 3302 arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); 3303 /* 3304 * arc_access and arc_change_state need to be aware that a 3305 * header has just come out of L2ARC, so we set its state to 3306 * l2c_only even though it's about to change. 3307 */ 3308 nhdr->b_l1hdr.b_state = arc_l2c_only; 3309 3310 /* Verify previous threads set to NULL before freeing */ 3311 ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); 3312 } else { 3313 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 3314 ASSERT0(hdr->b_l1hdr.b_bufcnt); 3315 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 3316 3317 /* 3318 * If we've reached here, We must have been called from 3319 * arc_evict_hdr(), as such we should have already been 3320 * removed from any ghost list we were previously on 3321 * (which protects us from racing with arc_evict_state), 3322 * thus no locking is needed during this check. 3323 */ 3324 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 3325 3326 /* 3327 * A buffer must not be moved into the arc_l2c_only 3328 * state if it's not finished being written out to the 3329 * l2arc device. Otherwise, the b_l1hdr.b_pabd field 3330 * might try to be accessed, even though it was removed. 3331 */ 3332 VERIFY(!HDR_L2_WRITING(hdr)); 3333 VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3334 3335#ifdef ZFS_DEBUG 3336 if (hdr->b_l1hdr.b_thawed != NULL) { 3337 kmem_free(hdr->b_l1hdr.b_thawed, 1); 3338 hdr->b_l1hdr.b_thawed = NULL; 3339 } 3340#endif 3341 3342 arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); 3343 } 3344 /* 3345 * The header has been reallocated so we need to re-insert it into any 3346 * lists it was on. 3347 */ 3348 (void) buf_hash_insert(nhdr, NULL); 3349 3350 ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); 3351 3352 mutex_enter(&dev->l2ad_mtx); 3353 3354 /* 3355 * We must place the realloc'ed header back into the list at 3356 * the same spot. Otherwise, if it's placed earlier in the list, 3357 * l2arc_write_buffers() could find it during the function's 3358 * write phase, and try to write it out to the l2arc. 3359 */ 3360 list_insert_after(&dev->l2ad_buflist, hdr, nhdr); 3361 list_remove(&dev->l2ad_buflist, hdr); 3362 3363 mutex_exit(&dev->l2ad_mtx); 3364 3365 /* 3366 * Since we're using the pointer address as the tag when 3367 * incrementing and decrementing the l2ad_alloc refcount, we 3368 * must remove the old pointer (that we're about to destroy) and 3369 * add the new pointer to the refcount. Otherwise we'd remove 3370 * the wrong pointer address when calling arc_hdr_destroy() later. 3371 */ 3372 3373 (void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); 3374 (void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); 3375 3376 buf_discard_identity(hdr); 3377 kmem_cache_free(old, hdr); 3378 3379 return (nhdr); 3380} 3381 3382/* 3383 * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. 3384 * The buf is returned thawed since we expect the consumer to modify it. 3385 */ 3386arc_buf_t * 3387arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) 3388{ 3389 arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, 3390 ZIO_COMPRESS_OFF, type); 3391 ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); 3392 3393 arc_buf_t *buf = NULL; 3394 VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf)); 3395 arc_buf_thaw(buf); 3396 3397 return (buf); 3398} 3399 3400/* 3401 * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this 3402 * for bufs containing metadata. 3403 */ 3404arc_buf_t * 3405arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, 3406 enum zio_compress compression_type) 3407{ 3408 ASSERT3U(lsize, >, 0); 3409 ASSERT3U(lsize, >=, psize); 3410 ASSERT(compression_type > ZIO_COMPRESS_OFF); 3411 ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS); 3412 3413 arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, 3414 compression_type, ARC_BUFC_DATA); 3415 ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); 3416 3417 arc_buf_t *buf = NULL; 3418 VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf)); 3419 arc_buf_thaw(buf); 3420 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 3421 3422 if (!arc_buf_is_shared(buf)) { 3423 /* 3424 * To ensure that the hdr has the correct data in it if we call 3425 * arc_decompress() on this buf before it's been written to 3426 * disk, it's easiest if we just set up sharing between the 3427 * buf and the hdr. 3428 */ 3429 ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd)); 3430 arc_hdr_free_pabd(hdr); 3431 arc_share_buf(hdr, buf); 3432 } 3433 3434 return (buf); 3435} 3436 3437static void 3438arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) 3439{ 3440 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 3441 l2arc_dev_t *dev = l2hdr->b_dev; 3442 uint64_t psize = arc_hdr_size(hdr); 3443 3444 ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 3445 ASSERT(HDR_HAS_L2HDR(hdr)); 3446 3447 list_remove(&dev->l2ad_buflist, hdr); 3448 3449 ARCSTAT_INCR(arcstat_l2_psize, -psize); 3450 ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); 3451 3452 vdev_space_update(dev->l2ad_vdev, -psize, 0, 0); 3453 3454 (void) refcount_remove_many(&dev->l2ad_alloc, psize, hdr); 3455 arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); 3456} 3457 3458static void 3459arc_hdr_destroy(arc_buf_hdr_t *hdr) 3460{ 3461 if (HDR_HAS_L1HDR(hdr)) { 3462 ASSERT(hdr->b_l1hdr.b_buf == NULL || 3463 hdr->b_l1hdr.b_bufcnt > 0); 3464 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3465 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 3466 } 3467 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3468 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 3469 3470 if (!HDR_EMPTY(hdr)) 3471 buf_discard_identity(hdr); 3472 3473 if (HDR_HAS_L2HDR(hdr)) { 3474 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 3475 boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); 3476 3477 if (!buflist_held) 3478 mutex_enter(&dev->l2ad_mtx); 3479 3480 /* 3481 * Even though we checked this conditional above, we 3482 * need to check this again now that we have the 3483 * l2ad_mtx. This is because we could be racing with 3484 * another thread calling l2arc_evict() which might have 3485 * destroyed this header's L2 portion as we were waiting 3486 * to acquire the l2ad_mtx. If that happens, we don't 3487 * want to re-destroy the header's L2 portion. 3488 */ 3489 if (HDR_HAS_L2HDR(hdr)) { 3490 l2arc_trim(hdr); 3491 arc_hdr_l2hdr_destroy(hdr); 3492 } 3493 3494 if (!buflist_held) 3495 mutex_exit(&dev->l2ad_mtx); 3496 } 3497 3498 if (HDR_HAS_L1HDR(hdr)) { 3499 arc_cksum_free(hdr); 3500 3501 while (hdr->b_l1hdr.b_buf != NULL) 3502 arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); 3503 3504#ifdef ZFS_DEBUG 3505 if (hdr->b_l1hdr.b_thawed != NULL) { 3506 kmem_free(hdr->b_l1hdr.b_thawed, 1); 3507 hdr->b_l1hdr.b_thawed = NULL; 3508 } 3509#endif 3510 3511 if (hdr->b_l1hdr.b_pabd != NULL) { 3512 arc_hdr_free_pabd(hdr); 3513 } 3514 } 3515 3516 ASSERT3P(hdr->b_hash_next, ==, NULL); 3517 if (HDR_HAS_L1HDR(hdr)) { 3518 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 3519 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 3520 kmem_cache_free(hdr_full_cache, hdr); 3521 } else { 3522 kmem_cache_free(hdr_l2only_cache, hdr); 3523 } 3524} 3525 3526void 3527arc_buf_destroy(arc_buf_t *buf, void* tag) 3528{ 3529 arc_buf_hdr_t *hdr = buf->b_hdr; 3530 kmutex_t *hash_lock = HDR_LOCK(hdr); 3531 3532 if (hdr->b_l1hdr.b_state == arc_anon) { 3533 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); 3534 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3535 VERIFY0(remove_reference(hdr, NULL, tag)); 3536 arc_hdr_destroy(hdr); 3537 return; 3538 } 3539 3540 mutex_enter(hash_lock); 3541 ASSERT3P(hdr, ==, buf->b_hdr); 3542 ASSERT(hdr->b_l1hdr.b_bufcnt > 0); 3543 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 3544 ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); 3545 ASSERT3P(buf->b_data, !=, NULL); 3546 3547 (void) remove_reference(hdr, hash_lock, tag); 3548 arc_buf_destroy_impl(buf); 3549 mutex_exit(hash_lock); 3550} 3551 3552/* 3553 * Evict the arc_buf_hdr that is provided as a parameter. The resultant 3554 * state of the header is dependent on it's state prior to entering this 3555 * function. The following transitions are possible: 3556 * 3557 * - arc_mru -> arc_mru_ghost 3558 * - arc_mfu -> arc_mfu_ghost 3559 * - arc_mru_ghost -> arc_l2c_only 3560 * - arc_mru_ghost -> deleted 3561 * - arc_mfu_ghost -> arc_l2c_only 3562 * - arc_mfu_ghost -> deleted 3563 */ 3564static int64_t 3565arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 3566{ 3567 arc_state_t *evicted_state, *state; 3568 int64_t bytes_evicted = 0; 3569 int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? 3570 zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms; 3571 3572 ASSERT(MUTEX_HELD(hash_lock)); 3573 ASSERT(HDR_HAS_L1HDR(hdr)); 3574 3575 state = hdr->b_l1hdr.b_state; 3576 if (GHOST_STATE(state)) { 3577 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3578 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 3579 3580 /* 3581 * l2arc_write_buffers() relies on a header's L1 portion 3582 * (i.e. its b_pabd field) during it's write phase. 3583 * Thus, we cannot push a header onto the arc_l2c_only 3584 * state (removing it's L1 piece) until the header is 3585 * done being written to the l2arc. 3586 */ 3587 if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { 3588 ARCSTAT_BUMP(arcstat_evict_l2_skip); 3589 return (bytes_evicted); 3590 } 3591 3592 ARCSTAT_BUMP(arcstat_deleted); 3593 bytes_evicted += HDR_GET_LSIZE(hdr); 3594 3595 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); 3596 3597 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3598 if (HDR_HAS_L2HDR(hdr)) { 3599 /* 3600 * This buffer is cached on the 2nd Level ARC; 3601 * don't destroy the header. 3602 */ 3603 arc_change_state(arc_l2c_only, hdr, hash_lock); 3604 /* 3605 * dropping from L1+L2 cached to L2-only, 3606 * realloc to remove the L1 header. 3607 */ 3608 hdr = arc_hdr_realloc(hdr, hdr_full_cache, 3609 hdr_l2only_cache); 3610 } else { 3611 arc_change_state(arc_anon, hdr, hash_lock); 3612 arc_hdr_destroy(hdr); 3613 } 3614 return (bytes_evicted); 3615 } 3616 3617 ASSERT(state == arc_mru || state == arc_mfu); 3618 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 3619 3620 /* prefetch buffers have a minimum lifespan */ 3621 if (HDR_IO_IN_PROGRESS(hdr) || 3622 ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && 3623 ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) { 3624 ARCSTAT_BUMP(arcstat_evict_skip); 3625 return (bytes_evicted); 3626 } 3627 3628 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 3629 while (hdr->b_l1hdr.b_buf) { 3630 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 3631 if (!mutex_tryenter(&buf->b_evict_lock)) { 3632 ARCSTAT_BUMP(arcstat_mutex_miss); 3633 break; 3634 } 3635 if (buf->b_data != NULL) 3636 bytes_evicted += HDR_GET_LSIZE(hdr); 3637 mutex_exit(&buf->b_evict_lock); 3638 arc_buf_destroy_impl(buf); 3639 } 3640 3641 if (HDR_HAS_L2HDR(hdr)) { 3642 ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); 3643 } else { 3644 if (l2arc_write_eligible(hdr->b_spa, hdr)) { 3645 ARCSTAT_INCR(arcstat_evict_l2_eligible, 3646 HDR_GET_LSIZE(hdr)); 3647 } else { 3648 ARCSTAT_INCR(arcstat_evict_l2_ineligible, 3649 HDR_GET_LSIZE(hdr)); 3650 } 3651 } 3652 3653 if (hdr->b_l1hdr.b_bufcnt == 0) { 3654 arc_cksum_free(hdr); 3655 3656 bytes_evicted += arc_hdr_size(hdr); 3657 3658 /* 3659 * If this hdr is being evicted and has a compressed 3660 * buffer then we discard it here before we change states. 3661 * This ensures that the accounting is updated correctly 3662 * in arc_free_data_impl(). 3663 */ 3664 arc_hdr_free_pabd(hdr); 3665 3666 arc_change_state(evicted_state, hdr, hash_lock); 3667 ASSERT(HDR_IN_HASH_TABLE(hdr)); 3668 arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); 3669 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); 3670 } 3671 3672 return (bytes_evicted); 3673} 3674 3675static uint64_t 3676arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, 3677 uint64_t spa, int64_t bytes) 3678{ 3679 multilist_sublist_t *mls; 3680 uint64_t bytes_evicted = 0; 3681 arc_buf_hdr_t *hdr; 3682 kmutex_t *hash_lock; 3683 int evict_count = 0; 3684 3685 ASSERT3P(marker, !=, NULL); 3686 IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 3687 3688 mls = multilist_sublist_lock(ml, idx); 3689 3690 for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; 3691 hdr = multilist_sublist_prev(mls, marker)) { 3692 if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || 3693 (evict_count >= zfs_arc_evict_batch_limit)) 3694 break; 3695 3696 /* 3697 * To keep our iteration location, move the marker 3698 * forward. Since we're not holding hdr's hash lock, we 3699 * must be very careful and not remove 'hdr' from the 3700 * sublist. Otherwise, other consumers might mistake the 3701 * 'hdr' as not being on a sublist when they call the 3702 * multilist_link_active() function (they all rely on 3703 * the hash lock protecting concurrent insertions and 3704 * removals). multilist_sublist_move_forward() was 3705 * specifically implemented to ensure this is the case 3706 * (only 'marker' will be removed and re-inserted). 3707 */ 3708 multilist_sublist_move_forward(mls, marker); 3709 3710 /* 3711 * The only case where the b_spa field should ever be 3712 * zero, is the marker headers inserted by 3713 * arc_evict_state(). It's possible for multiple threads 3714 * to be calling arc_evict_state() concurrently (e.g. 3715 * dsl_pool_close() and zio_inject_fault()), so we must 3716 * skip any markers we see from these other threads. 3717 */ 3718 if (hdr->b_spa == 0) 3719 continue; 3720 3721 /* we're only interested in evicting buffers of a certain spa */ 3722 if (spa != 0 && hdr->b_spa != spa) { 3723 ARCSTAT_BUMP(arcstat_evict_skip); 3724 continue; 3725 } 3726 3727 hash_lock = HDR_LOCK(hdr); 3728 3729 /* 3730 * We aren't calling this function from any code path 3731 * that would already be holding a hash lock, so we're 3732 * asserting on this assumption to be defensive in case 3733 * this ever changes. Without this check, it would be 3734 * possible to incorrectly increment arcstat_mutex_miss 3735 * below (e.g. if the code changed such that we called 3736 * this function with a hash lock held). 3737 */ 3738 ASSERT(!MUTEX_HELD(hash_lock)); 3739 3740 if (mutex_tryenter(hash_lock)) { 3741 uint64_t evicted = arc_evict_hdr(hdr, hash_lock); 3742 mutex_exit(hash_lock); 3743 3744 bytes_evicted += evicted; 3745 3746 /* 3747 * If evicted is zero, arc_evict_hdr() must have 3748 * decided to skip this header, don't increment 3749 * evict_count in this case. 3750 */ 3751 if (evicted != 0) 3752 evict_count++; 3753 3754 /* 3755 * If arc_size isn't overflowing, signal any 3756 * threads that might happen to be waiting. 3757 * 3758 * For each header evicted, we wake up a single 3759 * thread. If we used cv_broadcast, we could 3760 * wake up "too many" threads causing arc_size 3761 * to significantly overflow arc_c; since 3762 * arc_get_data_impl() doesn't check for overflow 3763 * when it's woken up (it doesn't because it's 3764 * possible for the ARC to be overflowing while 3765 * full of un-evictable buffers, and the 3766 * function should proceed in this case). 3767 * 3768 * If threads are left sleeping, due to not 3769 * using cv_broadcast, they will be woken up 3770 * just before arc_reclaim_thread() sleeps. 3771 */ 3772 mutex_enter(&arc_reclaim_lock); 3773 if (!arc_is_overflowing()) 3774 cv_signal(&arc_reclaim_waiters_cv); 3775 mutex_exit(&arc_reclaim_lock); 3776 } else { 3777 ARCSTAT_BUMP(arcstat_mutex_miss); 3778 } 3779 } 3780 3781 multilist_sublist_unlock(mls); 3782 3783 return (bytes_evicted); 3784} 3785 3786/* 3787 * Evict buffers from the given arc state, until we've removed the 3788 * specified number of bytes. Move the removed buffers to the 3789 * appropriate evict state. 3790 * 3791 * This function makes a "best effort". It skips over any buffers 3792 * it can't get a hash_lock on, and so, may not catch all candidates. 3793 * It may also return without evicting as much space as requested. 3794 * 3795 * If bytes is specified using the special value ARC_EVICT_ALL, this 3796 * will evict all available (i.e. unlocked and evictable) buffers from 3797 * the given arc state; which is used by arc_flush(). 3798 */ 3799static uint64_t 3800arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, 3801 arc_buf_contents_t type) 3802{ 3803 uint64_t total_evicted = 0; 3804 multilist_t *ml = state->arcs_list[type]; 3805 int num_sublists; 3806 arc_buf_hdr_t **markers; 3807 3808 IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 3809 3810 num_sublists = multilist_get_num_sublists(ml); 3811 3812 /* 3813 * If we've tried to evict from each sublist, made some 3814 * progress, but still have not hit the target number of bytes 3815 * to evict, we want to keep trying. The markers allow us to 3816 * pick up where we left off for each individual sublist, rather 3817 * than starting from the tail each time. 3818 */ 3819 markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); 3820 for (int i = 0; i < num_sublists; i++) { 3821 markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); 3822 3823 /* 3824 * A b_spa of 0 is used to indicate that this header is 3825 * a marker. This fact is used in arc_adjust_type() and 3826 * arc_evict_state_impl(). 3827 */ 3828 markers[i]->b_spa = 0; 3829 3830 multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 3831 multilist_sublist_insert_tail(mls, markers[i]); 3832 multilist_sublist_unlock(mls); 3833 } 3834 3835 /* 3836 * While we haven't hit our target number of bytes to evict, or 3837 * we're evicting all available buffers. 3838 */ 3839 while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { 3840 /* 3841 * Start eviction using a randomly selected sublist, 3842 * this is to try and evenly balance eviction across all 3843 * sublists. Always starting at the same sublist 3844 * (e.g. index 0) would cause evictions to favor certain 3845 * sublists over others. 3846 */ 3847 int sublist_idx = multilist_get_random_index(ml); 3848 uint64_t scan_evicted = 0; 3849 3850 for (int i = 0; i < num_sublists; i++) { 3851 uint64_t bytes_remaining; 3852 uint64_t bytes_evicted; 3853 3854 if (bytes == ARC_EVICT_ALL) 3855 bytes_remaining = ARC_EVICT_ALL; 3856 else if (total_evicted < bytes) 3857 bytes_remaining = bytes - total_evicted; 3858 else 3859 break; 3860 3861 bytes_evicted = arc_evict_state_impl(ml, sublist_idx, 3862 markers[sublist_idx], spa, bytes_remaining); 3863 3864 scan_evicted += bytes_evicted; 3865 total_evicted += bytes_evicted; 3866 3867 /* we've reached the end, wrap to the beginning */ 3868 if (++sublist_idx >= num_sublists) 3869 sublist_idx = 0; 3870 } 3871 3872 /* 3873 * If we didn't evict anything during this scan, we have 3874 * no reason to believe we'll evict more during another 3875 * scan, so break the loop. 3876 */ 3877 if (scan_evicted == 0) { 3878 /* This isn't possible, let's make that obvious */ 3879 ASSERT3S(bytes, !=, 0); 3880 3881 /* 3882 * When bytes is ARC_EVICT_ALL, the only way to 3883 * break the loop is when scan_evicted is zero. 3884 * In that case, we actually have evicted enough, 3885 * so we don't want to increment the kstat. 3886 */ 3887 if (bytes != ARC_EVICT_ALL) { 3888 ASSERT3S(total_evicted, <, bytes); 3889 ARCSTAT_BUMP(arcstat_evict_not_enough); 3890 } 3891 3892 break; 3893 } 3894 } 3895 3896 for (int i = 0; i < num_sublists; i++) { 3897 multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 3898 multilist_sublist_remove(mls, markers[i]); 3899 multilist_sublist_unlock(mls); 3900 3901 kmem_cache_free(hdr_full_cache, markers[i]); 3902 } 3903 kmem_free(markers, sizeof (*markers) * num_sublists); 3904 3905 return (total_evicted); 3906} 3907 3908/* 3909 * Flush all "evictable" data of the given type from the arc state 3910 * specified. This will not evict any "active" buffers (i.e. referenced). 3911 * 3912 * When 'retry' is set to B_FALSE, the function will make a single pass 3913 * over the state and evict any buffers that it can. Since it doesn't 3914 * continually retry the eviction, it might end up leaving some buffers 3915 * in the ARC due to lock misses. 3916 * 3917 * When 'retry' is set to B_TRUE, the function will continually retry the 3918 * eviction until *all* evictable buffers have been removed from the 3919 * state. As a result, if concurrent insertions into the state are 3920 * allowed (e.g. if the ARC isn't shutting down), this function might 3921 * wind up in an infinite loop, continually trying to evict buffers. 3922 */ 3923static uint64_t 3924arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, 3925 boolean_t retry) 3926{ 3927 uint64_t evicted = 0; 3928 3929 while (refcount_count(&state->arcs_esize[type]) != 0) { 3930 evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); 3931 3932 if (!retry) 3933 break; 3934 } 3935 3936 return (evicted); 3937} 3938 3939/* 3940 * Evict the specified number of bytes from the state specified, 3941 * restricting eviction to the spa and type given. This function 3942 * prevents us from trying to evict more from a state's list than 3943 * is "evictable", and to skip evicting altogether when passed a 3944 * negative value for "bytes". In contrast, arc_evict_state() will 3945 * evict everything it can, when passed a negative value for "bytes". 3946 */ 3947static uint64_t 3948arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, 3949 arc_buf_contents_t type) 3950{ 3951 int64_t delta; 3952 3953 if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) { 3954 delta = MIN(refcount_count(&state->arcs_esize[type]), bytes); 3955 return (arc_evict_state(state, spa, delta, type)); 3956 } 3957 3958 return (0); 3959} 3960 3961/* 3962 * Evict metadata buffers from the cache, such that arc_meta_used is 3963 * capped by the arc_meta_limit tunable. 3964 */ 3965static uint64_t 3966arc_adjust_meta(uint64_t meta_used) 3967{ 3968 uint64_t total_evicted = 0; 3969 int64_t target; 3970 3971 /* 3972 * If we're over the meta limit, we want to evict enough 3973 * metadata to get back under the meta limit. We don't want to 3974 * evict so much that we drop the MRU below arc_p, though. If 3975 * we're over the meta limit more than we're over arc_p, we 3976 * evict some from the MRU here, and some from the MFU below. 3977 */ 3978 target = MIN((int64_t)(meta_used - arc_meta_limit), 3979 (int64_t)(refcount_count(&arc_anon->arcs_size) + 3980 refcount_count(&arc_mru->arcs_size) - arc_p)); 3981 3982 total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 3983 3984 /* 3985 * Similar to the above, we want to evict enough bytes to get us 3986 * below the meta limit, but not so much as to drop us below the 3987 * space allotted to the MFU (which is defined as arc_c - arc_p). 3988 */ 3989 target = MIN((int64_t)(meta_used - arc_meta_limit), 3990 (int64_t)(refcount_count(&arc_mfu->arcs_size) - 3991 (arc_c - arc_p))); 3992 3993 total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 3994 3995 return (total_evicted); 3996} 3997 3998/* 3999 * Return the type of the oldest buffer in the given arc state 4000 * 4001 * This function will select a random sublist of type ARC_BUFC_DATA and 4002 * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist 4003 * is compared, and the type which contains the "older" buffer will be 4004 * returned. 4005 */ 4006static arc_buf_contents_t 4007arc_adjust_type(arc_state_t *state) 4008{ 4009 multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA]; 4010 multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA]; 4011 int data_idx = multilist_get_random_index(data_ml); 4012 int meta_idx = multilist_get_random_index(meta_ml); 4013 multilist_sublist_t *data_mls; 4014 multilist_sublist_t *meta_mls; 4015 arc_buf_contents_t type; 4016 arc_buf_hdr_t *data_hdr; 4017 arc_buf_hdr_t *meta_hdr; 4018 4019 /* 4020 * We keep the sublist lock until we're finished, to prevent 4021 * the headers from being destroyed via arc_evict_state(). 4022 */ 4023 data_mls = multilist_sublist_lock(data_ml, data_idx); 4024 meta_mls = multilist_sublist_lock(meta_ml, meta_idx); 4025 4026 /* 4027 * These two loops are to ensure we skip any markers that 4028 * might be at the tail of the lists due to arc_evict_state(). 4029 */ 4030 4031 for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; 4032 data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { 4033 if (data_hdr->b_spa != 0) 4034 break; 4035 } 4036 4037 for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; 4038 meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { 4039 if (meta_hdr->b_spa != 0) 4040 break; 4041 } 4042 4043 if (data_hdr == NULL && meta_hdr == NULL) { 4044 type = ARC_BUFC_DATA; 4045 } else if (data_hdr == NULL) { 4046 ASSERT3P(meta_hdr, !=, NULL); 4047 type = ARC_BUFC_METADATA; 4048 } else if (meta_hdr == NULL) { 4049 ASSERT3P(data_hdr, !=, NULL); 4050 type = ARC_BUFC_DATA; 4051 } else { 4052 ASSERT3P(data_hdr, !=, NULL); 4053 ASSERT3P(meta_hdr, !=, NULL); 4054 4055 /* The headers can't be on the sublist without an L1 header */ 4056 ASSERT(HDR_HAS_L1HDR(data_hdr)); 4057 ASSERT(HDR_HAS_L1HDR(meta_hdr)); 4058 4059 if (data_hdr->b_l1hdr.b_arc_access < 4060 meta_hdr->b_l1hdr.b_arc_access) { 4061 type = ARC_BUFC_DATA; 4062 } else { 4063 type = ARC_BUFC_METADATA; 4064 } 4065 } 4066 4067 multilist_sublist_unlock(meta_mls); 4068 multilist_sublist_unlock(data_mls); 4069 4070 return (type); 4071} 4072 4073/* 4074 * Evict buffers from the cache, such that arc_size is capped by arc_c. 4075 */ 4076static uint64_t 4077arc_adjust(void) 4078{ 4079 uint64_t total_evicted = 0; 4080 uint64_t bytes; 4081 int64_t target; 4082 uint64_t asize = aggsum_value(&arc_size); 4083 uint64_t ameta = aggsum_value(&arc_meta_used); 4084 4085 /* 4086 * If we're over arc_meta_limit, we want to correct that before 4087 * potentially evicting data buffers below. 4088 */ 4089 total_evicted += arc_adjust_meta(ameta); 4090 4091 /* 4092 * Adjust MRU size 4093 * 4094 * If we're over the target cache size, we want to evict enough 4095 * from the list to get back to our target size. We don't want 4096 * to evict too much from the MRU, such that it drops below 4097 * arc_p. So, if we're over our target cache size more than 4098 * the MRU is over arc_p, we'll evict enough to get back to 4099 * arc_p here, and then evict more from the MFU below. 4100 */ 4101 target = MIN((int64_t)(asize - arc_c), 4102 (int64_t)(refcount_count(&arc_anon->arcs_size) + 4103 refcount_count(&arc_mru->arcs_size) + ameta - arc_p)); 4104 4105 /* 4106 * If we're below arc_meta_min, always prefer to evict data. 4107 * Otherwise, try to satisfy the requested number of bytes to 4108 * evict from the type which contains older buffers; in an 4109 * effort to keep newer buffers in the cache regardless of their 4110 * type. If we cannot satisfy the number of bytes from this 4111 * type, spill over into the next type. 4112 */ 4113 if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && 4114 ameta > arc_meta_min) { 4115 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 4116 total_evicted += bytes; 4117 4118 /* 4119 * If we couldn't evict our target number of bytes from 4120 * metadata, we try to get the rest from data. 4121 */ 4122 target -= bytes; 4123 4124 total_evicted += 4125 arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 4126 } else { 4127 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 4128 total_evicted += bytes; 4129 4130 /* 4131 * If we couldn't evict our target number of bytes from 4132 * data, we try to get the rest from metadata. 4133 */ 4134 target -= bytes; 4135 4136 total_evicted += 4137 arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 4138 } 4139 4140 /* 4141 * Re-sum ARC stats after the first round of evictions. 4142 */ 4143 asize = aggsum_value(&arc_size); 4144 ameta = aggsum_value(&arc_meta_used); 4145 4146 /* 4147 * Adjust MFU size 4148 * 4149 * Now that we've tried to evict enough from the MRU to get its 4150 * size back to arc_p, if we're still above the target cache 4151 * size, we evict the rest from the MFU. 4152 */ 4153 target = asize - arc_c; 4154 4155 if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && 4156 ameta > arc_meta_min) { 4157 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 4158 total_evicted += bytes; 4159 4160 /* 4161 * If we couldn't evict our target number of bytes from 4162 * metadata, we try to get the rest from data. 4163 */ 4164 target -= bytes; 4165 4166 total_evicted += 4167 arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 4168 } else { 4169 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 4170 total_evicted += bytes; 4171 4172 /* 4173 * If we couldn't evict our target number of bytes from 4174 * data, we try to get the rest from data. 4175 */ 4176 target -= bytes; 4177 4178 total_evicted += 4179 arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 4180 } 4181 4182 /* 4183 * Adjust ghost lists 4184 * 4185 * In addition to the above, the ARC also defines target values 4186 * for the ghost lists. The sum of the mru list and mru ghost 4187 * list should never exceed the target size of the cache, and 4188 * the sum of the mru list, mfu list, mru ghost list, and mfu 4189 * ghost list should never exceed twice the target size of the 4190 * cache. The following logic enforces these limits on the ghost 4191 * caches, and evicts from them as needed. 4192 */ 4193 target = refcount_count(&arc_mru->arcs_size) + 4194 refcount_count(&arc_mru_ghost->arcs_size) - arc_c; 4195 4196 bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); 4197 total_evicted += bytes; 4198 4199 target -= bytes; 4200 4201 total_evicted += 4202 arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); 4203 4204 /* 4205 * We assume the sum of the mru list and mfu list is less than 4206 * or equal to arc_c (we enforced this above), which means we 4207 * can use the simpler of the two equations below: 4208 * 4209 * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c 4210 * mru ghost + mfu ghost <= arc_c 4211 */ 4212 target = refcount_count(&arc_mru_ghost->arcs_size) + 4213 refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; 4214 4215 bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); 4216 total_evicted += bytes; 4217 4218 target -= bytes; 4219 4220 total_evicted += 4221 arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); 4222 4223 return (total_evicted); 4224} 4225 4226void 4227arc_flush(spa_t *spa, boolean_t retry) 4228{ 4229 uint64_t guid = 0; 4230 4231 /* 4232 * If retry is B_TRUE, a spa must not be specified since we have 4233 * no good way to determine if all of a spa's buffers have been 4234 * evicted from an arc state. 4235 */ 4236 ASSERT(!retry || spa == 0); 4237 4238 if (spa != NULL) 4239 guid = spa_load_guid(spa); 4240 4241 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); 4242 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); 4243 4244 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); 4245 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); 4246 4247 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); 4248 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); 4249 4250 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); 4251 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); 4252} 4253 4254void 4255arc_shrink(int64_t to_free) 4256{ 4257 uint64_t asize = aggsum_value(&arc_size); 4258 if (arc_c > arc_c_min) { 4259 DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, 4260 arc_c_min, uint64_t, arc_p, uint64_t, to_free); 4261 if (arc_c > arc_c_min + to_free) 4262 atomic_add_64(&arc_c, -to_free); 4263 else 4264 arc_c = arc_c_min; 4265 4266 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 4267 if (asize < arc_c) 4268 arc_c = MAX(asize, arc_c_min); 4269 if (arc_p > arc_c) 4270 arc_p = (arc_c >> 1); 4271 4272 DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, 4273 arc_p); 4274 4275 ASSERT(arc_c >= arc_c_min); 4276 ASSERT((int64_t)arc_p >= 0); 4277 } 4278 4279 if (asize > arc_c) { 4280 DTRACE_PROBE2(arc__shrink_adjust, uint64_t, asize, 4281 uint64_t, arc_c); 4282 (void) arc_adjust(); 4283 } 4284} 4285 4286typedef enum free_memory_reason_t { 4287 FMR_UNKNOWN, 4288 FMR_NEEDFREE, 4289 FMR_LOTSFREE, 4290 FMR_SWAPFS_MINFREE, 4291 FMR_PAGES_PP_MAXIMUM, 4292 FMR_HEAP_ARENA, 4293 FMR_ZIO_ARENA, 4294 FMR_ZIO_FRAG, 4295} free_memory_reason_t; 4296 4297int64_t last_free_memory; 4298free_memory_reason_t last_free_reason; 4299 4300/* 4301 * Additional reserve of pages for pp_reserve. 4302 */ 4303int64_t arc_pages_pp_reserve = 64; 4304 4305/* 4306 * Additional reserve of pages for swapfs. 4307 */ 4308int64_t arc_swapfs_reserve = 64; 4309 4310/* 4311 * Return the amount of memory that can be consumed before reclaim will be 4312 * needed. Positive if there is sufficient free memory, negative indicates 4313 * the amount of memory that needs to be freed up. 4314 */ 4315static int64_t 4316arc_available_memory(void) 4317{ 4318 int64_t lowest = INT64_MAX; 4319 int64_t n; 4320 free_memory_reason_t r = FMR_UNKNOWN; 4321 4322#ifdef _KERNEL 4323#ifdef __FreeBSD__ 4324 /* 4325 * Cooperate with pagedaemon when it's time for it to scan 4326 * and reclaim some pages. 4327 */ 4328 n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); 4329 if (n < lowest) { 4330 lowest = n; 4331 r = FMR_LOTSFREE; 4332 } 4333 4334#else 4335 if (needfree > 0) { 4336 n = PAGESIZE * (-needfree); 4337 if (n < lowest) { 4338 lowest = n; 4339 r = FMR_NEEDFREE; 4340 } 4341 } 4342 4343 /* 4344 * check that we're out of range of the pageout scanner. It starts to 4345 * schedule paging if freemem is less than lotsfree and needfree. 4346 * lotsfree is the high-water mark for pageout, and needfree is the 4347 * number of needed free pages. We add extra pages here to make sure 4348 * the scanner doesn't start up while we're freeing memory. 4349 */ 4350 n = PAGESIZE * (freemem - lotsfree - needfree - desfree); 4351 if (n < lowest) { 4352 lowest = n; 4353 r = FMR_LOTSFREE; 4354 } 4355 4356 /* 4357 * check to make sure that swapfs has enough space so that anon 4358 * reservations can still succeed. anon_resvmem() checks that the 4359 * availrmem is greater than swapfs_minfree, and the number of reserved 4360 * swap pages. We also add a bit of extra here just to prevent 4361 * circumstances from getting really dire. 4362 */ 4363 n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - 4364 desfree - arc_swapfs_reserve); 4365 if (n < lowest) { 4366 lowest = n; 4367 r = FMR_SWAPFS_MINFREE; 4368 } 4369 4370 4371 /* 4372 * Check that we have enough availrmem that memory locking (e.g., via 4373 * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum 4374 * stores the number of pages that cannot be locked; when availrmem 4375 * drops below pages_pp_maximum, page locking mechanisms such as 4376 * page_pp_lock() will fail.) 4377 */ 4378 n = PAGESIZE * (availrmem - pages_pp_maximum - 4379 arc_pages_pp_reserve); 4380 if (n < lowest) { 4381 lowest = n; 4382 r = FMR_PAGES_PP_MAXIMUM; 4383 } 4384 4385#endif /* __FreeBSD__ */ 4386#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 4387 /* 4388 * If we're on an i386 platform, it's possible that we'll exhaust the 4389 * kernel heap space before we ever run out of available physical 4390 * memory. Most checks of the size of the heap_area compare against 4391 * tune.t_minarmem, which is the minimum available real memory that we 4392 * can have in the system. However, this is generally fixed at 25 pages 4393 * which is so low that it's useless. In this comparison, we seek to 4394 * calculate the total heap-size, and reclaim if more than 3/4ths of the 4395 * heap is allocated. (Or, in the calculation, if less than 1/4th is 4396 * free) 4397 */ 4398 n = (int64_t)vmem_size(heap_arena, VMEM_FREE) - 4399 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); 4400 if (n < lowest) { 4401 lowest = n; 4402 r = FMR_HEAP_ARENA; 4403 } 4404#define zio_arena NULL 4405#else 4406#define zio_arena heap_arena 4407#endif 4408 4409 /* 4410 * If zio data pages are being allocated out of a separate heap segment, 4411 * then enforce that the size of available vmem for this arena remains 4412 * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free. 4413 * 4414 * Note that reducing the arc_zio_arena_free_shift keeps more virtual 4415 * memory (in the zio_arena) free, which can avoid memory 4416 * fragmentation issues. 4417 */ 4418 if (zio_arena != NULL) { 4419 n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - 4420 (vmem_size(zio_arena, VMEM_ALLOC) >> 4421 arc_zio_arena_free_shift); 4422 if (n < lowest) { 4423 lowest = n; 4424 r = FMR_ZIO_ARENA; 4425 } 4426 } 4427 4428 /* 4429 * Above limits know nothing about real level of KVA fragmentation. 4430 * Start aggressive reclamation if too little sequential KVA left. 4431 */ 4432 if (lowest > 0) { 4433 n = (vmem_size(heap_arena, VMEM_MAXFREE) < SPA_MAXBLOCKSIZE) ? 4434 -((int64_t)vmem_size(heap_arena, VMEM_ALLOC) >> 4) : 4435 INT64_MAX; 4436 if (n < lowest) { 4437 lowest = n; 4438 r = FMR_ZIO_FRAG; 4439 } 4440 } 4441 4442#else /* _KERNEL */ 4443 /* Every 100 calls, free a small amount */ 4444 if (spa_get_random(100) == 0) 4445 lowest = -1024; 4446#endif /* _KERNEL */ 4447 4448 last_free_memory = lowest; 4449 last_free_reason = r; 4450 DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); 4451 return (lowest); 4452} 4453 4454 4455/* 4456 * Determine if the system is under memory pressure and is asking 4457 * to reclaim memory. A return value of B_TRUE indicates that the system 4458 * is under memory pressure and that the arc should adjust accordingly. 4459 */ 4460static boolean_t 4461arc_reclaim_needed(void) 4462{ 4463 return (arc_available_memory() < 0); 4464} 4465 4466extern kmem_cache_t *zio_buf_cache[]; 4467extern kmem_cache_t *zio_data_buf_cache[]; 4468extern kmem_cache_t *range_seg_cache; 4469extern kmem_cache_t *abd_chunk_cache; 4470 4471static __noinline void 4472arc_kmem_reap_now(void) 4473{ 4474 size_t i; 4475 kmem_cache_t *prev_cache = NULL; 4476 kmem_cache_t *prev_data_cache = NULL; 4477 4478 DTRACE_PROBE(arc__kmem_reap_start); 4479#ifdef _KERNEL 4480 if (aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) { 4481 /* 4482 * We are exceeding our meta-data cache limit. 4483 * Purge some DNLC entries to release holds on meta-data. 4484 */ 4485 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 4486 } 4487#if defined(__i386) 4488 /* 4489 * Reclaim unused memory from all kmem caches. 4490 */ 4491 kmem_reap(); 4492#endif 4493#endif 4494 4495 /* 4496 * If a kmem reap is already active, don't schedule more. We must 4497 * check for this because kmem_cache_reap_soon() won't actually 4498 * block on the cache being reaped (this is to prevent callers from 4499 * becoming implicitly blocked by a system-wide kmem reap -- which, 4500 * on a system with many, many full magazines, can take minutes). 4501 */ 4502 if (kmem_cache_reap_active()) 4503 return; 4504 4505 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 4506 if (zio_buf_cache[i] != prev_cache) { 4507 prev_cache = zio_buf_cache[i]; 4508 kmem_cache_reap_soon(zio_buf_cache[i]); 4509 } 4510 if (zio_data_buf_cache[i] != prev_data_cache) { 4511 prev_data_cache = zio_data_buf_cache[i]; 4512 kmem_cache_reap_soon(zio_data_buf_cache[i]); 4513 } 4514 } 4515 kmem_cache_reap_soon(abd_chunk_cache); 4516 kmem_cache_reap_soon(buf_cache); 4517 kmem_cache_reap_soon(hdr_full_cache); 4518 kmem_cache_reap_soon(hdr_l2only_cache); 4519 kmem_cache_reap_soon(range_seg_cache); 4520 4521#ifdef illumos 4522 if (zio_arena != NULL) { 4523 /* 4524 * Ask the vmem arena to reclaim unused memory from its 4525 * quantum caches. 4526 */ 4527 vmem_qcache_reap(zio_arena); 4528 } 4529#endif 4530 DTRACE_PROBE(arc__kmem_reap_end); 4531} 4532 4533/* 4534 * Threads can block in arc_get_data_impl() waiting for this thread to evict 4535 * enough data and signal them to proceed. When this happens, the threads in 4536 * arc_get_data_impl() are sleeping while holding the hash lock for their 4537 * particular arc header. Thus, we must be careful to never sleep on a 4538 * hash lock in this thread. This is to prevent the following deadlock: 4539 * 4540 * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L", 4541 * waiting for the reclaim thread to signal it. 4542 * 4543 * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, 4544 * fails, and goes to sleep forever. 4545 * 4546 * This possible deadlock is avoided by always acquiring a hash lock 4547 * using mutex_tryenter() from arc_reclaim_thread(). 4548 */ 4549/* ARGSUSED */ 4550static void 4551arc_reclaim_thread(void *unused __unused) 4552{ 4553 hrtime_t growtime = 0; 4554 hrtime_t kmem_reap_time = 0; 4555 callb_cpr_t cpr; 4556 4557 CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); 4558 4559 mutex_enter(&arc_reclaim_lock); 4560 while (!arc_reclaim_thread_exit) { 4561 uint64_t evicted = 0; 4562 4563 /* 4564 * This is necessary in order for the mdb ::arc dcmd to 4565 * show up to date information. Since the ::arc command 4566 * does not call the kstat's update function, without 4567 * this call, the command may show stale stats for the 4568 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even 4569 * with this change, the data might be up to 1 second 4570 * out of date; but that should suffice. The arc_state_t 4571 * structures can be queried directly if more accurate 4572 * information is needed. 4573 */ 4574 if (arc_ksp != NULL) 4575 arc_ksp->ks_update(arc_ksp, KSTAT_READ); 4576 4577 mutex_exit(&arc_reclaim_lock); 4578 4579 /* 4580 * We call arc_adjust() before (possibly) calling 4581 * arc_kmem_reap_now(), so that we can wake up 4582 * arc_get_data_impl() sooner. 4583 */ 4584 evicted = arc_adjust(); 4585 4586 int64_t free_memory = arc_available_memory(); 4587 if (free_memory < 0) { 4588 hrtime_t curtime = gethrtime(); 4589 arc_no_grow = B_TRUE; 4590 arc_warm = B_TRUE; 4591 4592 /* 4593 * Wait at least zfs_grow_retry (default 60) seconds 4594 * before considering growing. 4595 */ 4596 growtime = curtime + SEC2NSEC(arc_grow_retry); 4597 4598 /* 4599 * Wait at least arc_kmem_cache_reap_retry_ms 4600 * between arc_kmem_reap_now() calls. Without 4601 * this check it is possible to end up in a 4602 * situation where we spend lots of time 4603 * reaping caches, while we're near arc_c_min. 4604 */ 4605 if (curtime >= kmem_reap_time) { 4606 arc_kmem_reap_now(); 4607 kmem_reap_time = gethrtime() + 4608 MSEC2NSEC(arc_kmem_cache_reap_retry_ms); 4609 } 4610 4611 /* 4612 * If we are still low on memory, shrink the ARC 4613 * so that we have arc_shrink_min free space. 4614 */ 4615 free_memory = arc_available_memory(); 4616 4617 int64_t to_free = 4618 (arc_c >> arc_shrink_shift) - free_memory; 4619 if (to_free > 0) { 4620#ifdef _KERNEL 4621#ifdef illumos 4622 to_free = MAX(to_free, ptob(needfree)); 4623#endif 4624#endif 4625 arc_shrink(to_free); 4626 } 4627 } else if (free_memory < arc_c >> arc_no_grow_shift) { 4628 arc_no_grow = B_TRUE; 4629 } else if (gethrtime() >= growtime) { 4630 arc_no_grow = B_FALSE; 4631 } 4632 4633 mutex_enter(&arc_reclaim_lock); 4634 4635 /* 4636 * If evicted is zero, we couldn't evict anything via 4637 * arc_adjust(). This could be due to hash lock 4638 * collisions, but more likely due to the majority of 4639 * arc buffers being unevictable. Therefore, even if 4640 * arc_size is above arc_c, another pass is unlikely to 4641 * be helpful and could potentially cause us to enter an 4642 * infinite loop. 4643 */ 4644 if (aggsum_compare(&arc_size, arc_c) <= 0|| evicted == 0) { 4645 /* 4646 * We're either no longer overflowing, or we 4647 * can't evict anything more, so we should wake 4648 * up any threads before we go to sleep. 4649 */ 4650 cv_broadcast(&arc_reclaim_waiters_cv); 4651 4652 /* 4653 * Block until signaled, or after one second (we 4654 * might need to perform arc_kmem_reap_now() 4655 * even if we aren't being signalled) 4656 */ 4657 CALLB_CPR_SAFE_BEGIN(&cpr); 4658 (void) cv_timedwait_hires(&arc_reclaim_thread_cv, 4659 &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); 4660 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); 4661 } 4662 } 4663 4664 arc_reclaim_thread_exit = B_FALSE; 4665 cv_broadcast(&arc_reclaim_thread_cv); 4666 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ 4667 thread_exit(); 4668} 4669 4670static u_int arc_dnlc_evicts_arg; 4671extern struct vfsops zfs_vfsops; 4672 4673static void 4674arc_dnlc_evicts_thread(void *dummy __unused) 4675{ 4676 callb_cpr_t cpr; 4677 u_int percent; 4678 4679 CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG); 4680 4681 mutex_enter(&arc_dnlc_evicts_lock); 4682 while (!arc_dnlc_evicts_thread_exit) { 4683 CALLB_CPR_SAFE_BEGIN(&cpr); 4684 (void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); 4685 CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock); 4686 if (arc_dnlc_evicts_arg != 0) { 4687 percent = arc_dnlc_evicts_arg; 4688 mutex_exit(&arc_dnlc_evicts_lock); 4689#ifdef _KERNEL 4690 vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops); 4691#endif 4692 mutex_enter(&arc_dnlc_evicts_lock); 4693 /* 4694 * Clear our token only after vnlru_free() 4695 * pass is done, to avoid false queueing of 4696 * the requests. 4697 */ 4698 arc_dnlc_evicts_arg = 0; 4699 } 4700 } 4701 arc_dnlc_evicts_thread_exit = FALSE; 4702 cv_broadcast(&arc_dnlc_evicts_cv); 4703 CALLB_CPR_EXIT(&cpr); 4704 thread_exit(); 4705} 4706 4707void 4708dnlc_reduce_cache(void *arg) 4709{ 4710 u_int percent; 4711 4712 percent = (u_int)(uintptr_t)arg; 4713 mutex_enter(&arc_dnlc_evicts_lock); 4714 if (arc_dnlc_evicts_arg == 0) { 4715 arc_dnlc_evicts_arg = percent; 4716 cv_broadcast(&arc_dnlc_evicts_cv); 4717 } 4718 mutex_exit(&arc_dnlc_evicts_lock); 4719} 4720 4721/* 4722 * Adapt arc info given the number of bytes we are trying to add and 4723 * the state that we are comming from. This function is only called 4724 * when we are adding new content to the cache. 4725 */ 4726static void 4727arc_adapt(int bytes, arc_state_t *state) 4728{ 4729 int mult; 4730 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 4731 int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size); 4732 int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size); 4733 4734 if (state == arc_l2c_only) 4735 return; 4736 4737 ASSERT(bytes > 0); 4738 /* 4739 * Adapt the target size of the MRU list: 4740 * - if we just hit in the MRU ghost list, then increase 4741 * the target size of the MRU list. 4742 * - if we just hit in the MFU ghost list, then increase 4743 * the target size of the MFU list by decreasing the 4744 * target size of the MRU list. 4745 */ 4746 if (state == arc_mru_ghost) { 4747 mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); 4748 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 4749 4750 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 4751 } else if (state == arc_mfu_ghost) { 4752 uint64_t delta; 4753 4754 mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); 4755 mult = MIN(mult, 10); 4756 4757 delta = MIN(bytes * mult, arc_p); 4758 arc_p = MAX(arc_p_min, arc_p - delta); 4759 } 4760 ASSERT((int64_t)arc_p >= 0); 4761 4762 if (arc_reclaim_needed()) { 4763 cv_signal(&arc_reclaim_thread_cv); 4764 return; 4765 } 4766 4767 if (arc_no_grow) 4768 return; 4769 4770 if (arc_c >= arc_c_max) 4771 return; 4772 4773 /* 4774 * If we're within (2 * maxblocksize) bytes of the target 4775 * cache size, increment the target cache size 4776 */ 4777 if (aggsum_compare(&arc_size, arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) > 4778 0) { 4779 DTRACE_PROBE1(arc__inc_adapt, int, bytes); 4780 atomic_add_64(&arc_c, (int64_t)bytes); 4781 if (arc_c > arc_c_max) 4782 arc_c = arc_c_max; 4783 else if (state == arc_anon) 4784 atomic_add_64(&arc_p, (int64_t)bytes); 4785 if (arc_p > arc_c) 4786 arc_p = arc_c; 4787 } 4788 ASSERT((int64_t)arc_p >= 0); 4789} 4790 4791/* 4792 * Check if arc_size has grown past our upper threshold, determined by 4793 * zfs_arc_overflow_shift. 4794 */ 4795static boolean_t 4796arc_is_overflowing(void) 4797{ 4798 /* Always allow at least one block of overflow */ 4799 uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, 4800 arc_c >> zfs_arc_overflow_shift); 4801 4802 /* 4803 * We just compare the lower bound here for performance reasons. Our 4804 * primary goals are to make sure that the arc never grows without 4805 * bound, and that it can reach its maximum size. This check 4806 * accomplishes both goals. The maximum amount we could run over by is 4807 * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block 4808 * in the ARC. In practice, that's in the tens of MB, which is low 4809 * enough to be safe. 4810 */ 4811 return (aggsum_lower_bound(&arc_size) >= arc_c + overflow); 4812} 4813 4814static abd_t * 4815arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 4816{ 4817 arc_buf_contents_t type = arc_buf_type(hdr); 4818 4819 arc_get_data_impl(hdr, size, tag); 4820 if (type == ARC_BUFC_METADATA) { 4821 return (abd_alloc(size, B_TRUE)); 4822 } else { 4823 ASSERT(type == ARC_BUFC_DATA); 4824 return (abd_alloc(size, B_FALSE)); 4825 } 4826} 4827 4828static void * 4829arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 4830{ 4831 arc_buf_contents_t type = arc_buf_type(hdr); 4832 4833 arc_get_data_impl(hdr, size, tag); 4834 if (type == ARC_BUFC_METADATA) { 4835 return (zio_buf_alloc(size)); 4836 } else { 4837 ASSERT(type == ARC_BUFC_DATA); 4838 return (zio_data_buf_alloc(size)); 4839 } 4840} 4841 4842/* 4843 * Allocate a block and return it to the caller. If we are hitting the 4844 * hard limit for the cache size, we must sleep, waiting for the eviction 4845 * thread to catch up. If we're past the target size but below the hard 4846 * limit, we'll only signal the reclaim thread and continue on. 4847 */ 4848static void 4849arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 4850{ 4851 arc_state_t *state = hdr->b_l1hdr.b_state; 4852 arc_buf_contents_t type = arc_buf_type(hdr); 4853 4854 arc_adapt(size, state); 4855 4856 /* 4857 * If arc_size is currently overflowing, and has grown past our 4858 * upper limit, we must be adding data faster than the evict 4859 * thread can evict. Thus, to ensure we don't compound the 4860 * problem by adding more data and forcing arc_size to grow even 4861 * further past it's target size, we halt and wait for the 4862 * eviction thread to catch up. 4863 * 4864 * It's also possible that the reclaim thread is unable to evict 4865 * enough buffers to get arc_size below the overflow limit (e.g. 4866 * due to buffers being un-evictable, or hash lock collisions). 4867 * In this case, we want to proceed regardless if we're 4868 * overflowing; thus we don't use a while loop here. 4869 */ 4870 if (arc_is_overflowing()) { 4871 mutex_enter(&arc_reclaim_lock); 4872 4873 /* 4874 * Now that we've acquired the lock, we may no longer be 4875 * over the overflow limit, lets check. 4876 * 4877 * We're ignoring the case of spurious wake ups. If that 4878 * were to happen, it'd let this thread consume an ARC 4879 * buffer before it should have (i.e. before we're under 4880 * the overflow limit and were signalled by the reclaim 4881 * thread). As long as that is a rare occurrence, it 4882 * shouldn't cause any harm. 4883 */ 4884 if (arc_is_overflowing()) { 4885 cv_signal(&arc_reclaim_thread_cv); 4886 cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); 4887 } 4888 4889 mutex_exit(&arc_reclaim_lock); 4890 } 4891 4892 VERIFY3U(hdr->b_type, ==, type); 4893 if (type == ARC_BUFC_METADATA) { 4894 arc_space_consume(size, ARC_SPACE_META); 4895 } else { 4896 arc_space_consume(size, ARC_SPACE_DATA); 4897 } 4898 4899 /* 4900 * Update the state size. Note that ghost states have a 4901 * "ghost size" and so don't need to be updated. 4902 */ 4903 if (!GHOST_STATE(state)) { 4904 4905 (void) refcount_add_many(&state->arcs_size, size, tag); 4906 4907 /* 4908 * If this is reached via arc_read, the link is 4909 * protected by the hash lock. If reached via 4910 * arc_buf_alloc, the header should not be accessed by 4911 * any other thread. And, if reached via arc_read_done, 4912 * the hash lock will protect it if it's found in the 4913 * hash table; otherwise no other thread should be 4914 * trying to [add|remove]_reference it. 4915 */ 4916 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 4917 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4918 (void) refcount_add_many(&state->arcs_esize[type], 4919 size, tag); 4920 } 4921 4922 /* 4923 * If we are growing the cache, and we are adding anonymous 4924 * data, and we have outgrown arc_p, update arc_p 4925 */ 4926 if (aggsum_compare(&arc_size, arc_c) < 0 && 4927 hdr->b_l1hdr.b_state == arc_anon && 4928 (refcount_count(&arc_anon->arcs_size) + 4929 refcount_count(&arc_mru->arcs_size) > arc_p)) 4930 arc_p = MIN(arc_c, arc_p + size); 4931 } 4932 ARCSTAT_BUMP(arcstat_allocated); 4933} 4934 4935static void 4936arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag) 4937{ 4938 arc_free_data_impl(hdr, size, tag); 4939 abd_free(abd); 4940} 4941 4942static void 4943arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag) 4944{ 4945 arc_buf_contents_t type = arc_buf_type(hdr); 4946 4947 arc_free_data_impl(hdr, size, tag); 4948 if (type == ARC_BUFC_METADATA) { 4949 zio_buf_free(buf, size); 4950 } else { 4951 ASSERT(type == ARC_BUFC_DATA); 4952 zio_data_buf_free(buf, size); 4953 } 4954} 4955 4956/* 4957 * Free the arc data buffer. 4958 */ 4959static void 4960arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 4961{ 4962 arc_state_t *state = hdr->b_l1hdr.b_state; 4963 arc_buf_contents_t type = arc_buf_type(hdr); 4964 4965 /* protected by hash lock, if in the hash table */ 4966 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 4967 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4968 ASSERT(state != arc_anon && state != arc_l2c_only); 4969 4970 (void) refcount_remove_many(&state->arcs_esize[type], 4971 size, tag); 4972 } 4973 (void) refcount_remove_many(&state->arcs_size, size, tag); 4974 4975 VERIFY3U(hdr->b_type, ==, type); 4976 if (type == ARC_BUFC_METADATA) { 4977 arc_space_return(size, ARC_SPACE_META); 4978 } else { 4979 ASSERT(type == ARC_BUFC_DATA); 4980 arc_space_return(size, ARC_SPACE_DATA); 4981 } 4982} 4983 4984/* 4985 * This routine is called whenever a buffer is accessed. 4986 * NOTE: the hash lock is dropped in this function. 4987 */ 4988static void 4989arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 4990{ 4991 clock_t now; 4992 4993 ASSERT(MUTEX_HELD(hash_lock)); 4994 ASSERT(HDR_HAS_L1HDR(hdr)); 4995 4996 if (hdr->b_l1hdr.b_state == arc_anon) { 4997 /* 4998 * This buffer is not in the cache, and does not 4999 * appear in our "ghost" list. Add the new buffer 5000 * to the MRU state. 5001 */ 5002 5003 ASSERT0(hdr->b_l1hdr.b_arc_access); 5004 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 5005 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 5006 arc_change_state(arc_mru, hdr, hash_lock); 5007 5008 } else if (hdr->b_l1hdr.b_state == arc_mru) { 5009 now = ddi_get_lbolt(); 5010 5011 /* 5012 * If this buffer is here because of a prefetch, then either: 5013 * - clear the flag if this is a "referencing" read 5014 * (any subsequent access will bump this into the MFU state). 5015 * or 5016 * - move the buffer to the head of the list if this is 5017 * another prefetch (to make it less likely to be evicted). 5018 */ 5019 if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { 5020 if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 5021 /* link protected by hash lock */ 5022 ASSERT(multilist_link_active( 5023 &hdr->b_l1hdr.b_arc_node)); 5024 } else { 5025 arc_hdr_clear_flags(hdr, 5026 ARC_FLAG_PREFETCH | 5027 ARC_FLAG_PRESCIENT_PREFETCH); 5028 ARCSTAT_BUMP(arcstat_mru_hits); 5029 } 5030 hdr->b_l1hdr.b_arc_access = now; 5031 return; 5032 } 5033 5034 /* 5035 * This buffer has been "accessed" only once so far, 5036 * but it is still in the cache. Move it to the MFU 5037 * state. 5038 */ 5039 if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { 5040 /* 5041 * More than 125ms have passed since we 5042 * instantiated this buffer. Move it to the 5043 * most frequently used state. 5044 */ 5045 hdr->b_l1hdr.b_arc_access = now; 5046 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 5047 arc_change_state(arc_mfu, hdr, hash_lock); 5048 } 5049 ARCSTAT_BUMP(arcstat_mru_hits); 5050 } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { 5051 arc_state_t *new_state; 5052 /* 5053 * This buffer has been "accessed" recently, but 5054 * was evicted from the cache. Move it to the 5055 * MFU state. 5056 */ 5057 5058 if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { 5059 new_state = arc_mru; 5060 if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { 5061 arc_hdr_clear_flags(hdr, 5062 ARC_FLAG_PREFETCH | 5063 ARC_FLAG_PRESCIENT_PREFETCH); 5064 } 5065 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 5066 } else { 5067 new_state = arc_mfu; 5068 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 5069 } 5070 5071 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 5072 arc_change_state(new_state, hdr, hash_lock); 5073 5074 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 5075 } else if (hdr->b_l1hdr.b_state == arc_mfu) { 5076 /* 5077 * This buffer has been accessed more than once and is 5078 * still in the cache. Keep it in the MFU state. 5079 * 5080 * NOTE: an add_reference() that occurred when we did 5081 * the arc_read() will have kicked this off the list. 5082 * If it was a prefetch, we will explicitly move it to 5083 * the head of the list now. 5084 */ 5085 5086 ARCSTAT_BUMP(arcstat_mfu_hits); 5087 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 5088 } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { 5089 arc_state_t *new_state = arc_mfu; 5090 /* 5091 * This buffer has been accessed more than once but has 5092 * been evicted from the cache. Move it back to the 5093 * MFU state. 5094 */ 5095 5096 if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { 5097 /* 5098 * This is a prefetch access... 5099 * move this block back to the MRU state. 5100 */ 5101 new_state = arc_mru; 5102 } 5103 5104 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 5105 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 5106 arc_change_state(new_state, hdr, hash_lock); 5107 5108 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 5109 } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { 5110 /* 5111 * This buffer is on the 2nd Level ARC. 5112 */ 5113 5114 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 5115 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 5116 arc_change_state(arc_mfu, hdr, hash_lock); 5117 } else { 5118 ASSERT(!"invalid arc state"); 5119 } 5120} 5121 5122/* 5123 * This routine is called by dbuf_hold() to update the arc_access() state 5124 * which otherwise would be skipped for entries in the dbuf cache. 5125 */ 5126void 5127arc_buf_access(arc_buf_t *buf) 5128{ 5129 mutex_enter(&buf->b_evict_lock); 5130 arc_buf_hdr_t *hdr = buf->b_hdr; 5131 5132 /* 5133 * Avoid taking the hash_lock when possible as an optimization. 5134 * The header must be checked again under the hash_lock in order 5135 * to handle the case where it is concurrently being released. 5136 */ 5137 if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { 5138 mutex_exit(&buf->b_evict_lock); 5139 ARCSTAT_BUMP(arcstat_access_skip); 5140 return; 5141 } 5142 5143 kmutex_t *hash_lock = HDR_LOCK(hdr); 5144 mutex_enter(hash_lock); 5145 5146 if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { 5147 mutex_exit(hash_lock); 5148 mutex_exit(&buf->b_evict_lock); 5149 ARCSTAT_BUMP(arcstat_access_skip); 5150 return; 5151 } 5152 5153 mutex_exit(&buf->b_evict_lock); 5154 5155 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 5156 hdr->b_l1hdr.b_state == arc_mfu); 5157 5158 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 5159 arc_access(hdr, hash_lock); 5160 mutex_exit(hash_lock); 5161 5162 ARCSTAT_BUMP(arcstat_hits); 5163 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 5164 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); 5165} 5166 5167/* a generic arc_read_done_func_t which you can use */ 5168/* ARGSUSED */ 5169void 5170arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, 5171 arc_buf_t *buf, void *arg) 5172{ 5173 if (buf == NULL) 5174 return; 5175 5176 bcopy(buf->b_data, arg, arc_buf_size(buf)); 5177 arc_buf_destroy(buf, arg); 5178} 5179 5180/* a generic arc_read_done_func_t */ 5181/* ARGSUSED */ 5182void 5183arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, 5184 arc_buf_t *buf, void *arg) 5185{ 5186 arc_buf_t **bufp = arg; 5187 if (buf == NULL) { 5188 ASSERT(zio == NULL || zio->io_error != 0); 5189 *bufp = NULL; 5190 } else { 5191 ASSERT(zio == NULL || zio->io_error == 0); 5192 *bufp = buf; 5193 ASSERT(buf->b_data != NULL); 5194 } 5195} 5196 5197static void 5198arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) 5199{ 5200 if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { 5201 ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); 5202 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); 5203 } else { 5204 if (HDR_COMPRESSION_ENABLED(hdr)) { 5205 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, 5206 BP_GET_COMPRESS(bp)); 5207 } 5208 ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); 5209 ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); 5210 } 5211} 5212 5213static void 5214arc_read_done(zio_t *zio) 5215{ 5216 arc_buf_hdr_t *hdr = zio->io_private; 5217 kmutex_t *hash_lock = NULL; 5218 arc_callback_t *callback_list; 5219 arc_callback_t *acb; 5220 boolean_t freeable = B_FALSE; 5221 boolean_t no_zio_error = (zio->io_error == 0); 5222 5223 /* 5224 * The hdr was inserted into hash-table and removed from lists 5225 * prior to starting I/O. We should find this header, since 5226 * it's in the hash table, and it should be legit since it's 5227 * not possible to evict it during the I/O. The only possible 5228 * reason for it not to be found is if we were freed during the 5229 * read. 5230 */ 5231 if (HDR_IN_HASH_TABLE(hdr)) { 5232 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); 5233 ASSERT3U(hdr->b_dva.dva_word[0], ==, 5234 BP_IDENTITY(zio->io_bp)->dva_word[0]); 5235 ASSERT3U(hdr->b_dva.dva_word[1], ==, 5236 BP_IDENTITY(zio->io_bp)->dva_word[1]); 5237 5238 arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, 5239 &hash_lock); 5240 5241 ASSERT((found == hdr && 5242 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 5243 (found == hdr && HDR_L2_READING(hdr))); 5244 ASSERT3P(hash_lock, !=, NULL); 5245 } 5246 5247 if (no_zio_error) { 5248 /* byteswap if necessary */ 5249 if (BP_SHOULD_BYTESWAP(zio->io_bp)) { 5250 if (BP_GET_LEVEL(zio->io_bp) > 0) { 5251 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; 5252 } else { 5253 hdr->b_l1hdr.b_byteswap = 5254 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 5255 } 5256 } else { 5257 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; 5258 } 5259 } 5260 5261 arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); 5262 if (l2arc_noprefetch && HDR_PREFETCH(hdr)) 5263 arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); 5264 5265 callback_list = hdr->b_l1hdr.b_acb; 5266 ASSERT3P(callback_list, !=, NULL); 5267 5268 if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) { 5269 /* 5270 * Only call arc_access on anonymous buffers. This is because 5271 * if we've issued an I/O for an evicted buffer, we've already 5272 * called arc_access (to prevent any simultaneous readers from 5273 * getting confused). 5274 */ 5275 arc_access(hdr, hash_lock); 5276 } 5277 5278 /* 5279 * If a read request has a callback (i.e. acb_done is not NULL), then we 5280 * make a buf containing the data according to the parameters which were 5281 * passed in. The implementation of arc_buf_alloc_impl() ensures that we 5282 * aren't needlessly decompressing the data multiple times. 5283 */ 5284 int callback_cnt = 0; 5285 for (acb = callback_list; acb != NULL; acb = acb->acb_next) { 5286 if (!acb->acb_done) 5287 continue; 5288 5289 callback_cnt++; 5290 5291 if (no_zio_error) { 5292 int error = arc_buf_alloc_impl(hdr, acb->acb_private, 5293 acb->acb_compressed, zio->io_error == 0, 5294 &acb->acb_buf); 5295 if (error != 0) { 5296 /* 5297 * Decompression failed. Set io_error 5298 * so that when we call acb_done (below), 5299 * we will indicate that the read failed. 5300 * Note that in the unusual case where one 5301 * callback is compressed and another 5302 * uncompressed, we will mark all of them 5303 * as failed, even though the uncompressed 5304 * one can't actually fail. In this case, 5305 * the hdr will not be anonymous, because 5306 * if there are multiple callbacks, it's 5307 * because multiple threads found the same 5308 * arc buf in the hash table. 5309 */ 5310 zio->io_error = error; 5311 } 5312 } 5313 } 5314 /* 5315 * If there are multiple callbacks, we must have the hash lock, 5316 * because the only way for multiple threads to find this hdr is 5317 * in the hash table. This ensures that if there are multiple 5318 * callbacks, the hdr is not anonymous. If it were anonymous, 5319 * we couldn't use arc_buf_destroy() in the error case below. 5320 */ 5321 ASSERT(callback_cnt < 2 || hash_lock != NULL); 5322 5323 hdr->b_l1hdr.b_acb = NULL; 5324 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 5325 if (callback_cnt == 0) { 5326 ASSERT(HDR_PREFETCH(hdr)); 5327 ASSERT0(hdr->b_l1hdr.b_bufcnt); 5328 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 5329 } 5330 5331 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || 5332 callback_list != NULL); 5333 5334 if (no_zio_error) { 5335 arc_hdr_verify(hdr, zio->io_bp); 5336 } else { 5337 arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); 5338 if (hdr->b_l1hdr.b_state != arc_anon) 5339 arc_change_state(arc_anon, hdr, hash_lock); 5340 if (HDR_IN_HASH_TABLE(hdr)) 5341 buf_hash_remove(hdr); 5342 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 5343 } 5344 5345 /* 5346 * Broadcast before we drop the hash_lock to avoid the possibility 5347 * that the hdr (and hence the cv) might be freed before we get to 5348 * the cv_broadcast(). 5349 */ 5350 cv_broadcast(&hdr->b_l1hdr.b_cv); 5351 5352 if (hash_lock != NULL) { 5353 mutex_exit(hash_lock); 5354 } else { 5355 /* 5356 * This block was freed while we waited for the read to 5357 * complete. It has been removed from the hash table and 5358 * moved to the anonymous state (so that it won't show up 5359 * in the cache). 5360 */ 5361 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 5362 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 5363 } 5364 5365 /* execute each callback and free its structure */ 5366 while ((acb = callback_list) != NULL) { 5367 if (acb->acb_done != NULL) { 5368 if (zio->io_error != 0 && acb->acb_buf != NULL) { 5369 /* 5370 * If arc_buf_alloc_impl() fails during 5371 * decompression, the buf will still be 5372 * allocated, and needs to be freed here. 5373 */ 5374 arc_buf_destroy(acb->acb_buf, acb->acb_private); 5375 acb->acb_buf = NULL; 5376 } 5377 acb->acb_done(zio, &zio->io_bookmark, zio->io_bp, 5378 acb->acb_buf, acb->acb_private); 5379 } 5380 5381 if (acb->acb_zio_dummy != NULL) { 5382 acb->acb_zio_dummy->io_error = zio->io_error; 5383 zio_nowait(acb->acb_zio_dummy); 5384 } 5385 5386 callback_list = acb->acb_next; 5387 kmem_free(acb, sizeof (arc_callback_t)); 5388 } 5389 5390 if (freeable) 5391 arc_hdr_destroy(hdr); 5392} 5393 5394/* 5395 * "Read" the block at the specified DVA (in bp) via the 5396 * cache. If the block is found in the cache, invoke the provided 5397 * callback immediately and return. Note that the `zio' parameter 5398 * in the callback will be NULL in this case, since no IO was 5399 * required. If the block is not in the cache pass the read request 5400 * on to the spa with a substitute callback function, so that the 5401 * requested block will be added to the cache. 5402 * 5403 * If a read request arrives for a block that has a read in-progress, 5404 * either wait for the in-progress read to complete (and return the 5405 * results); or, if this is a read with a "done" func, add a record 5406 * to the read to invoke the "done" func when the read completes, 5407 * and return; or just return. 5408 * 5409 * arc_read_done() will invoke all the requested "done" functions 5410 * for readers of this block. 5411 */ 5412int 5413arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, 5414 void *private, zio_priority_t priority, int zio_flags, 5415 arc_flags_t *arc_flags, const zbookmark_phys_t *zb) 5416{ 5417 arc_buf_hdr_t *hdr = NULL; 5418 kmutex_t *hash_lock = NULL; 5419 zio_t *rzio; 5420 uint64_t guid = spa_load_guid(spa); 5421 boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0; 5422 int rc = 0; 5423 5424 ASSERT(!BP_IS_EMBEDDED(bp) || 5425 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 5426 5427top: 5428 if (!BP_IS_EMBEDDED(bp)) { 5429 /* 5430 * Embedded BP's have no DVA and require no I/O to "read". 5431 * Create an anonymous arc buf to back it. 5432 */ 5433 hdr = buf_hash_find(guid, bp, &hash_lock); 5434 } 5435 5436 if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) { 5437 arc_buf_t *buf = NULL; 5438 *arc_flags |= ARC_FLAG_CACHED; 5439 5440 if (HDR_IO_IN_PROGRESS(hdr)) { 5441 zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; 5442 5443 ASSERT3P(head_zio, !=, NULL); 5444 if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && 5445 priority == ZIO_PRIORITY_SYNC_READ) { 5446 /* 5447 * This is a sync read that needs to wait for 5448 * an in-flight async read. Request that the 5449 * zio have its priority upgraded. 5450 */ 5451 zio_change_priority(head_zio, priority); 5452 DTRACE_PROBE1(arc__async__upgrade__sync, 5453 arc_buf_hdr_t *, hdr); 5454 ARCSTAT_BUMP(arcstat_async_upgrade_sync); 5455 } 5456 if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 5457 arc_hdr_clear_flags(hdr, 5458 ARC_FLAG_PREDICTIVE_PREFETCH); 5459 } 5460 5461 if (*arc_flags & ARC_FLAG_WAIT) { 5462 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); 5463 mutex_exit(hash_lock); 5464 goto top; 5465 } 5466 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 5467 5468 if (done) { 5469 arc_callback_t *acb = NULL; 5470 5471 acb = kmem_zalloc(sizeof (arc_callback_t), 5472 KM_SLEEP); 5473 acb->acb_done = done; 5474 acb->acb_private = private; 5475 acb->acb_compressed = compressed_read; 5476 if (pio != NULL) 5477 acb->acb_zio_dummy = zio_null(pio, 5478 spa, NULL, NULL, NULL, zio_flags); 5479 5480 ASSERT3P(acb->acb_done, !=, NULL); 5481 acb->acb_zio_head = head_zio; 5482 acb->acb_next = hdr->b_l1hdr.b_acb; 5483 hdr->b_l1hdr.b_acb = acb; 5484 mutex_exit(hash_lock); 5485 return (0); 5486 } 5487 mutex_exit(hash_lock); 5488 return (0); 5489 } 5490 5491 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 5492 hdr->b_l1hdr.b_state == arc_mfu); 5493 5494 if (done) { 5495 if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 5496 /* 5497 * This is a demand read which does not have to 5498 * wait for i/o because we did a predictive 5499 * prefetch i/o for it, which has completed. 5500 */ 5501 DTRACE_PROBE1( 5502 arc__demand__hit__predictive__prefetch, 5503 arc_buf_hdr_t *, hdr); 5504 ARCSTAT_BUMP( 5505 arcstat_demand_hit_predictive_prefetch); 5506 arc_hdr_clear_flags(hdr, 5507 ARC_FLAG_PREDICTIVE_PREFETCH); 5508 } 5509 5510 if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) { 5511 ARCSTAT_BUMP( 5512 arcstat_demand_hit_prescient_prefetch); 5513 arc_hdr_clear_flags(hdr, 5514 ARC_FLAG_PRESCIENT_PREFETCH); 5515 } 5516 5517 ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); 5518 /* Get a buf with the desired data in it. */ 5519 rc = arc_buf_alloc_impl(hdr, private, 5520 compressed_read, B_TRUE, &buf); 5521 if (rc != 0) { 5522 arc_buf_destroy(buf, private); 5523 buf = NULL; 5524 } 5525 ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || 5526 rc == 0 || rc != ENOENT); 5527 } else if (*arc_flags & ARC_FLAG_PREFETCH && 5528 refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 5529 arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); 5530 } 5531 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 5532 arc_access(hdr, hash_lock); 5533 if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) 5534 arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); 5535 if (*arc_flags & ARC_FLAG_L2CACHE) 5536 arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); 5537 mutex_exit(hash_lock); 5538 ARCSTAT_BUMP(arcstat_hits); 5539 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 5540 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 5541 data, metadata, hits); 5542 5543 if (done) 5544 done(NULL, zb, bp, buf, private); 5545 } else { 5546 uint64_t lsize = BP_GET_LSIZE(bp); 5547 uint64_t psize = BP_GET_PSIZE(bp); 5548 arc_callback_t *acb; 5549 vdev_t *vd = NULL; 5550 uint64_t addr = 0; 5551 boolean_t devw = B_FALSE; 5552 uint64_t size; 5553 5554 if (hdr == NULL) { 5555 /* this block is not in the cache */ 5556 arc_buf_hdr_t *exists = NULL; 5557 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 5558 hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, 5559 BP_GET_COMPRESS(bp), type); 5560 5561 if (!BP_IS_EMBEDDED(bp)) { 5562 hdr->b_dva = *BP_IDENTITY(bp); 5563 hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 5564 exists = buf_hash_insert(hdr, &hash_lock); 5565 } 5566 if (exists != NULL) { 5567 /* somebody beat us to the hash insert */ 5568 mutex_exit(hash_lock); 5569 buf_discard_identity(hdr); 5570 arc_hdr_destroy(hdr); 5571 goto top; /* restart the IO request */ 5572 } 5573 } else { 5574 /* 5575 * This block is in the ghost cache. If it was L2-only 5576 * (and thus didn't have an L1 hdr), we realloc the 5577 * header to add an L1 hdr. 5578 */ 5579 if (!HDR_HAS_L1HDR(hdr)) { 5580 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, 5581 hdr_full_cache); 5582 } 5583 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 5584 ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); 5585 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 5586 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 5587 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 5588 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 5589 5590 /* 5591 * This is a delicate dance that we play here. 5592 * This hdr is in the ghost list so we access it 5593 * to move it out of the ghost list before we 5594 * initiate the read. If it's a prefetch then 5595 * it won't have a callback so we'll remove the 5596 * reference that arc_buf_alloc_impl() created. We 5597 * do this after we've called arc_access() to 5598 * avoid hitting an assert in remove_reference(). 5599 */ 5600 arc_access(hdr, hash_lock); 5601 arc_hdr_alloc_pabd(hdr); 5602 } 5603 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 5604 size = arc_hdr_size(hdr); 5605 5606 /* 5607 * If compression is enabled on the hdr, then will do 5608 * RAW I/O and will store the compressed data in the hdr's 5609 * data block. Otherwise, the hdr's data block will contain 5610 * the uncompressed data. 5611 */ 5612 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { 5613 zio_flags |= ZIO_FLAG_RAW; 5614 } 5615 5616 if (*arc_flags & ARC_FLAG_PREFETCH) 5617 arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); 5618 if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) 5619 arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); 5620 5621 if (*arc_flags & ARC_FLAG_L2CACHE) 5622 arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); 5623 if (BP_GET_LEVEL(bp) > 0) 5624 arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); 5625 if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) 5626 arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); 5627 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); 5628 5629 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 5630 acb->acb_done = done; 5631 acb->acb_private = private; 5632 acb->acb_compressed = compressed_read; 5633 5634 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 5635 hdr->b_l1hdr.b_acb = acb; 5636 arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 5637 5638 if (HDR_HAS_L2HDR(hdr) && 5639 (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { 5640 devw = hdr->b_l2hdr.b_dev->l2ad_writing; 5641 addr = hdr->b_l2hdr.b_daddr; 5642 /* 5643 * Lock out L2ARC device removal. 5644 */ 5645 if (vdev_is_dead(vd) || 5646 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 5647 vd = NULL; 5648 } 5649 5650 /* 5651 * We count both async reads and scrub IOs as asynchronous so 5652 * that both can be upgraded in the event of a cache hit while 5653 * the read IO is still in-flight. 5654 */ 5655 if (priority == ZIO_PRIORITY_ASYNC_READ || 5656 priority == ZIO_PRIORITY_SCRUB) 5657 arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); 5658 else 5659 arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); 5660 5661 /* 5662 * At this point, we have a level 1 cache miss. Try again in 5663 * L2ARC if possible. 5664 */ 5665 ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); 5666 5667 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 5668 uint64_t, lsize, zbookmark_phys_t *, zb); 5669 ARCSTAT_BUMP(arcstat_misses); 5670 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 5671 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 5672 data, metadata, misses); 5673#ifdef _KERNEL 5674#ifdef RACCT 5675 if (racct_enable) { 5676 PROC_LOCK(curproc); 5677 racct_add_force(curproc, RACCT_READBPS, size); 5678 racct_add_force(curproc, RACCT_READIOPS, 1); 5679 PROC_UNLOCK(curproc); 5680 } 5681#endif /* RACCT */ 5682 curthread->td_ru.ru_inblock++; 5683#endif 5684 5685 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 5686 /* 5687 * Read from the L2ARC if the following are true: 5688 * 1. The L2ARC vdev was previously cached. 5689 * 2. This buffer still has L2ARC metadata. 5690 * 3. This buffer isn't currently writing to the L2ARC. 5691 * 4. The L2ARC entry wasn't evicted, which may 5692 * also have invalidated the vdev. 5693 * 5. This isn't prefetch and l2arc_noprefetch is set. 5694 */ 5695 if (HDR_HAS_L2HDR(hdr) && 5696 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 5697 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 5698 l2arc_read_callback_t *cb; 5699 abd_t *abd; 5700 uint64_t asize; 5701 5702 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 5703 ARCSTAT_BUMP(arcstat_l2_hits); 5704 5705 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 5706 KM_SLEEP); 5707 cb->l2rcb_hdr = hdr; 5708 cb->l2rcb_bp = *bp; 5709 cb->l2rcb_zb = *zb; 5710 cb->l2rcb_flags = zio_flags; 5711 5712 asize = vdev_psize_to_asize(vd, size); 5713 if (asize != size) { 5714 abd = abd_alloc_for_io(asize, 5715 HDR_ISTYPE_METADATA(hdr)); 5716 cb->l2rcb_abd = abd; 5717 } else { 5718 abd = hdr->b_l1hdr.b_pabd; 5719 } 5720 5721 ASSERT(addr >= VDEV_LABEL_START_SIZE && 5722 addr + asize <= vd->vdev_psize - 5723 VDEV_LABEL_END_SIZE); 5724 5725 /* 5726 * l2arc read. The SCL_L2ARC lock will be 5727 * released by l2arc_read_done(). 5728 * Issue a null zio if the underlying buffer 5729 * was squashed to zero size by compression. 5730 */ 5731 ASSERT3U(HDR_GET_COMPRESS(hdr), !=, 5732 ZIO_COMPRESS_EMPTY); 5733 rzio = zio_read_phys(pio, vd, addr, 5734 asize, abd, 5735 ZIO_CHECKSUM_OFF, 5736 l2arc_read_done, cb, priority, 5737 zio_flags | ZIO_FLAG_DONT_CACHE | 5738 ZIO_FLAG_CANFAIL | 5739 ZIO_FLAG_DONT_PROPAGATE | 5740 ZIO_FLAG_DONT_RETRY, B_FALSE); 5741 acb->acb_zio_head = rzio; 5742 5743 if (hash_lock != NULL) 5744 mutex_exit(hash_lock); 5745 5746 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 5747 zio_t *, rzio); 5748 ARCSTAT_INCR(arcstat_l2_read_bytes, size); 5749 5750 if (*arc_flags & ARC_FLAG_NOWAIT) { 5751 zio_nowait(rzio); 5752 return (0); 5753 } 5754 5755 ASSERT(*arc_flags & ARC_FLAG_WAIT); 5756 if (zio_wait(rzio) == 0) 5757 return (0); 5758 5759 /* l2arc read error; goto zio_read() */ 5760 if (hash_lock != NULL) 5761 mutex_enter(hash_lock); 5762 } else { 5763 DTRACE_PROBE1(l2arc__miss, 5764 arc_buf_hdr_t *, hdr); 5765 ARCSTAT_BUMP(arcstat_l2_misses); 5766 if (HDR_L2_WRITING(hdr)) 5767 ARCSTAT_BUMP(arcstat_l2_rw_clash); 5768 spa_config_exit(spa, SCL_L2ARC, vd); 5769 } 5770 } else { 5771 if (vd != NULL) 5772 spa_config_exit(spa, SCL_L2ARC, vd); 5773 if (l2arc_ndev != 0) { 5774 DTRACE_PROBE1(l2arc__miss, 5775 arc_buf_hdr_t *, hdr); 5776 ARCSTAT_BUMP(arcstat_l2_misses); 5777 } 5778 } 5779 5780 rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size, 5781 arc_read_done, hdr, priority, zio_flags, zb); 5782 acb->acb_zio_head = rzio; 5783 5784 if (hash_lock != NULL) 5785 mutex_exit(hash_lock); 5786 5787 if (*arc_flags & ARC_FLAG_WAIT) 5788 return (zio_wait(rzio)); 5789 5790 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 5791 zio_nowait(rzio); 5792 } 5793 return (0); 5794} 5795 5796/* 5797 * Notify the arc that a block was freed, and thus will never be used again. 5798 */ 5799void 5800arc_freed(spa_t *spa, const blkptr_t *bp) 5801{ 5802 arc_buf_hdr_t *hdr; 5803 kmutex_t *hash_lock; 5804 uint64_t guid = spa_load_guid(spa); 5805 5806 ASSERT(!BP_IS_EMBEDDED(bp)); 5807 5808 hdr = buf_hash_find(guid, bp, &hash_lock); 5809 if (hdr == NULL) 5810 return; 5811 5812 /* 5813 * We might be trying to free a block that is still doing I/O 5814 * (i.e. prefetch) or has a reference (i.e. a dedup-ed, 5815 * dmu_sync-ed block). If this block is being prefetched, then it 5816 * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr 5817 * until the I/O completes. A block may also have a reference if it is 5818 * part of a dedup-ed, dmu_synced write. The dmu_sync() function would 5819 * have written the new block to its final resting place on disk but 5820 * without the dedup flag set. This would have left the hdr in the MRU 5821 * state and discoverable. When the txg finally syncs it detects that 5822 * the block was overridden in open context and issues an override I/O. 5823 * Since this is a dedup block, the override I/O will determine if the 5824 * block is already in the DDT. If so, then it will replace the io_bp 5825 * with the bp from the DDT and allow the I/O to finish. When the I/O 5826 * reaches the done callback, dbuf_write_override_done, it will 5827 * check to see if the io_bp and io_bp_override are identical. 5828 * If they are not, then it indicates that the bp was replaced with 5829 * the bp in the DDT and the override bp is freed. This allows 5830 * us to arrive here with a reference on a block that is being 5831 * freed. So if we have an I/O in progress, or a reference to 5832 * this hdr, then we don't destroy the hdr. 5833 */ 5834 if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && 5835 refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { 5836 arc_change_state(arc_anon, hdr, hash_lock); 5837 arc_hdr_destroy(hdr); 5838 mutex_exit(hash_lock); 5839 } else { 5840 mutex_exit(hash_lock); 5841 } 5842 5843} 5844 5845/* 5846 * Release this buffer from the cache, making it an anonymous buffer. This 5847 * must be done after a read and prior to modifying the buffer contents. 5848 * If the buffer has more than one reference, we must make 5849 * a new hdr for the buffer. 5850 */ 5851void 5852arc_release(arc_buf_t *buf, void *tag) 5853{ 5854 arc_buf_hdr_t *hdr = buf->b_hdr; 5855 5856 /* 5857 * It would be nice to assert that if it's DMU metadata (level > 5858 * 0 || it's the dnode file), then it must be syncing context. 5859 * But we don't know that information at this level. 5860 */ 5861 5862 mutex_enter(&buf->b_evict_lock); 5863 5864 ASSERT(HDR_HAS_L1HDR(hdr)); 5865 5866 /* 5867 * We don't grab the hash lock prior to this check, because if 5868 * the buffer's header is in the arc_anon state, it won't be 5869 * linked into the hash table. 5870 */ 5871 if (hdr->b_l1hdr.b_state == arc_anon) { 5872 mutex_exit(&buf->b_evict_lock); 5873 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 5874 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 5875 ASSERT(!HDR_HAS_L2HDR(hdr)); 5876 ASSERT(HDR_EMPTY(hdr)); 5877 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); 5878 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); 5879 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 5880 5881 hdr->b_l1hdr.b_arc_access = 0; 5882 5883 /* 5884 * If the buf is being overridden then it may already 5885 * have a hdr that is not empty. 5886 */ 5887 buf_discard_identity(hdr); 5888 arc_buf_thaw(buf); 5889 5890 return; 5891 } 5892 5893 kmutex_t *hash_lock = HDR_LOCK(hdr); 5894 mutex_enter(hash_lock); 5895 5896 /* 5897 * This assignment is only valid as long as the hash_lock is 5898 * held, we must be careful not to reference state or the 5899 * b_state field after dropping the lock. 5900 */ 5901 arc_state_t *state = hdr->b_l1hdr.b_state; 5902 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 5903 ASSERT3P(state, !=, arc_anon); 5904 5905 /* this buffer is not on any list */ 5906 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); 5907 5908 if (HDR_HAS_L2HDR(hdr)) { 5909 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); 5910 5911 /* 5912 * We have to recheck this conditional again now that 5913 * we're holding the l2ad_mtx to prevent a race with 5914 * another thread which might be concurrently calling 5915 * l2arc_evict(). In that case, l2arc_evict() might have 5916 * destroyed the header's L2 portion as we were waiting 5917 * to acquire the l2ad_mtx. 5918 */ 5919 if (HDR_HAS_L2HDR(hdr)) { 5920 l2arc_trim(hdr); 5921 arc_hdr_l2hdr_destroy(hdr); 5922 } 5923 5924 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); 5925 } 5926 5927 /* 5928 * Do we have more than one buf? 5929 */ 5930 if (hdr->b_l1hdr.b_bufcnt > 1) { 5931 arc_buf_hdr_t *nhdr; 5932 uint64_t spa = hdr->b_spa; 5933 uint64_t psize = HDR_GET_PSIZE(hdr); 5934 uint64_t lsize = HDR_GET_LSIZE(hdr); 5935 enum zio_compress compress = HDR_GET_COMPRESS(hdr); 5936 arc_buf_contents_t type = arc_buf_type(hdr); 5937 VERIFY3U(hdr->b_type, ==, type); 5938 5939 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); 5940 (void) remove_reference(hdr, hash_lock, tag); 5941 5942 if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { 5943 ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); 5944 ASSERT(ARC_BUF_LAST(buf)); 5945 } 5946 5947 /* 5948 * Pull the data off of this hdr and attach it to 5949 * a new anonymous hdr. Also find the last buffer 5950 * in the hdr's buffer list. 5951 */ 5952 arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); 5953 ASSERT3P(lastbuf, !=, NULL); 5954 5955 /* 5956 * If the current arc_buf_t and the hdr are sharing their data 5957 * buffer, then we must stop sharing that block. 5958 */ 5959 if (arc_buf_is_shared(buf)) { 5960 VERIFY(!arc_buf_is_shared(lastbuf)); 5961 5962 /* 5963 * First, sever the block sharing relationship between 5964 * buf and the arc_buf_hdr_t. 5965 */ 5966 arc_unshare_buf(hdr, buf); 5967 5968 /* 5969 * Now we need to recreate the hdr's b_pabd. Since we 5970 * have lastbuf handy, we try to share with it, but if 5971 * we can't then we allocate a new b_pabd and copy the 5972 * data from buf into it. 5973 */ 5974 if (arc_can_share(hdr, lastbuf)) { 5975 arc_share_buf(hdr, lastbuf); 5976 } else { 5977 arc_hdr_alloc_pabd(hdr); 5978 abd_copy_from_buf(hdr->b_l1hdr.b_pabd, 5979 buf->b_data, psize); 5980 } 5981 VERIFY3P(lastbuf->b_data, !=, NULL); 5982 } else if (HDR_SHARED_DATA(hdr)) { 5983 /* 5984 * Uncompressed shared buffers are always at the end 5985 * of the list. Compressed buffers don't have the 5986 * same requirements. This makes it hard to 5987 * simply assert that the lastbuf is shared so 5988 * we rely on the hdr's compression flags to determine 5989 * if we have a compressed, shared buffer. 5990 */ 5991 ASSERT(arc_buf_is_shared(lastbuf) || 5992 HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); 5993 ASSERT(!ARC_BUF_SHARED(buf)); 5994 } 5995 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 5996 ASSERT3P(state, !=, arc_l2c_only); 5997 5998 (void) refcount_remove_many(&state->arcs_size, 5999 arc_buf_size(buf), buf); 6000 6001 if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { 6002 ASSERT3P(state, !=, arc_l2c_only); 6003 (void) refcount_remove_many(&state->arcs_esize[type], 6004 arc_buf_size(buf), buf); 6005 } 6006 6007 hdr->b_l1hdr.b_bufcnt -= 1; 6008 arc_cksum_verify(buf); 6009#ifdef illumos 6010 arc_buf_unwatch(buf); 6011#endif 6012 6013 mutex_exit(hash_lock); 6014 6015 /* 6016 * Allocate a new hdr. The new hdr will contain a b_pabd 6017 * buffer which will be freed in arc_write(). 6018 */ 6019 nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); 6020 ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); 6021 ASSERT0(nhdr->b_l1hdr.b_bufcnt); 6022 ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt)); 6023 VERIFY3U(nhdr->b_type, ==, type); 6024 ASSERT(!HDR_SHARED_DATA(nhdr)); 6025 6026 nhdr->b_l1hdr.b_buf = buf; 6027 nhdr->b_l1hdr.b_bufcnt = 1; 6028 (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); 6029 buf->b_hdr = nhdr; 6030 6031 mutex_exit(&buf->b_evict_lock); 6032 (void) refcount_add_many(&arc_anon->arcs_size, 6033 arc_buf_size(buf), buf); 6034 } else { 6035 mutex_exit(&buf->b_evict_lock); 6036 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); 6037 /* protected by hash lock, or hdr is on arc_anon */ 6038 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 6039 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 6040 arc_change_state(arc_anon, hdr, hash_lock); 6041 hdr->b_l1hdr.b_arc_access = 0; 6042 mutex_exit(hash_lock); 6043 6044 buf_discard_identity(hdr); 6045 arc_buf_thaw(buf); 6046 } 6047} 6048 6049int 6050arc_released(arc_buf_t *buf) 6051{ 6052 int released; 6053 6054 mutex_enter(&buf->b_evict_lock); 6055 released = (buf->b_data != NULL && 6056 buf->b_hdr->b_l1hdr.b_state == arc_anon); 6057 mutex_exit(&buf->b_evict_lock); 6058 return (released); 6059} 6060 6061#ifdef ZFS_DEBUG 6062int 6063arc_referenced(arc_buf_t *buf) 6064{ 6065 int referenced; 6066 6067 mutex_enter(&buf->b_evict_lock); 6068 referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); 6069 mutex_exit(&buf->b_evict_lock); 6070 return (referenced); 6071} 6072#endif 6073 6074static void 6075arc_write_ready(zio_t *zio) 6076{ 6077 arc_write_callback_t *callback = zio->io_private; 6078 arc_buf_t *buf = callback->awcb_buf; 6079 arc_buf_hdr_t *hdr = buf->b_hdr; 6080 uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); 6081 6082 ASSERT(HDR_HAS_L1HDR(hdr)); 6083 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); 6084 ASSERT(hdr->b_l1hdr.b_bufcnt > 0); 6085 6086 /* 6087 * If we're reexecuting this zio because the pool suspended, then 6088 * cleanup any state that was previously set the first time the 6089 * callback was invoked. 6090 */ 6091 if (zio->io_flags & ZIO_FLAG_REEXECUTED) { 6092 arc_cksum_free(hdr); 6093#ifdef illumos 6094 arc_buf_unwatch(buf); 6095#endif 6096 if (hdr->b_l1hdr.b_pabd != NULL) { 6097 if (arc_buf_is_shared(buf)) { 6098 arc_unshare_buf(hdr, buf); 6099 } else { 6100 arc_hdr_free_pabd(hdr); 6101 } 6102 } 6103 } 6104 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 6105 ASSERT(!HDR_SHARED_DATA(hdr)); 6106 ASSERT(!arc_buf_is_shared(buf)); 6107 6108 callback->awcb_ready(zio, buf, callback->awcb_private); 6109 6110 if (HDR_IO_IN_PROGRESS(hdr)) 6111 ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); 6112 6113 arc_cksum_compute(buf); 6114 arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 6115 6116 enum zio_compress compress; 6117 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 6118 compress = ZIO_COMPRESS_OFF; 6119 } else { 6120 ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp)); 6121 compress = BP_GET_COMPRESS(zio->io_bp); 6122 } 6123 HDR_SET_PSIZE(hdr, psize); 6124 arc_hdr_set_compress(hdr, compress); 6125 6126 6127 /* 6128 * Fill the hdr with data. If the hdr is compressed, the data we want 6129 * is available from the zio, otherwise we can take it from the buf. 6130 * 6131 * We might be able to share the buf's data with the hdr here. However, 6132 * doing so would cause the ARC to be full of linear ABDs if we write a 6133 * lot of shareable data. As a compromise, we check whether scattered 6134 * ABDs are allowed, and assume that if they are then the user wants 6135 * the ARC to be primarily filled with them regardless of the data being 6136 * written. Therefore, if they're allowed then we allocate one and copy 6137 * the data into it; otherwise, we share the data directly if we can. 6138 */ 6139 if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { 6140 arc_hdr_alloc_pabd(hdr); 6141 6142 /* 6143 * Ideally, we would always copy the io_abd into b_pabd, but the 6144 * user may have disabled compressed ARC, thus we must check the 6145 * hdr's compression setting rather than the io_bp's. 6146 */ 6147 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { 6148 ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, 6149 ZIO_COMPRESS_OFF); 6150 ASSERT3U(psize, >, 0); 6151 6152 abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); 6153 } else { 6154 ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); 6155 6156 abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, 6157 arc_buf_size(buf)); 6158 } 6159 } else { 6160 ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); 6161 ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); 6162 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); 6163 6164 arc_share_buf(hdr, buf); 6165 } 6166 6167 arc_hdr_verify(hdr, zio->io_bp); 6168} 6169 6170static void 6171arc_write_children_ready(zio_t *zio) 6172{ 6173 arc_write_callback_t *callback = zio->io_private; 6174 arc_buf_t *buf = callback->awcb_buf; 6175 6176 callback->awcb_children_ready(zio, buf, callback->awcb_private); 6177} 6178 6179/* 6180 * The SPA calls this callback for each physical write that happens on behalf 6181 * of a logical write. See the comment in dbuf_write_physdone() for details. 6182 */ 6183static void 6184arc_write_physdone(zio_t *zio) 6185{ 6186 arc_write_callback_t *cb = zio->io_private; 6187 if (cb->awcb_physdone != NULL) 6188 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 6189} 6190 6191static void 6192arc_write_done(zio_t *zio) 6193{ 6194 arc_write_callback_t *callback = zio->io_private; 6195 arc_buf_t *buf = callback->awcb_buf; 6196 arc_buf_hdr_t *hdr = buf->b_hdr; 6197 6198 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 6199 6200 if (zio->io_error == 0) { 6201 arc_hdr_verify(hdr, zio->io_bp); 6202 6203 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 6204 buf_discard_identity(hdr); 6205 } else { 6206 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 6207 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 6208 } 6209 } else { 6210 ASSERT(HDR_EMPTY(hdr)); 6211 } 6212 6213 /* 6214 * If the block to be written was all-zero or compressed enough to be 6215 * embedded in the BP, no write was performed so there will be no 6216 * dva/birth/checksum. The buffer must therefore remain anonymous 6217 * (and uncached). 6218 */ 6219 if (!HDR_EMPTY(hdr)) { 6220 arc_buf_hdr_t *exists; 6221 kmutex_t *hash_lock; 6222 6223 ASSERT3U(zio->io_error, ==, 0); 6224 6225 arc_cksum_verify(buf); 6226 6227 exists = buf_hash_insert(hdr, &hash_lock); 6228 if (exists != NULL) { 6229 /* 6230 * This can only happen if we overwrite for 6231 * sync-to-convergence, because we remove 6232 * buffers from the hash table when we arc_free(). 6233 */ 6234 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 6235 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 6236 panic("bad overwrite, hdr=%p exists=%p", 6237 (void *)hdr, (void *)exists); 6238 ASSERT(refcount_is_zero( 6239 &exists->b_l1hdr.b_refcnt)); 6240 arc_change_state(arc_anon, exists, hash_lock); 6241 mutex_exit(hash_lock); 6242 arc_hdr_destroy(exists); 6243 exists = buf_hash_insert(hdr, &hash_lock); 6244 ASSERT3P(exists, ==, NULL); 6245 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 6246 /* nopwrite */ 6247 ASSERT(zio->io_prop.zp_nopwrite); 6248 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 6249 panic("bad nopwrite, hdr=%p exists=%p", 6250 (void *)hdr, (void *)exists); 6251 } else { 6252 /* Dedup */ 6253 ASSERT(hdr->b_l1hdr.b_bufcnt == 1); 6254 ASSERT(hdr->b_l1hdr.b_state == arc_anon); 6255 ASSERT(BP_GET_DEDUP(zio->io_bp)); 6256 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 6257 } 6258 } 6259 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 6260 /* if it's not anon, we are doing a scrub */ 6261 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) 6262 arc_access(hdr, hash_lock); 6263 mutex_exit(hash_lock); 6264 } else { 6265 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 6266 } 6267 6268 ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 6269 callback->awcb_done(zio, buf, callback->awcb_private); 6270 6271 abd_put(zio->io_abd); 6272 kmem_free(callback, sizeof (arc_write_callback_t)); 6273} 6274 6275zio_t * 6276arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 6277 boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, 6278 arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone, 6279 arc_write_done_func_t *done, void *private, zio_priority_t priority, 6280 int zio_flags, const zbookmark_phys_t *zb) 6281{ 6282 arc_buf_hdr_t *hdr = buf->b_hdr; 6283 arc_write_callback_t *callback; 6284 zio_t *zio; 6285 zio_prop_t localprop = *zp; 6286 6287 ASSERT3P(ready, !=, NULL); 6288 ASSERT3P(done, !=, NULL); 6289 ASSERT(!HDR_IO_ERROR(hdr)); 6290 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 6291 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 6292 ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); 6293 if (l2arc) 6294 arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); 6295 if (ARC_BUF_COMPRESSED(buf)) { 6296 /* 6297 * We're writing a pre-compressed buffer. Make the 6298 * compression algorithm requested by the zio_prop_t match 6299 * the pre-compressed buffer's compression algorithm. 6300 */ 6301 localprop.zp_compress = HDR_GET_COMPRESS(hdr); 6302 6303 ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); 6304 zio_flags |= ZIO_FLAG_RAW; 6305 } 6306 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 6307 callback->awcb_ready = ready; 6308 callback->awcb_children_ready = children_ready; 6309 callback->awcb_physdone = physdone; 6310 callback->awcb_done = done; 6311 callback->awcb_private = private; 6312 callback->awcb_buf = buf; 6313 6314 /* 6315 * The hdr's b_pabd is now stale, free it now. A new data block 6316 * will be allocated when the zio pipeline calls arc_write_ready(). 6317 */ 6318 if (hdr->b_l1hdr.b_pabd != NULL) { 6319 /* 6320 * If the buf is currently sharing the data block with 6321 * the hdr then we need to break that relationship here. 6322 * The hdr will remain with a NULL data pointer and the 6323 * buf will take sole ownership of the block. 6324 */ 6325 if (arc_buf_is_shared(buf)) { 6326 arc_unshare_buf(hdr, buf); 6327 } else { 6328 arc_hdr_free_pabd(hdr); 6329 } 6330 VERIFY3P(buf->b_data, !=, NULL); 6331 arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); 6332 } 6333 ASSERT(!arc_buf_is_shared(buf)); 6334 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 6335 6336 zio = zio_write(pio, spa, txg, bp, 6337 abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), 6338 HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready, 6339 (children_ready != NULL) ? arc_write_children_ready : NULL, 6340 arc_write_physdone, arc_write_done, callback, 6341 priority, zio_flags, zb); 6342 6343 return (zio); 6344} 6345 6346static int 6347arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) 6348{ 6349#ifdef _KERNEL 6350 uint64_t available_memory = ptob(freemem); 6351 6352#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 6353 available_memory = 6354 MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); 6355#endif 6356 6357 if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) 6358 return (0); 6359 6360 if (txg > spa->spa_lowmem_last_txg) { 6361 spa->spa_lowmem_last_txg = txg; 6362 spa->spa_lowmem_page_load = 0; 6363 } 6364 /* 6365 * If we are in pageout, we know that memory is already tight, 6366 * the arc is already going to be evicting, so we just want to 6367 * continue to let page writes occur as quickly as possible. 6368 */ 6369 if (curproc == pageproc) { 6370 if (spa->spa_lowmem_page_load > 6371 MAX(ptob(minfree), available_memory) / 4) 6372 return (SET_ERROR(ERESTART)); 6373 /* Note: reserve is inflated, so we deflate */ 6374 atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8); 6375 return (0); 6376 } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) { 6377 /* memory is low, delay before restarting */ 6378 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 6379 return (SET_ERROR(EAGAIN)); 6380 } 6381 spa->spa_lowmem_page_load = 0; 6382#endif /* _KERNEL */ 6383 return (0); 6384} 6385 6386void 6387arc_tempreserve_clear(uint64_t reserve) 6388{ 6389 atomic_add_64(&arc_tempreserve, -reserve); 6390 ASSERT((int64_t)arc_tempreserve >= 0); 6391} 6392 6393int 6394arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) 6395{ 6396 int error; 6397 uint64_t anon_size; 6398 6399 if (reserve > arc_c/4 && !arc_no_grow) { 6400 arc_c = MIN(arc_c_max, reserve * 4); 6401 DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); 6402 } 6403 if (reserve > arc_c) 6404 return (SET_ERROR(ENOMEM)); 6405 6406 /* 6407 * Don't count loaned bufs as in flight dirty data to prevent long 6408 * network delays from blocking transactions that are ready to be 6409 * assigned to a txg. 6410 */ 6411 6412 /* assert that it has not wrapped around */ 6413 ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); 6414 6415 anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) - 6416 arc_loaned_bytes), 0); 6417 6418 /* 6419 * Writes will, almost always, require additional memory allocations 6420 * in order to compress/encrypt/etc the data. We therefore need to 6421 * make sure that there is sufficient available memory for this. 6422 */ 6423 error = arc_memory_throttle(spa, reserve, txg); 6424 if (error != 0) 6425 return (error); 6426 6427 /* 6428 * Throttle writes when the amount of dirty data in the cache 6429 * gets too large. We try to keep the cache less than half full 6430 * of dirty blocks so that our sync times don't grow too large. 6431 * 6432 * In the case of one pool being built on another pool, we want 6433 * to make sure we don't end up throttling the lower (backing) 6434 * pool when the upper pool is the majority contributor to dirty 6435 * data. To insure we make forward progress during throttling, we 6436 * also check the current pool's net dirty data and only throttle 6437 * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty 6438 * data in the cache. 6439 * 6440 * Note: if two requests come in concurrently, we might let them 6441 * both succeed, when one of them should fail. Not a huge deal. 6442 */ 6443 uint64_t total_dirty = reserve + arc_tempreserve + anon_size; 6444 uint64_t spa_dirty_anon = spa_dirty_data(spa); 6445 6446 if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 && 6447 anon_size > arc_c * zfs_arc_anon_limit_percent / 100 && 6448 spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) { 6449 uint64_t meta_esize = 6450 refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); 6451 uint64_t data_esize = 6452 refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); 6453 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 6454 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 6455 arc_tempreserve >> 10, meta_esize >> 10, 6456 data_esize >> 10, reserve >> 10, arc_c >> 10); 6457 return (SET_ERROR(ERESTART)); 6458 } 6459 atomic_add_64(&arc_tempreserve, reserve); 6460 return (0); 6461} 6462 6463static void 6464arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, 6465 kstat_named_t *evict_data, kstat_named_t *evict_metadata) 6466{ 6467 size->value.ui64 = refcount_count(&state->arcs_size); 6468 evict_data->value.ui64 = 6469 refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); 6470 evict_metadata->value.ui64 = 6471 refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); 6472} 6473 6474static int 6475arc_kstat_update(kstat_t *ksp, int rw) 6476{ 6477 arc_stats_t *as = ksp->ks_data; 6478 6479 if (rw == KSTAT_WRITE) { 6480 return (EACCES); 6481 } else { 6482 arc_kstat_update_state(arc_anon, 6483 &as->arcstat_anon_size, 6484 &as->arcstat_anon_evictable_data, 6485 &as->arcstat_anon_evictable_metadata); 6486 arc_kstat_update_state(arc_mru, 6487 &as->arcstat_mru_size, 6488 &as->arcstat_mru_evictable_data, 6489 &as->arcstat_mru_evictable_metadata); 6490 arc_kstat_update_state(arc_mru_ghost, 6491 &as->arcstat_mru_ghost_size, 6492 &as->arcstat_mru_ghost_evictable_data, 6493 &as->arcstat_mru_ghost_evictable_metadata); 6494 arc_kstat_update_state(arc_mfu, 6495 &as->arcstat_mfu_size, 6496 &as->arcstat_mfu_evictable_data, 6497 &as->arcstat_mfu_evictable_metadata); 6498 arc_kstat_update_state(arc_mfu_ghost, 6499 &as->arcstat_mfu_ghost_size, 6500 &as->arcstat_mfu_ghost_evictable_data, 6501 &as->arcstat_mfu_ghost_evictable_metadata); 6502 6503 ARCSTAT(arcstat_size) = aggsum_value(&arc_size); 6504 ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used); 6505 ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size); 6506 ARCSTAT(arcstat_metadata_size) = 6507 aggsum_value(&astat_metadata_size); 6508 ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size); 6509 ARCSTAT(arcstat_other_size) = aggsum_value(&astat_other_size); 6510 ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size); 6511 } 6512 6513 return (0); 6514} 6515 6516/* 6517 * This function *must* return indices evenly distributed between all 6518 * sublists of the multilist. This is needed due to how the ARC eviction 6519 * code is laid out; arc_evict_state() assumes ARC buffers are evenly 6520 * distributed between all sublists and uses this assumption when 6521 * deciding which sublist to evict from and how much to evict from it. 6522 */ 6523unsigned int 6524arc_state_multilist_index_func(multilist_t *ml, void *obj) 6525{ 6526 arc_buf_hdr_t *hdr = obj; 6527 6528 /* 6529 * We rely on b_dva to generate evenly distributed index 6530 * numbers using buf_hash below. So, as an added precaution, 6531 * let's make sure we never add empty buffers to the arc lists. 6532 */ 6533 ASSERT(!HDR_EMPTY(hdr)); 6534 6535 /* 6536 * The assumption here, is the hash value for a given 6537 * arc_buf_hdr_t will remain constant throughout it's lifetime 6538 * (i.e. it's b_spa, b_dva, and b_birth fields don't change). 6539 * Thus, we don't need to store the header's sublist index 6540 * on insertion, as this index can be recalculated on removal. 6541 * 6542 * Also, the low order bits of the hash value are thought to be 6543 * distributed evenly. Otherwise, in the case that the multilist 6544 * has a power of two number of sublists, each sublists' usage 6545 * would not be evenly distributed. 6546 */ 6547 return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % 6548 multilist_get_num_sublists(ml)); 6549} 6550 6551#ifdef _KERNEL 6552static eventhandler_tag arc_event_lowmem = NULL; 6553 6554static void 6555arc_lowmem(void *arg __unused, int howto __unused) 6556{ 6557 6558 mutex_enter(&arc_reclaim_lock); 6559 DTRACE_PROBE1(arc__needfree, int64_t, ((int64_t)freemem - zfs_arc_free_target) * PAGESIZE); 6560 cv_signal(&arc_reclaim_thread_cv); 6561 6562 /* 6563 * It is unsafe to block here in arbitrary threads, because we can come 6564 * here from ARC itself and may hold ARC locks and thus risk a deadlock 6565 * with ARC reclaim thread. 6566 */ 6567 if (curproc == pageproc) 6568 (void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); 6569 mutex_exit(&arc_reclaim_lock); 6570} 6571#endif 6572 6573static void 6574arc_state_init(void) 6575{ 6576 arc_anon = &ARC_anon; 6577 arc_mru = &ARC_mru; 6578 arc_mru_ghost = &ARC_mru_ghost; 6579 arc_mfu = &ARC_mfu; 6580 arc_mfu_ghost = &ARC_mfu_ghost; 6581 arc_l2c_only = &ARC_l2c_only; 6582 6583 arc_mru->arcs_list[ARC_BUFC_METADATA] = 6584 multilist_create(sizeof (arc_buf_hdr_t), 6585 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6586 arc_state_multilist_index_func); 6587 arc_mru->arcs_list[ARC_BUFC_DATA] = 6588 multilist_create(sizeof (arc_buf_hdr_t), 6589 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6590 arc_state_multilist_index_func); 6591 arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] = 6592 multilist_create(sizeof (arc_buf_hdr_t), 6593 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6594 arc_state_multilist_index_func); 6595 arc_mru_ghost->arcs_list[ARC_BUFC_DATA] = 6596 multilist_create(sizeof (arc_buf_hdr_t), 6597 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6598 arc_state_multilist_index_func); 6599 arc_mfu->arcs_list[ARC_BUFC_METADATA] = 6600 multilist_create(sizeof (arc_buf_hdr_t), 6601 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6602 arc_state_multilist_index_func); 6603 arc_mfu->arcs_list[ARC_BUFC_DATA] = 6604 multilist_create(sizeof (arc_buf_hdr_t), 6605 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6606 arc_state_multilist_index_func); 6607 arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] = 6608 multilist_create(sizeof (arc_buf_hdr_t), 6609 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6610 arc_state_multilist_index_func); 6611 arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] = 6612 multilist_create(sizeof (arc_buf_hdr_t), 6613 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6614 arc_state_multilist_index_func); 6615 arc_l2c_only->arcs_list[ARC_BUFC_METADATA] = 6616 multilist_create(sizeof (arc_buf_hdr_t), 6617 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6618 arc_state_multilist_index_func); 6619 arc_l2c_only->arcs_list[ARC_BUFC_DATA] = 6620 multilist_create(sizeof (arc_buf_hdr_t), 6621 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6622 arc_state_multilist_index_func); 6623 6624 refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); 6625 refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); 6626 refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); 6627 refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); 6628 refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); 6629 refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); 6630 refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); 6631 refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); 6632 refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); 6633 refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); 6634 refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); 6635 refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); 6636 6637 refcount_create(&arc_anon->arcs_size); 6638 refcount_create(&arc_mru->arcs_size); 6639 refcount_create(&arc_mru_ghost->arcs_size); 6640 refcount_create(&arc_mfu->arcs_size); 6641 refcount_create(&arc_mfu_ghost->arcs_size); 6642 refcount_create(&arc_l2c_only->arcs_size); 6643 6644 aggsum_init(&arc_meta_used, 0); 6645 aggsum_init(&arc_size, 0); 6646 aggsum_init(&astat_data_size, 0); 6647 aggsum_init(&astat_metadata_size, 0); 6648 aggsum_init(&astat_hdr_size, 0); 6649 aggsum_init(&astat_other_size, 0); 6650 aggsum_init(&astat_l2_hdr_size, 0); 6651} 6652 6653static void 6654arc_state_fini(void) 6655{ 6656 refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); 6657 refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); 6658 refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); 6659 refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); 6660 refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); 6661 refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); 6662 refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); 6663 refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); 6664 refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); 6665 refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); 6666 refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); 6667 refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); 6668 6669 refcount_destroy(&arc_anon->arcs_size); 6670 refcount_destroy(&arc_mru->arcs_size); 6671 refcount_destroy(&arc_mru_ghost->arcs_size); 6672 refcount_destroy(&arc_mfu->arcs_size); 6673 refcount_destroy(&arc_mfu_ghost->arcs_size); 6674 refcount_destroy(&arc_l2c_only->arcs_size); 6675 6676 multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]); 6677 multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 6678 multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]); 6679 multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 6680 multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]); 6681 multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 6682 multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]); 6683 multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 6684} 6685 6686uint64_t 6687arc_max_bytes(void) 6688{ 6689 return (arc_c_max); 6690} 6691 6692void 6693arc_init(void) 6694{ 6695 int i, prefetch_tunable_set = 0; 6696 6697 /* 6698 * allmem is "all memory that we could possibly use". 6699 */ 6700#ifdef illumos 6701#ifdef _KERNEL 6702 uint64_t allmem = ptob(physmem - swapfs_minfree); 6703#else 6704 uint64_t allmem = (physmem * PAGESIZE) / 2; 6705#endif 6706#else 6707 uint64_t allmem = kmem_size(); 6708#endif 6709 6710 6711 mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); 6712 cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); 6713 cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); 6714 6715 mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL); 6716 cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL); 6717 6718 /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */ 6719 arc_c_min = MAX(allmem / 32, arc_abs_min); 6720 /* set max to 5/8 of all memory, or all but 1GB, whichever is more */ 6721 if (allmem >= 1 << 30) 6722 arc_c_max = allmem - (1 << 30); 6723 else 6724 arc_c_max = arc_c_min; 6725 arc_c_max = MAX(allmem * 5 / 8, arc_c_max); 6726 6727 /* 6728 * In userland, there's only the memory pressure that we artificially 6729 * create (see arc_available_memory()). Don't let arc_c get too 6730 * small, because it can cause transactions to be larger than 6731 * arc_c, causing arc_tempreserve_space() to fail. 6732 */ 6733#ifndef _KERNEL 6734 arc_c_min = arc_c_max / 2; 6735#endif 6736 6737#ifdef _KERNEL 6738 /* 6739 * Allow the tunables to override our calculations if they are 6740 * reasonable. 6741 */ 6742 if (zfs_arc_max > arc_abs_min && zfs_arc_max < allmem) { 6743 arc_c_max = zfs_arc_max; 6744 arc_c_min = MIN(arc_c_min, arc_c_max); 6745 } 6746 if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max) 6747 arc_c_min = zfs_arc_min; 6748#endif 6749 6750 arc_c = arc_c_max; 6751 arc_p = (arc_c >> 1); 6752 6753 /* limit meta-data to 1/4 of the arc capacity */ 6754 arc_meta_limit = arc_c_max / 4; 6755 6756#ifdef _KERNEL 6757 /* 6758 * Metadata is stored in the kernel's heap. Don't let us 6759 * use more than half the heap for the ARC. 6760 */ 6761 arc_meta_limit = MIN(arc_meta_limit, 6762 vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2); 6763#endif 6764 6765 /* Allow the tunable to override if it is reasonable */ 6766 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 6767 arc_meta_limit = zfs_arc_meta_limit; 6768 6769 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 6770 arc_c_min = arc_meta_limit / 2; 6771 6772 if (zfs_arc_meta_min > 0) { 6773 arc_meta_min = zfs_arc_meta_min; 6774 } else { 6775 arc_meta_min = arc_c_min / 2; 6776 } 6777 6778 if (zfs_arc_grow_retry > 0) 6779 arc_grow_retry = zfs_arc_grow_retry; 6780 6781 if (zfs_arc_shrink_shift > 0) 6782 arc_shrink_shift = zfs_arc_shrink_shift; 6783 6784 if (zfs_arc_no_grow_shift > 0) 6785 arc_no_grow_shift = zfs_arc_no_grow_shift; 6786 /* 6787 * Ensure that arc_no_grow_shift is less than arc_shrink_shift. 6788 */ 6789 if (arc_no_grow_shift >= arc_shrink_shift) 6790 arc_no_grow_shift = arc_shrink_shift - 1; 6791 6792 if (zfs_arc_p_min_shift > 0) 6793 arc_p_min_shift = zfs_arc_p_min_shift; 6794 6795 /* if kmem_flags are set, lets try to use less memory */ 6796 if (kmem_debugging()) 6797 arc_c = arc_c / 2; 6798 if (arc_c < arc_c_min) 6799 arc_c = arc_c_min; 6800 6801 zfs_arc_min = arc_c_min; 6802 zfs_arc_max = arc_c_max; 6803 6804 arc_state_init(); 6805 buf_init(); 6806 6807 arc_reclaim_thread_exit = B_FALSE; 6808 arc_dnlc_evicts_thread_exit = FALSE; 6809 6810 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 6811 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 6812 6813 if (arc_ksp != NULL) { 6814 arc_ksp->ks_data = &arc_stats; 6815 arc_ksp->ks_update = arc_kstat_update; 6816 kstat_install(arc_ksp); 6817 } 6818 6819 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 6820 TS_RUN, minclsyspri); 6821 6822#ifdef _KERNEL 6823 arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 6824 EVENTHANDLER_PRI_FIRST); 6825#endif 6826 6827 (void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0, 6828 TS_RUN, minclsyspri); 6829 6830 arc_dead = B_FALSE; 6831 arc_warm = B_FALSE; 6832 6833 /* 6834 * Calculate maximum amount of dirty data per pool. 6835 * 6836 * If it has been set by /etc/system, take that. 6837 * Otherwise, use a percentage of physical memory defined by 6838 * zfs_dirty_data_max_percent (default 10%) with a cap at 6839 * zfs_dirty_data_max_max (default 4GB). 6840 */ 6841 if (zfs_dirty_data_max == 0) { 6842 zfs_dirty_data_max = ptob(physmem) * 6843 zfs_dirty_data_max_percent / 100; 6844 zfs_dirty_data_max = MIN(zfs_dirty_data_max, 6845 zfs_dirty_data_max_max); 6846 } 6847 6848#ifdef _KERNEL 6849 if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 6850 prefetch_tunable_set = 1; 6851 6852#ifdef __i386__ 6853 if (prefetch_tunable_set == 0) { 6854 printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 6855 "-- to enable,\n"); 6856 printf(" add \"vfs.zfs.prefetch_disable=0\" " 6857 "to /boot/loader.conf.\n"); 6858 zfs_prefetch_disable = 1; 6859 } 6860#else 6861 if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 6862 prefetch_tunable_set == 0) { 6863 printf("ZFS NOTICE: Prefetch is disabled by default if less " 6864 "than 4GB of RAM is present;\n" 6865 " to enable, add \"vfs.zfs.prefetch_disable=0\" " 6866 "to /boot/loader.conf.\n"); 6867 zfs_prefetch_disable = 1; 6868 } 6869#endif 6870 /* Warn about ZFS memory and address space requirements. */ 6871 if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 6872 printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 6873 "expect unstable behavior.\n"); 6874 } 6875 if (allmem < 512 * (1 << 20)) { 6876 printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 6877 "expect unstable behavior.\n"); 6878 printf(" Consider tuning vm.kmem_size and " 6879 "vm.kmem_size_max\n"); 6880 printf(" in /boot/loader.conf.\n"); 6881 } 6882#endif 6883} 6884 6885void 6886arc_fini(void) 6887{ 6888#ifdef _KERNEL 6889 if (arc_event_lowmem != NULL) 6890 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 6891#endif 6892 6893 mutex_enter(&arc_reclaim_lock); 6894 arc_reclaim_thread_exit = B_TRUE; 6895 /* 6896 * The reclaim thread will set arc_reclaim_thread_exit back to 6897 * B_FALSE when it is finished exiting; we're waiting for that. 6898 */ 6899 while (arc_reclaim_thread_exit) { 6900 cv_signal(&arc_reclaim_thread_cv); 6901 cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); 6902 } 6903 mutex_exit(&arc_reclaim_lock); 6904 6905 /* Use B_TRUE to ensure *all* buffers are evicted */ 6906 arc_flush(NULL, B_TRUE); 6907 6908 mutex_enter(&arc_dnlc_evicts_lock); 6909 arc_dnlc_evicts_thread_exit = TRUE; 6910 /* 6911 * The user evicts thread will set arc_user_evicts_thread_exit 6912 * to FALSE when it is finished exiting; we're waiting for that. 6913 */ 6914 while (arc_dnlc_evicts_thread_exit) { 6915 cv_signal(&arc_dnlc_evicts_cv); 6916 cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); 6917 } 6918 mutex_exit(&arc_dnlc_evicts_lock); 6919 6920 arc_dead = B_TRUE; 6921 6922 if (arc_ksp != NULL) { 6923 kstat_delete(arc_ksp); 6924 arc_ksp = NULL; 6925 } 6926 6927 mutex_destroy(&arc_reclaim_lock); 6928 cv_destroy(&arc_reclaim_thread_cv); 6929 cv_destroy(&arc_reclaim_waiters_cv); 6930 6931 mutex_destroy(&arc_dnlc_evicts_lock); 6932 cv_destroy(&arc_dnlc_evicts_cv); 6933 6934 arc_state_fini(); 6935 buf_fini(); 6936 6937 ASSERT0(arc_loaned_bytes); 6938} 6939 6940/* 6941 * Level 2 ARC 6942 * 6943 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 6944 * It uses dedicated storage devices to hold cached data, which are populated 6945 * using large infrequent writes. The main role of this cache is to boost 6946 * the performance of random read workloads. The intended L2ARC devices 6947 * include short-stroked disks, solid state disks, and other media with 6948 * substantially faster read latency than disk. 6949 * 6950 * +-----------------------+ 6951 * | ARC | 6952 * +-----------------------+ 6953 * | ^ ^ 6954 * | | | 6955 * l2arc_feed_thread() arc_read() 6956 * | | | 6957 * | l2arc read | 6958 * V | | 6959 * +---------------+ | 6960 * | L2ARC | | 6961 * +---------------+ | 6962 * | ^ | 6963 * l2arc_write() | | 6964 * | | | 6965 * V | | 6966 * +-------+ +-------+ 6967 * | vdev | | vdev | 6968 * | cache | | cache | 6969 * +-------+ +-------+ 6970 * +=========+ .-----. 6971 * : L2ARC : |-_____-| 6972 * : devices : | Disks | 6973 * +=========+ `-_____-' 6974 * 6975 * Read requests are satisfied from the following sources, in order: 6976 * 6977 * 1) ARC 6978 * 2) vdev cache of L2ARC devices 6979 * 3) L2ARC devices 6980 * 4) vdev cache of disks 6981 * 5) disks 6982 * 6983 * Some L2ARC device types exhibit extremely slow write performance. 6984 * To accommodate for this there are some significant differences between 6985 * the L2ARC and traditional cache design: 6986 * 6987 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 6988 * the ARC behave as usual, freeing buffers and placing headers on ghost 6989 * lists. The ARC does not send buffers to the L2ARC during eviction as 6990 * this would add inflated write latencies for all ARC memory pressure. 6991 * 6992 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 6993 * It does this by periodically scanning buffers from the eviction-end of 6994 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 6995 * not already there. It scans until a headroom of buffers is satisfied, 6996 * which itself is a buffer for ARC eviction. If a compressible buffer is 6997 * found during scanning and selected for writing to an L2ARC device, we 6998 * temporarily boost scanning headroom during the next scan cycle to make 6999 * sure we adapt to compression effects (which might significantly reduce 7000 * the data volume we write to L2ARC). The thread that does this is 7001 * l2arc_feed_thread(), illustrated below; example sizes are included to 7002 * provide a better sense of ratio than this diagram: 7003 * 7004 * head --> tail 7005 * +---------------------+----------+ 7006 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 7007 * +---------------------+----------+ | o L2ARC eligible 7008 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 7009 * +---------------------+----------+ | 7010 * 15.9 Gbytes ^ 32 Mbytes | 7011 * headroom | 7012 * l2arc_feed_thread() 7013 * | 7014 * l2arc write hand <--[oooo]--' 7015 * | 8 Mbyte 7016 * | write max 7017 * V 7018 * +==============================+ 7019 * L2ARC dev |####|#|###|###| |####| ... | 7020 * +==============================+ 7021 * 32 Gbytes 7022 * 7023 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 7024 * evicted, then the L2ARC has cached a buffer much sooner than it probably 7025 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 7026 * safe to say that this is an uncommon case, since buffers at the end of 7027 * the ARC lists have moved there due to inactivity. 7028 * 7029 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 7030 * then the L2ARC simply misses copying some buffers. This serves as a 7031 * pressure valve to prevent heavy read workloads from both stalling the ARC 7032 * with waits and clogging the L2ARC with writes. This also helps prevent 7033 * the potential for the L2ARC to churn if it attempts to cache content too 7034 * quickly, such as during backups of the entire pool. 7035 * 7036 * 5. After system boot and before the ARC has filled main memory, there are 7037 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 7038 * lists can remain mostly static. Instead of searching from tail of these 7039 * lists as pictured, the l2arc_feed_thread() will search from the list heads 7040 * for eligible buffers, greatly increasing its chance of finding them. 7041 * 7042 * The L2ARC device write speed is also boosted during this time so that 7043 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 7044 * there are no L2ARC reads, and no fear of degrading read performance 7045 * through increased writes. 7046 * 7047 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 7048 * the vdev queue can aggregate them into larger and fewer writes. Each 7049 * device is written to in a rotor fashion, sweeping writes through 7050 * available space then repeating. 7051 * 7052 * 7. The L2ARC does not store dirty content. It never needs to flush 7053 * write buffers back to disk based storage. 7054 * 7055 * 8. If an ARC buffer is written (and dirtied) which also exists in the 7056 * L2ARC, the now stale L2ARC buffer is immediately dropped. 7057 * 7058 * The performance of the L2ARC can be tweaked by a number of tunables, which 7059 * may be necessary for different workloads: 7060 * 7061 * l2arc_write_max max write bytes per interval 7062 * l2arc_write_boost extra write bytes during device warmup 7063 * l2arc_noprefetch skip caching prefetched buffers 7064 * l2arc_headroom number of max device writes to precache 7065 * l2arc_headroom_boost when we find compressed buffers during ARC 7066 * scanning, we multiply headroom by this 7067 * percentage factor for the next scan cycle, 7068 * since more compressed buffers are likely to 7069 * be present 7070 * l2arc_feed_secs seconds between L2ARC writing 7071 * 7072 * Tunables may be removed or added as future performance improvements are 7073 * integrated, and also may become zpool properties. 7074 * 7075 * There are three key functions that control how the L2ARC warms up: 7076 * 7077 * l2arc_write_eligible() check if a buffer is eligible to cache 7078 * l2arc_write_size() calculate how much to write 7079 * l2arc_write_interval() calculate sleep delay between writes 7080 * 7081 * These three functions determine what to write, how much, and how quickly 7082 * to send writes. 7083 */ 7084 7085static boolean_t 7086l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) 7087{ 7088 /* 7089 * A buffer is *not* eligible for the L2ARC if it: 7090 * 1. belongs to a different spa. 7091 * 2. is already cached on the L2ARC. 7092 * 3. has an I/O in progress (it may be an incomplete read). 7093 * 4. is flagged not eligible (zfs property). 7094 */ 7095 if (hdr->b_spa != spa_guid) { 7096 ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 7097 return (B_FALSE); 7098 } 7099 if (HDR_HAS_L2HDR(hdr)) { 7100 ARCSTAT_BUMP(arcstat_l2_write_in_l2); 7101 return (B_FALSE); 7102 } 7103 if (HDR_IO_IN_PROGRESS(hdr)) { 7104 ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 7105 return (B_FALSE); 7106 } 7107 if (!HDR_L2CACHE(hdr)) { 7108 ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 7109 return (B_FALSE); 7110 } 7111 7112 return (B_TRUE); 7113} 7114 7115static uint64_t 7116l2arc_write_size(void) 7117{ 7118 uint64_t size; 7119 7120 /* 7121 * Make sure our globals have meaningful values in case the user 7122 * altered them. 7123 */ 7124 size = l2arc_write_max; 7125 if (size == 0) { 7126 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 7127 "be greater than zero, resetting it to the default (%d)", 7128 L2ARC_WRITE_SIZE); 7129 size = l2arc_write_max = L2ARC_WRITE_SIZE; 7130 } 7131 7132 if (arc_warm == B_FALSE) 7133 size += l2arc_write_boost; 7134 7135 return (size); 7136 7137} 7138 7139static clock_t 7140l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 7141{ 7142 clock_t interval, next, now; 7143 7144 /* 7145 * If the ARC lists are busy, increase our write rate; if the 7146 * lists are stale, idle back. This is achieved by checking 7147 * how much we previously wrote - if it was more than half of 7148 * what we wanted, schedule the next write much sooner. 7149 */ 7150 if (l2arc_feed_again && wrote > (wanted / 2)) 7151 interval = (hz * l2arc_feed_min_ms) / 1000; 7152 else 7153 interval = hz * l2arc_feed_secs; 7154 7155 now = ddi_get_lbolt(); 7156 next = MAX(now, MIN(now + interval, began + interval)); 7157 7158 return (next); 7159} 7160 7161/* 7162 * Cycle through L2ARC devices. This is how L2ARC load balances. 7163 * If a device is returned, this also returns holding the spa config lock. 7164 */ 7165static l2arc_dev_t * 7166l2arc_dev_get_next(void) 7167{ 7168 l2arc_dev_t *first, *next = NULL; 7169 7170 /* 7171 * Lock out the removal of spas (spa_namespace_lock), then removal 7172 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 7173 * both locks will be dropped and a spa config lock held instead. 7174 */ 7175 mutex_enter(&spa_namespace_lock); 7176 mutex_enter(&l2arc_dev_mtx); 7177 7178 /* if there are no vdevs, there is nothing to do */ 7179 if (l2arc_ndev == 0) 7180 goto out; 7181 7182 first = NULL; 7183 next = l2arc_dev_last; 7184 do { 7185 /* loop around the list looking for a non-faulted vdev */ 7186 if (next == NULL) { 7187 next = list_head(l2arc_dev_list); 7188 } else { 7189 next = list_next(l2arc_dev_list, next); 7190 if (next == NULL) 7191 next = list_head(l2arc_dev_list); 7192 } 7193 7194 /* if we have come back to the start, bail out */ 7195 if (first == NULL) 7196 first = next; 7197 else if (next == first) 7198 break; 7199 7200 } while (vdev_is_dead(next->l2ad_vdev)); 7201 7202 /* if we were unable to find any usable vdevs, return NULL */ 7203 if (vdev_is_dead(next->l2ad_vdev)) 7204 next = NULL; 7205 7206 l2arc_dev_last = next; 7207 7208out: 7209 mutex_exit(&l2arc_dev_mtx); 7210 7211 /* 7212 * Grab the config lock to prevent the 'next' device from being 7213 * removed while we are writing to it. 7214 */ 7215 if (next != NULL) 7216 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 7217 mutex_exit(&spa_namespace_lock); 7218 7219 return (next); 7220} 7221 7222/* 7223 * Free buffers that were tagged for destruction. 7224 */ 7225static void 7226l2arc_do_free_on_write() 7227{ 7228 list_t *buflist; 7229 l2arc_data_free_t *df, *df_prev; 7230 7231 mutex_enter(&l2arc_free_on_write_mtx); 7232 buflist = l2arc_free_on_write; 7233 7234 for (df = list_tail(buflist); df; df = df_prev) { 7235 df_prev = list_prev(buflist, df); 7236 ASSERT3P(df->l2df_abd, !=, NULL); 7237 abd_free(df->l2df_abd); 7238 list_remove(buflist, df); 7239 kmem_free(df, sizeof (l2arc_data_free_t)); 7240 } 7241 7242 mutex_exit(&l2arc_free_on_write_mtx); 7243} 7244 7245/* 7246 * A write to a cache device has completed. Update all headers to allow 7247 * reads from these buffers to begin. 7248 */ 7249static void 7250l2arc_write_done(zio_t *zio) 7251{ 7252 l2arc_write_callback_t *cb; 7253 l2arc_dev_t *dev; 7254 list_t *buflist; 7255 arc_buf_hdr_t *head, *hdr, *hdr_prev; 7256 kmutex_t *hash_lock; 7257 int64_t bytes_dropped = 0; 7258 7259 cb = zio->io_private; 7260 ASSERT3P(cb, !=, NULL); 7261 dev = cb->l2wcb_dev; 7262 ASSERT3P(dev, !=, NULL); 7263 head = cb->l2wcb_head; 7264 ASSERT3P(head, !=, NULL); 7265 buflist = &dev->l2ad_buflist; 7266 ASSERT3P(buflist, !=, NULL); 7267 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 7268 l2arc_write_callback_t *, cb); 7269 7270 if (zio->io_error != 0) 7271 ARCSTAT_BUMP(arcstat_l2_writes_error); 7272 7273 /* 7274 * All writes completed, or an error was hit. 7275 */ 7276top: 7277 mutex_enter(&dev->l2ad_mtx); 7278 for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { 7279 hdr_prev = list_prev(buflist, hdr); 7280 7281 hash_lock = HDR_LOCK(hdr); 7282 7283 /* 7284 * We cannot use mutex_enter or else we can deadlock 7285 * with l2arc_write_buffers (due to swapping the order 7286 * the hash lock and l2ad_mtx are taken). 7287 */ 7288 if (!mutex_tryenter(hash_lock)) { 7289 /* 7290 * Missed the hash lock. We must retry so we 7291 * don't leave the ARC_FLAG_L2_WRITING bit set. 7292 */ 7293 ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); 7294 7295 /* 7296 * We don't want to rescan the headers we've 7297 * already marked as having been written out, so 7298 * we reinsert the head node so we can pick up 7299 * where we left off. 7300 */ 7301 list_remove(buflist, head); 7302 list_insert_after(buflist, hdr, head); 7303 7304 mutex_exit(&dev->l2ad_mtx); 7305 7306 /* 7307 * We wait for the hash lock to become available 7308 * to try and prevent busy waiting, and increase 7309 * the chance we'll be able to acquire the lock 7310 * the next time around. 7311 */ 7312 mutex_enter(hash_lock); 7313 mutex_exit(hash_lock); 7314 goto top; 7315 } 7316 7317 /* 7318 * We could not have been moved into the arc_l2c_only 7319 * state while in-flight due to our ARC_FLAG_L2_WRITING 7320 * bit being set. Let's just ensure that's being enforced. 7321 */ 7322 ASSERT(HDR_HAS_L1HDR(hdr)); 7323 7324 if (zio->io_error != 0) { 7325 /* 7326 * Error - drop L2ARC entry. 7327 */ 7328 list_remove(buflist, hdr); 7329 l2arc_trim(hdr); 7330 arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); 7331 7332 ARCSTAT_INCR(arcstat_l2_psize, -arc_hdr_size(hdr)); 7333 ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); 7334 7335 bytes_dropped += arc_hdr_size(hdr); 7336 (void) refcount_remove_many(&dev->l2ad_alloc, 7337 arc_hdr_size(hdr), hdr); 7338 } 7339 7340 /* 7341 * Allow ARC to begin reads and ghost list evictions to 7342 * this L2ARC entry. 7343 */ 7344 arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); 7345 7346 mutex_exit(hash_lock); 7347 } 7348 7349 atomic_inc_64(&l2arc_writes_done); 7350 list_remove(buflist, head); 7351 ASSERT(!HDR_HAS_L1HDR(head)); 7352 kmem_cache_free(hdr_l2only_cache, head); 7353 mutex_exit(&dev->l2ad_mtx); 7354 7355 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); 7356 7357 l2arc_do_free_on_write(); 7358 7359 kmem_free(cb, sizeof (l2arc_write_callback_t)); 7360} 7361 7362/* 7363 * A read to a cache device completed. Validate buffer contents before 7364 * handing over to the regular ARC routines. 7365 */ 7366static void 7367l2arc_read_done(zio_t *zio) 7368{ 7369 l2arc_read_callback_t *cb; 7370 arc_buf_hdr_t *hdr; 7371 kmutex_t *hash_lock; 7372 boolean_t valid_cksum; 7373 7374 ASSERT3P(zio->io_vd, !=, NULL); 7375 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 7376 7377 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 7378 7379 cb = zio->io_private; 7380 ASSERT3P(cb, !=, NULL); 7381 hdr = cb->l2rcb_hdr; 7382 ASSERT3P(hdr, !=, NULL); 7383 7384 hash_lock = HDR_LOCK(hdr); 7385 mutex_enter(hash_lock); 7386 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 7387 7388 /* 7389 * If the data was read into a temporary buffer, 7390 * move it and free the buffer. 7391 */ 7392 if (cb->l2rcb_abd != NULL) { 7393 ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); 7394 if (zio->io_error == 0) { 7395 abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd, 7396 arc_hdr_size(hdr)); 7397 } 7398 7399 /* 7400 * The following must be done regardless of whether 7401 * there was an error: 7402 * - free the temporary buffer 7403 * - point zio to the real ARC buffer 7404 * - set zio size accordingly 7405 * These are required because zio is either re-used for 7406 * an I/O of the block in the case of the error 7407 * or the zio is passed to arc_read_done() and it 7408 * needs real data. 7409 */ 7410 abd_free(cb->l2rcb_abd); 7411 zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); 7412 zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd; 7413 } 7414 7415 ASSERT3P(zio->io_abd, !=, NULL); 7416 7417 /* 7418 * Check this survived the L2ARC journey. 7419 */ 7420 ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd); 7421 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 7422 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 7423 7424 valid_cksum = arc_cksum_is_equal(hdr, zio); 7425 if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 7426 mutex_exit(hash_lock); 7427 zio->io_private = hdr; 7428 arc_read_done(zio); 7429 } else { 7430 mutex_exit(hash_lock); 7431 /* 7432 * Buffer didn't survive caching. Increment stats and 7433 * reissue to the original storage device. 7434 */ 7435 if (zio->io_error != 0) { 7436 ARCSTAT_BUMP(arcstat_l2_io_error); 7437 } else { 7438 zio->io_error = SET_ERROR(EIO); 7439 } 7440 if (!valid_cksum) 7441 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 7442 7443 /* 7444 * If there's no waiter, issue an async i/o to the primary 7445 * storage now. If there *is* a waiter, the caller must 7446 * issue the i/o in a context where it's OK to block. 7447 */ 7448 if (zio->io_waiter == NULL) { 7449 zio_t *pio = zio_unique_parent(zio); 7450 7451 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 7452 7453 zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, 7454 hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done, 7455 hdr, zio->io_priority, cb->l2rcb_flags, 7456 &cb->l2rcb_zb)); 7457 } 7458 } 7459 7460 kmem_free(cb, sizeof (l2arc_read_callback_t)); 7461} 7462 7463/* 7464 * This is the list priority from which the L2ARC will search for pages to 7465 * cache. This is used within loops (0..3) to cycle through lists in the 7466 * desired order. This order can have a significant effect on cache 7467 * performance. 7468 * 7469 * Currently the metadata lists are hit first, MFU then MRU, followed by 7470 * the data lists. This function returns a locked list, and also returns 7471 * the lock pointer. 7472 */ 7473static multilist_sublist_t * 7474l2arc_sublist_lock(int list_num) 7475{ 7476 multilist_t *ml = NULL; 7477 unsigned int idx; 7478 7479 ASSERT(list_num >= 0 && list_num <= 3); 7480 7481 switch (list_num) { 7482 case 0: 7483 ml = arc_mfu->arcs_list[ARC_BUFC_METADATA]; 7484 break; 7485 case 1: 7486 ml = arc_mru->arcs_list[ARC_BUFC_METADATA]; 7487 break; 7488 case 2: 7489 ml = arc_mfu->arcs_list[ARC_BUFC_DATA]; 7490 break; 7491 case 3: 7492 ml = arc_mru->arcs_list[ARC_BUFC_DATA]; 7493 break; 7494 } 7495 7496 /* 7497 * Return a randomly-selected sublist. This is acceptable 7498 * because the caller feeds only a little bit of data for each 7499 * call (8MB). Subsequent calls will result in different 7500 * sublists being selected. 7501 */ 7502 idx = multilist_get_random_index(ml); 7503 return (multilist_sublist_lock(ml, idx)); 7504} 7505 7506/* 7507 * Evict buffers from the device write hand to the distance specified in 7508 * bytes. This distance may span populated buffers, it may span nothing. 7509 * This is clearing a region on the L2ARC device ready for writing. 7510 * If the 'all' boolean is set, every buffer is evicted. 7511 */ 7512static void 7513l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 7514{ 7515 list_t *buflist; 7516 arc_buf_hdr_t *hdr, *hdr_prev; 7517 kmutex_t *hash_lock; 7518 uint64_t taddr; 7519 7520 buflist = &dev->l2ad_buflist; 7521 7522 if (!all && dev->l2ad_first) { 7523 /* 7524 * This is the first sweep through the device. There is 7525 * nothing to evict. 7526 */ 7527 return; 7528 } 7529 7530 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 7531 /* 7532 * When nearing the end of the device, evict to the end 7533 * before the device write hand jumps to the start. 7534 */ 7535 taddr = dev->l2ad_end; 7536 } else { 7537 taddr = dev->l2ad_hand + distance; 7538 } 7539 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 7540 uint64_t, taddr, boolean_t, all); 7541 7542top: 7543 mutex_enter(&dev->l2ad_mtx); 7544 for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { 7545 hdr_prev = list_prev(buflist, hdr); 7546 7547 hash_lock = HDR_LOCK(hdr); 7548 7549 /* 7550 * We cannot use mutex_enter or else we can deadlock 7551 * with l2arc_write_buffers (due to swapping the order 7552 * the hash lock and l2ad_mtx are taken). 7553 */ 7554 if (!mutex_tryenter(hash_lock)) { 7555 /* 7556 * Missed the hash lock. Retry. 7557 */ 7558 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 7559 mutex_exit(&dev->l2ad_mtx); 7560 mutex_enter(hash_lock); 7561 mutex_exit(hash_lock); 7562 goto top; 7563 } 7564 7565 /* 7566 * A header can't be on this list if it doesn't have L2 header. 7567 */ 7568 ASSERT(HDR_HAS_L2HDR(hdr)); 7569 7570 /* Ensure this header has finished being written. */ 7571 ASSERT(!HDR_L2_WRITING(hdr)); 7572 ASSERT(!HDR_L2_WRITE_HEAD(hdr)); 7573 7574 if (!all && (hdr->b_l2hdr.b_daddr >= taddr || 7575 hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { 7576 /* 7577 * We've evicted to the target address, 7578 * or the end of the device. 7579 */ 7580 mutex_exit(hash_lock); 7581 break; 7582 } 7583 7584 if (!HDR_HAS_L1HDR(hdr)) { 7585 ASSERT(!HDR_L2_READING(hdr)); 7586 /* 7587 * This doesn't exist in the ARC. Destroy. 7588 * arc_hdr_destroy() will call list_remove() 7589 * and decrement arcstat_l2_lsize. 7590 */ 7591 arc_change_state(arc_anon, hdr, hash_lock); 7592 arc_hdr_destroy(hdr); 7593 } else { 7594 ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); 7595 ARCSTAT_BUMP(arcstat_l2_evict_l1cached); 7596 /* 7597 * Invalidate issued or about to be issued 7598 * reads, since we may be about to write 7599 * over this location. 7600 */ 7601 if (HDR_L2_READING(hdr)) { 7602 ARCSTAT_BUMP(arcstat_l2_evict_reading); 7603 arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); 7604 } 7605 7606 arc_hdr_l2hdr_destroy(hdr); 7607 } 7608 mutex_exit(hash_lock); 7609 } 7610 mutex_exit(&dev->l2ad_mtx); 7611} 7612 7613/* 7614 * Find and write ARC buffers to the L2ARC device. 7615 * 7616 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid 7617 * for reading until they have completed writing. 7618 * The headroom_boost is an in-out parameter used to maintain headroom boost 7619 * state between calls to this function. 7620 * 7621 * Returns the number of bytes actually written (which may be smaller than 7622 * the delta by which the device hand has changed due to alignment). 7623 */ 7624static uint64_t 7625l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) 7626{ 7627 arc_buf_hdr_t *hdr, *hdr_prev, *head; 7628 uint64_t write_asize, write_psize, write_lsize, headroom; 7629 boolean_t full; 7630 l2arc_write_callback_t *cb; 7631 zio_t *pio, *wzio; 7632 uint64_t guid = spa_load_guid(spa); 7633 int try; 7634 7635 ASSERT3P(dev->l2ad_vdev, !=, NULL); 7636 7637 pio = NULL; 7638 write_lsize = write_asize = write_psize = 0; 7639 full = B_FALSE; 7640 head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); 7641 arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); 7642 7643 ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); 7644 /* 7645 * Copy buffers for L2ARC writing. 7646 */ 7647 for (try = 0; try <= 3; try++) { 7648 multilist_sublist_t *mls = l2arc_sublist_lock(try); 7649 uint64_t passed_sz = 0; 7650 7651 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); 7652 7653 /* 7654 * L2ARC fast warmup. 7655 * 7656 * Until the ARC is warm and starts to evict, read from the 7657 * head of the ARC lists rather than the tail. 7658 */ 7659 if (arc_warm == B_FALSE) 7660 hdr = multilist_sublist_head(mls); 7661 else 7662 hdr = multilist_sublist_tail(mls); 7663 if (hdr == NULL) 7664 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); 7665 7666 headroom = target_sz * l2arc_headroom; 7667 if (zfs_compressed_arc_enabled) 7668 headroom = (headroom * l2arc_headroom_boost) / 100; 7669 7670 for (; hdr; hdr = hdr_prev) { 7671 kmutex_t *hash_lock; 7672 7673 if (arc_warm == B_FALSE) 7674 hdr_prev = multilist_sublist_next(mls, hdr); 7675 else 7676 hdr_prev = multilist_sublist_prev(mls, hdr); 7677 ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, 7678 HDR_GET_LSIZE(hdr)); 7679 7680 hash_lock = HDR_LOCK(hdr); 7681 if (!mutex_tryenter(hash_lock)) { 7682 ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); 7683 /* 7684 * Skip this buffer rather than waiting. 7685 */ 7686 continue; 7687 } 7688 7689 passed_sz += HDR_GET_LSIZE(hdr); 7690 if (passed_sz > headroom) { 7691 /* 7692 * Searched too far. 7693 */ 7694 mutex_exit(hash_lock); 7695 ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); 7696 break; 7697 } 7698 7699 if (!l2arc_write_eligible(guid, hdr)) { 7700 mutex_exit(hash_lock); 7701 continue; 7702 } 7703 7704 /* 7705 * We rely on the L1 portion of the header below, so 7706 * it's invalid for this header to have been evicted out 7707 * of the ghost cache, prior to being written out. The 7708 * ARC_FLAG_L2_WRITING bit ensures this won't happen. 7709 */ 7710 ASSERT(HDR_HAS_L1HDR(hdr)); 7711 7712 ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); 7713 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 7714 ASSERT3U(arc_hdr_size(hdr), >, 0); 7715 uint64_t psize = arc_hdr_size(hdr); 7716 uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, 7717 psize); 7718 7719 if ((write_asize + asize) > target_sz) { 7720 full = B_TRUE; 7721 mutex_exit(hash_lock); 7722 ARCSTAT_BUMP(arcstat_l2_write_full); 7723 break; 7724 } 7725 7726 if (pio == NULL) { 7727 /* 7728 * Insert a dummy header on the buflist so 7729 * l2arc_write_done() can find where the 7730 * write buffers begin without searching. 7731 */ 7732 mutex_enter(&dev->l2ad_mtx); 7733 list_insert_head(&dev->l2ad_buflist, head); 7734 mutex_exit(&dev->l2ad_mtx); 7735 7736 cb = kmem_alloc( 7737 sizeof (l2arc_write_callback_t), KM_SLEEP); 7738 cb->l2wcb_dev = dev; 7739 cb->l2wcb_head = head; 7740 pio = zio_root(spa, l2arc_write_done, cb, 7741 ZIO_FLAG_CANFAIL); 7742 ARCSTAT_BUMP(arcstat_l2_write_pios); 7743 } 7744 7745 hdr->b_l2hdr.b_dev = dev; 7746 hdr->b_l2hdr.b_daddr = dev->l2ad_hand; 7747 arc_hdr_set_flags(hdr, 7748 ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR); 7749 7750 mutex_enter(&dev->l2ad_mtx); 7751 list_insert_head(&dev->l2ad_buflist, hdr); 7752 mutex_exit(&dev->l2ad_mtx); 7753 7754 (void) refcount_add_many(&dev->l2ad_alloc, psize, hdr); 7755 7756 /* 7757 * Normally the L2ARC can use the hdr's data, but if 7758 * we're sharing data between the hdr and one of its 7759 * bufs, L2ARC needs its own copy of the data so that 7760 * the ZIO below can't race with the buf consumer. 7761 * Another case where we need to create a copy of the 7762 * data is when the buffer size is not device-aligned 7763 * and we need to pad the block to make it such. 7764 * That also keeps the clock hand suitably aligned. 7765 * 7766 * To ensure that the copy will be available for the 7767 * lifetime of the ZIO and be cleaned up afterwards, we 7768 * add it to the l2arc_free_on_write queue. 7769 */ 7770 abd_t *to_write; 7771 if (!HDR_SHARED_DATA(hdr) && psize == asize) { 7772 to_write = hdr->b_l1hdr.b_pabd; 7773 } else { 7774 to_write = abd_alloc_for_io(asize, 7775 HDR_ISTYPE_METADATA(hdr)); 7776 abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize); 7777 if (asize != psize) { 7778 abd_zero_off(to_write, psize, 7779 asize - psize); 7780 } 7781 l2arc_free_abd_on_write(to_write, asize, 7782 arc_buf_type(hdr)); 7783 } 7784 wzio = zio_write_phys(pio, dev->l2ad_vdev, 7785 hdr->b_l2hdr.b_daddr, asize, to_write, 7786 ZIO_CHECKSUM_OFF, NULL, hdr, 7787 ZIO_PRIORITY_ASYNC_WRITE, 7788 ZIO_FLAG_CANFAIL, B_FALSE); 7789 7790 write_lsize += HDR_GET_LSIZE(hdr); 7791 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 7792 zio_t *, wzio); 7793 7794 write_psize += psize; 7795 write_asize += asize; 7796 dev->l2ad_hand += asize; 7797 7798 mutex_exit(hash_lock); 7799 7800 (void) zio_nowait(wzio); 7801 } 7802 7803 multilist_sublist_unlock(mls); 7804 7805 if (full == B_TRUE) 7806 break; 7807 } 7808 7809 /* No buffers selected for writing? */ 7810 if (pio == NULL) { 7811 ASSERT0(write_lsize); 7812 ASSERT(!HDR_HAS_L1HDR(head)); 7813 kmem_cache_free(hdr_l2only_cache, head); 7814 return (0); 7815 } 7816 7817 ASSERT3U(write_psize, <=, target_sz); 7818 ARCSTAT_BUMP(arcstat_l2_writes_sent); 7819 ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); 7820 ARCSTAT_INCR(arcstat_l2_lsize, write_lsize); 7821 ARCSTAT_INCR(arcstat_l2_psize, write_psize); 7822 vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0); 7823 7824 /* 7825 * Bump device hand to the device start if it is approaching the end. 7826 * l2arc_evict() will already have evicted ahead for this case. 7827 */ 7828 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 7829 dev->l2ad_hand = dev->l2ad_start; 7830 dev->l2ad_first = B_FALSE; 7831 } 7832 7833 dev->l2ad_writing = B_TRUE; 7834 (void) zio_wait(pio); 7835 dev->l2ad_writing = B_FALSE; 7836 7837 return (write_asize); 7838} 7839 7840/* 7841 * This thread feeds the L2ARC at regular intervals. This is the beating 7842 * heart of the L2ARC. 7843 */ 7844/* ARGSUSED */ 7845static void 7846l2arc_feed_thread(void *unused __unused) 7847{ 7848 callb_cpr_t cpr; 7849 l2arc_dev_t *dev; 7850 spa_t *spa; 7851 uint64_t size, wrote; 7852 clock_t begin, next = ddi_get_lbolt(); 7853 7854 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 7855 7856 mutex_enter(&l2arc_feed_thr_lock); 7857 7858 while (l2arc_thread_exit == 0) { 7859 CALLB_CPR_SAFE_BEGIN(&cpr); 7860 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 7861 next - ddi_get_lbolt()); 7862 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 7863 next = ddi_get_lbolt() + hz; 7864 7865 /* 7866 * Quick check for L2ARC devices. 7867 */ 7868 mutex_enter(&l2arc_dev_mtx); 7869 if (l2arc_ndev == 0) { 7870 mutex_exit(&l2arc_dev_mtx); 7871 continue; 7872 } 7873 mutex_exit(&l2arc_dev_mtx); 7874 begin = ddi_get_lbolt(); 7875 7876 /* 7877 * This selects the next l2arc device to write to, and in 7878 * doing so the next spa to feed from: dev->l2ad_spa. This 7879 * will return NULL if there are now no l2arc devices or if 7880 * they are all faulted. 7881 * 7882 * If a device is returned, its spa's config lock is also 7883 * held to prevent device removal. l2arc_dev_get_next() 7884 * will grab and release l2arc_dev_mtx. 7885 */ 7886 if ((dev = l2arc_dev_get_next()) == NULL) 7887 continue; 7888 7889 spa = dev->l2ad_spa; 7890 ASSERT3P(spa, !=, NULL); 7891 7892 /* 7893 * If the pool is read-only then force the feed thread to 7894 * sleep a little longer. 7895 */ 7896 if (!spa_writeable(spa)) { 7897 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 7898 spa_config_exit(spa, SCL_L2ARC, dev); 7899 continue; 7900 } 7901 7902 /* 7903 * Avoid contributing to memory pressure. 7904 */ 7905 if (arc_reclaim_needed()) { 7906 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 7907 spa_config_exit(spa, SCL_L2ARC, dev); 7908 continue; 7909 } 7910 7911 ARCSTAT_BUMP(arcstat_l2_feeds); 7912 7913 size = l2arc_write_size(); 7914 7915 /* 7916 * Evict L2ARC buffers that will be overwritten. 7917 */ 7918 l2arc_evict(dev, size, B_FALSE); 7919 7920 /* 7921 * Write ARC buffers. 7922 */ 7923 wrote = l2arc_write_buffers(spa, dev, size); 7924 7925 /* 7926 * Calculate interval between writes. 7927 */ 7928 next = l2arc_write_interval(begin, size, wrote); 7929 spa_config_exit(spa, SCL_L2ARC, dev); 7930 } 7931 7932 l2arc_thread_exit = 0; 7933 cv_broadcast(&l2arc_feed_thr_cv); 7934 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 7935 thread_exit(); 7936} 7937 7938boolean_t 7939l2arc_vdev_present(vdev_t *vd) 7940{ 7941 l2arc_dev_t *dev; 7942 7943 mutex_enter(&l2arc_dev_mtx); 7944 for (dev = list_head(l2arc_dev_list); dev != NULL; 7945 dev = list_next(l2arc_dev_list, dev)) { 7946 if (dev->l2ad_vdev == vd) 7947 break; 7948 } 7949 mutex_exit(&l2arc_dev_mtx); 7950 7951 return (dev != NULL); 7952} 7953 7954/* 7955 * Add a vdev for use by the L2ARC. By this point the spa has already 7956 * validated the vdev and opened it. 7957 */ 7958void 7959l2arc_add_vdev(spa_t *spa, vdev_t *vd) 7960{ 7961 l2arc_dev_t *adddev; 7962 7963 ASSERT(!l2arc_vdev_present(vd)); 7964 7965 vdev_ashift_optimize(vd); 7966 7967 /* 7968 * Create a new l2arc device entry. 7969 */ 7970 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 7971 adddev->l2ad_spa = spa; 7972 adddev->l2ad_vdev = vd; 7973 adddev->l2ad_start = VDEV_LABEL_START_SIZE; 7974 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 7975 adddev->l2ad_hand = adddev->l2ad_start; 7976 adddev->l2ad_first = B_TRUE; 7977 adddev->l2ad_writing = B_FALSE; 7978 7979 mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); 7980 /* 7981 * This is a list of all ARC buffers that are still valid on the 7982 * device. 7983 */ 7984 list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 7985 offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); 7986 7987 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 7988 refcount_create(&adddev->l2ad_alloc); 7989 7990 /* 7991 * Add device to global list 7992 */ 7993 mutex_enter(&l2arc_dev_mtx); 7994 list_insert_head(l2arc_dev_list, adddev); 7995 atomic_inc_64(&l2arc_ndev); 7996 mutex_exit(&l2arc_dev_mtx); 7997} 7998 7999/* 8000 * Remove a vdev from the L2ARC. 8001 */ 8002void 8003l2arc_remove_vdev(vdev_t *vd) 8004{ 8005 l2arc_dev_t *dev, *nextdev, *remdev = NULL; 8006 8007 /* 8008 * Find the device by vdev 8009 */ 8010 mutex_enter(&l2arc_dev_mtx); 8011 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 8012 nextdev = list_next(l2arc_dev_list, dev); 8013 if (vd == dev->l2ad_vdev) { 8014 remdev = dev; 8015 break; 8016 } 8017 } 8018 ASSERT3P(remdev, !=, NULL); 8019 8020 /* 8021 * Remove device from global list 8022 */ 8023 list_remove(l2arc_dev_list, remdev); 8024 l2arc_dev_last = NULL; /* may have been invalidated */ 8025 atomic_dec_64(&l2arc_ndev); 8026 mutex_exit(&l2arc_dev_mtx); 8027 8028 /* 8029 * Clear all buflists and ARC references. L2ARC device flush. 8030 */ 8031 l2arc_evict(remdev, 0, B_TRUE); 8032 list_destroy(&remdev->l2ad_buflist); 8033 mutex_destroy(&remdev->l2ad_mtx); 8034 refcount_destroy(&remdev->l2ad_alloc); 8035 kmem_free(remdev, sizeof (l2arc_dev_t)); 8036} 8037 8038void 8039l2arc_init(void) 8040{ 8041 l2arc_thread_exit = 0; 8042 l2arc_ndev = 0; 8043 l2arc_writes_sent = 0; 8044 l2arc_writes_done = 0; 8045 8046 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 8047 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 8048 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 8049 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 8050 8051 l2arc_dev_list = &L2ARC_dev_list; 8052 l2arc_free_on_write = &L2ARC_free_on_write; 8053 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 8054 offsetof(l2arc_dev_t, l2ad_node)); 8055 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 8056 offsetof(l2arc_data_free_t, l2df_list_node)); 8057} 8058 8059void 8060l2arc_fini(void) 8061{ 8062 /* 8063 * This is called from dmu_fini(), which is called from spa_fini(); 8064 * Because of this, we can assume that all l2arc devices have 8065 * already been removed when the pools themselves were removed. 8066 */ 8067 8068 l2arc_do_free_on_write(); 8069 8070 mutex_destroy(&l2arc_feed_thr_lock); 8071 cv_destroy(&l2arc_feed_thr_cv); 8072 mutex_destroy(&l2arc_dev_mtx); 8073 mutex_destroy(&l2arc_free_on_write_mtx); 8074 8075 list_destroy(l2arc_dev_list); 8076 list_destroy(l2arc_free_on_write); 8077} 8078 8079void 8080l2arc_start(void) 8081{ 8082 if (!(spa_mode_global & FWRITE)) 8083 return; 8084 8085 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 8086 TS_RUN, minclsyspri); 8087} 8088 8089void 8090l2arc_stop(void) 8091{ 8092 if (!(spa_mode_global & FWRITE)) 8093 return; 8094 8095 mutex_enter(&l2arc_feed_thr_lock); 8096 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 8097 l2arc_thread_exit = 1; 8098 while (l2arc_thread_exit != 0) 8099 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 8100 mutex_exit(&l2arc_feed_thr_lock); 8101} 8102