arc.c revision 346684
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2018, Joyent, Inc. 24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. 26 * Copyright 2017 Nexenta Systems, Inc. All rights reserved. 27 */ 28 29/* 30 * DVA-based Adjustable Replacement Cache 31 * 32 * While much of the theory of operation used here is 33 * based on the self-tuning, low overhead replacement cache 34 * presented by Megiddo and Modha at FAST 2003, there are some 35 * significant differences: 36 * 37 * 1. The Megiddo and Modha model assumes any page is evictable. 38 * Pages in its cache cannot be "locked" into memory. This makes 39 * the eviction algorithm simple: evict the last page in the list. 40 * This also make the performance characteristics easy to reason 41 * about. Our cache is not so simple. At any given moment, some 42 * subset of the blocks in the cache are un-evictable because we 43 * have handed out a reference to them. Blocks are only evictable 44 * when there are no external references active. This makes 45 * eviction far more problematic: we choose to evict the evictable 46 * blocks that are the "lowest" in the list. 47 * 48 * There are times when it is not possible to evict the requested 49 * space. In these circumstances we are unable to adjust the cache 50 * size. To prevent the cache growing unbounded at these times we 51 * implement a "cache throttle" that slows the flow of new data 52 * into the cache until we can make space available. 53 * 54 * 2. The Megiddo and Modha model assumes a fixed cache size. 55 * Pages are evicted when the cache is full and there is a cache 56 * miss. Our model has a variable sized cache. It grows with 57 * high use, but also tries to react to memory pressure from the 58 * operating system: decreasing its size when system memory is 59 * tight. 60 * 61 * 3. The Megiddo and Modha model assumes a fixed page size. All 62 * elements of the cache are therefore exactly the same size. So 63 * when adjusting the cache size following a cache miss, its simply 64 * a matter of choosing a single page to evict. In our model, we 65 * have variable sized cache blocks (rangeing from 512 bytes to 66 * 128K bytes). We therefore choose a set of blocks to evict to make 67 * space for a cache miss that approximates as closely as possible 68 * the space used by the new block. 69 * 70 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 71 * by N. Megiddo & D. Modha, FAST 2003 72 */ 73 74/* 75 * The locking model: 76 * 77 * A new reference to a cache buffer can be obtained in two 78 * ways: 1) via a hash table lookup using the DVA as a key, 79 * or 2) via one of the ARC lists. The arc_read() interface 80 * uses method 1, while the internal ARC algorithms for 81 * adjusting the cache use method 2. We therefore provide two 82 * types of locks: 1) the hash table lock array, and 2) the 83 * ARC list locks. 84 * 85 * Buffers do not have their own mutexes, rather they rely on the 86 * hash table mutexes for the bulk of their protection (i.e. most 87 * fields in the arc_buf_hdr_t are protected by these mutexes). 88 * 89 * buf_hash_find() returns the appropriate mutex (held) when it 90 * locates the requested buffer in the hash table. It returns 91 * NULL for the mutex if the buffer was not in the table. 92 * 93 * buf_hash_remove() expects the appropriate hash mutex to be 94 * already held before it is invoked. 95 * 96 * Each ARC state also has a mutex which is used to protect the 97 * buffer list associated with the state. When attempting to 98 * obtain a hash table lock while holding an ARC list lock you 99 * must use: mutex_tryenter() to avoid deadlock. Also note that 100 * the active state mutex must be held before the ghost state mutex. 101 * 102 * Note that the majority of the performance stats are manipulated 103 * with atomic operations. 104 * 105 * The L2ARC uses the l2ad_mtx on each vdev for the following: 106 * 107 * - L2ARC buflist creation 108 * - L2ARC buflist eviction 109 * - L2ARC write completion, which walks L2ARC buflists 110 * - ARC header destruction, as it removes from L2ARC buflists 111 * - ARC header release, as it removes from L2ARC buflists 112 */ 113 114/* 115 * ARC operation: 116 * 117 * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. 118 * This structure can point either to a block that is still in the cache or to 119 * one that is only accessible in an L2 ARC device, or it can provide 120 * information about a block that was recently evicted. If a block is 121 * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough 122 * information to retrieve it from the L2ARC device. This information is 123 * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block 124 * that is in this state cannot access the data directly. 125 * 126 * Blocks that are actively being referenced or have not been evicted 127 * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within 128 * the arc_buf_hdr_t that will point to the data block in memory. A block can 129 * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC 130 * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and 131 * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). 132 * 133 * The L1ARC's data pointer may or may not be uncompressed. The ARC has the 134 * ability to store the physical data (b_pabd) associated with the DVA of the 135 * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, 136 * it will match its on-disk compression characteristics. This behavior can be 137 * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the 138 * compressed ARC functionality is disabled, the b_pabd will point to an 139 * uncompressed version of the on-disk data. 140 * 141 * Data in the L1ARC is not accessed by consumers of the ARC directly. Each 142 * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it. 143 * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC 144 * consumer. The ARC will provide references to this data and will keep it 145 * cached until it is no longer in use. The ARC caches only the L1ARC's physical 146 * data block and will evict any arc_buf_t that is no longer referenced. The 147 * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the 148 * "overhead_size" kstat. 149 * 150 * Depending on the consumer, an arc_buf_t can be requested in uncompressed or 151 * compressed form. The typical case is that consumers will want uncompressed 152 * data, and when that happens a new data buffer is allocated where the data is 153 * decompressed for them to use. Currently the only consumer who wants 154 * compressed arc_buf_t's is "zfs send", when it streams data exactly as it 155 * exists on disk. When this happens, the arc_buf_t's data buffer is shared 156 * with the arc_buf_hdr_t. 157 * 158 * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The 159 * first one is owned by a compressed send consumer (and therefore references 160 * the same compressed data buffer as the arc_buf_hdr_t) and the second could be 161 * used by any other consumer (and has its own uncompressed copy of the data 162 * buffer). 163 * 164 * arc_buf_hdr_t 165 * +-----------+ 166 * | fields | 167 * | common to | 168 * | L1- and | 169 * | L2ARC | 170 * +-----------+ 171 * | l2arc_buf_hdr_t 172 * | | 173 * +-----------+ 174 * | l1arc_buf_hdr_t 175 * | | arc_buf_t 176 * | b_buf +------------>+-----------+ arc_buf_t 177 * | b_pabd +-+ |b_next +---->+-----------+ 178 * +-----------+ | |-----------| |b_next +-->NULL 179 * | |b_comp = T | +-----------+ 180 * | |b_data +-+ |b_comp = F | 181 * | +-----------+ | |b_data +-+ 182 * +->+------+ | +-----------+ | 183 * compressed | | | | 184 * data | |<--------------+ | uncompressed 185 * +------+ compressed, | data 186 * shared +-->+------+ 187 * data | | 188 * | | 189 * +------+ 190 * 191 * When a consumer reads a block, the ARC must first look to see if the 192 * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new 193 * arc_buf_t and either copies uncompressed data into a new data buffer from an 194 * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a 195 * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the 196 * hdr is compressed and the desired compression characteristics of the 197 * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the 198 * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be 199 * the last buffer in the hdr's b_buf list, however a shared compressed buf can 200 * be anywhere in the hdr's list. 201 * 202 * The diagram below shows an example of an uncompressed ARC hdr that is 203 * sharing its data with an arc_buf_t (note that the shared uncompressed buf is 204 * the last element in the buf list): 205 * 206 * arc_buf_hdr_t 207 * +-----------+ 208 * | | 209 * | | 210 * | | 211 * +-----------+ 212 * l2arc_buf_hdr_t| | 213 * | | 214 * +-----------+ 215 * l1arc_buf_hdr_t| | 216 * | | arc_buf_t (shared) 217 * | b_buf +------------>+---------+ arc_buf_t 218 * | | |b_next +---->+---------+ 219 * | b_pabd +-+ |---------| |b_next +-->NULL 220 * +-----------+ | | | +---------+ 221 * | |b_data +-+ | | 222 * | +---------+ | |b_data +-+ 223 * +->+------+ | +---------+ | 224 * | | | | 225 * uncompressed | | | | 226 * data +------+ | | 227 * ^ +->+------+ | 228 * | uncompressed | | | 229 * | data | | | 230 * | +------+ | 231 * +---------------------------------+ 232 * 233 * Writing to the ARC requires that the ARC first discard the hdr's b_pabd 234 * since the physical block is about to be rewritten. The new data contents 235 * will be contained in the arc_buf_t. As the I/O pipeline performs the write, 236 * it may compress the data before writing it to disk. The ARC will be called 237 * with the transformed data and will bcopy the transformed on-disk block into 238 * a newly allocated b_pabd. Writes are always done into buffers which have 239 * either been loaned (and hence are new and don't have other readers) or 240 * buffers which have been released (and hence have their own hdr, if there 241 * were originally other readers of the buf's original hdr). This ensures that 242 * the ARC only needs to update a single buf and its hdr after a write occurs. 243 * 244 * When the L2ARC is in use, it will also take advantage of the b_pabd. The 245 * L2ARC will always write the contents of b_pabd to the L2ARC. This means 246 * that when compressed ARC is enabled that the L2ARC blocks are identical 247 * to the on-disk block in the main data pool. This provides a significant 248 * advantage since the ARC can leverage the bp's checksum when reading from the 249 * L2ARC to determine if the contents are valid. However, if the compressed 250 * ARC is disabled, then the L2ARC's block must be transformed to look 251 * like the physical block in the main data pool before comparing the 252 * checksum and determining its validity. 253 */ 254 255#include <sys/spa.h> 256#include <sys/zio.h> 257#include <sys/spa_impl.h> 258#include <sys/zio_compress.h> 259#include <sys/zio_checksum.h> 260#include <sys/zfs_context.h> 261#include <sys/arc.h> 262#include <sys/refcount.h> 263#include <sys/vdev.h> 264#include <sys/vdev_impl.h> 265#include <sys/dsl_pool.h> 266#include <sys/zio_checksum.h> 267#include <sys/multilist.h> 268#include <sys/abd.h> 269#ifdef _KERNEL 270#include <sys/dnlc.h> 271#include <sys/racct.h> 272#endif 273#include <sys/callb.h> 274#include <sys/kstat.h> 275#include <sys/trim_map.h> 276#include <zfs_fletcher.h> 277#include <sys/sdt.h> 278#include <sys/aggsum.h> 279#include <sys/cityhash.h> 280 281#include <machine/vmparam.h> 282 283#ifdef illumos 284#ifndef _KERNEL 285/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 286boolean_t arc_watch = B_FALSE; 287int arc_procfd; 288#endif 289#endif /* illumos */ 290 291static kmutex_t arc_reclaim_lock; 292static kcondvar_t arc_reclaim_thread_cv; 293static boolean_t arc_reclaim_thread_exit; 294static kcondvar_t arc_reclaim_waiters_cv; 295 296static kmutex_t arc_dnlc_evicts_lock; 297static kcondvar_t arc_dnlc_evicts_cv; 298static boolean_t arc_dnlc_evicts_thread_exit; 299 300uint_t arc_reduce_dnlc_percent = 3; 301 302/* 303 * The number of headers to evict in arc_evict_state_impl() before 304 * dropping the sublist lock and evicting from another sublist. A lower 305 * value means we're more likely to evict the "correct" header (i.e. the 306 * oldest header in the arc state), but comes with higher overhead 307 * (i.e. more invocations of arc_evict_state_impl()). 308 */ 309int zfs_arc_evict_batch_limit = 10; 310 311/* number of seconds before growing cache again */ 312static int arc_grow_retry = 60; 313 314/* number of milliseconds before attempting a kmem-cache-reap */ 315static int arc_kmem_cache_reap_retry_ms = 0; 316 317/* shift of arc_c for calculating overflow limit in arc_get_data_impl */ 318int zfs_arc_overflow_shift = 8; 319 320/* shift of arc_c for calculating both min and max arc_p */ 321static int arc_p_min_shift = 4; 322 323/* log2(fraction of arc to reclaim) */ 324static int arc_shrink_shift = 7; 325 326/* 327 * log2(fraction of ARC which must be free to allow growing). 328 * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, 329 * when reading a new block into the ARC, we will evict an equal-sized block 330 * from the ARC. 331 * 332 * This must be less than arc_shrink_shift, so that when we shrink the ARC, 333 * we will still not allow it to grow. 334 */ 335int arc_no_grow_shift = 5; 336 337 338/* 339 * minimum lifespan of a prefetch block in clock ticks 340 * (initialized in arc_init()) 341 */ 342static int zfs_arc_min_prefetch_ms = 1; 343static int zfs_arc_min_prescient_prefetch_ms = 6; 344 345/* 346 * If this percent of memory is free, don't throttle. 347 */ 348int arc_lotsfree_percent = 10; 349 350static int arc_dead; 351extern boolean_t zfs_prefetch_disable; 352 353/* 354 * The arc has filled available memory and has now warmed up. 355 */ 356static boolean_t arc_warm; 357 358/* 359 * log2 fraction of the zio arena to keep free. 360 */ 361int arc_zio_arena_free_shift = 2; 362 363/* 364 * These tunables are for performance analysis. 365 */ 366uint64_t zfs_arc_max; 367uint64_t zfs_arc_min; 368uint64_t zfs_arc_meta_limit = 0; 369uint64_t zfs_arc_meta_min = 0; 370int zfs_arc_grow_retry = 0; 371int zfs_arc_shrink_shift = 0; 372int zfs_arc_no_grow_shift = 0; 373int zfs_arc_p_min_shift = 0; 374uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ 375u_int zfs_arc_free_target = 0; 376 377/* Absolute min for arc min / max is 16MB. */ 378static uint64_t arc_abs_min = 16 << 20; 379 380/* 381 * ARC dirty data constraints for arc_tempreserve_space() throttle 382 */ 383uint_t zfs_arc_dirty_limit_percent = 50; /* total dirty data limit */ 384uint_t zfs_arc_anon_limit_percent = 25; /* anon block dirty limit */ 385uint_t zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */ 386 387boolean_t zfs_compressed_arc_enabled = B_TRUE; 388 389static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); 390static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); 391static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS); 392static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS); 393static int sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS); 394 395#if defined(__FreeBSD__) && defined(_KERNEL) 396static void 397arc_free_target_init(void *unused __unused) 398{ 399 400 zfs_arc_free_target = vm_pageout_wakeup_thresh; 401} 402SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, 403 arc_free_target_init, NULL); 404 405TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 406TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); 407TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); 408TUNABLE_INT("vfs.zfs.arc_grow_retry", &zfs_arc_grow_retry); 409TUNABLE_INT("vfs.zfs.arc_no_grow_shift", &zfs_arc_no_grow_shift); 410SYSCTL_DECL(_vfs_zfs); 411SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_U64 | CTLFLAG_RWTUN, 412 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size"); 413SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_U64 | CTLFLAG_RWTUN, 414 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size"); 415SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, CTLTYPE_U32 | CTLFLAG_RWTUN, 416 0, sizeof(uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U", 417 "log2(fraction of ARC which must be free to allow growing)"); 418SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, 419 &zfs_arc_average_blocksize, 0, 420 "ARC average blocksize"); 421SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, 422 &arc_shrink_shift, 0, 423 "log2(fraction of arc to reclaim)"); 424SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_grow_retry, CTLFLAG_RW, 425 &arc_grow_retry, 0, 426 "Wait in seconds before considering growing ARC"); 427SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN, 428 &zfs_compressed_arc_enabled, 0, 429 "Enable compressed ARC"); 430SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_kmem_cache_reap_retry_ms, CTLFLAG_RWTUN, 431 &arc_kmem_cache_reap_retry_ms, 0, 432 "Interval between ARC kmem_cache reapings"); 433 434/* 435 * We don't have a tunable for arc_free_target due to the dependency on 436 * pagedaemon initialisation. 437 */ 438SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, 439 CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), 440 sysctl_vfs_zfs_arc_free_target, "IU", 441 "Desired number of free pages below which ARC triggers reclaim"); 442 443static int 444sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) 445{ 446 u_int val; 447 int err; 448 449 val = zfs_arc_free_target; 450 err = sysctl_handle_int(oidp, &val, 0, req); 451 if (err != 0 || req->newptr == NULL) 452 return (err); 453 454 if (val < minfree) 455 return (EINVAL); 456 if (val > vm_cnt.v_page_count) 457 return (EINVAL); 458 459 zfs_arc_free_target = val; 460 461 return (0); 462} 463 464/* 465 * Must be declared here, before the definition of corresponding kstat 466 * macro which uses the same names will confuse the compiler. 467 */ 468SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, 469 CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 470 sysctl_vfs_zfs_arc_meta_limit, "QU", 471 "ARC metadata limit"); 472#endif 473 474/* 475 * Note that buffers can be in one of 6 states: 476 * ARC_anon - anonymous (discussed below) 477 * ARC_mru - recently used, currently cached 478 * ARC_mru_ghost - recentely used, no longer in cache 479 * ARC_mfu - frequently used, currently cached 480 * ARC_mfu_ghost - frequently used, no longer in cache 481 * ARC_l2c_only - exists in L2ARC but not other states 482 * When there are no active references to the buffer, they are 483 * are linked onto a list in one of these arc states. These are 484 * the only buffers that can be evicted or deleted. Within each 485 * state there are multiple lists, one for meta-data and one for 486 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 487 * etc.) is tracked separately so that it can be managed more 488 * explicitly: favored over data, limited explicitly. 489 * 490 * Anonymous buffers are buffers that are not associated with 491 * a DVA. These are buffers that hold dirty block copies 492 * before they are written to stable storage. By definition, 493 * they are "ref'd" and are considered part of arc_mru 494 * that cannot be freed. Generally, they will aquire a DVA 495 * as they are written and migrate onto the arc_mru list. 496 * 497 * The ARC_l2c_only state is for buffers that are in the second 498 * level ARC but no longer in any of the ARC_m* lists. The second 499 * level ARC itself may also contain buffers that are in any of 500 * the ARC_m* states - meaning that a buffer can exist in two 501 * places. The reason for the ARC_l2c_only state is to keep the 502 * buffer header in the hash table, so that reads that hit the 503 * second level ARC benefit from these fast lookups. 504 */ 505 506typedef struct arc_state { 507 /* 508 * list of evictable buffers 509 */ 510 multilist_t *arcs_list[ARC_BUFC_NUMTYPES]; 511 /* 512 * total amount of evictable data in this state 513 */ 514 refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; 515 /* 516 * total amount of data in this state; this includes: evictable, 517 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. 518 */ 519 refcount_t arcs_size; 520} arc_state_t; 521 522/* The 6 states: */ 523static arc_state_t ARC_anon; 524static arc_state_t ARC_mru; 525static arc_state_t ARC_mru_ghost; 526static arc_state_t ARC_mfu; 527static arc_state_t ARC_mfu_ghost; 528static arc_state_t ARC_l2c_only; 529 530typedef struct arc_stats { 531 kstat_named_t arcstat_hits; 532 kstat_named_t arcstat_misses; 533 kstat_named_t arcstat_demand_data_hits; 534 kstat_named_t arcstat_demand_data_misses; 535 kstat_named_t arcstat_demand_metadata_hits; 536 kstat_named_t arcstat_demand_metadata_misses; 537 kstat_named_t arcstat_prefetch_data_hits; 538 kstat_named_t arcstat_prefetch_data_misses; 539 kstat_named_t arcstat_prefetch_metadata_hits; 540 kstat_named_t arcstat_prefetch_metadata_misses; 541 kstat_named_t arcstat_mru_hits; 542 kstat_named_t arcstat_mru_ghost_hits; 543 kstat_named_t arcstat_mfu_hits; 544 kstat_named_t arcstat_mfu_ghost_hits; 545 kstat_named_t arcstat_allocated; 546 kstat_named_t arcstat_deleted; 547 /* 548 * Number of buffers that could not be evicted because the hash lock 549 * was held by another thread. The lock may not necessarily be held 550 * by something using the same buffer, since hash locks are shared 551 * by multiple buffers. 552 */ 553 kstat_named_t arcstat_mutex_miss; 554 /* 555 * Number of buffers skipped when updating the access state due to the 556 * header having already been released after acquiring the hash lock. 557 */ 558 kstat_named_t arcstat_access_skip; 559 /* 560 * Number of buffers skipped because they have I/O in progress, are 561 * indirect prefetch buffers that have not lived long enough, or are 562 * not from the spa we're trying to evict from. 563 */ 564 kstat_named_t arcstat_evict_skip; 565 /* 566 * Number of times arc_evict_state() was unable to evict enough 567 * buffers to reach it's target amount. 568 */ 569 kstat_named_t arcstat_evict_not_enough; 570 kstat_named_t arcstat_evict_l2_cached; 571 kstat_named_t arcstat_evict_l2_eligible; 572 kstat_named_t arcstat_evict_l2_ineligible; 573 kstat_named_t arcstat_evict_l2_skip; 574 kstat_named_t arcstat_hash_elements; 575 kstat_named_t arcstat_hash_elements_max; 576 kstat_named_t arcstat_hash_collisions; 577 kstat_named_t arcstat_hash_chains; 578 kstat_named_t arcstat_hash_chain_max; 579 kstat_named_t arcstat_p; 580 kstat_named_t arcstat_c; 581 kstat_named_t arcstat_c_min; 582 kstat_named_t arcstat_c_max; 583 /* Not updated directly; only synced in arc_kstat_update. */ 584 kstat_named_t arcstat_size; 585 /* 586 * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. 587 * Note that the compressed bytes may match the uncompressed bytes 588 * if the block is either not compressed or compressed arc is disabled. 589 */ 590 kstat_named_t arcstat_compressed_size; 591 /* 592 * Uncompressed size of the data stored in b_pabd. If compressed 593 * arc is disabled then this value will be identical to the stat 594 * above. 595 */ 596 kstat_named_t arcstat_uncompressed_size; 597 /* 598 * Number of bytes stored in all the arc_buf_t's. This is classified 599 * as "overhead" since this data is typically short-lived and will 600 * be evicted from the arc when it becomes unreferenced unless the 601 * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level 602 * values have been set (see comment in dbuf.c for more information). 603 */ 604 kstat_named_t arcstat_overhead_size; 605 /* 606 * Number of bytes consumed by internal ARC structures necessary 607 * for tracking purposes; these structures are not actually 608 * backed by ARC buffers. This includes arc_buf_hdr_t structures 609 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 610 * caches), and arc_buf_t structures (allocated via arc_buf_t 611 * cache). 612 * Not updated directly; only synced in arc_kstat_update. 613 */ 614 kstat_named_t arcstat_hdr_size; 615 /* 616 * Number of bytes consumed by ARC buffers of type equal to 617 * ARC_BUFC_DATA. This is generally consumed by buffers backing 618 * on disk user data (e.g. plain file contents). 619 * Not updated directly; only synced in arc_kstat_update. 620 */ 621 kstat_named_t arcstat_data_size; 622 /* 623 * Number of bytes consumed by ARC buffers of type equal to 624 * ARC_BUFC_METADATA. This is generally consumed by buffers 625 * backing on disk data that is used for internal ZFS 626 * structures (e.g. ZAP, dnode, indirect blocks, etc). 627 * Not updated directly; only synced in arc_kstat_update. 628 */ 629 kstat_named_t arcstat_metadata_size; 630 /* 631 * Number of bytes consumed by various buffers and structures 632 * not actually backed with ARC buffers. This includes bonus 633 * buffers (allocated directly via zio_buf_* functions), 634 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t 635 * cache), and dnode_t structures (allocated via dnode_t cache). 636 * Not updated directly; only synced in arc_kstat_update. 637 */ 638 kstat_named_t arcstat_other_size; 639 /* 640 * Total number of bytes consumed by ARC buffers residing in the 641 * arc_anon state. This includes *all* buffers in the arc_anon 642 * state; e.g. data, metadata, evictable, and unevictable buffers 643 * are all included in this value. 644 * Not updated directly; only synced in arc_kstat_update. 645 */ 646 kstat_named_t arcstat_anon_size; 647 /* 648 * Number of bytes consumed by ARC buffers that meet the 649 * following criteria: backing buffers of type ARC_BUFC_DATA, 650 * residing in the arc_anon state, and are eligible for eviction 651 * (e.g. have no outstanding holds on the buffer). 652 * Not updated directly; only synced in arc_kstat_update. 653 */ 654 kstat_named_t arcstat_anon_evictable_data; 655 /* 656 * Number of bytes consumed by ARC buffers that meet the 657 * following criteria: backing buffers of type ARC_BUFC_METADATA, 658 * residing in the arc_anon state, and are eligible for eviction 659 * (e.g. have no outstanding holds on the buffer). 660 * Not updated directly; only synced in arc_kstat_update. 661 */ 662 kstat_named_t arcstat_anon_evictable_metadata; 663 /* 664 * Total number of bytes consumed by ARC buffers residing in the 665 * arc_mru state. This includes *all* buffers in the arc_mru 666 * state; e.g. data, metadata, evictable, and unevictable buffers 667 * are all included in this value. 668 * Not updated directly; only synced in arc_kstat_update. 669 */ 670 kstat_named_t arcstat_mru_size; 671 /* 672 * Number of bytes consumed by ARC buffers that meet the 673 * following criteria: backing buffers of type ARC_BUFC_DATA, 674 * residing in the arc_mru state, and are eligible for eviction 675 * (e.g. have no outstanding holds on the buffer). 676 * Not updated directly; only synced in arc_kstat_update. 677 */ 678 kstat_named_t arcstat_mru_evictable_data; 679 /* 680 * Number of bytes consumed by ARC buffers that meet the 681 * following criteria: backing buffers of type ARC_BUFC_METADATA, 682 * residing in the arc_mru state, and are eligible for eviction 683 * (e.g. have no outstanding holds on the buffer). 684 * Not updated directly; only synced in arc_kstat_update. 685 */ 686 kstat_named_t arcstat_mru_evictable_metadata; 687 /* 688 * Total number of bytes that *would have been* consumed by ARC 689 * buffers in the arc_mru_ghost state. The key thing to note 690 * here, is the fact that this size doesn't actually indicate 691 * RAM consumption. The ghost lists only consist of headers and 692 * don't actually have ARC buffers linked off of these headers. 693 * Thus, *if* the headers had associated ARC buffers, these 694 * buffers *would have* consumed this number of bytes. 695 * Not updated directly; only synced in arc_kstat_update. 696 */ 697 kstat_named_t arcstat_mru_ghost_size; 698 /* 699 * Number of bytes that *would have been* consumed by ARC 700 * buffers that are eligible for eviction, of type 701 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 702 * Not updated directly; only synced in arc_kstat_update. 703 */ 704 kstat_named_t arcstat_mru_ghost_evictable_data; 705 /* 706 * Number of bytes that *would have been* consumed by ARC 707 * buffers that are eligible for eviction, of type 708 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 709 * Not updated directly; only synced in arc_kstat_update. 710 */ 711 kstat_named_t arcstat_mru_ghost_evictable_metadata; 712 /* 713 * Total number of bytes consumed by ARC buffers residing in the 714 * arc_mfu state. This includes *all* buffers in the arc_mfu 715 * state; e.g. data, metadata, evictable, and unevictable buffers 716 * are all included in this value. 717 * Not updated directly; only synced in arc_kstat_update. 718 */ 719 kstat_named_t arcstat_mfu_size; 720 /* 721 * Number of bytes consumed by ARC buffers that are eligible for 722 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 723 * state. 724 * Not updated directly; only synced in arc_kstat_update. 725 */ 726 kstat_named_t arcstat_mfu_evictable_data; 727 /* 728 * Number of bytes consumed by ARC buffers that are eligible for 729 * eviction, of type ARC_BUFC_METADATA, and reside in the 730 * arc_mfu state. 731 * Not updated directly; only synced in arc_kstat_update. 732 */ 733 kstat_named_t arcstat_mfu_evictable_metadata; 734 /* 735 * Total number of bytes that *would have been* consumed by ARC 736 * buffers in the arc_mfu_ghost state. See the comment above 737 * arcstat_mru_ghost_size for more details. 738 * Not updated directly; only synced in arc_kstat_update. 739 */ 740 kstat_named_t arcstat_mfu_ghost_size; 741 /* 742 * Number of bytes that *would have been* consumed by ARC 743 * buffers that are eligible for eviction, of type 744 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 745 * Not updated directly; only synced in arc_kstat_update. 746 */ 747 kstat_named_t arcstat_mfu_ghost_evictable_data; 748 /* 749 * Number of bytes that *would have been* consumed by ARC 750 * buffers that are eligible for eviction, of type 751 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 752 * Not updated directly; only synced in arc_kstat_update. 753 */ 754 kstat_named_t arcstat_mfu_ghost_evictable_metadata; 755 kstat_named_t arcstat_l2_hits; 756 kstat_named_t arcstat_l2_misses; 757 kstat_named_t arcstat_l2_feeds; 758 kstat_named_t arcstat_l2_rw_clash; 759 kstat_named_t arcstat_l2_read_bytes; 760 kstat_named_t arcstat_l2_write_bytes; 761 kstat_named_t arcstat_l2_writes_sent; 762 kstat_named_t arcstat_l2_writes_done; 763 kstat_named_t arcstat_l2_writes_error; 764 kstat_named_t arcstat_l2_writes_lock_retry; 765 kstat_named_t arcstat_l2_evict_lock_retry; 766 kstat_named_t arcstat_l2_evict_reading; 767 kstat_named_t arcstat_l2_evict_l1cached; 768 kstat_named_t arcstat_l2_free_on_write; 769 kstat_named_t arcstat_l2_abort_lowmem; 770 kstat_named_t arcstat_l2_cksum_bad; 771 kstat_named_t arcstat_l2_io_error; 772 kstat_named_t arcstat_l2_lsize; 773 kstat_named_t arcstat_l2_psize; 774 /* Not updated directly; only synced in arc_kstat_update. */ 775 kstat_named_t arcstat_l2_hdr_size; 776 kstat_named_t arcstat_l2_write_trylock_fail; 777 kstat_named_t arcstat_l2_write_passed_headroom; 778 kstat_named_t arcstat_l2_write_spa_mismatch; 779 kstat_named_t arcstat_l2_write_in_l2; 780 kstat_named_t arcstat_l2_write_hdr_io_in_progress; 781 kstat_named_t arcstat_l2_write_not_cacheable; 782 kstat_named_t arcstat_l2_write_full; 783 kstat_named_t arcstat_l2_write_buffer_iter; 784 kstat_named_t arcstat_l2_write_pios; 785 kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 786 kstat_named_t arcstat_l2_write_buffer_list_iter; 787 kstat_named_t arcstat_l2_write_buffer_list_null_iter; 788 kstat_named_t arcstat_memory_throttle_count; 789 /* Not updated directly; only synced in arc_kstat_update. */ 790 kstat_named_t arcstat_meta_used; 791 kstat_named_t arcstat_meta_limit; 792 kstat_named_t arcstat_meta_max; 793 kstat_named_t arcstat_meta_min; 794 kstat_named_t arcstat_async_upgrade_sync; 795 kstat_named_t arcstat_demand_hit_predictive_prefetch; 796 kstat_named_t arcstat_demand_hit_prescient_prefetch; 797} arc_stats_t; 798 799static arc_stats_t arc_stats = { 800 { "hits", KSTAT_DATA_UINT64 }, 801 { "misses", KSTAT_DATA_UINT64 }, 802 { "demand_data_hits", KSTAT_DATA_UINT64 }, 803 { "demand_data_misses", KSTAT_DATA_UINT64 }, 804 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 805 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 806 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 807 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 808 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 809 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 810 { "mru_hits", KSTAT_DATA_UINT64 }, 811 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 812 { "mfu_hits", KSTAT_DATA_UINT64 }, 813 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 814 { "allocated", KSTAT_DATA_UINT64 }, 815 { "deleted", KSTAT_DATA_UINT64 }, 816 { "mutex_miss", KSTAT_DATA_UINT64 }, 817 { "access_skip", KSTAT_DATA_UINT64 }, 818 { "evict_skip", KSTAT_DATA_UINT64 }, 819 { "evict_not_enough", KSTAT_DATA_UINT64 }, 820 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 821 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 822 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 823 { "evict_l2_skip", KSTAT_DATA_UINT64 }, 824 { "hash_elements", KSTAT_DATA_UINT64 }, 825 { "hash_elements_max", KSTAT_DATA_UINT64 }, 826 { "hash_collisions", KSTAT_DATA_UINT64 }, 827 { "hash_chains", KSTAT_DATA_UINT64 }, 828 { "hash_chain_max", KSTAT_DATA_UINT64 }, 829 { "p", KSTAT_DATA_UINT64 }, 830 { "c", KSTAT_DATA_UINT64 }, 831 { "c_min", KSTAT_DATA_UINT64 }, 832 { "c_max", KSTAT_DATA_UINT64 }, 833 { "size", KSTAT_DATA_UINT64 }, 834 { "compressed_size", KSTAT_DATA_UINT64 }, 835 { "uncompressed_size", KSTAT_DATA_UINT64 }, 836 { "overhead_size", KSTAT_DATA_UINT64 }, 837 { "hdr_size", KSTAT_DATA_UINT64 }, 838 { "data_size", KSTAT_DATA_UINT64 }, 839 { "metadata_size", KSTAT_DATA_UINT64 }, 840 { "other_size", KSTAT_DATA_UINT64 }, 841 { "anon_size", KSTAT_DATA_UINT64 }, 842 { "anon_evictable_data", KSTAT_DATA_UINT64 }, 843 { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, 844 { "mru_size", KSTAT_DATA_UINT64 }, 845 { "mru_evictable_data", KSTAT_DATA_UINT64 }, 846 { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, 847 { "mru_ghost_size", KSTAT_DATA_UINT64 }, 848 { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, 849 { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 850 { "mfu_size", KSTAT_DATA_UINT64 }, 851 { "mfu_evictable_data", KSTAT_DATA_UINT64 }, 852 { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, 853 { "mfu_ghost_size", KSTAT_DATA_UINT64 }, 854 { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, 855 { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 856 { "l2_hits", KSTAT_DATA_UINT64 }, 857 { "l2_misses", KSTAT_DATA_UINT64 }, 858 { "l2_feeds", KSTAT_DATA_UINT64 }, 859 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 860 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 861 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 862 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 863 { "l2_writes_done", KSTAT_DATA_UINT64 }, 864 { "l2_writes_error", KSTAT_DATA_UINT64 }, 865 { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, 866 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 867 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 868 { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, 869 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 870 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 871 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 872 { "l2_io_error", KSTAT_DATA_UINT64 }, 873 { "l2_size", KSTAT_DATA_UINT64 }, 874 { "l2_asize", KSTAT_DATA_UINT64 }, 875 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 876 { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 877 { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 878 { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 879 { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 880 { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 881 { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 882 { "l2_write_full", KSTAT_DATA_UINT64 }, 883 { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 884 { "l2_write_pios", KSTAT_DATA_UINT64 }, 885 { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 886 { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 887 { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, 888 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 889 { "arc_meta_used", KSTAT_DATA_UINT64 }, 890 { "arc_meta_limit", KSTAT_DATA_UINT64 }, 891 { "arc_meta_max", KSTAT_DATA_UINT64 }, 892 { "arc_meta_min", KSTAT_DATA_UINT64 }, 893 { "async_upgrade_sync", KSTAT_DATA_UINT64 }, 894 { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, 895 { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, 896}; 897 898#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 899 900#define ARCSTAT_INCR(stat, val) \ 901 atomic_add_64(&arc_stats.stat.value.ui64, (val)) 902 903#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 904#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 905 906#define ARCSTAT_MAX(stat, val) { \ 907 uint64_t m; \ 908 while ((val) > (m = arc_stats.stat.value.ui64) && \ 909 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 910 continue; \ 911} 912 913#define ARCSTAT_MAXSTAT(stat) \ 914 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 915 916/* 917 * We define a macro to allow ARC hits/misses to be easily broken down by 918 * two separate conditions, giving a total of four different subtypes for 919 * each of hits and misses (so eight statistics total). 920 */ 921#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 922 if (cond1) { \ 923 if (cond2) { \ 924 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 925 } else { \ 926 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 927 } \ 928 } else { \ 929 if (cond2) { \ 930 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 931 } else { \ 932 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 933 } \ 934 } 935 936kstat_t *arc_ksp; 937static arc_state_t *arc_anon; 938static arc_state_t *arc_mru; 939static arc_state_t *arc_mru_ghost; 940static arc_state_t *arc_mfu; 941static arc_state_t *arc_mfu_ghost; 942static arc_state_t *arc_l2c_only; 943 944/* 945 * There are several ARC variables that are critical to export as kstats -- 946 * but we don't want to have to grovel around in the kstat whenever we wish to 947 * manipulate them. For these variables, we therefore define them to be in 948 * terms of the statistic variable. This assures that we are not introducing 949 * the possibility of inconsistency by having shadow copies of the variables, 950 * while still allowing the code to be readable. 951 */ 952#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 953#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 954#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 955#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 956#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 957#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ 958#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 959 960/* compressed size of entire arc */ 961#define arc_compressed_size ARCSTAT(arcstat_compressed_size) 962/* uncompressed size of entire arc */ 963#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) 964/* number of bytes in the arc from arc_buf_t's */ 965#define arc_overhead_size ARCSTAT(arcstat_overhead_size) 966 967/* 968 * There are also some ARC variables that we want to export, but that are 969 * updated so often that having the canonical representation be the statistic 970 * variable causes a performance bottleneck. We want to use aggsum_t's for these 971 * instead, but still be able to export the kstat in the same way as before. 972 * The solution is to always use the aggsum version, except in the kstat update 973 * callback. 974 */ 975aggsum_t arc_size; 976aggsum_t arc_meta_used; 977aggsum_t astat_data_size; 978aggsum_t astat_metadata_size; 979aggsum_t astat_hdr_size; 980aggsum_t astat_other_size; 981aggsum_t astat_l2_hdr_size; 982 983static int arc_no_grow; /* Don't try to grow cache size */ 984static uint64_t arc_tempreserve; 985static uint64_t arc_loaned_bytes; 986 987typedef struct arc_callback arc_callback_t; 988 989struct arc_callback { 990 void *acb_private; 991 arc_read_done_func_t *acb_done; 992 arc_buf_t *acb_buf; 993 boolean_t acb_compressed; 994 zio_t *acb_zio_dummy; 995 zio_t *acb_zio_head; 996 arc_callback_t *acb_next; 997}; 998 999typedef struct arc_write_callback arc_write_callback_t; 1000 1001struct arc_write_callback { 1002 void *awcb_private; 1003 arc_write_done_func_t *awcb_ready; 1004 arc_write_done_func_t *awcb_children_ready; 1005 arc_write_done_func_t *awcb_physdone; 1006 arc_write_done_func_t *awcb_done; 1007 arc_buf_t *awcb_buf; 1008}; 1009 1010/* 1011 * ARC buffers are separated into multiple structs as a memory saving measure: 1012 * - Common fields struct, always defined, and embedded within it: 1013 * - L2-only fields, always allocated but undefined when not in L2ARC 1014 * - L1-only fields, only allocated when in L1ARC 1015 * 1016 * Buffer in L1 Buffer only in L2 1017 * +------------------------+ +------------------------+ 1018 * | arc_buf_hdr_t | | arc_buf_hdr_t | 1019 * | | | | 1020 * | | | | 1021 * | | | | 1022 * +------------------------+ +------------------------+ 1023 * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 1024 * | (undefined if L1-only) | | | 1025 * +------------------------+ +------------------------+ 1026 * | l1arc_buf_hdr_t | 1027 * | | 1028 * | | 1029 * | | 1030 * | | 1031 * +------------------------+ 1032 * 1033 * Because it's possible for the L2ARC to become extremely large, we can wind 1034 * up eating a lot of memory in L2ARC buffer headers, so the size of a header 1035 * is minimized by only allocating the fields necessary for an L1-cached buffer 1036 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 1037 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 1038 * words in pointers. arc_hdr_realloc() is used to switch a header between 1039 * these two allocation states. 1040 */ 1041typedef struct l1arc_buf_hdr { 1042 kmutex_t b_freeze_lock; 1043 zio_cksum_t *b_freeze_cksum; 1044#ifdef ZFS_DEBUG 1045 /* 1046 * Used for debugging with kmem_flags - by allocating and freeing 1047 * b_thawed when the buffer is thawed, we get a record of the stack 1048 * trace that thawed it. 1049 */ 1050 void *b_thawed; 1051#endif 1052 1053 arc_buf_t *b_buf; 1054 uint32_t b_bufcnt; 1055 /* for waiting on writes to complete */ 1056 kcondvar_t b_cv; 1057 uint8_t b_byteswap; 1058 1059 /* protected by arc state mutex */ 1060 arc_state_t *b_state; 1061 multilist_node_t b_arc_node; 1062 1063 /* updated atomically */ 1064 clock_t b_arc_access; 1065 1066 /* self protecting */ 1067 refcount_t b_refcnt; 1068 1069 arc_callback_t *b_acb; 1070 abd_t *b_pabd; 1071} l1arc_buf_hdr_t; 1072 1073typedef struct l2arc_dev l2arc_dev_t; 1074 1075typedef struct l2arc_buf_hdr { 1076 /* protected by arc_buf_hdr mutex */ 1077 l2arc_dev_t *b_dev; /* L2ARC device */ 1078 uint64_t b_daddr; /* disk address, offset byte */ 1079 1080 list_node_t b_l2node; 1081} l2arc_buf_hdr_t; 1082 1083struct arc_buf_hdr { 1084 /* protected by hash lock */ 1085 dva_t b_dva; 1086 uint64_t b_birth; 1087 1088 arc_buf_contents_t b_type; 1089 arc_buf_hdr_t *b_hash_next; 1090 arc_flags_t b_flags; 1091 1092 /* 1093 * This field stores the size of the data buffer after 1094 * compression, and is set in the arc's zio completion handlers. 1095 * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). 1096 * 1097 * While the block pointers can store up to 32MB in their psize 1098 * field, we can only store up to 32MB minus 512B. This is due 1099 * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. 1100 * a field of zeros represents 512B in the bp). We can't use a 1101 * bias of 1 since we need to reserve a psize of zero, here, to 1102 * represent holes and embedded blocks. 1103 * 1104 * This isn't a problem in practice, since the maximum size of a 1105 * buffer is limited to 16MB, so we never need to store 32MB in 1106 * this field. Even in the upstream illumos code base, the 1107 * maximum size of a buffer is limited to 16MB. 1108 */ 1109 uint16_t b_psize; 1110 1111 /* 1112 * This field stores the size of the data buffer before 1113 * compression, and cannot change once set. It is in units 1114 * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) 1115 */ 1116 uint16_t b_lsize; /* immutable */ 1117 uint64_t b_spa; /* immutable */ 1118 1119 /* L2ARC fields. Undefined when not in L2ARC. */ 1120 l2arc_buf_hdr_t b_l2hdr; 1121 /* L1ARC fields. Undefined when in l2arc_only state */ 1122 l1arc_buf_hdr_t b_l1hdr; 1123}; 1124 1125#if defined(__FreeBSD__) && defined(_KERNEL) 1126static int 1127sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) 1128{ 1129 uint64_t val; 1130 int err; 1131 1132 val = arc_meta_limit; 1133 err = sysctl_handle_64(oidp, &val, 0, req); 1134 if (err != 0 || req->newptr == NULL) 1135 return (err); 1136 1137 if (val <= 0 || val > arc_c_max) 1138 return (EINVAL); 1139 1140 arc_meta_limit = val; 1141 return (0); 1142} 1143 1144static int 1145sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) 1146{ 1147 uint32_t val; 1148 int err; 1149 1150 val = arc_no_grow_shift; 1151 err = sysctl_handle_32(oidp, &val, 0, req); 1152 if (err != 0 || req->newptr == NULL) 1153 return (err); 1154 1155 if (val >= arc_shrink_shift) 1156 return (EINVAL); 1157 1158 arc_no_grow_shift = val; 1159 return (0); 1160} 1161 1162static int 1163sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS) 1164{ 1165 uint64_t val; 1166 int err; 1167 1168 val = zfs_arc_max; 1169 err = sysctl_handle_64(oidp, &val, 0, req); 1170 if (err != 0 || req->newptr == NULL) 1171 return (err); 1172 1173 if (zfs_arc_max == 0) { 1174 /* Loader tunable so blindly set */ 1175 zfs_arc_max = val; 1176 return (0); 1177 } 1178 1179 if (val < arc_abs_min || val > kmem_size()) 1180 return (EINVAL); 1181 if (val < arc_c_min) 1182 return (EINVAL); 1183 if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit) 1184 return (EINVAL); 1185 1186 arc_c_max = val; 1187 1188 arc_c = arc_c_max; 1189 arc_p = (arc_c >> 1); 1190 1191 if (zfs_arc_meta_limit == 0) { 1192 /* limit meta-data to 1/4 of the arc capacity */ 1193 arc_meta_limit = arc_c_max / 4; 1194 } 1195 1196 /* if kmem_flags are set, lets try to use less memory */ 1197 if (kmem_debugging()) 1198 arc_c = arc_c / 2; 1199 1200 zfs_arc_max = arc_c; 1201 1202 return (0); 1203} 1204 1205static int 1206sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS) 1207{ 1208 uint64_t val; 1209 int err; 1210 1211 val = zfs_arc_min; 1212 err = sysctl_handle_64(oidp, &val, 0, req); 1213 if (err != 0 || req->newptr == NULL) 1214 return (err); 1215 1216 if (zfs_arc_min == 0) { 1217 /* Loader tunable so blindly set */ 1218 zfs_arc_min = val; 1219 return (0); 1220 } 1221 1222 if (val < arc_abs_min || val > arc_c_max) 1223 return (EINVAL); 1224 1225 arc_c_min = val; 1226 1227 if (zfs_arc_meta_min == 0) 1228 arc_meta_min = arc_c_min / 2; 1229 1230 if (arc_c < arc_c_min) 1231 arc_c = arc_c_min; 1232 1233 zfs_arc_min = arc_c_min; 1234 1235 return (0); 1236} 1237#endif 1238 1239#define GHOST_STATE(state) \ 1240 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 1241 (state) == arc_l2c_only) 1242 1243#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) 1244#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) 1245#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) 1246#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) 1247#define HDR_PRESCIENT_PREFETCH(hdr) \ 1248 ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) 1249#define HDR_COMPRESSION_ENABLED(hdr) \ 1250 ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) 1251 1252#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) 1253#define HDR_L2_READING(hdr) \ 1254 (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ 1255 ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) 1256#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) 1257#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) 1258#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) 1259#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) 1260 1261#define HDR_ISTYPE_METADATA(hdr) \ 1262 ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) 1263#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) 1264 1265#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) 1266#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) 1267 1268/* For storing compression mode in b_flags */ 1269#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) 1270 1271#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ 1272 HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) 1273#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ 1274 HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); 1275 1276#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) 1277#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) 1278#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) 1279 1280/* 1281 * Other sizes 1282 */ 1283 1284#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 1285#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) 1286 1287/* 1288 * Hash table routines 1289 */ 1290 1291#define HT_LOCK_PAD CACHE_LINE_SIZE 1292 1293struct ht_lock { 1294 kmutex_t ht_lock; 1295#ifdef _KERNEL 1296 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 1297#endif 1298}; 1299 1300#define BUF_LOCKS 256 1301typedef struct buf_hash_table { 1302 uint64_t ht_mask; 1303 arc_buf_hdr_t **ht_table; 1304 struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 1305} buf_hash_table_t; 1306 1307static buf_hash_table_t buf_hash_table; 1308 1309#define BUF_HASH_INDEX(spa, dva, birth) \ 1310 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 1311#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 1312#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 1313#define HDR_LOCK(hdr) \ 1314 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 1315 1316uint64_t zfs_crc64_table[256]; 1317 1318/* 1319 * Level 2 ARC 1320 */ 1321 1322#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 1323#define L2ARC_HEADROOM 2 /* num of writes */ 1324/* 1325 * If we discover during ARC scan any buffers to be compressed, we boost 1326 * our headroom for the next scanning cycle by this percentage multiple. 1327 */ 1328#define L2ARC_HEADROOM_BOOST 200 1329#define L2ARC_FEED_SECS 1 /* caching interval secs */ 1330#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 1331 1332#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 1333#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 1334 1335/* L2ARC Performance Tunables */ 1336uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 1337uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 1338uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 1339uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 1340uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 1341uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 1342boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 1343boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 1344boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 1345 1346SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RWTUN, 1347 &l2arc_write_max, 0, "max write size"); 1348SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RWTUN, 1349 &l2arc_write_boost, 0, "extra write during warmup"); 1350SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RWTUN, 1351 &l2arc_headroom, 0, "number of dev writes"); 1352SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RWTUN, 1353 &l2arc_feed_secs, 0, "interval seconds"); 1354SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RWTUN, 1355 &l2arc_feed_min_ms, 0, "min interval milliseconds"); 1356 1357SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RWTUN, 1358 &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 1359SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RWTUN, 1360 &l2arc_feed_again, 0, "turbo warmup"); 1361SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RWTUN, 1362 &l2arc_norw, 0, "no reads during writes"); 1363 1364SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 1365 &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); 1366SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, 1367 &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1368 "size of anonymous state"); 1369SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, 1370 &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1371 "size of anonymous state"); 1372 1373SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 1374 &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); 1375SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, 1376 &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1377 "size of metadata in mru state"); 1378SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, 1379 &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1380 "size of data in mru state"); 1381 1382SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 1383 &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); 1384SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, 1385 &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1386 "size of metadata in mru ghost state"); 1387SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, 1388 &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1389 "size of data in mru ghost state"); 1390 1391SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 1392 &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); 1393SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, 1394 &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1395 "size of metadata in mfu state"); 1396SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, 1397 &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1398 "size of data in mfu state"); 1399 1400SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 1401 &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); 1402SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, 1403 &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, 1404 "size of metadata in mfu ghost state"); 1405SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, 1406 &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, 1407 "size of data in mfu ghost state"); 1408 1409SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 1410 &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); 1411 1412SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prefetch_ms, CTLFLAG_RW, 1413 &zfs_arc_min_prefetch_ms, 0, "Min life of prefetch block in ms"); 1414SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prescient_prefetch_ms, CTLFLAG_RW, 1415 &zfs_arc_min_prescient_prefetch_ms, 0, "Min life of prescient prefetched block in ms"); 1416 1417/* 1418 * L2ARC Internals 1419 */ 1420struct l2arc_dev { 1421 vdev_t *l2ad_vdev; /* vdev */ 1422 spa_t *l2ad_spa; /* spa */ 1423 uint64_t l2ad_hand; /* next write location */ 1424 uint64_t l2ad_start; /* first addr on device */ 1425 uint64_t l2ad_end; /* last addr on device */ 1426 boolean_t l2ad_first; /* first sweep through */ 1427 boolean_t l2ad_writing; /* currently writing */ 1428 kmutex_t l2ad_mtx; /* lock for buffer list */ 1429 list_t l2ad_buflist; /* buffer list */ 1430 list_node_t l2ad_node; /* device list node */ 1431 refcount_t l2ad_alloc; /* allocated bytes */ 1432}; 1433 1434static list_t L2ARC_dev_list; /* device list */ 1435static list_t *l2arc_dev_list; /* device list pointer */ 1436static kmutex_t l2arc_dev_mtx; /* device list mutex */ 1437static l2arc_dev_t *l2arc_dev_last; /* last device used */ 1438static list_t L2ARC_free_on_write; /* free after write buf list */ 1439static list_t *l2arc_free_on_write; /* free after write list ptr */ 1440static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 1441static uint64_t l2arc_ndev; /* number of devices */ 1442 1443typedef struct l2arc_read_callback { 1444 arc_buf_hdr_t *l2rcb_hdr; /* read header */ 1445 blkptr_t l2rcb_bp; /* original blkptr */ 1446 zbookmark_phys_t l2rcb_zb; /* original bookmark */ 1447 int l2rcb_flags; /* original flags */ 1448 abd_t *l2rcb_abd; /* temporary buffer */ 1449} l2arc_read_callback_t; 1450 1451typedef struct l2arc_write_callback { 1452 l2arc_dev_t *l2wcb_dev; /* device info */ 1453 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 1454} l2arc_write_callback_t; 1455 1456typedef struct l2arc_data_free { 1457 /* protected by l2arc_free_on_write_mtx */ 1458 abd_t *l2df_abd; 1459 size_t l2df_size; 1460 arc_buf_contents_t l2df_type; 1461 list_node_t l2df_list_node; 1462} l2arc_data_free_t; 1463 1464static kmutex_t l2arc_feed_thr_lock; 1465static kcondvar_t l2arc_feed_thr_cv; 1466static uint8_t l2arc_thread_exit; 1467 1468static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *); 1469static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); 1470static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *); 1471static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); 1472static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); 1473static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); 1474static void arc_hdr_free_pabd(arc_buf_hdr_t *); 1475static void arc_hdr_alloc_pabd(arc_buf_hdr_t *); 1476static void arc_access(arc_buf_hdr_t *, kmutex_t *); 1477static boolean_t arc_is_overflowing(); 1478static void arc_buf_watch(arc_buf_t *); 1479 1480static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); 1481static uint32_t arc_bufc_to_flags(arc_buf_contents_t); 1482static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); 1483static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); 1484 1485static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); 1486static void l2arc_read_done(zio_t *); 1487 1488static void 1489l2arc_trim(const arc_buf_hdr_t *hdr) 1490{ 1491 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 1492 1493 ASSERT(HDR_HAS_L2HDR(hdr)); 1494 ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 1495 1496 if (HDR_GET_PSIZE(hdr) != 0) { 1497 trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, 1498 HDR_GET_PSIZE(hdr), 0); 1499 } 1500} 1501 1502/* 1503 * We use Cityhash for this. It's fast, and has good hash properties without 1504 * requiring any large static buffers. 1505 */ 1506static uint64_t 1507buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 1508{ 1509 return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth)); 1510} 1511 1512#define HDR_EMPTY(hdr) \ 1513 ((hdr)->b_dva.dva_word[0] == 0 && \ 1514 (hdr)->b_dva.dva_word[1] == 0) 1515 1516#define HDR_EQUAL(spa, dva, birth, hdr) \ 1517 ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 1518 ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 1519 ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) 1520 1521static void 1522buf_discard_identity(arc_buf_hdr_t *hdr) 1523{ 1524 hdr->b_dva.dva_word[0] = 0; 1525 hdr->b_dva.dva_word[1] = 0; 1526 hdr->b_birth = 0; 1527} 1528 1529static arc_buf_hdr_t * 1530buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) 1531{ 1532 const dva_t *dva = BP_IDENTITY(bp); 1533 uint64_t birth = BP_PHYSICAL_BIRTH(bp); 1534 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 1535 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1536 arc_buf_hdr_t *hdr; 1537 1538 mutex_enter(hash_lock); 1539 for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; 1540 hdr = hdr->b_hash_next) { 1541 if (HDR_EQUAL(spa, dva, birth, hdr)) { 1542 *lockp = hash_lock; 1543 return (hdr); 1544 } 1545 } 1546 mutex_exit(hash_lock); 1547 *lockp = NULL; 1548 return (NULL); 1549} 1550 1551/* 1552 * Insert an entry into the hash table. If there is already an element 1553 * equal to elem in the hash table, then the already existing element 1554 * will be returned and the new element will not be inserted. 1555 * Otherwise returns NULL. 1556 * If lockp == NULL, the caller is assumed to already hold the hash lock. 1557 */ 1558static arc_buf_hdr_t * 1559buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) 1560{ 1561 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1562 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1563 arc_buf_hdr_t *fhdr; 1564 uint32_t i; 1565 1566 ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); 1567 ASSERT(hdr->b_birth != 0); 1568 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1569 1570 if (lockp != NULL) { 1571 *lockp = hash_lock; 1572 mutex_enter(hash_lock); 1573 } else { 1574 ASSERT(MUTEX_HELD(hash_lock)); 1575 } 1576 1577 for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; 1578 fhdr = fhdr->b_hash_next, i++) { 1579 if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) 1580 return (fhdr); 1581 } 1582 1583 hdr->b_hash_next = buf_hash_table.ht_table[idx]; 1584 buf_hash_table.ht_table[idx] = hdr; 1585 arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); 1586 1587 /* collect some hash table performance data */ 1588 if (i > 0) { 1589 ARCSTAT_BUMP(arcstat_hash_collisions); 1590 if (i == 1) 1591 ARCSTAT_BUMP(arcstat_hash_chains); 1592 1593 ARCSTAT_MAX(arcstat_hash_chain_max, i); 1594 } 1595 1596 ARCSTAT_BUMP(arcstat_hash_elements); 1597 ARCSTAT_MAXSTAT(arcstat_hash_elements); 1598 1599 return (NULL); 1600} 1601 1602static void 1603buf_hash_remove(arc_buf_hdr_t *hdr) 1604{ 1605 arc_buf_hdr_t *fhdr, **hdrp; 1606 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1607 1608 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 1609 ASSERT(HDR_IN_HASH_TABLE(hdr)); 1610 1611 hdrp = &buf_hash_table.ht_table[idx]; 1612 while ((fhdr = *hdrp) != hdr) { 1613 ASSERT3P(fhdr, !=, NULL); 1614 hdrp = &fhdr->b_hash_next; 1615 } 1616 *hdrp = hdr->b_hash_next; 1617 hdr->b_hash_next = NULL; 1618 arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); 1619 1620 /* collect some hash table performance data */ 1621 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 1622 1623 if (buf_hash_table.ht_table[idx] && 1624 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 1625 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 1626} 1627 1628/* 1629 * Global data structures and functions for the buf kmem cache. 1630 */ 1631static kmem_cache_t *hdr_full_cache; 1632static kmem_cache_t *hdr_l2only_cache; 1633static kmem_cache_t *buf_cache; 1634 1635static void 1636buf_fini(void) 1637{ 1638 int i; 1639 1640 kmem_free(buf_hash_table.ht_table, 1641 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 1642 for (i = 0; i < BUF_LOCKS; i++) 1643 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 1644 kmem_cache_destroy(hdr_full_cache); 1645 kmem_cache_destroy(hdr_l2only_cache); 1646 kmem_cache_destroy(buf_cache); 1647} 1648 1649/* 1650 * Constructor callback - called when the cache is empty 1651 * and a new buf is requested. 1652 */ 1653/* ARGSUSED */ 1654static int 1655hdr_full_cons(void *vbuf, void *unused, int kmflag) 1656{ 1657 arc_buf_hdr_t *hdr = vbuf; 1658 1659 bzero(hdr, HDR_FULL_SIZE); 1660 cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); 1661 refcount_create(&hdr->b_l1hdr.b_refcnt); 1662 mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 1663 multilist_link_init(&hdr->b_l1hdr.b_arc_node); 1664 arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1665 1666 return (0); 1667} 1668 1669/* ARGSUSED */ 1670static int 1671hdr_l2only_cons(void *vbuf, void *unused, int kmflag) 1672{ 1673 arc_buf_hdr_t *hdr = vbuf; 1674 1675 bzero(hdr, HDR_L2ONLY_SIZE); 1676 arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1677 1678 return (0); 1679} 1680 1681/* ARGSUSED */ 1682static int 1683buf_cons(void *vbuf, void *unused, int kmflag) 1684{ 1685 arc_buf_t *buf = vbuf; 1686 1687 bzero(buf, sizeof (arc_buf_t)); 1688 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 1689 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1690 1691 return (0); 1692} 1693 1694/* 1695 * Destructor callback - called when a cached buf is 1696 * no longer required. 1697 */ 1698/* ARGSUSED */ 1699static void 1700hdr_full_dest(void *vbuf, void *unused) 1701{ 1702 arc_buf_hdr_t *hdr = vbuf; 1703 1704 ASSERT(HDR_EMPTY(hdr)); 1705 cv_destroy(&hdr->b_l1hdr.b_cv); 1706 refcount_destroy(&hdr->b_l1hdr.b_refcnt); 1707 mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); 1708 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 1709 arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1710} 1711 1712/* ARGSUSED */ 1713static void 1714hdr_l2only_dest(void *vbuf, void *unused) 1715{ 1716 arc_buf_hdr_t *hdr = vbuf; 1717 1718 ASSERT(HDR_EMPTY(hdr)); 1719 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1720} 1721 1722/* ARGSUSED */ 1723static void 1724buf_dest(void *vbuf, void *unused) 1725{ 1726 arc_buf_t *buf = vbuf; 1727 1728 mutex_destroy(&buf->b_evict_lock); 1729 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1730} 1731 1732/* 1733 * Reclaim callback -- invoked when memory is low. 1734 */ 1735/* ARGSUSED */ 1736static void 1737hdr_recl(void *unused) 1738{ 1739 dprintf("hdr_recl called\n"); 1740 /* 1741 * umem calls the reclaim func when we destroy the buf cache, 1742 * which is after we do arc_fini(). 1743 */ 1744 if (!arc_dead) 1745 cv_signal(&arc_reclaim_thread_cv); 1746} 1747 1748static void 1749buf_init(void) 1750{ 1751 uint64_t *ct; 1752 uint64_t hsize = 1ULL << 12; 1753 int i, j; 1754 1755 /* 1756 * The hash table is big enough to fill all of physical memory 1757 * with an average block size of zfs_arc_average_blocksize (default 8K). 1758 * By default, the table will take up 1759 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). 1760 */ 1761 while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) 1762 hsize <<= 1; 1763retry: 1764 buf_hash_table.ht_mask = hsize - 1; 1765 buf_hash_table.ht_table = 1766 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1767 if (buf_hash_table.ht_table == NULL) { 1768 ASSERT(hsize > (1ULL << 8)); 1769 hsize >>= 1; 1770 goto retry; 1771 } 1772 1773 hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 1774 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); 1775 hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", 1776 HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, 1777 NULL, NULL, 0); 1778 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1779 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1780 1781 for (i = 0; i < 256; i++) 1782 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1783 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1784 1785 for (i = 0; i < BUF_LOCKS; i++) { 1786 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1787 NULL, MUTEX_DEFAULT, NULL); 1788 } 1789} 1790 1791/* 1792 * This is the size that the buf occupies in memory. If the buf is compressed, 1793 * it will correspond to the compressed size. You should use this method of 1794 * getting the buf size unless you explicitly need the logical size. 1795 */ 1796int32_t 1797arc_buf_size(arc_buf_t *buf) 1798{ 1799 return (ARC_BUF_COMPRESSED(buf) ? 1800 HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr)); 1801} 1802 1803int32_t 1804arc_buf_lsize(arc_buf_t *buf) 1805{ 1806 return (HDR_GET_LSIZE(buf->b_hdr)); 1807} 1808 1809enum zio_compress 1810arc_get_compression(arc_buf_t *buf) 1811{ 1812 return (ARC_BUF_COMPRESSED(buf) ? 1813 HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); 1814} 1815 1816#define ARC_MINTIME (hz>>4) /* 62 ms */ 1817 1818static inline boolean_t 1819arc_buf_is_shared(arc_buf_t *buf) 1820{ 1821 boolean_t shared = (buf->b_data != NULL && 1822 buf->b_hdr->b_l1hdr.b_pabd != NULL && 1823 abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && 1824 buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); 1825 IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); 1826 IMPLY(shared, ARC_BUF_SHARED(buf)); 1827 IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); 1828 1829 /* 1830 * It would be nice to assert arc_can_share() too, but the "hdr isn't 1831 * already being shared" requirement prevents us from doing that. 1832 */ 1833 1834 return (shared); 1835} 1836 1837/* 1838 * Free the checksum associated with this header. If there is no checksum, this 1839 * is a no-op. 1840 */ 1841static inline void 1842arc_cksum_free(arc_buf_hdr_t *hdr) 1843{ 1844 ASSERT(HDR_HAS_L1HDR(hdr)); 1845 mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 1846 if (hdr->b_l1hdr.b_freeze_cksum != NULL) { 1847 kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); 1848 hdr->b_l1hdr.b_freeze_cksum = NULL; 1849 } 1850 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1851} 1852 1853/* 1854 * Return true iff at least one of the bufs on hdr is not compressed. 1855 */ 1856static boolean_t 1857arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) 1858{ 1859 for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) { 1860 if (!ARC_BUF_COMPRESSED(b)) { 1861 return (B_TRUE); 1862 } 1863 } 1864 return (B_FALSE); 1865} 1866 1867/* 1868 * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data 1869 * matches the checksum that is stored in the hdr. If there is no checksum, 1870 * or if the buf is compressed, this is a no-op. 1871 */ 1872static void 1873arc_cksum_verify(arc_buf_t *buf) 1874{ 1875 arc_buf_hdr_t *hdr = buf->b_hdr; 1876 zio_cksum_t zc; 1877 1878 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1879 return; 1880 1881 if (ARC_BUF_COMPRESSED(buf)) { 1882 ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || 1883 arc_hdr_has_uncompressed_buf(hdr)); 1884 return; 1885 } 1886 1887 ASSERT(HDR_HAS_L1HDR(hdr)); 1888 1889 mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 1890 if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { 1891 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1892 return; 1893 } 1894 1895 fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc); 1896 if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) 1897 panic("buffer modified while frozen!"); 1898 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1899} 1900 1901static boolean_t 1902arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) 1903{ 1904 enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp); 1905 boolean_t valid_cksum; 1906 1907 ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); 1908 VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); 1909 1910 /* 1911 * We rely on the blkptr's checksum to determine if the block 1912 * is valid or not. When compressed arc is enabled, the l2arc 1913 * writes the block to the l2arc just as it appears in the pool. 1914 * This allows us to use the blkptr's checksum to validate the 1915 * data that we just read off of the l2arc without having to store 1916 * a separate checksum in the arc_buf_hdr_t. However, if compressed 1917 * arc is disabled, then the data written to the l2arc is always 1918 * uncompressed and won't match the block as it exists in the main 1919 * pool. When this is the case, we must first compress it if it is 1920 * compressed on the main pool before we can validate the checksum. 1921 */ 1922 if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) { 1923 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); 1924 uint64_t lsize = HDR_GET_LSIZE(hdr); 1925 uint64_t csize; 1926 1927 abd_t *cdata = abd_alloc_linear(HDR_GET_PSIZE(hdr), B_TRUE); 1928 csize = zio_compress_data(compress, zio->io_abd, 1929 abd_to_buf(cdata), lsize); 1930 1931 ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); 1932 if (csize < HDR_GET_PSIZE(hdr)) { 1933 /* 1934 * Compressed blocks are always a multiple of the 1935 * smallest ashift in the pool. Ideally, we would 1936 * like to round up the csize to the next 1937 * spa_min_ashift but that value may have changed 1938 * since the block was last written. Instead, 1939 * we rely on the fact that the hdr's psize 1940 * was set to the psize of the block when it was 1941 * last written. We set the csize to that value 1942 * and zero out any part that should not contain 1943 * data. 1944 */ 1945 abd_zero_off(cdata, csize, HDR_GET_PSIZE(hdr) - csize); 1946 csize = HDR_GET_PSIZE(hdr); 1947 } 1948 zio_push_transform(zio, cdata, csize, HDR_GET_PSIZE(hdr), NULL); 1949 } 1950 1951 /* 1952 * Block pointers always store the checksum for the logical data. 1953 * If the block pointer has the gang bit set, then the checksum 1954 * it represents is for the reconstituted data and not for an 1955 * individual gang member. The zio pipeline, however, must be able to 1956 * determine the checksum of each of the gang constituents so it 1957 * treats the checksum comparison differently than what we need 1958 * for l2arc blocks. This prevents us from using the 1959 * zio_checksum_error() interface directly. Instead we must call the 1960 * zio_checksum_error_impl() so that we can ensure the checksum is 1961 * generated using the correct checksum algorithm and accounts for the 1962 * logical I/O size and not just a gang fragment. 1963 */ 1964 valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, 1965 BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, 1966 zio->io_offset, NULL) == 0); 1967 zio_pop_transforms(zio); 1968 return (valid_cksum); 1969} 1970 1971/* 1972 * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a 1973 * checksum and attaches it to the buf's hdr so that we can ensure that the buf 1974 * isn't modified later on. If buf is compressed or there is already a checksum 1975 * on the hdr, this is a no-op (we only checksum uncompressed bufs). 1976 */ 1977static void 1978arc_cksum_compute(arc_buf_t *buf) 1979{ 1980 arc_buf_hdr_t *hdr = buf->b_hdr; 1981 1982 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1983 return; 1984 1985 ASSERT(HDR_HAS_L1HDR(hdr)); 1986 1987 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1988 if (hdr->b_l1hdr.b_freeze_cksum != NULL) { 1989 ASSERT(arc_hdr_has_uncompressed_buf(hdr)); 1990 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1991 return; 1992 } else if (ARC_BUF_COMPRESSED(buf)) { 1993 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1994 return; 1995 } 1996 1997 ASSERT(!ARC_BUF_COMPRESSED(buf)); 1998 hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), 1999 KM_SLEEP); 2000 fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, 2001 hdr->b_l1hdr.b_freeze_cksum); 2002 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 2003#ifdef illumos 2004 arc_buf_watch(buf); 2005#endif 2006} 2007 2008#ifdef illumos 2009#ifndef _KERNEL 2010typedef struct procctl { 2011 long cmd; 2012 prwatch_t prwatch; 2013} procctl_t; 2014#endif 2015 2016/* ARGSUSED */ 2017static void 2018arc_buf_unwatch(arc_buf_t *buf) 2019{ 2020#ifndef _KERNEL 2021 if (arc_watch) { 2022 int result; 2023 procctl_t ctl; 2024 ctl.cmd = PCWATCH; 2025 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 2026 ctl.prwatch.pr_size = 0; 2027 ctl.prwatch.pr_wflags = 0; 2028 result = write(arc_procfd, &ctl, sizeof (ctl)); 2029 ASSERT3U(result, ==, sizeof (ctl)); 2030 } 2031#endif 2032} 2033 2034/* ARGSUSED */ 2035static void 2036arc_buf_watch(arc_buf_t *buf) 2037{ 2038#ifndef _KERNEL 2039 if (arc_watch) { 2040 int result; 2041 procctl_t ctl; 2042 ctl.cmd = PCWATCH; 2043 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 2044 ctl.prwatch.pr_size = arc_buf_size(buf); 2045 ctl.prwatch.pr_wflags = WA_WRITE; 2046 result = write(arc_procfd, &ctl, sizeof (ctl)); 2047 ASSERT3U(result, ==, sizeof (ctl)); 2048 } 2049#endif 2050} 2051#endif /* illumos */ 2052 2053static arc_buf_contents_t 2054arc_buf_type(arc_buf_hdr_t *hdr) 2055{ 2056 arc_buf_contents_t type; 2057 if (HDR_ISTYPE_METADATA(hdr)) { 2058 type = ARC_BUFC_METADATA; 2059 } else { 2060 type = ARC_BUFC_DATA; 2061 } 2062 VERIFY3U(hdr->b_type, ==, type); 2063 return (type); 2064} 2065 2066boolean_t 2067arc_is_metadata(arc_buf_t *buf) 2068{ 2069 return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0); 2070} 2071 2072static uint32_t 2073arc_bufc_to_flags(arc_buf_contents_t type) 2074{ 2075 switch (type) { 2076 case ARC_BUFC_DATA: 2077 /* metadata field is 0 if buffer contains normal data */ 2078 return (0); 2079 case ARC_BUFC_METADATA: 2080 return (ARC_FLAG_BUFC_METADATA); 2081 default: 2082 break; 2083 } 2084 panic("undefined ARC buffer type!"); 2085 return ((uint32_t)-1); 2086} 2087 2088void 2089arc_buf_thaw(arc_buf_t *buf) 2090{ 2091 arc_buf_hdr_t *hdr = buf->b_hdr; 2092 2093 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 2094 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2095 2096 arc_cksum_verify(buf); 2097 2098 /* 2099 * Compressed buffers do not manipulate the b_freeze_cksum or 2100 * allocate b_thawed. 2101 */ 2102 if (ARC_BUF_COMPRESSED(buf)) { 2103 ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || 2104 arc_hdr_has_uncompressed_buf(hdr)); 2105 return; 2106 } 2107 2108 ASSERT(HDR_HAS_L1HDR(hdr)); 2109 arc_cksum_free(hdr); 2110 2111 mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 2112#ifdef ZFS_DEBUG 2113 if (zfs_flags & ZFS_DEBUG_MODIFY) { 2114 if (hdr->b_l1hdr.b_thawed != NULL) 2115 kmem_free(hdr->b_l1hdr.b_thawed, 1); 2116 hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); 2117 } 2118#endif 2119 2120 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 2121 2122#ifdef illumos 2123 arc_buf_unwatch(buf); 2124#endif 2125} 2126 2127void 2128arc_buf_freeze(arc_buf_t *buf) 2129{ 2130 arc_buf_hdr_t *hdr = buf->b_hdr; 2131 kmutex_t *hash_lock; 2132 2133 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 2134 return; 2135 2136 if (ARC_BUF_COMPRESSED(buf)) { 2137 ASSERT(hdr->b_l1hdr.b_freeze_cksum == NULL || 2138 arc_hdr_has_uncompressed_buf(hdr)); 2139 return; 2140 } 2141 2142 hash_lock = HDR_LOCK(hdr); 2143 mutex_enter(hash_lock); 2144 2145 ASSERT(HDR_HAS_L1HDR(hdr)); 2146 ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL || 2147 hdr->b_l1hdr.b_state == arc_anon); 2148 arc_cksum_compute(buf); 2149 mutex_exit(hash_lock); 2150} 2151 2152/* 2153 * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, 2154 * the following functions should be used to ensure that the flags are 2155 * updated in a thread-safe way. When manipulating the flags either 2156 * the hash_lock must be held or the hdr must be undiscoverable. This 2157 * ensures that we're not racing with any other threads when updating 2158 * the flags. 2159 */ 2160static inline void 2161arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) 2162{ 2163 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2164 hdr->b_flags |= flags; 2165} 2166 2167static inline void 2168arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) 2169{ 2170 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2171 hdr->b_flags &= ~flags; 2172} 2173 2174/* 2175 * Setting the compression bits in the arc_buf_hdr_t's b_flags is 2176 * done in a special way since we have to clear and set bits 2177 * at the same time. Consumers that wish to set the compression bits 2178 * must use this function to ensure that the flags are updated in 2179 * thread-safe manner. 2180 */ 2181static void 2182arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) 2183{ 2184 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2185 2186 /* 2187 * Holes and embedded blocks will always have a psize = 0 so 2188 * we ignore the compression of the blkptr and set the 2189 * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF. 2190 * Holes and embedded blocks remain anonymous so we don't 2191 * want to uncompress them. Mark them as uncompressed. 2192 */ 2193 if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { 2194 arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); 2195 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); 2196 ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); 2197 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); 2198 } else { 2199 arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); 2200 HDR_SET_COMPRESS(hdr, cmp); 2201 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); 2202 ASSERT(HDR_COMPRESSION_ENABLED(hdr)); 2203 } 2204} 2205 2206/* 2207 * Looks for another buf on the same hdr which has the data decompressed, copies 2208 * from it, and returns true. If no such buf exists, returns false. 2209 */ 2210static boolean_t 2211arc_buf_try_copy_decompressed_data(arc_buf_t *buf) 2212{ 2213 arc_buf_hdr_t *hdr = buf->b_hdr; 2214 boolean_t copied = B_FALSE; 2215 2216 ASSERT(HDR_HAS_L1HDR(hdr)); 2217 ASSERT3P(buf->b_data, !=, NULL); 2218 ASSERT(!ARC_BUF_COMPRESSED(buf)); 2219 2220 for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL; 2221 from = from->b_next) { 2222 /* can't use our own data buffer */ 2223 if (from == buf) { 2224 continue; 2225 } 2226 2227 if (!ARC_BUF_COMPRESSED(from)) { 2228 bcopy(from->b_data, buf->b_data, arc_buf_size(buf)); 2229 copied = B_TRUE; 2230 break; 2231 } 2232 } 2233 2234 /* 2235 * There were no decompressed bufs, so there should not be a 2236 * checksum on the hdr either. 2237 */ 2238 EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); 2239 2240 return (copied); 2241} 2242 2243/* 2244 * Given a buf that has a data buffer attached to it, this function will 2245 * efficiently fill the buf with data of the specified compression setting from 2246 * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr 2247 * are already sharing a data buf, no copy is performed. 2248 * 2249 * If the buf is marked as compressed but uncompressed data was requested, this 2250 * will allocate a new data buffer for the buf, remove that flag, and fill the 2251 * buf with uncompressed data. You can't request a compressed buf on a hdr with 2252 * uncompressed data, and (since we haven't added support for it yet) if you 2253 * want compressed data your buf must already be marked as compressed and have 2254 * the correct-sized data buffer. 2255 */ 2256static int 2257arc_buf_fill(arc_buf_t *buf, boolean_t compressed) 2258{ 2259 arc_buf_hdr_t *hdr = buf->b_hdr; 2260 boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); 2261 dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; 2262 2263 ASSERT3P(buf->b_data, !=, NULL); 2264 IMPLY(compressed, hdr_compressed); 2265 IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); 2266 2267 if (hdr_compressed == compressed) { 2268 if (!arc_buf_is_shared(buf)) { 2269 abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, 2270 arc_buf_size(buf)); 2271 } 2272 } else { 2273 ASSERT(hdr_compressed); 2274 ASSERT(!compressed); 2275 ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); 2276 2277 /* 2278 * If the buf is sharing its data with the hdr, unlink it and 2279 * allocate a new data buffer for the buf. 2280 */ 2281 if (arc_buf_is_shared(buf)) { 2282 ASSERT(ARC_BUF_COMPRESSED(buf)); 2283 2284 /* We need to give the buf it's own b_data */ 2285 buf->b_flags &= ~ARC_BUF_FLAG_SHARED; 2286 buf->b_data = 2287 arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); 2288 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); 2289 2290 /* Previously overhead was 0; just add new overhead */ 2291 ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); 2292 } else if (ARC_BUF_COMPRESSED(buf)) { 2293 /* We need to reallocate the buf's b_data */ 2294 arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), 2295 buf); 2296 buf->b_data = 2297 arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); 2298 2299 /* We increased the size of b_data; update overhead */ 2300 ARCSTAT_INCR(arcstat_overhead_size, 2301 HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr)); 2302 } 2303 2304 /* 2305 * Regardless of the buf's previous compression settings, it 2306 * should not be compressed at the end of this function. 2307 */ 2308 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; 2309 2310 /* 2311 * Try copying the data from another buf which already has a 2312 * decompressed version. If that's not possible, it's time to 2313 * bite the bullet and decompress the data from the hdr. 2314 */ 2315 if (arc_buf_try_copy_decompressed_data(buf)) { 2316 /* Skip byteswapping and checksumming (already done) */ 2317 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL); 2318 return (0); 2319 } else { 2320 int error = zio_decompress_data(HDR_GET_COMPRESS(hdr), 2321 hdr->b_l1hdr.b_pabd, buf->b_data, 2322 HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); 2323 2324 /* 2325 * Absent hardware errors or software bugs, this should 2326 * be impossible, but log it anyway so we can debug it. 2327 */ 2328 if (error != 0) { 2329 zfs_dbgmsg( 2330 "hdr %p, compress %d, psize %d, lsize %d", 2331 hdr, HDR_GET_COMPRESS(hdr), 2332 HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); 2333 return (SET_ERROR(EIO)); 2334 } 2335 } 2336 } 2337 2338 /* Byteswap the buf's data if necessary */ 2339 if (bswap != DMU_BSWAP_NUMFUNCS) { 2340 ASSERT(!HDR_SHARED_DATA(hdr)); 2341 ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); 2342 dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); 2343 } 2344 2345 /* Compute the hdr's checksum if necessary */ 2346 arc_cksum_compute(buf); 2347 2348 return (0); 2349} 2350 2351int 2352arc_decompress(arc_buf_t *buf) 2353{ 2354 return (arc_buf_fill(buf, B_FALSE)); 2355} 2356 2357/* 2358 * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. 2359 */ 2360static uint64_t 2361arc_hdr_size(arc_buf_hdr_t *hdr) 2362{ 2363 uint64_t size; 2364 2365 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && 2366 HDR_GET_PSIZE(hdr) > 0) { 2367 size = HDR_GET_PSIZE(hdr); 2368 } else { 2369 ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); 2370 size = HDR_GET_LSIZE(hdr); 2371 } 2372 return (size); 2373} 2374 2375/* 2376 * Increment the amount of evictable space in the arc_state_t's refcount. 2377 * We account for the space used by the hdr and the arc buf individually 2378 * so that we can add and remove them from the refcount individually. 2379 */ 2380static void 2381arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) 2382{ 2383 arc_buf_contents_t type = arc_buf_type(hdr); 2384 2385 ASSERT(HDR_HAS_L1HDR(hdr)); 2386 2387 if (GHOST_STATE(state)) { 2388 ASSERT0(hdr->b_l1hdr.b_bufcnt); 2389 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2390 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2391 (void) refcount_add_many(&state->arcs_esize[type], 2392 HDR_GET_LSIZE(hdr), hdr); 2393 return; 2394 } 2395 2396 ASSERT(!GHOST_STATE(state)); 2397 if (hdr->b_l1hdr.b_pabd != NULL) { 2398 (void) refcount_add_many(&state->arcs_esize[type], 2399 arc_hdr_size(hdr), hdr); 2400 } 2401 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2402 buf = buf->b_next) { 2403 if (arc_buf_is_shared(buf)) 2404 continue; 2405 (void) refcount_add_many(&state->arcs_esize[type], 2406 arc_buf_size(buf), buf); 2407 } 2408} 2409 2410/* 2411 * Decrement the amount of evictable space in the arc_state_t's refcount. 2412 * We account for the space used by the hdr and the arc buf individually 2413 * so that we can add and remove them from the refcount individually. 2414 */ 2415static void 2416arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) 2417{ 2418 arc_buf_contents_t type = arc_buf_type(hdr); 2419 2420 ASSERT(HDR_HAS_L1HDR(hdr)); 2421 2422 if (GHOST_STATE(state)) { 2423 ASSERT0(hdr->b_l1hdr.b_bufcnt); 2424 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2425 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2426 (void) refcount_remove_many(&state->arcs_esize[type], 2427 HDR_GET_LSIZE(hdr), hdr); 2428 return; 2429 } 2430 2431 ASSERT(!GHOST_STATE(state)); 2432 if (hdr->b_l1hdr.b_pabd != NULL) { 2433 (void) refcount_remove_many(&state->arcs_esize[type], 2434 arc_hdr_size(hdr), hdr); 2435 } 2436 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2437 buf = buf->b_next) { 2438 if (arc_buf_is_shared(buf)) 2439 continue; 2440 (void) refcount_remove_many(&state->arcs_esize[type], 2441 arc_buf_size(buf), buf); 2442 } 2443} 2444 2445/* 2446 * Add a reference to this hdr indicating that someone is actively 2447 * referencing that memory. When the refcount transitions from 0 to 1, 2448 * we remove it from the respective arc_state_t list to indicate that 2449 * it is not evictable. 2450 */ 2451static void 2452add_reference(arc_buf_hdr_t *hdr, void *tag) 2453{ 2454 ASSERT(HDR_HAS_L1HDR(hdr)); 2455 if (!MUTEX_HELD(HDR_LOCK(hdr))) { 2456 ASSERT(hdr->b_l1hdr.b_state == arc_anon); 2457 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2458 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2459 } 2460 2461 arc_state_t *state = hdr->b_l1hdr.b_state; 2462 2463 if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && 2464 (state != arc_anon)) { 2465 /* We don't use the L2-only state list. */ 2466 if (state != arc_l2c_only) { 2467 multilist_remove(state->arcs_list[arc_buf_type(hdr)], 2468 hdr); 2469 arc_evictable_space_decrement(hdr, state); 2470 } 2471 /* remove the prefetch flag if we get a reference */ 2472 arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); 2473 } 2474} 2475 2476/* 2477 * Remove a reference from this hdr. When the reference transitions from 2478 * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's 2479 * list making it eligible for eviction. 2480 */ 2481static int 2482remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 2483{ 2484 int cnt; 2485 arc_state_t *state = hdr->b_l1hdr.b_state; 2486 2487 ASSERT(HDR_HAS_L1HDR(hdr)); 2488 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 2489 ASSERT(!GHOST_STATE(state)); 2490 2491 /* 2492 * arc_l2c_only counts as a ghost state so we don't need to explicitly 2493 * check to prevent usage of the arc_l2c_only list. 2494 */ 2495 if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && 2496 (state != arc_anon)) { 2497 multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr); 2498 ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); 2499 arc_evictable_space_increment(hdr, state); 2500 } 2501 return (cnt); 2502} 2503 2504/* 2505 * Move the supplied buffer to the indicated state. The hash lock 2506 * for the buffer must be held by the caller. 2507 */ 2508static void 2509arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, 2510 kmutex_t *hash_lock) 2511{ 2512 arc_state_t *old_state; 2513 int64_t refcnt; 2514 uint32_t bufcnt; 2515 boolean_t update_old, update_new; 2516 arc_buf_contents_t buftype = arc_buf_type(hdr); 2517 2518 /* 2519 * We almost always have an L1 hdr here, since we call arc_hdr_realloc() 2520 * in arc_read() when bringing a buffer out of the L2ARC. However, the 2521 * L1 hdr doesn't always exist when we change state to arc_anon before 2522 * destroying a header, in which case reallocating to add the L1 hdr is 2523 * pointless. 2524 */ 2525 if (HDR_HAS_L1HDR(hdr)) { 2526 old_state = hdr->b_l1hdr.b_state; 2527 refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); 2528 bufcnt = hdr->b_l1hdr.b_bufcnt; 2529 update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL); 2530 } else { 2531 old_state = arc_l2c_only; 2532 refcnt = 0; 2533 bufcnt = 0; 2534 update_old = B_FALSE; 2535 } 2536 update_new = update_old; 2537 2538 ASSERT(MUTEX_HELD(hash_lock)); 2539 ASSERT3P(new_state, !=, old_state); 2540 ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); 2541 ASSERT(old_state != arc_anon || bufcnt <= 1); 2542 2543 /* 2544 * If this buffer is evictable, transfer it from the 2545 * old state list to the new state list. 2546 */ 2547 if (refcnt == 0) { 2548 if (old_state != arc_anon && old_state != arc_l2c_only) { 2549 ASSERT(HDR_HAS_L1HDR(hdr)); 2550 multilist_remove(old_state->arcs_list[buftype], hdr); 2551 2552 if (GHOST_STATE(old_state)) { 2553 ASSERT0(bufcnt); 2554 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2555 update_old = B_TRUE; 2556 } 2557 arc_evictable_space_decrement(hdr, old_state); 2558 } 2559 if (new_state != arc_anon && new_state != arc_l2c_only) { 2560 2561 /* 2562 * An L1 header always exists here, since if we're 2563 * moving to some L1-cached state (i.e. not l2c_only or 2564 * anonymous), we realloc the header to add an L1hdr 2565 * beforehand. 2566 */ 2567 ASSERT(HDR_HAS_L1HDR(hdr)); 2568 multilist_insert(new_state->arcs_list[buftype], hdr); 2569 2570 if (GHOST_STATE(new_state)) { 2571 ASSERT0(bufcnt); 2572 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2573 update_new = B_TRUE; 2574 } 2575 arc_evictable_space_increment(hdr, new_state); 2576 } 2577 } 2578 2579 ASSERT(!HDR_EMPTY(hdr)); 2580 if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) 2581 buf_hash_remove(hdr); 2582 2583 /* adjust state sizes (ignore arc_l2c_only) */ 2584 2585 if (update_new && new_state != arc_l2c_only) { 2586 ASSERT(HDR_HAS_L1HDR(hdr)); 2587 if (GHOST_STATE(new_state)) { 2588 ASSERT0(bufcnt); 2589 2590 /* 2591 * When moving a header to a ghost state, we first 2592 * remove all arc buffers. Thus, we'll have a 2593 * bufcnt of zero, and no arc buffer to use for 2594 * the reference. As a result, we use the arc 2595 * header pointer for the reference. 2596 */ 2597 (void) refcount_add_many(&new_state->arcs_size, 2598 HDR_GET_LSIZE(hdr), hdr); 2599 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2600 } else { 2601 uint32_t buffers = 0; 2602 2603 /* 2604 * Each individual buffer holds a unique reference, 2605 * thus we must remove each of these references one 2606 * at a time. 2607 */ 2608 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2609 buf = buf->b_next) { 2610 ASSERT3U(bufcnt, !=, 0); 2611 buffers++; 2612 2613 /* 2614 * When the arc_buf_t is sharing the data 2615 * block with the hdr, the owner of the 2616 * reference belongs to the hdr. Only 2617 * add to the refcount if the arc_buf_t is 2618 * not shared. 2619 */ 2620 if (arc_buf_is_shared(buf)) 2621 continue; 2622 2623 (void) refcount_add_many(&new_state->arcs_size, 2624 arc_buf_size(buf), buf); 2625 } 2626 ASSERT3U(bufcnt, ==, buffers); 2627 2628 if (hdr->b_l1hdr.b_pabd != NULL) { 2629 (void) refcount_add_many(&new_state->arcs_size, 2630 arc_hdr_size(hdr), hdr); 2631 } else { 2632 ASSERT(GHOST_STATE(old_state)); 2633 } 2634 } 2635 } 2636 2637 if (update_old && old_state != arc_l2c_only) { 2638 ASSERT(HDR_HAS_L1HDR(hdr)); 2639 if (GHOST_STATE(old_state)) { 2640 ASSERT0(bufcnt); 2641 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2642 2643 /* 2644 * When moving a header off of a ghost state, 2645 * the header will not contain any arc buffers. 2646 * We use the arc header pointer for the reference 2647 * which is exactly what we did when we put the 2648 * header on the ghost state. 2649 */ 2650 2651 (void) refcount_remove_many(&old_state->arcs_size, 2652 HDR_GET_LSIZE(hdr), hdr); 2653 } else { 2654 uint32_t buffers = 0; 2655 2656 /* 2657 * Each individual buffer holds a unique reference, 2658 * thus we must remove each of these references one 2659 * at a time. 2660 */ 2661 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2662 buf = buf->b_next) { 2663 ASSERT3U(bufcnt, !=, 0); 2664 buffers++; 2665 2666 /* 2667 * When the arc_buf_t is sharing the data 2668 * block with the hdr, the owner of the 2669 * reference belongs to the hdr. Only 2670 * add to the refcount if the arc_buf_t is 2671 * not shared. 2672 */ 2673 if (arc_buf_is_shared(buf)) 2674 continue; 2675 2676 (void) refcount_remove_many( 2677 &old_state->arcs_size, arc_buf_size(buf), 2678 buf); 2679 } 2680 ASSERT3U(bufcnt, ==, buffers); 2681 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 2682 (void) refcount_remove_many( 2683 &old_state->arcs_size, arc_hdr_size(hdr), hdr); 2684 } 2685 } 2686 2687 if (HDR_HAS_L1HDR(hdr)) 2688 hdr->b_l1hdr.b_state = new_state; 2689 2690 /* 2691 * L2 headers should never be on the L2 state list since they don't 2692 * have L1 headers allocated. 2693 */ 2694 ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && 2695 multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); 2696} 2697 2698void 2699arc_space_consume(uint64_t space, arc_space_type_t type) 2700{ 2701 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 2702 2703 switch (type) { 2704 case ARC_SPACE_DATA: 2705 aggsum_add(&astat_data_size, space); 2706 break; 2707 case ARC_SPACE_META: 2708 aggsum_add(&astat_metadata_size, space); 2709 break; 2710 case ARC_SPACE_OTHER: 2711 aggsum_add(&astat_other_size, space); 2712 break; 2713 case ARC_SPACE_HDRS: 2714 aggsum_add(&astat_hdr_size, space); 2715 break; 2716 case ARC_SPACE_L2HDRS: 2717 aggsum_add(&astat_l2_hdr_size, space); 2718 break; 2719 } 2720 2721 if (type != ARC_SPACE_DATA) 2722 aggsum_add(&arc_meta_used, space); 2723 2724 aggsum_add(&arc_size, space); 2725} 2726 2727void 2728arc_space_return(uint64_t space, arc_space_type_t type) 2729{ 2730 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 2731 2732 switch (type) { 2733 case ARC_SPACE_DATA: 2734 aggsum_add(&astat_data_size, -space); 2735 break; 2736 case ARC_SPACE_META: 2737 aggsum_add(&astat_metadata_size, -space); 2738 break; 2739 case ARC_SPACE_OTHER: 2740 aggsum_add(&astat_other_size, -space); 2741 break; 2742 case ARC_SPACE_HDRS: 2743 aggsum_add(&astat_hdr_size, -space); 2744 break; 2745 case ARC_SPACE_L2HDRS: 2746 aggsum_add(&astat_l2_hdr_size, -space); 2747 break; 2748 } 2749 2750 if (type != ARC_SPACE_DATA) { 2751 ASSERT(aggsum_compare(&arc_meta_used, space) >= 0); 2752 /* 2753 * We use the upper bound here rather than the precise value 2754 * because the arc_meta_max value doesn't need to be 2755 * precise. It's only consumed by humans via arcstats. 2756 */ 2757 if (arc_meta_max < aggsum_upper_bound(&arc_meta_used)) 2758 arc_meta_max = aggsum_upper_bound(&arc_meta_used); 2759 aggsum_add(&arc_meta_used, -space); 2760 } 2761 2762 ASSERT(aggsum_compare(&arc_size, space) >= 0); 2763 aggsum_add(&arc_size, -space); 2764} 2765 2766/* 2767 * Given a hdr and a buf, returns whether that buf can share its b_data buffer 2768 * with the hdr's b_pabd. 2769 */ 2770static boolean_t 2771arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) 2772{ 2773 /* 2774 * The criteria for sharing a hdr's data are: 2775 * 1. the hdr's compression matches the buf's compression 2776 * 2. the hdr doesn't need to be byteswapped 2777 * 3. the hdr isn't already being shared 2778 * 4. the buf is either compressed or it is the last buf in the hdr list 2779 * 2780 * Criterion #4 maintains the invariant that shared uncompressed 2781 * bufs must be the final buf in the hdr's b_buf list. Reading this, you 2782 * might ask, "if a compressed buf is allocated first, won't that be the 2783 * last thing in the list?", but in that case it's impossible to create 2784 * a shared uncompressed buf anyway (because the hdr must be compressed 2785 * to have the compressed buf). You might also think that #3 is 2786 * sufficient to make this guarantee, however it's possible 2787 * (specifically in the rare L2ARC write race mentioned in 2788 * arc_buf_alloc_impl()) there will be an existing uncompressed buf that 2789 * is sharable, but wasn't at the time of its allocation. Rather than 2790 * allow a new shared uncompressed buf to be created and then shuffle 2791 * the list around to make it the last element, this simply disallows 2792 * sharing if the new buf isn't the first to be added. 2793 */ 2794 ASSERT3P(buf->b_hdr, ==, hdr); 2795 boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF; 2796 boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; 2797 return (buf_compressed == hdr_compressed && 2798 hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && 2799 !HDR_SHARED_DATA(hdr) && 2800 (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); 2801} 2802 2803/* 2804 * Allocate a buf for this hdr. If you care about the data that's in the hdr, 2805 * or if you want a compressed buffer, pass those flags in. Returns 0 if the 2806 * copy was made successfully, or an error code otherwise. 2807 */ 2808static int 2809arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, 2810 boolean_t fill, arc_buf_t **ret) 2811{ 2812 arc_buf_t *buf; 2813 2814 ASSERT(HDR_HAS_L1HDR(hdr)); 2815 ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); 2816 VERIFY(hdr->b_type == ARC_BUFC_DATA || 2817 hdr->b_type == ARC_BUFC_METADATA); 2818 ASSERT3P(ret, !=, NULL); 2819 ASSERT3P(*ret, ==, NULL); 2820 2821 buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2822 buf->b_hdr = hdr; 2823 buf->b_data = NULL; 2824 buf->b_next = hdr->b_l1hdr.b_buf; 2825 buf->b_flags = 0; 2826 2827 add_reference(hdr, tag); 2828 2829 /* 2830 * We're about to change the hdr's b_flags. We must either 2831 * hold the hash_lock or be undiscoverable. 2832 */ 2833 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2834 2835 /* 2836 * Only honor requests for compressed bufs if the hdr is actually 2837 * compressed. 2838 */ 2839 if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 2840 buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; 2841 2842 /* 2843 * If the hdr's data can be shared then we share the data buffer and 2844 * set the appropriate bit in the hdr's b_flags to indicate the hdr is 2845 * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new 2846 * buffer to store the buf's data. 2847 * 2848 * There are two additional restrictions here because we're sharing 2849 * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be 2850 * actively involved in an L2ARC write, because if this buf is used by 2851 * an arc_write() then the hdr's data buffer will be released when the 2852 * write completes, even though the L2ARC write might still be using it. 2853 * Second, the hdr's ABD must be linear so that the buf's user doesn't 2854 * need to be ABD-aware. 2855 */ 2856 boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && 2857 abd_is_linear(hdr->b_l1hdr.b_pabd); 2858 2859 /* Set up b_data and sharing */ 2860 if (can_share) { 2861 buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); 2862 buf->b_flags |= ARC_BUF_FLAG_SHARED; 2863 arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); 2864 } else { 2865 buf->b_data = 2866 arc_get_data_buf(hdr, arc_buf_size(buf), buf); 2867 ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); 2868 } 2869 VERIFY3P(buf->b_data, !=, NULL); 2870 2871 hdr->b_l1hdr.b_buf = buf; 2872 hdr->b_l1hdr.b_bufcnt += 1; 2873 2874 /* 2875 * If the user wants the data from the hdr, we need to either copy or 2876 * decompress the data. 2877 */ 2878 if (fill) { 2879 return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0)); 2880 } 2881 2882 return (0); 2883} 2884 2885static char *arc_onloan_tag = "onloan"; 2886 2887static inline void 2888arc_loaned_bytes_update(int64_t delta) 2889{ 2890 atomic_add_64(&arc_loaned_bytes, delta); 2891 2892 /* assert that it did not wrap around */ 2893 ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); 2894} 2895 2896/* 2897 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 2898 * flight data by arc_tempreserve_space() until they are "returned". Loaned 2899 * buffers must be returned to the arc before they can be used by the DMU or 2900 * freed. 2901 */ 2902arc_buf_t * 2903arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) 2904{ 2905 arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, 2906 is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); 2907 2908 arc_loaned_bytes_update(arc_buf_size(buf)); 2909 2910 return (buf); 2911} 2912 2913arc_buf_t * 2914arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, 2915 enum zio_compress compression_type) 2916{ 2917 arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, 2918 psize, lsize, compression_type); 2919 2920 arc_loaned_bytes_update(arc_buf_size(buf)); 2921 2922 return (buf); 2923} 2924 2925 2926/* 2927 * Return a loaned arc buffer to the arc. 2928 */ 2929void 2930arc_return_buf(arc_buf_t *buf, void *tag) 2931{ 2932 arc_buf_hdr_t *hdr = buf->b_hdr; 2933 2934 ASSERT3P(buf->b_data, !=, NULL); 2935 ASSERT(HDR_HAS_L1HDR(hdr)); 2936 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 2937 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2938 2939 arc_loaned_bytes_update(-arc_buf_size(buf)); 2940} 2941 2942/* Detach an arc_buf from a dbuf (tag) */ 2943void 2944arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 2945{ 2946 arc_buf_hdr_t *hdr = buf->b_hdr; 2947 2948 ASSERT3P(buf->b_data, !=, NULL); 2949 ASSERT(HDR_HAS_L1HDR(hdr)); 2950 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2951 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); 2952 2953 arc_loaned_bytes_update(arc_buf_size(buf)); 2954} 2955 2956static void 2957l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) 2958{ 2959 l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); 2960 2961 df->l2df_abd = abd; 2962 df->l2df_size = size; 2963 df->l2df_type = type; 2964 mutex_enter(&l2arc_free_on_write_mtx); 2965 list_insert_head(l2arc_free_on_write, df); 2966 mutex_exit(&l2arc_free_on_write_mtx); 2967} 2968 2969static void 2970arc_hdr_free_on_write(arc_buf_hdr_t *hdr) 2971{ 2972 arc_state_t *state = hdr->b_l1hdr.b_state; 2973 arc_buf_contents_t type = arc_buf_type(hdr); 2974 uint64_t size = arc_hdr_size(hdr); 2975 2976 /* protected by hash lock, if in the hash table */ 2977 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 2978 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2979 ASSERT(state != arc_anon && state != arc_l2c_only); 2980 2981 (void) refcount_remove_many(&state->arcs_esize[type], 2982 size, hdr); 2983 } 2984 (void) refcount_remove_many(&state->arcs_size, size, hdr); 2985 if (type == ARC_BUFC_METADATA) { 2986 arc_space_return(size, ARC_SPACE_META); 2987 } else { 2988 ASSERT(type == ARC_BUFC_DATA); 2989 arc_space_return(size, ARC_SPACE_DATA); 2990 } 2991 2992 l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); 2993} 2994 2995/* 2996 * Share the arc_buf_t's data with the hdr. Whenever we are sharing the 2997 * data buffer, we transfer the refcount ownership to the hdr and update 2998 * the appropriate kstats. 2999 */ 3000static void 3001arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) 3002{ 3003 arc_state_t *state = hdr->b_l1hdr.b_state; 3004 3005 ASSERT(arc_can_share(hdr, buf)); 3006 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3007 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 3008 3009 /* 3010 * Start sharing the data buffer. We transfer the 3011 * refcount ownership to the hdr since it always owns 3012 * the refcount whenever an arc_buf_t is shared. 3013 */ 3014 refcount_transfer_ownership(&state->arcs_size, buf, hdr); 3015 hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); 3016 abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, 3017 HDR_ISTYPE_METADATA(hdr)); 3018 arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); 3019 buf->b_flags |= ARC_BUF_FLAG_SHARED; 3020 3021 /* 3022 * Since we've transferred ownership to the hdr we need 3023 * to increment its compressed and uncompressed kstats and 3024 * decrement the overhead size. 3025 */ 3026 ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); 3027 ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); 3028 ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf)); 3029} 3030 3031static void 3032arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) 3033{ 3034 arc_state_t *state = hdr->b_l1hdr.b_state; 3035 3036 ASSERT(arc_buf_is_shared(buf)); 3037 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3038 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 3039 3040 /* 3041 * We are no longer sharing this buffer so we need 3042 * to transfer its ownership to the rightful owner. 3043 */ 3044 refcount_transfer_ownership(&state->arcs_size, hdr, buf); 3045 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); 3046 abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); 3047 abd_put(hdr->b_l1hdr.b_pabd); 3048 hdr->b_l1hdr.b_pabd = NULL; 3049 buf->b_flags &= ~ARC_BUF_FLAG_SHARED; 3050 3051 /* 3052 * Since the buffer is no longer shared between 3053 * the arc buf and the hdr, count it as overhead. 3054 */ 3055 ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); 3056 ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); 3057 ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); 3058} 3059 3060/* 3061 * Remove an arc_buf_t from the hdr's buf list and return the last 3062 * arc_buf_t on the list. If no buffers remain on the list then return 3063 * NULL. 3064 */ 3065static arc_buf_t * 3066arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) 3067{ 3068 ASSERT(HDR_HAS_L1HDR(hdr)); 3069 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 3070 3071 arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; 3072 arc_buf_t *lastbuf = NULL; 3073 3074 /* 3075 * Remove the buf from the hdr list and locate the last 3076 * remaining buffer on the list. 3077 */ 3078 while (*bufp != NULL) { 3079 if (*bufp == buf) 3080 *bufp = buf->b_next; 3081 3082 /* 3083 * If we've removed a buffer in the middle of 3084 * the list then update the lastbuf and update 3085 * bufp. 3086 */ 3087 if (*bufp != NULL) { 3088 lastbuf = *bufp; 3089 bufp = &(*bufp)->b_next; 3090 } 3091 } 3092 buf->b_next = NULL; 3093 ASSERT3P(lastbuf, !=, buf); 3094 IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); 3095 IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); 3096 IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); 3097 3098 return (lastbuf); 3099} 3100 3101/* 3102 * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's 3103 * list and free it. 3104 */ 3105static void 3106arc_buf_destroy_impl(arc_buf_t *buf) 3107{ 3108 arc_buf_hdr_t *hdr = buf->b_hdr; 3109 3110 /* 3111 * Free up the data associated with the buf but only if we're not 3112 * sharing this with the hdr. If we are sharing it with the hdr, the 3113 * hdr is responsible for doing the free. 3114 */ 3115 if (buf->b_data != NULL) { 3116 /* 3117 * We're about to change the hdr's b_flags. We must either 3118 * hold the hash_lock or be undiscoverable. 3119 */ 3120 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 3121 3122 arc_cksum_verify(buf); 3123#ifdef illumos 3124 arc_buf_unwatch(buf); 3125#endif 3126 3127 if (arc_buf_is_shared(buf)) { 3128 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); 3129 } else { 3130 uint64_t size = arc_buf_size(buf); 3131 arc_free_data_buf(hdr, buf->b_data, size, buf); 3132 ARCSTAT_INCR(arcstat_overhead_size, -size); 3133 } 3134 buf->b_data = NULL; 3135 3136 ASSERT(hdr->b_l1hdr.b_bufcnt > 0); 3137 hdr->b_l1hdr.b_bufcnt -= 1; 3138 } 3139 3140 arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); 3141 3142 if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { 3143 /* 3144 * If the current arc_buf_t is sharing its data buffer with the 3145 * hdr, then reassign the hdr's b_pabd to share it with the new 3146 * buffer at the end of the list. The shared buffer is always 3147 * the last one on the hdr's buffer list. 3148 * 3149 * There is an equivalent case for compressed bufs, but since 3150 * they aren't guaranteed to be the last buf in the list and 3151 * that is an exceedingly rare case, we just allow that space be 3152 * wasted temporarily. 3153 */ 3154 if (lastbuf != NULL) { 3155 /* Only one buf can be shared at once */ 3156 VERIFY(!arc_buf_is_shared(lastbuf)); 3157 /* hdr is uncompressed so can't have compressed buf */ 3158 VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); 3159 3160 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3161 arc_hdr_free_pabd(hdr); 3162 3163 /* 3164 * We must setup a new shared block between the 3165 * last buffer and the hdr. The data would have 3166 * been allocated by the arc buf so we need to transfer 3167 * ownership to the hdr since it's now being shared. 3168 */ 3169 arc_share_buf(hdr, lastbuf); 3170 } 3171 } else if (HDR_SHARED_DATA(hdr)) { 3172 /* 3173 * Uncompressed shared buffers are always at the end 3174 * of the list. Compressed buffers don't have the 3175 * same requirements. This makes it hard to 3176 * simply assert that the lastbuf is shared so 3177 * we rely on the hdr's compression flags to determine 3178 * if we have a compressed, shared buffer. 3179 */ 3180 ASSERT3P(lastbuf, !=, NULL); 3181 ASSERT(arc_buf_is_shared(lastbuf) || 3182 HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); 3183 } 3184 3185 /* 3186 * Free the checksum if we're removing the last uncompressed buf from 3187 * this hdr. 3188 */ 3189 if (!arc_hdr_has_uncompressed_buf(hdr)) { 3190 arc_cksum_free(hdr); 3191 } 3192 3193 /* clean up the buf */ 3194 buf->b_hdr = NULL; 3195 kmem_cache_free(buf_cache, buf); 3196} 3197 3198static void 3199arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr) 3200{ 3201 ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); 3202 ASSERT(HDR_HAS_L1HDR(hdr)); 3203 ASSERT(!HDR_SHARED_DATA(hdr)); 3204 3205 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3206 hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); 3207 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; 3208 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3209 3210 ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); 3211 ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); 3212} 3213 3214static void 3215arc_hdr_free_pabd(arc_buf_hdr_t *hdr) 3216{ 3217 ASSERT(HDR_HAS_L1HDR(hdr)); 3218 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3219 3220 /* 3221 * If the hdr is currently being written to the l2arc then 3222 * we defer freeing the data by adding it to the l2arc_free_on_write 3223 * list. The l2arc will free the data once it's finished 3224 * writing it to the l2arc device. 3225 */ 3226 if (HDR_L2_WRITING(hdr)) { 3227 arc_hdr_free_on_write(hdr); 3228 ARCSTAT_BUMP(arcstat_l2_free_on_write); 3229 } else { 3230 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, 3231 arc_hdr_size(hdr), hdr); 3232 } 3233 hdr->b_l1hdr.b_pabd = NULL; 3234 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; 3235 3236 ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); 3237 ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); 3238} 3239 3240static arc_buf_hdr_t * 3241arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, 3242 enum zio_compress compression_type, arc_buf_contents_t type) 3243{ 3244 arc_buf_hdr_t *hdr; 3245 3246 VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); 3247 3248 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 3249 ASSERT(HDR_EMPTY(hdr)); 3250 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 3251 ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL); 3252 HDR_SET_PSIZE(hdr, psize); 3253 HDR_SET_LSIZE(hdr, lsize); 3254 hdr->b_spa = spa; 3255 hdr->b_type = type; 3256 hdr->b_flags = 0; 3257 arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); 3258 arc_hdr_set_compress(hdr, compression_type); 3259 3260 hdr->b_l1hdr.b_state = arc_anon; 3261 hdr->b_l1hdr.b_arc_access = 0; 3262 hdr->b_l1hdr.b_bufcnt = 0; 3263 hdr->b_l1hdr.b_buf = NULL; 3264 3265 /* 3266 * Allocate the hdr's buffer. This will contain either 3267 * the compressed or uncompressed data depending on the block 3268 * it references and compressed arc enablement. 3269 */ 3270 arc_hdr_alloc_pabd(hdr); 3271 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3272 3273 return (hdr); 3274} 3275 3276/* 3277 * Transition between the two allocation states for the arc_buf_hdr struct. 3278 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without 3279 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller 3280 * version is used when a cache buffer is only in the L2ARC in order to reduce 3281 * memory usage. 3282 */ 3283static arc_buf_hdr_t * 3284arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) 3285{ 3286 ASSERT(HDR_HAS_L2HDR(hdr)); 3287 3288 arc_buf_hdr_t *nhdr; 3289 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 3290 3291 ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || 3292 (old == hdr_l2only_cache && new == hdr_full_cache)); 3293 3294 nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); 3295 3296 ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); 3297 buf_hash_remove(hdr); 3298 3299 bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); 3300 3301 if (new == hdr_full_cache) { 3302 arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); 3303 /* 3304 * arc_access and arc_change_state need to be aware that a 3305 * header has just come out of L2ARC, so we set its state to 3306 * l2c_only even though it's about to change. 3307 */ 3308 nhdr->b_l1hdr.b_state = arc_l2c_only; 3309 3310 /* Verify previous threads set to NULL before freeing */ 3311 ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); 3312 } else { 3313 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 3314 ASSERT0(hdr->b_l1hdr.b_bufcnt); 3315 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 3316 3317 /* 3318 * If we've reached here, We must have been called from 3319 * arc_evict_hdr(), as such we should have already been 3320 * removed from any ghost list we were previously on 3321 * (which protects us from racing with arc_evict_state), 3322 * thus no locking is needed during this check. 3323 */ 3324 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 3325 3326 /* 3327 * A buffer must not be moved into the arc_l2c_only 3328 * state if it's not finished being written out to the 3329 * l2arc device. Otherwise, the b_l1hdr.b_pabd field 3330 * might try to be accessed, even though it was removed. 3331 */ 3332 VERIFY(!HDR_L2_WRITING(hdr)); 3333 VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3334 3335#ifdef ZFS_DEBUG 3336 if (hdr->b_l1hdr.b_thawed != NULL) { 3337 kmem_free(hdr->b_l1hdr.b_thawed, 1); 3338 hdr->b_l1hdr.b_thawed = NULL; 3339 } 3340#endif 3341 3342 arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); 3343 } 3344 /* 3345 * The header has been reallocated so we need to re-insert it into any 3346 * lists it was on. 3347 */ 3348 (void) buf_hash_insert(nhdr, NULL); 3349 3350 ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); 3351 3352 mutex_enter(&dev->l2ad_mtx); 3353 3354 /* 3355 * We must place the realloc'ed header back into the list at 3356 * the same spot. Otherwise, if it's placed earlier in the list, 3357 * l2arc_write_buffers() could find it during the function's 3358 * write phase, and try to write it out to the l2arc. 3359 */ 3360 list_insert_after(&dev->l2ad_buflist, hdr, nhdr); 3361 list_remove(&dev->l2ad_buflist, hdr); 3362 3363 mutex_exit(&dev->l2ad_mtx); 3364 3365 /* 3366 * Since we're using the pointer address as the tag when 3367 * incrementing and decrementing the l2ad_alloc refcount, we 3368 * must remove the old pointer (that we're about to destroy) and 3369 * add the new pointer to the refcount. Otherwise we'd remove 3370 * the wrong pointer address when calling arc_hdr_destroy() later. 3371 */ 3372 3373 (void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); 3374 (void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); 3375 3376 buf_discard_identity(hdr); 3377 kmem_cache_free(old, hdr); 3378 3379 return (nhdr); 3380} 3381 3382/* 3383 * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. 3384 * The buf is returned thawed since we expect the consumer to modify it. 3385 */ 3386arc_buf_t * 3387arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) 3388{ 3389 arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, 3390 ZIO_COMPRESS_OFF, type); 3391 ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); 3392 3393 arc_buf_t *buf = NULL; 3394 VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf)); 3395 arc_buf_thaw(buf); 3396 3397 return (buf); 3398} 3399 3400/* 3401 * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this 3402 * for bufs containing metadata. 3403 */ 3404arc_buf_t * 3405arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, 3406 enum zio_compress compression_type) 3407{ 3408 ASSERT3U(lsize, >, 0); 3409 ASSERT3U(lsize, >=, psize); 3410 ASSERT(compression_type > ZIO_COMPRESS_OFF); 3411 ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS); 3412 3413 arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, 3414 compression_type, ARC_BUFC_DATA); 3415 ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); 3416 3417 arc_buf_t *buf = NULL; 3418 VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf)); 3419 arc_buf_thaw(buf); 3420 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 3421 3422 if (!arc_buf_is_shared(buf)) { 3423 /* 3424 * To ensure that the hdr has the correct data in it if we call 3425 * arc_decompress() on this buf before it's been written to 3426 * disk, it's easiest if we just set up sharing between the 3427 * buf and the hdr. 3428 */ 3429 ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd)); 3430 arc_hdr_free_pabd(hdr); 3431 arc_share_buf(hdr, buf); 3432 } 3433 3434 return (buf); 3435} 3436 3437static void 3438arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) 3439{ 3440 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 3441 l2arc_dev_t *dev = l2hdr->b_dev; 3442 uint64_t psize = arc_hdr_size(hdr); 3443 3444 ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 3445 ASSERT(HDR_HAS_L2HDR(hdr)); 3446 3447 list_remove(&dev->l2ad_buflist, hdr); 3448 3449 ARCSTAT_INCR(arcstat_l2_psize, -psize); 3450 ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); 3451 3452 vdev_space_update(dev->l2ad_vdev, -psize, 0, 0); 3453 3454 (void) refcount_remove_many(&dev->l2ad_alloc, psize, hdr); 3455 arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); 3456} 3457 3458static void 3459arc_hdr_destroy(arc_buf_hdr_t *hdr) 3460{ 3461 if (HDR_HAS_L1HDR(hdr)) { 3462 ASSERT(hdr->b_l1hdr.b_buf == NULL || 3463 hdr->b_l1hdr.b_bufcnt > 0); 3464 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3465 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 3466 } 3467 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3468 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 3469 3470 if (!HDR_EMPTY(hdr)) 3471 buf_discard_identity(hdr); 3472 3473 if (HDR_HAS_L2HDR(hdr)) { 3474 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 3475 boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); 3476 3477 if (!buflist_held) 3478 mutex_enter(&dev->l2ad_mtx); 3479 3480 /* 3481 * Even though we checked this conditional above, we 3482 * need to check this again now that we have the 3483 * l2ad_mtx. This is because we could be racing with 3484 * another thread calling l2arc_evict() which might have 3485 * destroyed this header's L2 portion as we were waiting 3486 * to acquire the l2ad_mtx. If that happens, we don't 3487 * want to re-destroy the header's L2 portion. 3488 */ 3489 if (HDR_HAS_L2HDR(hdr)) { 3490 l2arc_trim(hdr); 3491 arc_hdr_l2hdr_destroy(hdr); 3492 } 3493 3494 if (!buflist_held) 3495 mutex_exit(&dev->l2ad_mtx); 3496 } 3497 3498 if (HDR_HAS_L1HDR(hdr)) { 3499 arc_cksum_free(hdr); 3500 3501 while (hdr->b_l1hdr.b_buf != NULL) 3502 arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); 3503 3504#ifdef ZFS_DEBUG 3505 if (hdr->b_l1hdr.b_thawed != NULL) { 3506 kmem_free(hdr->b_l1hdr.b_thawed, 1); 3507 hdr->b_l1hdr.b_thawed = NULL; 3508 } 3509#endif 3510 3511 if (hdr->b_l1hdr.b_pabd != NULL) { 3512 arc_hdr_free_pabd(hdr); 3513 } 3514 } 3515 3516 ASSERT3P(hdr->b_hash_next, ==, NULL); 3517 if (HDR_HAS_L1HDR(hdr)) { 3518 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 3519 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 3520 kmem_cache_free(hdr_full_cache, hdr); 3521 } else { 3522 kmem_cache_free(hdr_l2only_cache, hdr); 3523 } 3524} 3525 3526void 3527arc_buf_destroy(arc_buf_t *buf, void* tag) 3528{ 3529 arc_buf_hdr_t *hdr = buf->b_hdr; 3530 kmutex_t *hash_lock = HDR_LOCK(hdr); 3531 3532 if (hdr->b_l1hdr.b_state == arc_anon) { 3533 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); 3534 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3535 VERIFY0(remove_reference(hdr, NULL, tag)); 3536 arc_hdr_destroy(hdr); 3537 return; 3538 } 3539 3540 mutex_enter(hash_lock); 3541 ASSERT3P(hdr, ==, buf->b_hdr); 3542 ASSERT(hdr->b_l1hdr.b_bufcnt > 0); 3543 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 3544 ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); 3545 ASSERT3P(buf->b_data, !=, NULL); 3546 3547 (void) remove_reference(hdr, hash_lock, tag); 3548 arc_buf_destroy_impl(buf); 3549 mutex_exit(hash_lock); 3550} 3551 3552/* 3553 * Evict the arc_buf_hdr that is provided as a parameter. The resultant 3554 * state of the header is dependent on it's state prior to entering this 3555 * function. The following transitions are possible: 3556 * 3557 * - arc_mru -> arc_mru_ghost 3558 * - arc_mfu -> arc_mfu_ghost 3559 * - arc_mru_ghost -> arc_l2c_only 3560 * - arc_mru_ghost -> deleted 3561 * - arc_mfu_ghost -> arc_l2c_only 3562 * - arc_mfu_ghost -> deleted 3563 */ 3564static int64_t 3565arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 3566{ 3567 arc_state_t *evicted_state, *state; 3568 int64_t bytes_evicted = 0; 3569 int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? 3570 zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms; 3571 3572 ASSERT(MUTEX_HELD(hash_lock)); 3573 ASSERT(HDR_HAS_L1HDR(hdr)); 3574 3575 state = hdr->b_l1hdr.b_state; 3576 if (GHOST_STATE(state)) { 3577 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3578 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 3579 3580 /* 3581 * l2arc_write_buffers() relies on a header's L1 portion 3582 * (i.e. its b_pabd field) during it's write phase. 3583 * Thus, we cannot push a header onto the arc_l2c_only 3584 * state (removing it's L1 piece) until the header is 3585 * done being written to the l2arc. 3586 */ 3587 if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { 3588 ARCSTAT_BUMP(arcstat_evict_l2_skip); 3589 return (bytes_evicted); 3590 } 3591 3592 ARCSTAT_BUMP(arcstat_deleted); 3593 bytes_evicted += HDR_GET_LSIZE(hdr); 3594 3595 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); 3596 3597 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3598 if (HDR_HAS_L2HDR(hdr)) { 3599 /* 3600 * This buffer is cached on the 2nd Level ARC; 3601 * don't destroy the header. 3602 */ 3603 arc_change_state(arc_l2c_only, hdr, hash_lock); 3604 /* 3605 * dropping from L1+L2 cached to L2-only, 3606 * realloc to remove the L1 header. 3607 */ 3608 hdr = arc_hdr_realloc(hdr, hdr_full_cache, 3609 hdr_l2only_cache); 3610 } else { 3611 arc_change_state(arc_anon, hdr, hash_lock); 3612 arc_hdr_destroy(hdr); 3613 } 3614 return (bytes_evicted); 3615 } 3616 3617 ASSERT(state == arc_mru || state == arc_mfu); 3618 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 3619 3620 /* prefetch buffers have a minimum lifespan */ 3621 if (HDR_IO_IN_PROGRESS(hdr) || 3622 ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && 3623 ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) { 3624 ARCSTAT_BUMP(arcstat_evict_skip); 3625 return (bytes_evicted); 3626 } 3627 3628 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 3629 while (hdr->b_l1hdr.b_buf) { 3630 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 3631 if (!mutex_tryenter(&buf->b_evict_lock)) { 3632 ARCSTAT_BUMP(arcstat_mutex_miss); 3633 break; 3634 } 3635 if (buf->b_data != NULL) 3636 bytes_evicted += HDR_GET_LSIZE(hdr); 3637 mutex_exit(&buf->b_evict_lock); 3638 arc_buf_destroy_impl(buf); 3639 } 3640 3641 if (HDR_HAS_L2HDR(hdr)) { 3642 ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); 3643 } else { 3644 if (l2arc_write_eligible(hdr->b_spa, hdr)) { 3645 ARCSTAT_INCR(arcstat_evict_l2_eligible, 3646 HDR_GET_LSIZE(hdr)); 3647 } else { 3648 ARCSTAT_INCR(arcstat_evict_l2_ineligible, 3649 HDR_GET_LSIZE(hdr)); 3650 } 3651 } 3652 3653 if (hdr->b_l1hdr.b_bufcnt == 0) { 3654 arc_cksum_free(hdr); 3655 3656 bytes_evicted += arc_hdr_size(hdr); 3657 3658 /* 3659 * If this hdr is being evicted and has a compressed 3660 * buffer then we discard it here before we change states. 3661 * This ensures that the accounting is updated correctly 3662 * in arc_free_data_impl(). 3663 */ 3664 arc_hdr_free_pabd(hdr); 3665 3666 arc_change_state(evicted_state, hdr, hash_lock); 3667 ASSERT(HDR_IN_HASH_TABLE(hdr)); 3668 arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); 3669 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); 3670 } 3671 3672 return (bytes_evicted); 3673} 3674 3675static uint64_t 3676arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, 3677 uint64_t spa, int64_t bytes) 3678{ 3679 multilist_sublist_t *mls; 3680 uint64_t bytes_evicted = 0; 3681 arc_buf_hdr_t *hdr; 3682 kmutex_t *hash_lock; 3683 int evict_count = 0; 3684 3685 ASSERT3P(marker, !=, NULL); 3686 IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 3687 3688 mls = multilist_sublist_lock(ml, idx); 3689 3690 for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; 3691 hdr = multilist_sublist_prev(mls, marker)) { 3692 if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || 3693 (evict_count >= zfs_arc_evict_batch_limit)) 3694 break; 3695 3696 /* 3697 * To keep our iteration location, move the marker 3698 * forward. Since we're not holding hdr's hash lock, we 3699 * must be very careful and not remove 'hdr' from the 3700 * sublist. Otherwise, other consumers might mistake the 3701 * 'hdr' as not being on a sublist when they call the 3702 * multilist_link_active() function (they all rely on 3703 * the hash lock protecting concurrent insertions and 3704 * removals). multilist_sublist_move_forward() was 3705 * specifically implemented to ensure this is the case 3706 * (only 'marker' will be removed and re-inserted). 3707 */ 3708 multilist_sublist_move_forward(mls, marker); 3709 3710 /* 3711 * The only case where the b_spa field should ever be 3712 * zero, is the marker headers inserted by 3713 * arc_evict_state(). It's possible for multiple threads 3714 * to be calling arc_evict_state() concurrently (e.g. 3715 * dsl_pool_close() and zio_inject_fault()), so we must 3716 * skip any markers we see from these other threads. 3717 */ 3718 if (hdr->b_spa == 0) 3719 continue; 3720 3721 /* we're only interested in evicting buffers of a certain spa */ 3722 if (spa != 0 && hdr->b_spa != spa) { 3723 ARCSTAT_BUMP(arcstat_evict_skip); 3724 continue; 3725 } 3726 3727 hash_lock = HDR_LOCK(hdr); 3728 3729 /* 3730 * We aren't calling this function from any code path 3731 * that would already be holding a hash lock, so we're 3732 * asserting on this assumption to be defensive in case 3733 * this ever changes. Without this check, it would be 3734 * possible to incorrectly increment arcstat_mutex_miss 3735 * below (e.g. if the code changed such that we called 3736 * this function with a hash lock held). 3737 */ 3738 ASSERT(!MUTEX_HELD(hash_lock)); 3739 3740 if (mutex_tryenter(hash_lock)) { 3741 uint64_t evicted = arc_evict_hdr(hdr, hash_lock); 3742 mutex_exit(hash_lock); 3743 3744 bytes_evicted += evicted; 3745 3746 /* 3747 * If evicted is zero, arc_evict_hdr() must have 3748 * decided to skip this header, don't increment 3749 * evict_count in this case. 3750 */ 3751 if (evicted != 0) 3752 evict_count++; 3753 3754 /* 3755 * If arc_size isn't overflowing, signal any 3756 * threads that might happen to be waiting. 3757 * 3758 * For each header evicted, we wake up a single 3759 * thread. If we used cv_broadcast, we could 3760 * wake up "too many" threads causing arc_size 3761 * to significantly overflow arc_c; since 3762 * arc_get_data_impl() doesn't check for overflow 3763 * when it's woken up (it doesn't because it's 3764 * possible for the ARC to be overflowing while 3765 * full of un-evictable buffers, and the 3766 * function should proceed in this case). 3767 * 3768 * If threads are left sleeping, due to not 3769 * using cv_broadcast, they will be woken up 3770 * just before arc_reclaim_thread() sleeps. 3771 */ 3772 mutex_enter(&arc_reclaim_lock); 3773 if (!arc_is_overflowing()) 3774 cv_signal(&arc_reclaim_waiters_cv); 3775 mutex_exit(&arc_reclaim_lock); 3776 } else { 3777 ARCSTAT_BUMP(arcstat_mutex_miss); 3778 } 3779 } 3780 3781 multilist_sublist_unlock(mls); 3782 3783 return (bytes_evicted); 3784} 3785 3786/* 3787 * Evict buffers from the given arc state, until we've removed the 3788 * specified number of bytes. Move the removed buffers to the 3789 * appropriate evict state. 3790 * 3791 * This function makes a "best effort". It skips over any buffers 3792 * it can't get a hash_lock on, and so, may not catch all candidates. 3793 * It may also return without evicting as much space as requested. 3794 * 3795 * If bytes is specified using the special value ARC_EVICT_ALL, this 3796 * will evict all available (i.e. unlocked and evictable) buffers from 3797 * the given arc state; which is used by arc_flush(). 3798 */ 3799static uint64_t 3800arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, 3801 arc_buf_contents_t type) 3802{ 3803 uint64_t total_evicted = 0; 3804 multilist_t *ml = state->arcs_list[type]; 3805 int num_sublists; 3806 arc_buf_hdr_t **markers; 3807 3808 IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 3809 3810 num_sublists = multilist_get_num_sublists(ml); 3811 3812 /* 3813 * If we've tried to evict from each sublist, made some 3814 * progress, but still have not hit the target number of bytes 3815 * to evict, we want to keep trying. The markers allow us to 3816 * pick up where we left off for each individual sublist, rather 3817 * than starting from the tail each time. 3818 */ 3819 markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); 3820 for (int i = 0; i < num_sublists; i++) { 3821 markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); 3822 3823 /* 3824 * A b_spa of 0 is used to indicate that this header is 3825 * a marker. This fact is used in arc_adjust_type() and 3826 * arc_evict_state_impl(). 3827 */ 3828 markers[i]->b_spa = 0; 3829 3830 multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 3831 multilist_sublist_insert_tail(mls, markers[i]); 3832 multilist_sublist_unlock(mls); 3833 } 3834 3835 /* 3836 * While we haven't hit our target number of bytes to evict, or 3837 * we're evicting all available buffers. 3838 */ 3839 while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { 3840 /* 3841 * Start eviction using a randomly selected sublist, 3842 * this is to try and evenly balance eviction across all 3843 * sublists. Always starting at the same sublist 3844 * (e.g. index 0) would cause evictions to favor certain 3845 * sublists over others. 3846 */ 3847 int sublist_idx = multilist_get_random_index(ml); 3848 uint64_t scan_evicted = 0; 3849 3850 for (int i = 0; i < num_sublists; i++) { 3851 uint64_t bytes_remaining; 3852 uint64_t bytes_evicted; 3853 3854 if (bytes == ARC_EVICT_ALL) 3855 bytes_remaining = ARC_EVICT_ALL; 3856 else if (total_evicted < bytes) 3857 bytes_remaining = bytes - total_evicted; 3858 else 3859 break; 3860 3861 bytes_evicted = arc_evict_state_impl(ml, sublist_idx, 3862 markers[sublist_idx], spa, bytes_remaining); 3863 3864 scan_evicted += bytes_evicted; 3865 total_evicted += bytes_evicted; 3866 3867 /* we've reached the end, wrap to the beginning */ 3868 if (++sublist_idx >= num_sublists) 3869 sublist_idx = 0; 3870 } 3871 3872 /* 3873 * If we didn't evict anything during this scan, we have 3874 * no reason to believe we'll evict more during another 3875 * scan, so break the loop. 3876 */ 3877 if (scan_evicted == 0) { 3878 /* This isn't possible, let's make that obvious */ 3879 ASSERT3S(bytes, !=, 0); 3880 3881 /* 3882 * When bytes is ARC_EVICT_ALL, the only way to 3883 * break the loop is when scan_evicted is zero. 3884 * In that case, we actually have evicted enough, 3885 * so we don't want to increment the kstat. 3886 */ 3887 if (bytes != ARC_EVICT_ALL) { 3888 ASSERT3S(total_evicted, <, bytes); 3889 ARCSTAT_BUMP(arcstat_evict_not_enough); 3890 } 3891 3892 break; 3893 } 3894 } 3895 3896 for (int i = 0; i < num_sublists; i++) { 3897 multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 3898 multilist_sublist_remove(mls, markers[i]); 3899 multilist_sublist_unlock(mls); 3900 3901 kmem_cache_free(hdr_full_cache, markers[i]); 3902 } 3903 kmem_free(markers, sizeof (*markers) * num_sublists); 3904 3905 return (total_evicted); 3906} 3907 3908/* 3909 * Flush all "evictable" data of the given type from the arc state 3910 * specified. This will not evict any "active" buffers (i.e. referenced). 3911 * 3912 * When 'retry' is set to B_FALSE, the function will make a single pass 3913 * over the state and evict any buffers that it can. Since it doesn't 3914 * continually retry the eviction, it might end up leaving some buffers 3915 * in the ARC due to lock misses. 3916 * 3917 * When 'retry' is set to B_TRUE, the function will continually retry the 3918 * eviction until *all* evictable buffers have been removed from the 3919 * state. As a result, if concurrent insertions into the state are 3920 * allowed (e.g. if the ARC isn't shutting down), this function might 3921 * wind up in an infinite loop, continually trying to evict buffers. 3922 */ 3923static uint64_t 3924arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, 3925 boolean_t retry) 3926{ 3927 uint64_t evicted = 0; 3928 3929 while (refcount_count(&state->arcs_esize[type]) != 0) { 3930 evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); 3931 3932 if (!retry) 3933 break; 3934 } 3935 3936 return (evicted); 3937} 3938 3939/* 3940 * Evict the specified number of bytes from the state specified, 3941 * restricting eviction to the spa and type given. This function 3942 * prevents us from trying to evict more from a state's list than 3943 * is "evictable", and to skip evicting altogether when passed a 3944 * negative value for "bytes". In contrast, arc_evict_state() will 3945 * evict everything it can, when passed a negative value for "bytes". 3946 */ 3947static uint64_t 3948arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, 3949 arc_buf_contents_t type) 3950{ 3951 int64_t delta; 3952 3953 if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) { 3954 delta = MIN(refcount_count(&state->arcs_esize[type]), bytes); 3955 return (arc_evict_state(state, spa, delta, type)); 3956 } 3957 3958 return (0); 3959} 3960 3961/* 3962 * Evict metadata buffers from the cache, such that arc_meta_used is 3963 * capped by the arc_meta_limit tunable. 3964 */ 3965static uint64_t 3966arc_adjust_meta(uint64_t meta_used) 3967{ 3968 uint64_t total_evicted = 0; 3969 int64_t target; 3970 3971 /* 3972 * If we're over the meta limit, we want to evict enough 3973 * metadata to get back under the meta limit. We don't want to 3974 * evict so much that we drop the MRU below arc_p, though. If 3975 * we're over the meta limit more than we're over arc_p, we 3976 * evict some from the MRU here, and some from the MFU below. 3977 */ 3978 target = MIN((int64_t)(meta_used - arc_meta_limit), 3979 (int64_t)(refcount_count(&arc_anon->arcs_size) + 3980 refcount_count(&arc_mru->arcs_size) - arc_p)); 3981 3982 total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 3983 3984 /* 3985 * Similar to the above, we want to evict enough bytes to get us 3986 * below the meta limit, but not so much as to drop us below the 3987 * space allotted to the MFU (which is defined as arc_c - arc_p). 3988 */ 3989 target = MIN((int64_t)(meta_used - arc_meta_limit), 3990 (int64_t)(refcount_count(&arc_mfu->arcs_size) - 3991 (arc_c - arc_p))); 3992 3993 total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 3994 3995 return (total_evicted); 3996} 3997 3998/* 3999 * Return the type of the oldest buffer in the given arc state 4000 * 4001 * This function will select a random sublist of type ARC_BUFC_DATA and 4002 * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist 4003 * is compared, and the type which contains the "older" buffer will be 4004 * returned. 4005 */ 4006static arc_buf_contents_t 4007arc_adjust_type(arc_state_t *state) 4008{ 4009 multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA]; 4010 multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA]; 4011 int data_idx = multilist_get_random_index(data_ml); 4012 int meta_idx = multilist_get_random_index(meta_ml); 4013 multilist_sublist_t *data_mls; 4014 multilist_sublist_t *meta_mls; 4015 arc_buf_contents_t type; 4016 arc_buf_hdr_t *data_hdr; 4017 arc_buf_hdr_t *meta_hdr; 4018 4019 /* 4020 * We keep the sublist lock until we're finished, to prevent 4021 * the headers from being destroyed via arc_evict_state(). 4022 */ 4023 data_mls = multilist_sublist_lock(data_ml, data_idx); 4024 meta_mls = multilist_sublist_lock(meta_ml, meta_idx); 4025 4026 /* 4027 * These two loops are to ensure we skip any markers that 4028 * might be at the tail of the lists due to arc_evict_state(). 4029 */ 4030 4031 for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; 4032 data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { 4033 if (data_hdr->b_spa != 0) 4034 break; 4035 } 4036 4037 for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; 4038 meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { 4039 if (meta_hdr->b_spa != 0) 4040 break; 4041 } 4042 4043 if (data_hdr == NULL && meta_hdr == NULL) { 4044 type = ARC_BUFC_DATA; 4045 } else if (data_hdr == NULL) { 4046 ASSERT3P(meta_hdr, !=, NULL); 4047 type = ARC_BUFC_METADATA; 4048 } else if (meta_hdr == NULL) { 4049 ASSERT3P(data_hdr, !=, NULL); 4050 type = ARC_BUFC_DATA; 4051 } else { 4052 ASSERT3P(data_hdr, !=, NULL); 4053 ASSERT3P(meta_hdr, !=, NULL); 4054 4055 /* The headers can't be on the sublist without an L1 header */ 4056 ASSERT(HDR_HAS_L1HDR(data_hdr)); 4057 ASSERT(HDR_HAS_L1HDR(meta_hdr)); 4058 4059 if (data_hdr->b_l1hdr.b_arc_access < 4060 meta_hdr->b_l1hdr.b_arc_access) { 4061 type = ARC_BUFC_DATA; 4062 } else { 4063 type = ARC_BUFC_METADATA; 4064 } 4065 } 4066 4067 multilist_sublist_unlock(meta_mls); 4068 multilist_sublist_unlock(data_mls); 4069 4070 return (type); 4071} 4072 4073/* 4074 * Evict buffers from the cache, such that arc_size is capped by arc_c. 4075 */ 4076static uint64_t 4077arc_adjust(void) 4078{ 4079 uint64_t total_evicted = 0; 4080 uint64_t bytes; 4081 int64_t target; 4082 uint64_t asize = aggsum_value(&arc_size); 4083 uint64_t ameta = aggsum_value(&arc_meta_used); 4084 4085 /* 4086 * If we're over arc_meta_limit, we want to correct that before 4087 * potentially evicting data buffers below. 4088 */ 4089 total_evicted += arc_adjust_meta(ameta); 4090 4091 /* 4092 * Adjust MRU size 4093 * 4094 * If we're over the target cache size, we want to evict enough 4095 * from the list to get back to our target size. We don't want 4096 * to evict too much from the MRU, such that it drops below 4097 * arc_p. So, if we're over our target cache size more than 4098 * the MRU is over arc_p, we'll evict enough to get back to 4099 * arc_p here, and then evict more from the MFU below. 4100 */ 4101 target = MIN((int64_t)(asize - arc_c), 4102 (int64_t)(refcount_count(&arc_anon->arcs_size) + 4103 refcount_count(&arc_mru->arcs_size) + ameta - arc_p)); 4104 4105 /* 4106 * If we're below arc_meta_min, always prefer to evict data. 4107 * Otherwise, try to satisfy the requested number of bytes to 4108 * evict from the type which contains older buffers; in an 4109 * effort to keep newer buffers in the cache regardless of their 4110 * type. If we cannot satisfy the number of bytes from this 4111 * type, spill over into the next type. 4112 */ 4113 if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && 4114 ameta > arc_meta_min) { 4115 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 4116 total_evicted += bytes; 4117 4118 /* 4119 * If we couldn't evict our target number of bytes from 4120 * metadata, we try to get the rest from data. 4121 */ 4122 target -= bytes; 4123 4124 total_evicted += 4125 arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 4126 } else { 4127 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 4128 total_evicted += bytes; 4129 4130 /* 4131 * If we couldn't evict our target number of bytes from 4132 * data, we try to get the rest from metadata. 4133 */ 4134 target -= bytes; 4135 4136 total_evicted += 4137 arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 4138 } 4139 4140 /* 4141 * Re-sum ARC stats after the first round of evictions. 4142 */ 4143 asize = aggsum_value(&arc_size); 4144 ameta = aggsum_value(&arc_meta_used); 4145 4146 /* 4147 * Adjust MFU size 4148 * 4149 * Now that we've tried to evict enough from the MRU to get its 4150 * size back to arc_p, if we're still above the target cache 4151 * size, we evict the rest from the MFU. 4152 */ 4153 target = asize - arc_c; 4154 4155 if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && 4156 ameta > arc_meta_min) { 4157 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 4158 total_evicted += bytes; 4159 4160 /* 4161 * If we couldn't evict our target number of bytes from 4162 * metadata, we try to get the rest from data. 4163 */ 4164 target -= bytes; 4165 4166 total_evicted += 4167 arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 4168 } else { 4169 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 4170 total_evicted += bytes; 4171 4172 /* 4173 * If we couldn't evict our target number of bytes from 4174 * data, we try to get the rest from data. 4175 */ 4176 target -= bytes; 4177 4178 total_evicted += 4179 arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 4180 } 4181 4182 /* 4183 * Adjust ghost lists 4184 * 4185 * In addition to the above, the ARC also defines target values 4186 * for the ghost lists. The sum of the mru list and mru ghost 4187 * list should never exceed the target size of the cache, and 4188 * the sum of the mru list, mfu list, mru ghost list, and mfu 4189 * ghost list should never exceed twice the target size of the 4190 * cache. The following logic enforces these limits on the ghost 4191 * caches, and evicts from them as needed. 4192 */ 4193 target = refcount_count(&arc_mru->arcs_size) + 4194 refcount_count(&arc_mru_ghost->arcs_size) - arc_c; 4195 4196 bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); 4197 total_evicted += bytes; 4198 4199 target -= bytes; 4200 4201 total_evicted += 4202 arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); 4203 4204 /* 4205 * We assume the sum of the mru list and mfu list is less than 4206 * or equal to arc_c (we enforced this above), which means we 4207 * can use the simpler of the two equations below: 4208 * 4209 * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c 4210 * mru ghost + mfu ghost <= arc_c 4211 */ 4212 target = refcount_count(&arc_mru_ghost->arcs_size) + 4213 refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; 4214 4215 bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); 4216 total_evicted += bytes; 4217 4218 target -= bytes; 4219 4220 total_evicted += 4221 arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); 4222 4223 return (total_evicted); 4224} 4225 4226void 4227arc_flush(spa_t *spa, boolean_t retry) 4228{ 4229 uint64_t guid = 0; 4230 4231 /* 4232 * If retry is B_TRUE, a spa must not be specified since we have 4233 * no good way to determine if all of a spa's buffers have been 4234 * evicted from an arc state. 4235 */ 4236 ASSERT(!retry || spa == 0); 4237 4238 if (spa != NULL) 4239 guid = spa_load_guid(spa); 4240 4241 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); 4242 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); 4243 4244 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); 4245 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); 4246 4247 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); 4248 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); 4249 4250 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); 4251 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); 4252} 4253 4254uint64_t 4255arc_shrink(int64_t to_free) 4256{ 4257 uint64_t asize = aggsum_value(&arc_size); 4258 if (arc_c > arc_c_min) { 4259 DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, 4260 arc_c_min, uint64_t, arc_p, uint64_t, to_free); 4261 if (arc_c > arc_c_min + to_free) 4262 atomic_add_64(&arc_c, -to_free); 4263 else 4264 arc_c = arc_c_min; 4265 4266 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 4267 if (asize < arc_c) 4268 arc_c = MAX(asize, arc_c_min); 4269 if (arc_p > arc_c) 4270 arc_p = (arc_c >> 1); 4271 4272 DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, 4273 arc_p); 4274 4275 ASSERT(arc_c >= arc_c_min); 4276 ASSERT((int64_t)arc_p >= 0); 4277 } 4278 4279 if (asize > arc_c) { 4280 DTRACE_PROBE2(arc__shrink_adjust, uint64_t, asize, 4281 uint64_t, arc_c); 4282 return (arc_adjust()); 4283 } 4284 return (0); 4285} 4286 4287typedef enum free_memory_reason_t { 4288 FMR_UNKNOWN, 4289 FMR_NEEDFREE, 4290 FMR_LOTSFREE, 4291 FMR_SWAPFS_MINFREE, 4292 FMR_PAGES_PP_MAXIMUM, 4293 FMR_HEAP_ARENA, 4294 FMR_ZIO_ARENA, 4295 FMR_ZIO_FRAG, 4296} free_memory_reason_t; 4297 4298int64_t last_free_memory; 4299free_memory_reason_t last_free_reason; 4300 4301/* 4302 * Additional reserve of pages for pp_reserve. 4303 */ 4304int64_t arc_pages_pp_reserve = 64; 4305 4306/* 4307 * Additional reserve of pages for swapfs. 4308 */ 4309int64_t arc_swapfs_reserve = 64; 4310 4311/* 4312 * Return the amount of memory that can be consumed before reclaim will be 4313 * needed. Positive if there is sufficient free memory, negative indicates 4314 * the amount of memory that needs to be freed up. 4315 */ 4316static int64_t 4317arc_available_memory(void) 4318{ 4319 int64_t lowest = INT64_MAX; 4320 int64_t n; 4321 free_memory_reason_t r = FMR_UNKNOWN; 4322 4323#ifdef _KERNEL 4324#ifdef __FreeBSD__ 4325 /* 4326 * Cooperate with pagedaemon when it's time for it to scan 4327 * and reclaim some pages. 4328 */ 4329 n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); 4330 if (n < lowest) { 4331 lowest = n; 4332 r = FMR_LOTSFREE; 4333 } 4334 4335#else 4336 if (needfree > 0) { 4337 n = PAGESIZE * (-needfree); 4338 if (n < lowest) { 4339 lowest = n; 4340 r = FMR_NEEDFREE; 4341 } 4342 } 4343 4344 /* 4345 * check that we're out of range of the pageout scanner. It starts to 4346 * schedule paging if freemem is less than lotsfree and needfree. 4347 * lotsfree is the high-water mark for pageout, and needfree is the 4348 * number of needed free pages. We add extra pages here to make sure 4349 * the scanner doesn't start up while we're freeing memory. 4350 */ 4351 n = PAGESIZE * (freemem - lotsfree - needfree - desfree); 4352 if (n < lowest) { 4353 lowest = n; 4354 r = FMR_LOTSFREE; 4355 } 4356 4357 /* 4358 * check to make sure that swapfs has enough space so that anon 4359 * reservations can still succeed. anon_resvmem() checks that the 4360 * availrmem is greater than swapfs_minfree, and the number of reserved 4361 * swap pages. We also add a bit of extra here just to prevent 4362 * circumstances from getting really dire. 4363 */ 4364 n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - 4365 desfree - arc_swapfs_reserve); 4366 if (n < lowest) { 4367 lowest = n; 4368 r = FMR_SWAPFS_MINFREE; 4369 } 4370 4371 4372 /* 4373 * Check that we have enough availrmem that memory locking (e.g., via 4374 * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum 4375 * stores the number of pages that cannot be locked; when availrmem 4376 * drops below pages_pp_maximum, page locking mechanisms such as 4377 * page_pp_lock() will fail.) 4378 */ 4379 n = PAGESIZE * (availrmem - pages_pp_maximum - 4380 arc_pages_pp_reserve); 4381 if (n < lowest) { 4382 lowest = n; 4383 r = FMR_PAGES_PP_MAXIMUM; 4384 } 4385 4386#endif /* __FreeBSD__ */ 4387#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 4388 /* 4389 * If we're on an i386 platform, it's possible that we'll exhaust the 4390 * kernel heap space before we ever run out of available physical 4391 * memory. Most checks of the size of the heap_area compare against 4392 * tune.t_minarmem, which is the minimum available real memory that we 4393 * can have in the system. However, this is generally fixed at 25 pages 4394 * which is so low that it's useless. In this comparison, we seek to 4395 * calculate the total heap-size, and reclaim if more than 3/4ths of the 4396 * heap is allocated. (Or, in the calculation, if less than 1/4th is 4397 * free) 4398 */ 4399 n = (int64_t)vmem_size(heap_arena, VMEM_FREE) - 4400 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); 4401 if (n < lowest) { 4402 lowest = n; 4403 r = FMR_HEAP_ARENA; 4404 } 4405#define zio_arena NULL 4406#else 4407#define zio_arena heap_arena 4408#endif 4409 4410 /* 4411 * If zio data pages are being allocated out of a separate heap segment, 4412 * then enforce that the size of available vmem for this arena remains 4413 * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free. 4414 * 4415 * Note that reducing the arc_zio_arena_free_shift keeps more virtual 4416 * memory (in the zio_arena) free, which can avoid memory 4417 * fragmentation issues. 4418 */ 4419 if (zio_arena != NULL) { 4420 n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - 4421 (vmem_size(zio_arena, VMEM_ALLOC) >> 4422 arc_zio_arena_free_shift); 4423 if (n < lowest) { 4424 lowest = n; 4425 r = FMR_ZIO_ARENA; 4426 } 4427 } 4428 4429 /* 4430 * Above limits know nothing about real level of KVA fragmentation. 4431 * Start aggressive reclamation if too little sequential KVA left. 4432 */ 4433 if (lowest > 0) { 4434 n = (vmem_size(heap_arena, VMEM_MAXFREE) < SPA_MAXBLOCKSIZE) ? 4435 -((int64_t)vmem_size(heap_arena, VMEM_ALLOC) >> 4) : 4436 INT64_MAX; 4437 if (n < lowest) { 4438 lowest = n; 4439 r = FMR_ZIO_FRAG; 4440 } 4441 } 4442 4443#else /* _KERNEL */ 4444 /* Every 100 calls, free a small amount */ 4445 if (spa_get_random(100) == 0) 4446 lowest = -1024; 4447#endif /* _KERNEL */ 4448 4449 last_free_memory = lowest; 4450 last_free_reason = r; 4451 DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); 4452 return (lowest); 4453} 4454 4455 4456/* 4457 * Determine if the system is under memory pressure and is asking 4458 * to reclaim memory. A return value of B_TRUE indicates that the system 4459 * is under memory pressure and that the arc should adjust accordingly. 4460 */ 4461static boolean_t 4462arc_reclaim_needed(void) 4463{ 4464 return (arc_available_memory() < 0); 4465} 4466 4467extern kmem_cache_t *zio_buf_cache[]; 4468extern kmem_cache_t *zio_data_buf_cache[]; 4469extern kmem_cache_t *range_seg_cache; 4470extern kmem_cache_t *abd_chunk_cache; 4471 4472static __noinline void 4473arc_kmem_reap_now(void) 4474{ 4475 size_t i; 4476 kmem_cache_t *prev_cache = NULL; 4477 kmem_cache_t *prev_data_cache = NULL; 4478 4479 DTRACE_PROBE(arc__kmem_reap_start); 4480#ifdef _KERNEL 4481 if (aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) { 4482 /* 4483 * We are exceeding our meta-data cache limit. 4484 * Purge some DNLC entries to release holds on meta-data. 4485 */ 4486 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 4487 } 4488#if defined(__i386) 4489 /* 4490 * Reclaim unused memory from all kmem caches. 4491 */ 4492 kmem_reap(); 4493#endif 4494#endif 4495 4496 /* 4497 * If a kmem reap is already active, don't schedule more. We must 4498 * check for this because kmem_cache_reap_soon() won't actually 4499 * block on the cache being reaped (this is to prevent callers from 4500 * becoming implicitly blocked by a system-wide kmem reap -- which, 4501 * on a system with many, many full magazines, can take minutes). 4502 */ 4503 if (kmem_cache_reap_active()) 4504 return; 4505 4506 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 4507 if (zio_buf_cache[i] != prev_cache) { 4508 prev_cache = zio_buf_cache[i]; 4509 kmem_cache_reap_soon(zio_buf_cache[i]); 4510 } 4511 if (zio_data_buf_cache[i] != prev_data_cache) { 4512 prev_data_cache = zio_data_buf_cache[i]; 4513 kmem_cache_reap_soon(zio_data_buf_cache[i]); 4514 } 4515 } 4516 kmem_cache_reap_soon(abd_chunk_cache); 4517 kmem_cache_reap_soon(buf_cache); 4518 kmem_cache_reap_soon(hdr_full_cache); 4519 kmem_cache_reap_soon(hdr_l2only_cache); 4520 kmem_cache_reap_soon(range_seg_cache); 4521 4522#ifdef illumos 4523 if (zio_arena != NULL) { 4524 /* 4525 * Ask the vmem arena to reclaim unused memory from its 4526 * quantum caches. 4527 */ 4528 vmem_qcache_reap(zio_arena); 4529 } 4530#endif 4531 DTRACE_PROBE(arc__kmem_reap_end); 4532} 4533 4534/* 4535 * Threads can block in arc_get_data_impl() waiting for this thread to evict 4536 * enough data and signal them to proceed. When this happens, the threads in 4537 * arc_get_data_impl() are sleeping while holding the hash lock for their 4538 * particular arc header. Thus, we must be careful to never sleep on a 4539 * hash lock in this thread. This is to prevent the following deadlock: 4540 * 4541 * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L", 4542 * waiting for the reclaim thread to signal it. 4543 * 4544 * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, 4545 * fails, and goes to sleep forever. 4546 * 4547 * This possible deadlock is avoided by always acquiring a hash lock 4548 * using mutex_tryenter() from arc_reclaim_thread(). 4549 */ 4550/* ARGSUSED */ 4551static void 4552arc_reclaim_thread(void *unused __unused) 4553{ 4554 hrtime_t growtime = 0; 4555 hrtime_t kmem_reap_time = 0; 4556 callb_cpr_t cpr; 4557 4558 CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); 4559 4560 mutex_enter(&arc_reclaim_lock); 4561 while (!arc_reclaim_thread_exit) { 4562 uint64_t evicted = 0; 4563 4564 /* 4565 * This is necessary in order for the mdb ::arc dcmd to 4566 * show up to date information. Since the ::arc command 4567 * does not call the kstat's update function, without 4568 * this call, the command may show stale stats for the 4569 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even 4570 * with this change, the data might be up to 1 second 4571 * out of date; but that should suffice. The arc_state_t 4572 * structures can be queried directly if more accurate 4573 * information is needed. 4574 */ 4575 if (arc_ksp != NULL) 4576 arc_ksp->ks_update(arc_ksp, KSTAT_READ); 4577 4578 mutex_exit(&arc_reclaim_lock); 4579 4580 /* 4581 * We call arc_adjust() before (possibly) calling 4582 * arc_kmem_reap_now(), so that we can wake up 4583 * arc_get_data_impl() sooner. 4584 */ 4585 evicted = arc_adjust(); 4586 4587 int64_t free_memory = arc_available_memory(); 4588 if (free_memory < 0) { 4589 hrtime_t curtime = gethrtime(); 4590 arc_no_grow = B_TRUE; 4591 arc_warm = B_TRUE; 4592 4593 /* 4594 * Wait at least zfs_grow_retry (default 60) seconds 4595 * before considering growing. 4596 */ 4597 growtime = curtime + SEC2NSEC(arc_grow_retry); 4598 4599 /* 4600 * Wait at least arc_kmem_cache_reap_retry_ms 4601 * between arc_kmem_reap_now() calls. Without 4602 * this check it is possible to end up in a 4603 * situation where we spend lots of time 4604 * reaping caches, while we're near arc_c_min. 4605 */ 4606 if (curtime >= kmem_reap_time) { 4607 arc_kmem_reap_now(); 4608 kmem_reap_time = gethrtime() + 4609 MSEC2NSEC(arc_kmem_cache_reap_retry_ms); 4610 } 4611 4612 /* 4613 * If we are still low on memory, shrink the ARC 4614 * so that we have arc_shrink_min free space. 4615 */ 4616 free_memory = arc_available_memory(); 4617 4618 int64_t to_free = 4619 (arc_c >> arc_shrink_shift) - free_memory; 4620 if (to_free > 0) { 4621#ifdef _KERNEL 4622#ifdef illumos 4623 to_free = MAX(to_free, ptob(needfree)); 4624#endif 4625#endif 4626 evicted += arc_shrink(to_free); 4627 } 4628 } else if (free_memory < arc_c >> arc_no_grow_shift) { 4629 arc_no_grow = B_TRUE; 4630 } else if (gethrtime() >= growtime) { 4631 arc_no_grow = B_FALSE; 4632 } 4633 4634 mutex_enter(&arc_reclaim_lock); 4635 4636 /* 4637 * If evicted is zero, we couldn't evict anything via 4638 * arc_adjust(). This could be due to hash lock 4639 * collisions, but more likely due to the majority of 4640 * arc buffers being unevictable. Therefore, even if 4641 * arc_size is above arc_c, another pass is unlikely to 4642 * be helpful and could potentially cause us to enter an 4643 * infinite loop. 4644 */ 4645 if (aggsum_compare(&arc_size, arc_c) <= 0|| evicted == 0) { 4646 /* 4647 * We're either no longer overflowing, or we 4648 * can't evict anything more, so we should wake 4649 * up any threads before we go to sleep. 4650 */ 4651 cv_broadcast(&arc_reclaim_waiters_cv); 4652 4653 /* 4654 * Block until signaled, or after one second (we 4655 * might need to perform arc_kmem_reap_now() 4656 * even if we aren't being signalled) 4657 */ 4658 CALLB_CPR_SAFE_BEGIN(&cpr); 4659 (void) cv_timedwait_hires(&arc_reclaim_thread_cv, 4660 &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); 4661 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); 4662 } 4663 } 4664 4665 arc_reclaim_thread_exit = B_FALSE; 4666 cv_broadcast(&arc_reclaim_thread_cv); 4667 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ 4668 thread_exit(); 4669} 4670 4671static u_int arc_dnlc_evicts_arg; 4672extern struct vfsops zfs_vfsops; 4673 4674static void 4675arc_dnlc_evicts_thread(void *dummy __unused) 4676{ 4677 callb_cpr_t cpr; 4678 u_int percent; 4679 4680 CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG); 4681 4682 mutex_enter(&arc_dnlc_evicts_lock); 4683 while (!arc_dnlc_evicts_thread_exit) { 4684 CALLB_CPR_SAFE_BEGIN(&cpr); 4685 (void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); 4686 CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock); 4687 if (arc_dnlc_evicts_arg != 0) { 4688 percent = arc_dnlc_evicts_arg; 4689 mutex_exit(&arc_dnlc_evicts_lock); 4690#ifdef _KERNEL 4691 vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops); 4692#endif 4693 mutex_enter(&arc_dnlc_evicts_lock); 4694 /* 4695 * Clear our token only after vnlru_free() 4696 * pass is done, to avoid false queueing of 4697 * the requests. 4698 */ 4699 arc_dnlc_evicts_arg = 0; 4700 } 4701 } 4702 arc_dnlc_evicts_thread_exit = FALSE; 4703 cv_broadcast(&arc_dnlc_evicts_cv); 4704 CALLB_CPR_EXIT(&cpr); 4705 thread_exit(); 4706} 4707 4708void 4709dnlc_reduce_cache(void *arg) 4710{ 4711 u_int percent; 4712 4713 percent = (u_int)(uintptr_t)arg; 4714 mutex_enter(&arc_dnlc_evicts_lock); 4715 if (arc_dnlc_evicts_arg == 0) { 4716 arc_dnlc_evicts_arg = percent; 4717 cv_broadcast(&arc_dnlc_evicts_cv); 4718 } 4719 mutex_exit(&arc_dnlc_evicts_lock); 4720} 4721 4722/* 4723 * Adapt arc info given the number of bytes we are trying to add and 4724 * the state that we are comming from. This function is only called 4725 * when we are adding new content to the cache. 4726 */ 4727static void 4728arc_adapt(int bytes, arc_state_t *state) 4729{ 4730 int mult; 4731 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 4732 int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size); 4733 int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size); 4734 4735 if (state == arc_l2c_only) 4736 return; 4737 4738 ASSERT(bytes > 0); 4739 /* 4740 * Adapt the target size of the MRU list: 4741 * - if we just hit in the MRU ghost list, then increase 4742 * the target size of the MRU list. 4743 * - if we just hit in the MFU ghost list, then increase 4744 * the target size of the MFU list by decreasing the 4745 * target size of the MRU list. 4746 */ 4747 if (state == arc_mru_ghost) { 4748 mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); 4749 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 4750 4751 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 4752 } else if (state == arc_mfu_ghost) { 4753 uint64_t delta; 4754 4755 mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); 4756 mult = MIN(mult, 10); 4757 4758 delta = MIN(bytes * mult, arc_p); 4759 arc_p = MAX(arc_p_min, arc_p - delta); 4760 } 4761 ASSERT((int64_t)arc_p >= 0); 4762 4763 if (arc_reclaim_needed()) { 4764 cv_signal(&arc_reclaim_thread_cv); 4765 return; 4766 } 4767 4768 if (arc_no_grow) 4769 return; 4770 4771 if (arc_c >= arc_c_max) 4772 return; 4773 4774 /* 4775 * If we're within (2 * maxblocksize) bytes of the target 4776 * cache size, increment the target cache size 4777 */ 4778 if (aggsum_compare(&arc_size, arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) > 4779 0) { 4780 DTRACE_PROBE1(arc__inc_adapt, int, bytes); 4781 atomic_add_64(&arc_c, (int64_t)bytes); 4782 if (arc_c > arc_c_max) 4783 arc_c = arc_c_max; 4784 else if (state == arc_anon) 4785 atomic_add_64(&arc_p, (int64_t)bytes); 4786 if (arc_p > arc_c) 4787 arc_p = arc_c; 4788 } 4789 ASSERT((int64_t)arc_p >= 0); 4790} 4791 4792/* 4793 * Check if arc_size has grown past our upper threshold, determined by 4794 * zfs_arc_overflow_shift. 4795 */ 4796static boolean_t 4797arc_is_overflowing(void) 4798{ 4799 /* Always allow at least one block of overflow */ 4800 uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, 4801 arc_c >> zfs_arc_overflow_shift); 4802 4803 /* 4804 * We just compare the lower bound here for performance reasons. Our 4805 * primary goals are to make sure that the arc never grows without 4806 * bound, and that it can reach its maximum size. This check 4807 * accomplishes both goals. The maximum amount we could run over by is 4808 * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block 4809 * in the ARC. In practice, that's in the tens of MB, which is low 4810 * enough to be safe. 4811 */ 4812 return (aggsum_lower_bound(&arc_size) >= arc_c + overflow); 4813} 4814 4815static abd_t * 4816arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 4817{ 4818 arc_buf_contents_t type = arc_buf_type(hdr); 4819 4820 arc_get_data_impl(hdr, size, tag); 4821 if (type == ARC_BUFC_METADATA) { 4822 return (abd_alloc(size, B_TRUE)); 4823 } else { 4824 ASSERT(type == ARC_BUFC_DATA); 4825 return (abd_alloc(size, B_FALSE)); 4826 } 4827} 4828 4829static void * 4830arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 4831{ 4832 arc_buf_contents_t type = arc_buf_type(hdr); 4833 4834 arc_get_data_impl(hdr, size, tag); 4835 if (type == ARC_BUFC_METADATA) { 4836 return (zio_buf_alloc(size)); 4837 } else { 4838 ASSERT(type == ARC_BUFC_DATA); 4839 return (zio_data_buf_alloc(size)); 4840 } 4841} 4842 4843/* 4844 * Allocate a block and return it to the caller. If we are hitting the 4845 * hard limit for the cache size, we must sleep, waiting for the eviction 4846 * thread to catch up. If we're past the target size but below the hard 4847 * limit, we'll only signal the reclaim thread and continue on. 4848 */ 4849static void 4850arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 4851{ 4852 arc_state_t *state = hdr->b_l1hdr.b_state; 4853 arc_buf_contents_t type = arc_buf_type(hdr); 4854 4855 arc_adapt(size, state); 4856 4857 /* 4858 * If arc_size is currently overflowing, and has grown past our 4859 * upper limit, we must be adding data faster than the evict 4860 * thread can evict. Thus, to ensure we don't compound the 4861 * problem by adding more data and forcing arc_size to grow even 4862 * further past it's target size, we halt and wait for the 4863 * eviction thread to catch up. 4864 * 4865 * It's also possible that the reclaim thread is unable to evict 4866 * enough buffers to get arc_size below the overflow limit (e.g. 4867 * due to buffers being un-evictable, or hash lock collisions). 4868 * In this case, we want to proceed regardless if we're 4869 * overflowing; thus we don't use a while loop here. 4870 */ 4871 if (arc_is_overflowing()) { 4872 mutex_enter(&arc_reclaim_lock); 4873 4874 /* 4875 * Now that we've acquired the lock, we may no longer be 4876 * over the overflow limit, lets check. 4877 * 4878 * We're ignoring the case of spurious wake ups. If that 4879 * were to happen, it'd let this thread consume an ARC 4880 * buffer before it should have (i.e. before we're under 4881 * the overflow limit and were signalled by the reclaim 4882 * thread). As long as that is a rare occurrence, it 4883 * shouldn't cause any harm. 4884 */ 4885 if (arc_is_overflowing()) { 4886 cv_signal(&arc_reclaim_thread_cv); 4887 cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); 4888 } 4889 4890 mutex_exit(&arc_reclaim_lock); 4891 } 4892 4893 VERIFY3U(hdr->b_type, ==, type); 4894 if (type == ARC_BUFC_METADATA) { 4895 arc_space_consume(size, ARC_SPACE_META); 4896 } else { 4897 arc_space_consume(size, ARC_SPACE_DATA); 4898 } 4899 4900 /* 4901 * Update the state size. Note that ghost states have a 4902 * "ghost size" and so don't need to be updated. 4903 */ 4904 if (!GHOST_STATE(state)) { 4905 4906 (void) refcount_add_many(&state->arcs_size, size, tag); 4907 4908 /* 4909 * If this is reached via arc_read, the link is 4910 * protected by the hash lock. If reached via 4911 * arc_buf_alloc, the header should not be accessed by 4912 * any other thread. And, if reached via arc_read_done, 4913 * the hash lock will protect it if it's found in the 4914 * hash table; otherwise no other thread should be 4915 * trying to [add|remove]_reference it. 4916 */ 4917 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 4918 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4919 (void) refcount_add_many(&state->arcs_esize[type], 4920 size, tag); 4921 } 4922 4923 /* 4924 * If we are growing the cache, and we are adding anonymous 4925 * data, and we have outgrown arc_p, update arc_p 4926 */ 4927 if (aggsum_compare(&arc_size, arc_c) < 0 && 4928 hdr->b_l1hdr.b_state == arc_anon && 4929 (refcount_count(&arc_anon->arcs_size) + 4930 refcount_count(&arc_mru->arcs_size) > arc_p)) 4931 arc_p = MIN(arc_c, arc_p + size); 4932 } 4933 ARCSTAT_BUMP(arcstat_allocated); 4934} 4935 4936static void 4937arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag) 4938{ 4939 arc_free_data_impl(hdr, size, tag); 4940 abd_free(abd); 4941} 4942 4943static void 4944arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag) 4945{ 4946 arc_buf_contents_t type = arc_buf_type(hdr); 4947 4948 arc_free_data_impl(hdr, size, tag); 4949 if (type == ARC_BUFC_METADATA) { 4950 zio_buf_free(buf, size); 4951 } else { 4952 ASSERT(type == ARC_BUFC_DATA); 4953 zio_data_buf_free(buf, size); 4954 } 4955} 4956 4957/* 4958 * Free the arc data buffer. 4959 */ 4960static void 4961arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 4962{ 4963 arc_state_t *state = hdr->b_l1hdr.b_state; 4964 arc_buf_contents_t type = arc_buf_type(hdr); 4965 4966 /* protected by hash lock, if in the hash table */ 4967 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 4968 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4969 ASSERT(state != arc_anon && state != arc_l2c_only); 4970 4971 (void) refcount_remove_many(&state->arcs_esize[type], 4972 size, tag); 4973 } 4974 (void) refcount_remove_many(&state->arcs_size, size, tag); 4975 4976 VERIFY3U(hdr->b_type, ==, type); 4977 if (type == ARC_BUFC_METADATA) { 4978 arc_space_return(size, ARC_SPACE_META); 4979 } else { 4980 ASSERT(type == ARC_BUFC_DATA); 4981 arc_space_return(size, ARC_SPACE_DATA); 4982 } 4983} 4984 4985/* 4986 * This routine is called whenever a buffer is accessed. 4987 * NOTE: the hash lock is dropped in this function. 4988 */ 4989static void 4990arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 4991{ 4992 clock_t now; 4993 4994 ASSERT(MUTEX_HELD(hash_lock)); 4995 ASSERT(HDR_HAS_L1HDR(hdr)); 4996 4997 if (hdr->b_l1hdr.b_state == arc_anon) { 4998 /* 4999 * This buffer is not in the cache, and does not 5000 * appear in our "ghost" list. Add the new buffer 5001 * to the MRU state. 5002 */ 5003 5004 ASSERT0(hdr->b_l1hdr.b_arc_access); 5005 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 5006 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 5007 arc_change_state(arc_mru, hdr, hash_lock); 5008 5009 } else if (hdr->b_l1hdr.b_state == arc_mru) { 5010 now = ddi_get_lbolt(); 5011 5012 /* 5013 * If this buffer is here because of a prefetch, then either: 5014 * - clear the flag if this is a "referencing" read 5015 * (any subsequent access will bump this into the MFU state). 5016 * or 5017 * - move the buffer to the head of the list if this is 5018 * another prefetch (to make it less likely to be evicted). 5019 */ 5020 if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { 5021 if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 5022 /* link protected by hash lock */ 5023 ASSERT(multilist_link_active( 5024 &hdr->b_l1hdr.b_arc_node)); 5025 } else { 5026 arc_hdr_clear_flags(hdr, 5027 ARC_FLAG_PREFETCH | 5028 ARC_FLAG_PRESCIENT_PREFETCH); 5029 ARCSTAT_BUMP(arcstat_mru_hits); 5030 } 5031 hdr->b_l1hdr.b_arc_access = now; 5032 return; 5033 } 5034 5035 /* 5036 * This buffer has been "accessed" only once so far, 5037 * but it is still in the cache. Move it to the MFU 5038 * state. 5039 */ 5040 if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { 5041 /* 5042 * More than 125ms have passed since we 5043 * instantiated this buffer. Move it to the 5044 * most frequently used state. 5045 */ 5046 hdr->b_l1hdr.b_arc_access = now; 5047 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 5048 arc_change_state(arc_mfu, hdr, hash_lock); 5049 } 5050 ARCSTAT_BUMP(arcstat_mru_hits); 5051 } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { 5052 arc_state_t *new_state; 5053 /* 5054 * This buffer has been "accessed" recently, but 5055 * was evicted from the cache. Move it to the 5056 * MFU state. 5057 */ 5058 5059 if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { 5060 new_state = arc_mru; 5061 if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { 5062 arc_hdr_clear_flags(hdr, 5063 ARC_FLAG_PREFETCH | 5064 ARC_FLAG_PRESCIENT_PREFETCH); 5065 } 5066 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 5067 } else { 5068 new_state = arc_mfu; 5069 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 5070 } 5071 5072 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 5073 arc_change_state(new_state, hdr, hash_lock); 5074 5075 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 5076 } else if (hdr->b_l1hdr.b_state == arc_mfu) { 5077 /* 5078 * This buffer has been accessed more than once and is 5079 * still in the cache. Keep it in the MFU state. 5080 * 5081 * NOTE: an add_reference() that occurred when we did 5082 * the arc_read() will have kicked this off the list. 5083 * If it was a prefetch, we will explicitly move it to 5084 * the head of the list now. 5085 */ 5086 5087 ARCSTAT_BUMP(arcstat_mfu_hits); 5088 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 5089 } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { 5090 arc_state_t *new_state = arc_mfu; 5091 /* 5092 * This buffer has been accessed more than once but has 5093 * been evicted from the cache. Move it back to the 5094 * MFU state. 5095 */ 5096 5097 if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { 5098 /* 5099 * This is a prefetch access... 5100 * move this block back to the MRU state. 5101 */ 5102 new_state = arc_mru; 5103 } 5104 5105 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 5106 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 5107 arc_change_state(new_state, hdr, hash_lock); 5108 5109 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 5110 } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { 5111 /* 5112 * This buffer is on the 2nd Level ARC. 5113 */ 5114 5115 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 5116 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 5117 arc_change_state(arc_mfu, hdr, hash_lock); 5118 } else { 5119 ASSERT(!"invalid arc state"); 5120 } 5121} 5122 5123/* 5124 * This routine is called by dbuf_hold() to update the arc_access() state 5125 * which otherwise would be skipped for entries in the dbuf cache. 5126 */ 5127void 5128arc_buf_access(arc_buf_t *buf) 5129{ 5130 mutex_enter(&buf->b_evict_lock); 5131 arc_buf_hdr_t *hdr = buf->b_hdr; 5132 5133 /* 5134 * Avoid taking the hash_lock when possible as an optimization. 5135 * The header must be checked again under the hash_lock in order 5136 * to handle the case where it is concurrently being released. 5137 */ 5138 if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { 5139 mutex_exit(&buf->b_evict_lock); 5140 ARCSTAT_BUMP(arcstat_access_skip); 5141 return; 5142 } 5143 5144 kmutex_t *hash_lock = HDR_LOCK(hdr); 5145 mutex_enter(hash_lock); 5146 5147 if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { 5148 mutex_exit(hash_lock); 5149 mutex_exit(&buf->b_evict_lock); 5150 ARCSTAT_BUMP(arcstat_access_skip); 5151 return; 5152 } 5153 5154 mutex_exit(&buf->b_evict_lock); 5155 5156 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 5157 hdr->b_l1hdr.b_state == arc_mfu); 5158 5159 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 5160 arc_access(hdr, hash_lock); 5161 mutex_exit(hash_lock); 5162 5163 ARCSTAT_BUMP(arcstat_hits); 5164 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 5165 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); 5166} 5167 5168/* a generic arc_read_done_func_t which you can use */ 5169/* ARGSUSED */ 5170void 5171arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, 5172 arc_buf_t *buf, void *arg) 5173{ 5174 if (buf == NULL) 5175 return; 5176 5177 bcopy(buf->b_data, arg, arc_buf_size(buf)); 5178 arc_buf_destroy(buf, arg); 5179} 5180 5181/* a generic arc_read_done_func_t */ 5182/* ARGSUSED */ 5183void 5184arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, 5185 arc_buf_t *buf, void *arg) 5186{ 5187 arc_buf_t **bufp = arg; 5188 if (buf == NULL) { 5189 ASSERT(zio == NULL || zio->io_error != 0); 5190 *bufp = NULL; 5191 } else { 5192 ASSERT(zio == NULL || zio->io_error == 0); 5193 *bufp = buf; 5194 ASSERT(buf->b_data != NULL); 5195 } 5196} 5197 5198static void 5199arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) 5200{ 5201 if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { 5202 ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); 5203 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); 5204 } else { 5205 if (HDR_COMPRESSION_ENABLED(hdr)) { 5206 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, 5207 BP_GET_COMPRESS(bp)); 5208 } 5209 ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); 5210 ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); 5211 } 5212} 5213 5214static void 5215arc_read_done(zio_t *zio) 5216{ 5217 arc_buf_hdr_t *hdr = zio->io_private; 5218 kmutex_t *hash_lock = NULL; 5219 arc_callback_t *callback_list; 5220 arc_callback_t *acb; 5221 boolean_t freeable = B_FALSE; 5222 boolean_t no_zio_error = (zio->io_error == 0); 5223 5224 /* 5225 * The hdr was inserted into hash-table and removed from lists 5226 * prior to starting I/O. We should find this header, since 5227 * it's in the hash table, and it should be legit since it's 5228 * not possible to evict it during the I/O. The only possible 5229 * reason for it not to be found is if we were freed during the 5230 * read. 5231 */ 5232 if (HDR_IN_HASH_TABLE(hdr)) { 5233 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); 5234 ASSERT3U(hdr->b_dva.dva_word[0], ==, 5235 BP_IDENTITY(zio->io_bp)->dva_word[0]); 5236 ASSERT3U(hdr->b_dva.dva_word[1], ==, 5237 BP_IDENTITY(zio->io_bp)->dva_word[1]); 5238 5239 arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, 5240 &hash_lock); 5241 5242 ASSERT((found == hdr && 5243 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 5244 (found == hdr && HDR_L2_READING(hdr))); 5245 ASSERT3P(hash_lock, !=, NULL); 5246 } 5247 5248 if (no_zio_error) { 5249 /* byteswap if necessary */ 5250 if (BP_SHOULD_BYTESWAP(zio->io_bp)) { 5251 if (BP_GET_LEVEL(zio->io_bp) > 0) { 5252 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; 5253 } else { 5254 hdr->b_l1hdr.b_byteswap = 5255 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 5256 } 5257 } else { 5258 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; 5259 } 5260 } 5261 5262 arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); 5263 if (l2arc_noprefetch && HDR_PREFETCH(hdr)) 5264 arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); 5265 5266 callback_list = hdr->b_l1hdr.b_acb; 5267 ASSERT3P(callback_list, !=, NULL); 5268 5269 if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) { 5270 /* 5271 * Only call arc_access on anonymous buffers. This is because 5272 * if we've issued an I/O for an evicted buffer, we've already 5273 * called arc_access (to prevent any simultaneous readers from 5274 * getting confused). 5275 */ 5276 arc_access(hdr, hash_lock); 5277 } 5278 5279 /* 5280 * If a read request has a callback (i.e. acb_done is not NULL), then we 5281 * make a buf containing the data according to the parameters which were 5282 * passed in. The implementation of arc_buf_alloc_impl() ensures that we 5283 * aren't needlessly decompressing the data multiple times. 5284 */ 5285 int callback_cnt = 0; 5286 for (acb = callback_list; acb != NULL; acb = acb->acb_next) { 5287 if (!acb->acb_done) 5288 continue; 5289 5290 callback_cnt++; 5291 5292 if (no_zio_error) { 5293 int error = arc_buf_alloc_impl(hdr, acb->acb_private, 5294 acb->acb_compressed, zio->io_error == 0, 5295 &acb->acb_buf); 5296 if (error != 0) { 5297 /* 5298 * Decompression failed. Set io_error 5299 * so that when we call acb_done (below), 5300 * we will indicate that the read failed. 5301 * Note that in the unusual case where one 5302 * callback is compressed and another 5303 * uncompressed, we will mark all of them 5304 * as failed, even though the uncompressed 5305 * one can't actually fail. In this case, 5306 * the hdr will not be anonymous, because 5307 * if there are multiple callbacks, it's 5308 * because multiple threads found the same 5309 * arc buf in the hash table. 5310 */ 5311 zio->io_error = error; 5312 } 5313 } 5314 } 5315 /* 5316 * If there are multiple callbacks, we must have the hash lock, 5317 * because the only way for multiple threads to find this hdr is 5318 * in the hash table. This ensures that if there are multiple 5319 * callbacks, the hdr is not anonymous. If it were anonymous, 5320 * we couldn't use arc_buf_destroy() in the error case below. 5321 */ 5322 ASSERT(callback_cnt < 2 || hash_lock != NULL); 5323 5324 hdr->b_l1hdr.b_acb = NULL; 5325 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 5326 if (callback_cnt == 0) { 5327 ASSERT(HDR_PREFETCH(hdr)); 5328 ASSERT0(hdr->b_l1hdr.b_bufcnt); 5329 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 5330 } 5331 5332 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || 5333 callback_list != NULL); 5334 5335 if (no_zio_error) { 5336 arc_hdr_verify(hdr, zio->io_bp); 5337 } else { 5338 arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); 5339 if (hdr->b_l1hdr.b_state != arc_anon) 5340 arc_change_state(arc_anon, hdr, hash_lock); 5341 if (HDR_IN_HASH_TABLE(hdr)) 5342 buf_hash_remove(hdr); 5343 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 5344 } 5345 5346 /* 5347 * Broadcast before we drop the hash_lock to avoid the possibility 5348 * that the hdr (and hence the cv) might be freed before we get to 5349 * the cv_broadcast(). 5350 */ 5351 cv_broadcast(&hdr->b_l1hdr.b_cv); 5352 5353 if (hash_lock != NULL) { 5354 mutex_exit(hash_lock); 5355 } else { 5356 /* 5357 * This block was freed while we waited for the read to 5358 * complete. It has been removed from the hash table and 5359 * moved to the anonymous state (so that it won't show up 5360 * in the cache). 5361 */ 5362 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 5363 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 5364 } 5365 5366 /* execute each callback and free its structure */ 5367 while ((acb = callback_list) != NULL) { 5368 if (acb->acb_done != NULL) { 5369 if (zio->io_error != 0 && acb->acb_buf != NULL) { 5370 /* 5371 * If arc_buf_alloc_impl() fails during 5372 * decompression, the buf will still be 5373 * allocated, and needs to be freed here. 5374 */ 5375 arc_buf_destroy(acb->acb_buf, acb->acb_private); 5376 acb->acb_buf = NULL; 5377 } 5378 acb->acb_done(zio, &zio->io_bookmark, zio->io_bp, 5379 acb->acb_buf, acb->acb_private); 5380 } 5381 5382 if (acb->acb_zio_dummy != NULL) { 5383 acb->acb_zio_dummy->io_error = zio->io_error; 5384 zio_nowait(acb->acb_zio_dummy); 5385 } 5386 5387 callback_list = acb->acb_next; 5388 kmem_free(acb, sizeof (arc_callback_t)); 5389 } 5390 5391 if (freeable) 5392 arc_hdr_destroy(hdr); 5393} 5394 5395/* 5396 * "Read" the block at the specified DVA (in bp) via the 5397 * cache. If the block is found in the cache, invoke the provided 5398 * callback immediately and return. Note that the `zio' parameter 5399 * in the callback will be NULL in this case, since no IO was 5400 * required. If the block is not in the cache pass the read request 5401 * on to the spa with a substitute callback function, so that the 5402 * requested block will be added to the cache. 5403 * 5404 * If a read request arrives for a block that has a read in-progress, 5405 * either wait for the in-progress read to complete (and return the 5406 * results); or, if this is a read with a "done" func, add a record 5407 * to the read to invoke the "done" func when the read completes, 5408 * and return; or just return. 5409 * 5410 * arc_read_done() will invoke all the requested "done" functions 5411 * for readers of this block. 5412 */ 5413int 5414arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, 5415 void *private, zio_priority_t priority, int zio_flags, 5416 arc_flags_t *arc_flags, const zbookmark_phys_t *zb) 5417{ 5418 arc_buf_hdr_t *hdr = NULL; 5419 kmutex_t *hash_lock = NULL; 5420 zio_t *rzio; 5421 uint64_t guid = spa_load_guid(spa); 5422 boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0; 5423 int rc = 0; 5424 5425 ASSERT(!BP_IS_EMBEDDED(bp) || 5426 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 5427 5428top: 5429 if (!BP_IS_EMBEDDED(bp)) { 5430 /* 5431 * Embedded BP's have no DVA and require no I/O to "read". 5432 * Create an anonymous arc buf to back it. 5433 */ 5434 hdr = buf_hash_find(guid, bp, &hash_lock); 5435 } 5436 5437 if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) { 5438 arc_buf_t *buf = NULL; 5439 *arc_flags |= ARC_FLAG_CACHED; 5440 5441 if (HDR_IO_IN_PROGRESS(hdr)) { 5442 zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; 5443 5444 ASSERT3P(head_zio, !=, NULL); 5445 if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && 5446 priority == ZIO_PRIORITY_SYNC_READ) { 5447 /* 5448 * This is a sync read that needs to wait for 5449 * an in-flight async read. Request that the 5450 * zio have its priority upgraded. 5451 */ 5452 zio_change_priority(head_zio, priority); 5453 DTRACE_PROBE1(arc__async__upgrade__sync, 5454 arc_buf_hdr_t *, hdr); 5455 ARCSTAT_BUMP(arcstat_async_upgrade_sync); 5456 } 5457 if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 5458 arc_hdr_clear_flags(hdr, 5459 ARC_FLAG_PREDICTIVE_PREFETCH); 5460 } 5461 5462 if (*arc_flags & ARC_FLAG_WAIT) { 5463 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); 5464 mutex_exit(hash_lock); 5465 goto top; 5466 } 5467 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 5468 5469 if (done) { 5470 arc_callback_t *acb = NULL; 5471 5472 acb = kmem_zalloc(sizeof (arc_callback_t), 5473 KM_SLEEP); 5474 acb->acb_done = done; 5475 acb->acb_private = private; 5476 acb->acb_compressed = compressed_read; 5477 if (pio != NULL) 5478 acb->acb_zio_dummy = zio_null(pio, 5479 spa, NULL, NULL, NULL, zio_flags); 5480 5481 ASSERT3P(acb->acb_done, !=, NULL); 5482 acb->acb_zio_head = head_zio; 5483 acb->acb_next = hdr->b_l1hdr.b_acb; 5484 hdr->b_l1hdr.b_acb = acb; 5485 mutex_exit(hash_lock); 5486 return (0); 5487 } 5488 mutex_exit(hash_lock); 5489 return (0); 5490 } 5491 5492 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 5493 hdr->b_l1hdr.b_state == arc_mfu); 5494 5495 if (done) { 5496 if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 5497 /* 5498 * This is a demand read which does not have to 5499 * wait for i/o because we did a predictive 5500 * prefetch i/o for it, which has completed. 5501 */ 5502 DTRACE_PROBE1( 5503 arc__demand__hit__predictive__prefetch, 5504 arc_buf_hdr_t *, hdr); 5505 ARCSTAT_BUMP( 5506 arcstat_demand_hit_predictive_prefetch); 5507 arc_hdr_clear_flags(hdr, 5508 ARC_FLAG_PREDICTIVE_PREFETCH); 5509 } 5510 5511 if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) { 5512 ARCSTAT_BUMP( 5513 arcstat_demand_hit_prescient_prefetch); 5514 arc_hdr_clear_flags(hdr, 5515 ARC_FLAG_PRESCIENT_PREFETCH); 5516 } 5517 5518 ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); 5519 /* Get a buf with the desired data in it. */ 5520 rc = arc_buf_alloc_impl(hdr, private, 5521 compressed_read, B_TRUE, &buf); 5522 if (rc != 0) { 5523 arc_buf_destroy(buf, private); 5524 buf = NULL; 5525 } 5526 ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || 5527 rc == 0 || rc != ENOENT); 5528 } else if (*arc_flags & ARC_FLAG_PREFETCH && 5529 refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 5530 arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); 5531 } 5532 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 5533 arc_access(hdr, hash_lock); 5534 if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) 5535 arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); 5536 if (*arc_flags & ARC_FLAG_L2CACHE) 5537 arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); 5538 mutex_exit(hash_lock); 5539 ARCSTAT_BUMP(arcstat_hits); 5540 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 5541 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 5542 data, metadata, hits); 5543 5544 if (done) 5545 done(NULL, zb, bp, buf, private); 5546 } else { 5547 uint64_t lsize = BP_GET_LSIZE(bp); 5548 uint64_t psize = BP_GET_PSIZE(bp); 5549 arc_callback_t *acb; 5550 vdev_t *vd = NULL; 5551 uint64_t addr = 0; 5552 boolean_t devw = B_FALSE; 5553 uint64_t size; 5554 5555 if (hdr == NULL) { 5556 /* this block is not in the cache */ 5557 arc_buf_hdr_t *exists = NULL; 5558 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 5559 hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, 5560 BP_GET_COMPRESS(bp), type); 5561 5562 if (!BP_IS_EMBEDDED(bp)) { 5563 hdr->b_dva = *BP_IDENTITY(bp); 5564 hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 5565 exists = buf_hash_insert(hdr, &hash_lock); 5566 } 5567 if (exists != NULL) { 5568 /* somebody beat us to the hash insert */ 5569 mutex_exit(hash_lock); 5570 buf_discard_identity(hdr); 5571 arc_hdr_destroy(hdr); 5572 goto top; /* restart the IO request */ 5573 } 5574 } else { 5575 /* 5576 * This block is in the ghost cache. If it was L2-only 5577 * (and thus didn't have an L1 hdr), we realloc the 5578 * header to add an L1 hdr. 5579 */ 5580 if (!HDR_HAS_L1HDR(hdr)) { 5581 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, 5582 hdr_full_cache); 5583 } 5584 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 5585 ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); 5586 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 5587 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 5588 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 5589 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 5590 5591 /* 5592 * This is a delicate dance that we play here. 5593 * This hdr is in the ghost list so we access it 5594 * to move it out of the ghost list before we 5595 * initiate the read. If it's a prefetch then 5596 * it won't have a callback so we'll remove the 5597 * reference that arc_buf_alloc_impl() created. We 5598 * do this after we've called arc_access() to 5599 * avoid hitting an assert in remove_reference(). 5600 */ 5601 arc_access(hdr, hash_lock); 5602 arc_hdr_alloc_pabd(hdr); 5603 } 5604 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 5605 size = arc_hdr_size(hdr); 5606 5607 /* 5608 * If compression is enabled on the hdr, then will do 5609 * RAW I/O and will store the compressed data in the hdr's 5610 * data block. Otherwise, the hdr's data block will contain 5611 * the uncompressed data. 5612 */ 5613 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { 5614 zio_flags |= ZIO_FLAG_RAW; 5615 } 5616 5617 if (*arc_flags & ARC_FLAG_PREFETCH) 5618 arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); 5619 if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) 5620 arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); 5621 5622 if (*arc_flags & ARC_FLAG_L2CACHE) 5623 arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); 5624 if (BP_GET_LEVEL(bp) > 0) 5625 arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); 5626 if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) 5627 arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); 5628 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); 5629 5630 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 5631 acb->acb_done = done; 5632 acb->acb_private = private; 5633 acb->acb_compressed = compressed_read; 5634 5635 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 5636 hdr->b_l1hdr.b_acb = acb; 5637 arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 5638 5639 if (HDR_HAS_L2HDR(hdr) && 5640 (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { 5641 devw = hdr->b_l2hdr.b_dev->l2ad_writing; 5642 addr = hdr->b_l2hdr.b_daddr; 5643 /* 5644 * Lock out L2ARC device removal. 5645 */ 5646 if (vdev_is_dead(vd) || 5647 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 5648 vd = NULL; 5649 } 5650 5651 /* 5652 * We count both async reads and scrub IOs as asynchronous so 5653 * that both can be upgraded in the event of a cache hit while 5654 * the read IO is still in-flight. 5655 */ 5656 if (priority == ZIO_PRIORITY_ASYNC_READ || 5657 priority == ZIO_PRIORITY_SCRUB) 5658 arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); 5659 else 5660 arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); 5661 5662 /* 5663 * At this point, we have a level 1 cache miss. Try again in 5664 * L2ARC if possible. 5665 */ 5666 ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); 5667 5668 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 5669 uint64_t, lsize, zbookmark_phys_t *, zb); 5670 ARCSTAT_BUMP(arcstat_misses); 5671 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 5672 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 5673 data, metadata, misses); 5674#ifdef _KERNEL 5675#ifdef RACCT 5676 if (racct_enable) { 5677 PROC_LOCK(curproc); 5678 racct_add_force(curproc, RACCT_READBPS, size); 5679 racct_add_force(curproc, RACCT_READIOPS, 1); 5680 PROC_UNLOCK(curproc); 5681 } 5682#endif /* RACCT */ 5683 curthread->td_ru.ru_inblock++; 5684#endif 5685 5686 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 5687 /* 5688 * Read from the L2ARC if the following are true: 5689 * 1. The L2ARC vdev was previously cached. 5690 * 2. This buffer still has L2ARC metadata. 5691 * 3. This buffer isn't currently writing to the L2ARC. 5692 * 4. The L2ARC entry wasn't evicted, which may 5693 * also have invalidated the vdev. 5694 * 5. This isn't prefetch and l2arc_noprefetch is set. 5695 */ 5696 if (HDR_HAS_L2HDR(hdr) && 5697 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 5698 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 5699 l2arc_read_callback_t *cb; 5700 abd_t *abd; 5701 uint64_t asize; 5702 5703 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 5704 ARCSTAT_BUMP(arcstat_l2_hits); 5705 5706 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 5707 KM_SLEEP); 5708 cb->l2rcb_hdr = hdr; 5709 cb->l2rcb_bp = *bp; 5710 cb->l2rcb_zb = *zb; 5711 cb->l2rcb_flags = zio_flags; 5712 5713 asize = vdev_psize_to_asize(vd, size); 5714 if (asize != size) { 5715 abd = abd_alloc_for_io(asize, 5716 HDR_ISTYPE_METADATA(hdr)); 5717 cb->l2rcb_abd = abd; 5718 } else { 5719 abd = hdr->b_l1hdr.b_pabd; 5720 } 5721 5722 ASSERT(addr >= VDEV_LABEL_START_SIZE && 5723 addr + asize <= vd->vdev_psize - 5724 VDEV_LABEL_END_SIZE); 5725 5726 /* 5727 * l2arc read. The SCL_L2ARC lock will be 5728 * released by l2arc_read_done(). 5729 * Issue a null zio if the underlying buffer 5730 * was squashed to zero size by compression. 5731 */ 5732 ASSERT3U(HDR_GET_COMPRESS(hdr), !=, 5733 ZIO_COMPRESS_EMPTY); 5734 rzio = zio_read_phys(pio, vd, addr, 5735 asize, abd, 5736 ZIO_CHECKSUM_OFF, 5737 l2arc_read_done, cb, priority, 5738 zio_flags | ZIO_FLAG_DONT_CACHE | 5739 ZIO_FLAG_CANFAIL | 5740 ZIO_FLAG_DONT_PROPAGATE | 5741 ZIO_FLAG_DONT_RETRY, B_FALSE); 5742 acb->acb_zio_head = rzio; 5743 5744 if (hash_lock != NULL) 5745 mutex_exit(hash_lock); 5746 5747 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 5748 zio_t *, rzio); 5749 ARCSTAT_INCR(arcstat_l2_read_bytes, size); 5750 5751 if (*arc_flags & ARC_FLAG_NOWAIT) { 5752 zio_nowait(rzio); 5753 return (0); 5754 } 5755 5756 ASSERT(*arc_flags & ARC_FLAG_WAIT); 5757 if (zio_wait(rzio) == 0) 5758 return (0); 5759 5760 /* l2arc read error; goto zio_read() */ 5761 if (hash_lock != NULL) 5762 mutex_enter(hash_lock); 5763 } else { 5764 DTRACE_PROBE1(l2arc__miss, 5765 arc_buf_hdr_t *, hdr); 5766 ARCSTAT_BUMP(arcstat_l2_misses); 5767 if (HDR_L2_WRITING(hdr)) 5768 ARCSTAT_BUMP(arcstat_l2_rw_clash); 5769 spa_config_exit(spa, SCL_L2ARC, vd); 5770 } 5771 } else { 5772 if (vd != NULL) 5773 spa_config_exit(spa, SCL_L2ARC, vd); 5774 if (l2arc_ndev != 0) { 5775 DTRACE_PROBE1(l2arc__miss, 5776 arc_buf_hdr_t *, hdr); 5777 ARCSTAT_BUMP(arcstat_l2_misses); 5778 } 5779 } 5780 5781 rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size, 5782 arc_read_done, hdr, priority, zio_flags, zb); 5783 acb->acb_zio_head = rzio; 5784 5785 if (hash_lock != NULL) 5786 mutex_exit(hash_lock); 5787 5788 if (*arc_flags & ARC_FLAG_WAIT) 5789 return (zio_wait(rzio)); 5790 5791 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 5792 zio_nowait(rzio); 5793 } 5794 return (0); 5795} 5796 5797/* 5798 * Notify the arc that a block was freed, and thus will never be used again. 5799 */ 5800void 5801arc_freed(spa_t *spa, const blkptr_t *bp) 5802{ 5803 arc_buf_hdr_t *hdr; 5804 kmutex_t *hash_lock; 5805 uint64_t guid = spa_load_guid(spa); 5806 5807 ASSERT(!BP_IS_EMBEDDED(bp)); 5808 5809 hdr = buf_hash_find(guid, bp, &hash_lock); 5810 if (hdr == NULL) 5811 return; 5812 5813 /* 5814 * We might be trying to free a block that is still doing I/O 5815 * (i.e. prefetch) or has a reference (i.e. a dedup-ed, 5816 * dmu_sync-ed block). If this block is being prefetched, then it 5817 * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr 5818 * until the I/O completes. A block may also have a reference if it is 5819 * part of a dedup-ed, dmu_synced write. The dmu_sync() function would 5820 * have written the new block to its final resting place on disk but 5821 * without the dedup flag set. This would have left the hdr in the MRU 5822 * state and discoverable. When the txg finally syncs it detects that 5823 * the block was overridden in open context and issues an override I/O. 5824 * Since this is a dedup block, the override I/O will determine if the 5825 * block is already in the DDT. If so, then it will replace the io_bp 5826 * with the bp from the DDT and allow the I/O to finish. When the I/O 5827 * reaches the done callback, dbuf_write_override_done, it will 5828 * check to see if the io_bp and io_bp_override are identical. 5829 * If they are not, then it indicates that the bp was replaced with 5830 * the bp in the DDT and the override bp is freed. This allows 5831 * us to arrive here with a reference on a block that is being 5832 * freed. So if we have an I/O in progress, or a reference to 5833 * this hdr, then we don't destroy the hdr. 5834 */ 5835 if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && 5836 refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { 5837 arc_change_state(arc_anon, hdr, hash_lock); 5838 arc_hdr_destroy(hdr); 5839 mutex_exit(hash_lock); 5840 } else { 5841 mutex_exit(hash_lock); 5842 } 5843 5844} 5845 5846/* 5847 * Release this buffer from the cache, making it an anonymous buffer. This 5848 * must be done after a read and prior to modifying the buffer contents. 5849 * If the buffer has more than one reference, we must make 5850 * a new hdr for the buffer. 5851 */ 5852void 5853arc_release(arc_buf_t *buf, void *tag) 5854{ 5855 arc_buf_hdr_t *hdr = buf->b_hdr; 5856 5857 /* 5858 * It would be nice to assert that if it's DMU metadata (level > 5859 * 0 || it's the dnode file), then it must be syncing context. 5860 * But we don't know that information at this level. 5861 */ 5862 5863 mutex_enter(&buf->b_evict_lock); 5864 5865 ASSERT(HDR_HAS_L1HDR(hdr)); 5866 5867 /* 5868 * We don't grab the hash lock prior to this check, because if 5869 * the buffer's header is in the arc_anon state, it won't be 5870 * linked into the hash table. 5871 */ 5872 if (hdr->b_l1hdr.b_state == arc_anon) { 5873 mutex_exit(&buf->b_evict_lock); 5874 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 5875 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 5876 ASSERT(!HDR_HAS_L2HDR(hdr)); 5877 ASSERT(HDR_EMPTY(hdr)); 5878 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); 5879 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); 5880 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 5881 5882 hdr->b_l1hdr.b_arc_access = 0; 5883 5884 /* 5885 * If the buf is being overridden then it may already 5886 * have a hdr that is not empty. 5887 */ 5888 buf_discard_identity(hdr); 5889 arc_buf_thaw(buf); 5890 5891 return; 5892 } 5893 5894 kmutex_t *hash_lock = HDR_LOCK(hdr); 5895 mutex_enter(hash_lock); 5896 5897 /* 5898 * This assignment is only valid as long as the hash_lock is 5899 * held, we must be careful not to reference state or the 5900 * b_state field after dropping the lock. 5901 */ 5902 arc_state_t *state = hdr->b_l1hdr.b_state; 5903 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 5904 ASSERT3P(state, !=, arc_anon); 5905 5906 /* this buffer is not on any list */ 5907 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); 5908 5909 if (HDR_HAS_L2HDR(hdr)) { 5910 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); 5911 5912 /* 5913 * We have to recheck this conditional again now that 5914 * we're holding the l2ad_mtx to prevent a race with 5915 * another thread which might be concurrently calling 5916 * l2arc_evict(). In that case, l2arc_evict() might have 5917 * destroyed the header's L2 portion as we were waiting 5918 * to acquire the l2ad_mtx. 5919 */ 5920 if (HDR_HAS_L2HDR(hdr)) { 5921 l2arc_trim(hdr); 5922 arc_hdr_l2hdr_destroy(hdr); 5923 } 5924 5925 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); 5926 } 5927 5928 /* 5929 * Do we have more than one buf? 5930 */ 5931 if (hdr->b_l1hdr.b_bufcnt > 1) { 5932 arc_buf_hdr_t *nhdr; 5933 uint64_t spa = hdr->b_spa; 5934 uint64_t psize = HDR_GET_PSIZE(hdr); 5935 uint64_t lsize = HDR_GET_LSIZE(hdr); 5936 enum zio_compress compress = HDR_GET_COMPRESS(hdr); 5937 arc_buf_contents_t type = arc_buf_type(hdr); 5938 VERIFY3U(hdr->b_type, ==, type); 5939 5940 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); 5941 (void) remove_reference(hdr, hash_lock, tag); 5942 5943 if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { 5944 ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); 5945 ASSERT(ARC_BUF_LAST(buf)); 5946 } 5947 5948 /* 5949 * Pull the data off of this hdr and attach it to 5950 * a new anonymous hdr. Also find the last buffer 5951 * in the hdr's buffer list. 5952 */ 5953 arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); 5954 ASSERT3P(lastbuf, !=, NULL); 5955 5956 /* 5957 * If the current arc_buf_t and the hdr are sharing their data 5958 * buffer, then we must stop sharing that block. 5959 */ 5960 if (arc_buf_is_shared(buf)) { 5961 VERIFY(!arc_buf_is_shared(lastbuf)); 5962 5963 /* 5964 * First, sever the block sharing relationship between 5965 * buf and the arc_buf_hdr_t. 5966 */ 5967 arc_unshare_buf(hdr, buf); 5968 5969 /* 5970 * Now we need to recreate the hdr's b_pabd. Since we 5971 * have lastbuf handy, we try to share with it, but if 5972 * we can't then we allocate a new b_pabd and copy the 5973 * data from buf into it. 5974 */ 5975 if (arc_can_share(hdr, lastbuf)) { 5976 arc_share_buf(hdr, lastbuf); 5977 } else { 5978 arc_hdr_alloc_pabd(hdr); 5979 abd_copy_from_buf(hdr->b_l1hdr.b_pabd, 5980 buf->b_data, psize); 5981 } 5982 VERIFY3P(lastbuf->b_data, !=, NULL); 5983 } else if (HDR_SHARED_DATA(hdr)) { 5984 /* 5985 * Uncompressed shared buffers are always at the end 5986 * of the list. Compressed buffers don't have the 5987 * same requirements. This makes it hard to 5988 * simply assert that the lastbuf is shared so 5989 * we rely on the hdr's compression flags to determine 5990 * if we have a compressed, shared buffer. 5991 */ 5992 ASSERT(arc_buf_is_shared(lastbuf) || 5993 HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); 5994 ASSERT(!ARC_BUF_SHARED(buf)); 5995 } 5996 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 5997 ASSERT3P(state, !=, arc_l2c_only); 5998 5999 (void) refcount_remove_many(&state->arcs_size, 6000 arc_buf_size(buf), buf); 6001 6002 if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { 6003 ASSERT3P(state, !=, arc_l2c_only); 6004 (void) refcount_remove_many(&state->arcs_esize[type], 6005 arc_buf_size(buf), buf); 6006 } 6007 6008 hdr->b_l1hdr.b_bufcnt -= 1; 6009 arc_cksum_verify(buf); 6010#ifdef illumos 6011 arc_buf_unwatch(buf); 6012#endif 6013 6014 mutex_exit(hash_lock); 6015 6016 /* 6017 * Allocate a new hdr. The new hdr will contain a b_pabd 6018 * buffer which will be freed in arc_write(). 6019 */ 6020 nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); 6021 ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); 6022 ASSERT0(nhdr->b_l1hdr.b_bufcnt); 6023 ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt)); 6024 VERIFY3U(nhdr->b_type, ==, type); 6025 ASSERT(!HDR_SHARED_DATA(nhdr)); 6026 6027 nhdr->b_l1hdr.b_buf = buf; 6028 nhdr->b_l1hdr.b_bufcnt = 1; 6029 (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); 6030 buf->b_hdr = nhdr; 6031 6032 mutex_exit(&buf->b_evict_lock); 6033 (void) refcount_add_many(&arc_anon->arcs_size, 6034 arc_buf_size(buf), buf); 6035 } else { 6036 mutex_exit(&buf->b_evict_lock); 6037 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); 6038 /* protected by hash lock, or hdr is on arc_anon */ 6039 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 6040 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 6041 arc_change_state(arc_anon, hdr, hash_lock); 6042 hdr->b_l1hdr.b_arc_access = 0; 6043 mutex_exit(hash_lock); 6044 6045 buf_discard_identity(hdr); 6046 arc_buf_thaw(buf); 6047 } 6048} 6049 6050int 6051arc_released(arc_buf_t *buf) 6052{ 6053 int released; 6054 6055 mutex_enter(&buf->b_evict_lock); 6056 released = (buf->b_data != NULL && 6057 buf->b_hdr->b_l1hdr.b_state == arc_anon); 6058 mutex_exit(&buf->b_evict_lock); 6059 return (released); 6060} 6061 6062#ifdef ZFS_DEBUG 6063int 6064arc_referenced(arc_buf_t *buf) 6065{ 6066 int referenced; 6067 6068 mutex_enter(&buf->b_evict_lock); 6069 referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); 6070 mutex_exit(&buf->b_evict_lock); 6071 return (referenced); 6072} 6073#endif 6074 6075static void 6076arc_write_ready(zio_t *zio) 6077{ 6078 arc_write_callback_t *callback = zio->io_private; 6079 arc_buf_t *buf = callback->awcb_buf; 6080 arc_buf_hdr_t *hdr = buf->b_hdr; 6081 uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); 6082 6083 ASSERT(HDR_HAS_L1HDR(hdr)); 6084 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); 6085 ASSERT(hdr->b_l1hdr.b_bufcnt > 0); 6086 6087 /* 6088 * If we're reexecuting this zio because the pool suspended, then 6089 * cleanup any state that was previously set the first time the 6090 * callback was invoked. 6091 */ 6092 if (zio->io_flags & ZIO_FLAG_REEXECUTED) { 6093 arc_cksum_free(hdr); 6094#ifdef illumos 6095 arc_buf_unwatch(buf); 6096#endif 6097 if (hdr->b_l1hdr.b_pabd != NULL) { 6098 if (arc_buf_is_shared(buf)) { 6099 arc_unshare_buf(hdr, buf); 6100 } else { 6101 arc_hdr_free_pabd(hdr); 6102 } 6103 } 6104 } 6105 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 6106 ASSERT(!HDR_SHARED_DATA(hdr)); 6107 ASSERT(!arc_buf_is_shared(buf)); 6108 6109 callback->awcb_ready(zio, buf, callback->awcb_private); 6110 6111 if (HDR_IO_IN_PROGRESS(hdr)) 6112 ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); 6113 6114 arc_cksum_compute(buf); 6115 arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 6116 6117 enum zio_compress compress; 6118 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 6119 compress = ZIO_COMPRESS_OFF; 6120 } else { 6121 ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp)); 6122 compress = BP_GET_COMPRESS(zio->io_bp); 6123 } 6124 HDR_SET_PSIZE(hdr, psize); 6125 arc_hdr_set_compress(hdr, compress); 6126 6127 6128 /* 6129 * Fill the hdr with data. If the hdr is compressed, the data we want 6130 * is available from the zio, otherwise we can take it from the buf. 6131 * 6132 * We might be able to share the buf's data with the hdr here. However, 6133 * doing so would cause the ARC to be full of linear ABDs if we write a 6134 * lot of shareable data. As a compromise, we check whether scattered 6135 * ABDs are allowed, and assume that if they are then the user wants 6136 * the ARC to be primarily filled with them regardless of the data being 6137 * written. Therefore, if they're allowed then we allocate one and copy 6138 * the data into it; otherwise, we share the data directly if we can. 6139 */ 6140 if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { 6141 arc_hdr_alloc_pabd(hdr); 6142 6143 /* 6144 * Ideally, we would always copy the io_abd into b_pabd, but the 6145 * user may have disabled compressed ARC, thus we must check the 6146 * hdr's compression setting rather than the io_bp's. 6147 */ 6148 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { 6149 ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, 6150 ZIO_COMPRESS_OFF); 6151 ASSERT3U(psize, >, 0); 6152 6153 abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); 6154 } else { 6155 ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); 6156 6157 abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, 6158 arc_buf_size(buf)); 6159 } 6160 } else { 6161 ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); 6162 ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); 6163 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); 6164 6165 arc_share_buf(hdr, buf); 6166 } 6167 6168 arc_hdr_verify(hdr, zio->io_bp); 6169} 6170 6171static void 6172arc_write_children_ready(zio_t *zio) 6173{ 6174 arc_write_callback_t *callback = zio->io_private; 6175 arc_buf_t *buf = callback->awcb_buf; 6176 6177 callback->awcb_children_ready(zio, buf, callback->awcb_private); 6178} 6179 6180/* 6181 * The SPA calls this callback for each physical write that happens on behalf 6182 * of a logical write. See the comment in dbuf_write_physdone() for details. 6183 */ 6184static void 6185arc_write_physdone(zio_t *zio) 6186{ 6187 arc_write_callback_t *cb = zio->io_private; 6188 if (cb->awcb_physdone != NULL) 6189 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 6190} 6191 6192static void 6193arc_write_done(zio_t *zio) 6194{ 6195 arc_write_callback_t *callback = zio->io_private; 6196 arc_buf_t *buf = callback->awcb_buf; 6197 arc_buf_hdr_t *hdr = buf->b_hdr; 6198 6199 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 6200 6201 if (zio->io_error == 0) { 6202 arc_hdr_verify(hdr, zio->io_bp); 6203 6204 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 6205 buf_discard_identity(hdr); 6206 } else { 6207 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 6208 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 6209 } 6210 } else { 6211 ASSERT(HDR_EMPTY(hdr)); 6212 } 6213 6214 /* 6215 * If the block to be written was all-zero or compressed enough to be 6216 * embedded in the BP, no write was performed so there will be no 6217 * dva/birth/checksum. The buffer must therefore remain anonymous 6218 * (and uncached). 6219 */ 6220 if (!HDR_EMPTY(hdr)) { 6221 arc_buf_hdr_t *exists; 6222 kmutex_t *hash_lock; 6223 6224 ASSERT3U(zio->io_error, ==, 0); 6225 6226 arc_cksum_verify(buf); 6227 6228 exists = buf_hash_insert(hdr, &hash_lock); 6229 if (exists != NULL) { 6230 /* 6231 * This can only happen if we overwrite for 6232 * sync-to-convergence, because we remove 6233 * buffers from the hash table when we arc_free(). 6234 */ 6235 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 6236 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 6237 panic("bad overwrite, hdr=%p exists=%p", 6238 (void *)hdr, (void *)exists); 6239 ASSERT(refcount_is_zero( 6240 &exists->b_l1hdr.b_refcnt)); 6241 arc_change_state(arc_anon, exists, hash_lock); 6242 mutex_exit(hash_lock); 6243 arc_hdr_destroy(exists); 6244 exists = buf_hash_insert(hdr, &hash_lock); 6245 ASSERT3P(exists, ==, NULL); 6246 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 6247 /* nopwrite */ 6248 ASSERT(zio->io_prop.zp_nopwrite); 6249 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 6250 panic("bad nopwrite, hdr=%p exists=%p", 6251 (void *)hdr, (void *)exists); 6252 } else { 6253 /* Dedup */ 6254 ASSERT(hdr->b_l1hdr.b_bufcnt == 1); 6255 ASSERT(hdr->b_l1hdr.b_state == arc_anon); 6256 ASSERT(BP_GET_DEDUP(zio->io_bp)); 6257 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 6258 } 6259 } 6260 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 6261 /* if it's not anon, we are doing a scrub */ 6262 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) 6263 arc_access(hdr, hash_lock); 6264 mutex_exit(hash_lock); 6265 } else { 6266 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 6267 } 6268 6269 ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 6270 callback->awcb_done(zio, buf, callback->awcb_private); 6271 6272 abd_put(zio->io_abd); 6273 kmem_free(callback, sizeof (arc_write_callback_t)); 6274} 6275 6276zio_t * 6277arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 6278 boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, 6279 arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone, 6280 arc_write_done_func_t *done, void *private, zio_priority_t priority, 6281 int zio_flags, const zbookmark_phys_t *zb) 6282{ 6283 arc_buf_hdr_t *hdr = buf->b_hdr; 6284 arc_write_callback_t *callback; 6285 zio_t *zio; 6286 zio_prop_t localprop = *zp; 6287 6288 ASSERT3P(ready, !=, NULL); 6289 ASSERT3P(done, !=, NULL); 6290 ASSERT(!HDR_IO_ERROR(hdr)); 6291 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 6292 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 6293 ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); 6294 if (l2arc) 6295 arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); 6296 if (ARC_BUF_COMPRESSED(buf)) { 6297 /* 6298 * We're writing a pre-compressed buffer. Make the 6299 * compression algorithm requested by the zio_prop_t match 6300 * the pre-compressed buffer's compression algorithm. 6301 */ 6302 localprop.zp_compress = HDR_GET_COMPRESS(hdr); 6303 6304 ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); 6305 zio_flags |= ZIO_FLAG_RAW; 6306 } 6307 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 6308 callback->awcb_ready = ready; 6309 callback->awcb_children_ready = children_ready; 6310 callback->awcb_physdone = physdone; 6311 callback->awcb_done = done; 6312 callback->awcb_private = private; 6313 callback->awcb_buf = buf; 6314 6315 /* 6316 * The hdr's b_pabd is now stale, free it now. A new data block 6317 * will be allocated when the zio pipeline calls arc_write_ready(). 6318 */ 6319 if (hdr->b_l1hdr.b_pabd != NULL) { 6320 /* 6321 * If the buf is currently sharing the data block with 6322 * the hdr then we need to break that relationship here. 6323 * The hdr will remain with a NULL data pointer and the 6324 * buf will take sole ownership of the block. 6325 */ 6326 if (arc_buf_is_shared(buf)) { 6327 arc_unshare_buf(hdr, buf); 6328 } else { 6329 arc_hdr_free_pabd(hdr); 6330 } 6331 VERIFY3P(buf->b_data, !=, NULL); 6332 arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); 6333 } 6334 ASSERT(!arc_buf_is_shared(buf)); 6335 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 6336 6337 zio = zio_write(pio, spa, txg, bp, 6338 abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), 6339 HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready, 6340 (children_ready != NULL) ? arc_write_children_ready : NULL, 6341 arc_write_physdone, arc_write_done, callback, 6342 priority, zio_flags, zb); 6343 6344 return (zio); 6345} 6346 6347static int 6348arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) 6349{ 6350#ifdef _KERNEL 6351 uint64_t available_memory = ptob(freemem); 6352 6353#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 6354 available_memory = 6355 MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); 6356#endif 6357 6358 if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) 6359 return (0); 6360 6361 if (txg > spa->spa_lowmem_last_txg) { 6362 spa->spa_lowmem_last_txg = txg; 6363 spa->spa_lowmem_page_load = 0; 6364 } 6365 /* 6366 * If we are in pageout, we know that memory is already tight, 6367 * the arc is already going to be evicting, so we just want to 6368 * continue to let page writes occur as quickly as possible. 6369 */ 6370 if (curproc == pageproc) { 6371 if (spa->spa_lowmem_page_load > 6372 MAX(ptob(minfree), available_memory) / 4) 6373 return (SET_ERROR(ERESTART)); 6374 /* Note: reserve is inflated, so we deflate */ 6375 atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8); 6376 return (0); 6377 } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) { 6378 /* memory is low, delay before restarting */ 6379 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 6380 return (SET_ERROR(EAGAIN)); 6381 } 6382 spa->spa_lowmem_page_load = 0; 6383#endif /* _KERNEL */ 6384 return (0); 6385} 6386 6387void 6388arc_tempreserve_clear(uint64_t reserve) 6389{ 6390 atomic_add_64(&arc_tempreserve, -reserve); 6391 ASSERT((int64_t)arc_tempreserve >= 0); 6392} 6393 6394int 6395arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) 6396{ 6397 int error; 6398 uint64_t anon_size; 6399 6400 if (reserve > arc_c/4 && !arc_no_grow) { 6401 arc_c = MIN(arc_c_max, reserve * 4); 6402 DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); 6403 } 6404 if (reserve > arc_c) 6405 return (SET_ERROR(ENOMEM)); 6406 6407 /* 6408 * Don't count loaned bufs as in flight dirty data to prevent long 6409 * network delays from blocking transactions that are ready to be 6410 * assigned to a txg. 6411 */ 6412 6413 /* assert that it has not wrapped around */ 6414 ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); 6415 6416 anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) - 6417 arc_loaned_bytes), 0); 6418 6419 /* 6420 * Writes will, almost always, require additional memory allocations 6421 * in order to compress/encrypt/etc the data. We therefore need to 6422 * make sure that there is sufficient available memory for this. 6423 */ 6424 error = arc_memory_throttle(spa, reserve, txg); 6425 if (error != 0) 6426 return (error); 6427 6428 /* 6429 * Throttle writes when the amount of dirty data in the cache 6430 * gets too large. We try to keep the cache less than half full 6431 * of dirty blocks so that our sync times don't grow too large. 6432 * 6433 * In the case of one pool being built on another pool, we want 6434 * to make sure we don't end up throttling the lower (backing) 6435 * pool when the upper pool is the majority contributor to dirty 6436 * data. To insure we make forward progress during throttling, we 6437 * also check the current pool's net dirty data and only throttle 6438 * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty 6439 * data in the cache. 6440 * 6441 * Note: if two requests come in concurrently, we might let them 6442 * both succeed, when one of them should fail. Not a huge deal. 6443 */ 6444 uint64_t total_dirty = reserve + arc_tempreserve + anon_size; 6445 uint64_t spa_dirty_anon = spa_dirty_data(spa); 6446 6447 if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 && 6448 anon_size > arc_c * zfs_arc_anon_limit_percent / 100 && 6449 spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) { 6450 uint64_t meta_esize = 6451 refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); 6452 uint64_t data_esize = 6453 refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); 6454 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 6455 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 6456 arc_tempreserve >> 10, meta_esize >> 10, 6457 data_esize >> 10, reserve >> 10, arc_c >> 10); 6458 return (SET_ERROR(ERESTART)); 6459 } 6460 atomic_add_64(&arc_tempreserve, reserve); 6461 return (0); 6462} 6463 6464static void 6465arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, 6466 kstat_named_t *evict_data, kstat_named_t *evict_metadata) 6467{ 6468 size->value.ui64 = refcount_count(&state->arcs_size); 6469 evict_data->value.ui64 = 6470 refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); 6471 evict_metadata->value.ui64 = 6472 refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); 6473} 6474 6475static int 6476arc_kstat_update(kstat_t *ksp, int rw) 6477{ 6478 arc_stats_t *as = ksp->ks_data; 6479 6480 if (rw == KSTAT_WRITE) { 6481 return (EACCES); 6482 } else { 6483 arc_kstat_update_state(arc_anon, 6484 &as->arcstat_anon_size, 6485 &as->arcstat_anon_evictable_data, 6486 &as->arcstat_anon_evictable_metadata); 6487 arc_kstat_update_state(arc_mru, 6488 &as->arcstat_mru_size, 6489 &as->arcstat_mru_evictable_data, 6490 &as->arcstat_mru_evictable_metadata); 6491 arc_kstat_update_state(arc_mru_ghost, 6492 &as->arcstat_mru_ghost_size, 6493 &as->arcstat_mru_ghost_evictable_data, 6494 &as->arcstat_mru_ghost_evictable_metadata); 6495 arc_kstat_update_state(arc_mfu, 6496 &as->arcstat_mfu_size, 6497 &as->arcstat_mfu_evictable_data, 6498 &as->arcstat_mfu_evictable_metadata); 6499 arc_kstat_update_state(arc_mfu_ghost, 6500 &as->arcstat_mfu_ghost_size, 6501 &as->arcstat_mfu_ghost_evictable_data, 6502 &as->arcstat_mfu_ghost_evictable_metadata); 6503 6504 ARCSTAT(arcstat_size) = aggsum_value(&arc_size); 6505 ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used); 6506 ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size); 6507 ARCSTAT(arcstat_metadata_size) = 6508 aggsum_value(&astat_metadata_size); 6509 ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size); 6510 ARCSTAT(arcstat_other_size) = aggsum_value(&astat_other_size); 6511 ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size); 6512 } 6513 6514 return (0); 6515} 6516 6517/* 6518 * This function *must* return indices evenly distributed between all 6519 * sublists of the multilist. This is needed due to how the ARC eviction 6520 * code is laid out; arc_evict_state() assumes ARC buffers are evenly 6521 * distributed between all sublists and uses this assumption when 6522 * deciding which sublist to evict from and how much to evict from it. 6523 */ 6524unsigned int 6525arc_state_multilist_index_func(multilist_t *ml, void *obj) 6526{ 6527 arc_buf_hdr_t *hdr = obj; 6528 6529 /* 6530 * We rely on b_dva to generate evenly distributed index 6531 * numbers using buf_hash below. So, as an added precaution, 6532 * let's make sure we never add empty buffers to the arc lists. 6533 */ 6534 ASSERT(!HDR_EMPTY(hdr)); 6535 6536 /* 6537 * The assumption here, is the hash value for a given 6538 * arc_buf_hdr_t will remain constant throughout it's lifetime 6539 * (i.e. it's b_spa, b_dva, and b_birth fields don't change). 6540 * Thus, we don't need to store the header's sublist index 6541 * on insertion, as this index can be recalculated on removal. 6542 * 6543 * Also, the low order bits of the hash value are thought to be 6544 * distributed evenly. Otherwise, in the case that the multilist 6545 * has a power of two number of sublists, each sublists' usage 6546 * would not be evenly distributed. 6547 */ 6548 return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % 6549 multilist_get_num_sublists(ml)); 6550} 6551 6552#ifdef _KERNEL 6553static eventhandler_tag arc_event_lowmem = NULL; 6554 6555static void 6556arc_lowmem(void *arg __unused, int howto __unused) 6557{ 6558 6559 mutex_enter(&arc_reclaim_lock); 6560 DTRACE_PROBE1(arc__needfree, int64_t, ((int64_t)freemem - zfs_arc_free_target) * PAGESIZE); 6561 cv_signal(&arc_reclaim_thread_cv); 6562 6563 /* 6564 * It is unsafe to block here in arbitrary threads, because we can come 6565 * here from ARC itself and may hold ARC locks and thus risk a deadlock 6566 * with ARC reclaim thread. 6567 */ 6568 if (curproc == pageproc) 6569 (void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); 6570 mutex_exit(&arc_reclaim_lock); 6571} 6572#endif 6573 6574static void 6575arc_state_init(void) 6576{ 6577 arc_anon = &ARC_anon; 6578 arc_mru = &ARC_mru; 6579 arc_mru_ghost = &ARC_mru_ghost; 6580 arc_mfu = &ARC_mfu; 6581 arc_mfu_ghost = &ARC_mfu_ghost; 6582 arc_l2c_only = &ARC_l2c_only; 6583 6584 arc_mru->arcs_list[ARC_BUFC_METADATA] = 6585 multilist_create(sizeof (arc_buf_hdr_t), 6586 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6587 arc_state_multilist_index_func); 6588 arc_mru->arcs_list[ARC_BUFC_DATA] = 6589 multilist_create(sizeof (arc_buf_hdr_t), 6590 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6591 arc_state_multilist_index_func); 6592 arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] = 6593 multilist_create(sizeof (arc_buf_hdr_t), 6594 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6595 arc_state_multilist_index_func); 6596 arc_mru_ghost->arcs_list[ARC_BUFC_DATA] = 6597 multilist_create(sizeof (arc_buf_hdr_t), 6598 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6599 arc_state_multilist_index_func); 6600 arc_mfu->arcs_list[ARC_BUFC_METADATA] = 6601 multilist_create(sizeof (arc_buf_hdr_t), 6602 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6603 arc_state_multilist_index_func); 6604 arc_mfu->arcs_list[ARC_BUFC_DATA] = 6605 multilist_create(sizeof (arc_buf_hdr_t), 6606 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6607 arc_state_multilist_index_func); 6608 arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] = 6609 multilist_create(sizeof (arc_buf_hdr_t), 6610 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6611 arc_state_multilist_index_func); 6612 arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] = 6613 multilist_create(sizeof (arc_buf_hdr_t), 6614 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6615 arc_state_multilist_index_func); 6616 arc_l2c_only->arcs_list[ARC_BUFC_METADATA] = 6617 multilist_create(sizeof (arc_buf_hdr_t), 6618 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6619 arc_state_multilist_index_func); 6620 arc_l2c_only->arcs_list[ARC_BUFC_DATA] = 6621 multilist_create(sizeof (arc_buf_hdr_t), 6622 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 6623 arc_state_multilist_index_func); 6624 6625 refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); 6626 refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); 6627 refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); 6628 refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); 6629 refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); 6630 refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); 6631 refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); 6632 refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); 6633 refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); 6634 refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); 6635 refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); 6636 refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); 6637 6638 refcount_create(&arc_anon->arcs_size); 6639 refcount_create(&arc_mru->arcs_size); 6640 refcount_create(&arc_mru_ghost->arcs_size); 6641 refcount_create(&arc_mfu->arcs_size); 6642 refcount_create(&arc_mfu_ghost->arcs_size); 6643 refcount_create(&arc_l2c_only->arcs_size); 6644 6645 aggsum_init(&arc_meta_used, 0); 6646 aggsum_init(&arc_size, 0); 6647 aggsum_init(&astat_data_size, 0); 6648 aggsum_init(&astat_metadata_size, 0); 6649 aggsum_init(&astat_hdr_size, 0); 6650 aggsum_init(&astat_other_size, 0); 6651 aggsum_init(&astat_l2_hdr_size, 0); 6652} 6653 6654static void 6655arc_state_fini(void) 6656{ 6657 refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); 6658 refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); 6659 refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); 6660 refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); 6661 refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); 6662 refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); 6663 refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); 6664 refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); 6665 refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); 6666 refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); 6667 refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); 6668 refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); 6669 6670 refcount_destroy(&arc_anon->arcs_size); 6671 refcount_destroy(&arc_mru->arcs_size); 6672 refcount_destroy(&arc_mru_ghost->arcs_size); 6673 refcount_destroy(&arc_mfu->arcs_size); 6674 refcount_destroy(&arc_mfu_ghost->arcs_size); 6675 refcount_destroy(&arc_l2c_only->arcs_size); 6676 6677 multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]); 6678 multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 6679 multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]); 6680 multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 6681 multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]); 6682 multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 6683 multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]); 6684 multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 6685} 6686 6687uint64_t 6688arc_max_bytes(void) 6689{ 6690 return (arc_c_max); 6691} 6692 6693void 6694arc_init(void) 6695{ 6696 int i, prefetch_tunable_set = 0; 6697 6698 /* 6699 * allmem is "all memory that we could possibly use". 6700 */ 6701#ifdef illumos 6702#ifdef _KERNEL 6703 uint64_t allmem = ptob(physmem - swapfs_minfree); 6704#else 6705 uint64_t allmem = (physmem * PAGESIZE) / 2; 6706#endif 6707#else 6708 uint64_t allmem = kmem_size(); 6709#endif 6710 6711 6712 mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); 6713 cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); 6714 cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); 6715 6716 mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL); 6717 cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL); 6718 6719 /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */ 6720 arc_c_min = MAX(allmem / 32, arc_abs_min); 6721 /* set max to 5/8 of all memory, or all but 1GB, whichever is more */ 6722 if (allmem >= 1 << 30) 6723 arc_c_max = allmem - (1 << 30); 6724 else 6725 arc_c_max = arc_c_min; 6726 arc_c_max = MAX(allmem * 5 / 8, arc_c_max); 6727 6728 /* 6729 * In userland, there's only the memory pressure that we artificially 6730 * create (see arc_available_memory()). Don't let arc_c get too 6731 * small, because it can cause transactions to be larger than 6732 * arc_c, causing arc_tempreserve_space() to fail. 6733 */ 6734#ifndef _KERNEL 6735 arc_c_min = arc_c_max / 2; 6736#endif 6737 6738#ifdef _KERNEL 6739 /* 6740 * Allow the tunables to override our calculations if they are 6741 * reasonable. 6742 */ 6743 if (zfs_arc_max > arc_abs_min && zfs_arc_max < allmem) { 6744 arc_c_max = zfs_arc_max; 6745 arc_c_min = MIN(arc_c_min, arc_c_max); 6746 } 6747 if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max) 6748 arc_c_min = zfs_arc_min; 6749#endif 6750 6751 arc_c = arc_c_max; 6752 arc_p = (arc_c >> 1); 6753 6754 /* limit meta-data to 1/4 of the arc capacity */ 6755 arc_meta_limit = arc_c_max / 4; 6756 6757#ifdef _KERNEL 6758 /* 6759 * Metadata is stored in the kernel's heap. Don't let us 6760 * use more than half the heap for the ARC. 6761 */ 6762 arc_meta_limit = MIN(arc_meta_limit, 6763 vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2); 6764#endif 6765 6766 /* Allow the tunable to override if it is reasonable */ 6767 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 6768 arc_meta_limit = zfs_arc_meta_limit; 6769 6770 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 6771 arc_c_min = arc_meta_limit / 2; 6772 6773 if (zfs_arc_meta_min > 0) { 6774 arc_meta_min = zfs_arc_meta_min; 6775 } else { 6776 arc_meta_min = arc_c_min / 2; 6777 } 6778 6779 if (zfs_arc_grow_retry > 0) 6780 arc_grow_retry = zfs_arc_grow_retry; 6781 6782 if (zfs_arc_shrink_shift > 0) 6783 arc_shrink_shift = zfs_arc_shrink_shift; 6784 6785 if (zfs_arc_no_grow_shift > 0) 6786 arc_no_grow_shift = zfs_arc_no_grow_shift; 6787 /* 6788 * Ensure that arc_no_grow_shift is less than arc_shrink_shift. 6789 */ 6790 if (arc_no_grow_shift >= arc_shrink_shift) 6791 arc_no_grow_shift = arc_shrink_shift - 1; 6792 6793 if (zfs_arc_p_min_shift > 0) 6794 arc_p_min_shift = zfs_arc_p_min_shift; 6795 6796 /* if kmem_flags are set, lets try to use less memory */ 6797 if (kmem_debugging()) 6798 arc_c = arc_c / 2; 6799 if (arc_c < arc_c_min) 6800 arc_c = arc_c_min; 6801 6802 zfs_arc_min = arc_c_min; 6803 zfs_arc_max = arc_c_max; 6804 6805 arc_state_init(); 6806 buf_init(); 6807 6808 arc_reclaim_thread_exit = B_FALSE; 6809 arc_dnlc_evicts_thread_exit = FALSE; 6810 6811 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 6812 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 6813 6814 if (arc_ksp != NULL) { 6815 arc_ksp->ks_data = &arc_stats; 6816 arc_ksp->ks_update = arc_kstat_update; 6817 kstat_install(arc_ksp); 6818 } 6819 6820 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 6821 TS_RUN, minclsyspri); 6822 6823#ifdef _KERNEL 6824 arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 6825 EVENTHANDLER_PRI_FIRST); 6826#endif 6827 6828 (void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0, 6829 TS_RUN, minclsyspri); 6830 6831 arc_dead = B_FALSE; 6832 arc_warm = B_FALSE; 6833 6834 /* 6835 * Calculate maximum amount of dirty data per pool. 6836 * 6837 * If it has been set by /etc/system, take that. 6838 * Otherwise, use a percentage of physical memory defined by 6839 * zfs_dirty_data_max_percent (default 10%) with a cap at 6840 * zfs_dirty_data_max_max (default 4GB). 6841 */ 6842 if (zfs_dirty_data_max == 0) { 6843 zfs_dirty_data_max = ptob(physmem) * 6844 zfs_dirty_data_max_percent / 100; 6845 zfs_dirty_data_max = MIN(zfs_dirty_data_max, 6846 zfs_dirty_data_max_max); 6847 } 6848 6849#ifdef _KERNEL 6850 if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 6851 prefetch_tunable_set = 1; 6852 6853#ifdef __i386__ 6854 if (prefetch_tunable_set == 0) { 6855 printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 6856 "-- to enable,\n"); 6857 printf(" add \"vfs.zfs.prefetch_disable=0\" " 6858 "to /boot/loader.conf.\n"); 6859 zfs_prefetch_disable = 1; 6860 } 6861#else 6862 if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 6863 prefetch_tunable_set == 0) { 6864 printf("ZFS NOTICE: Prefetch is disabled by default if less " 6865 "than 4GB of RAM is present;\n" 6866 " to enable, add \"vfs.zfs.prefetch_disable=0\" " 6867 "to /boot/loader.conf.\n"); 6868 zfs_prefetch_disable = 1; 6869 } 6870#endif 6871 /* Warn about ZFS memory and address space requirements. */ 6872 if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 6873 printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 6874 "expect unstable behavior.\n"); 6875 } 6876 if (allmem < 512 * (1 << 20)) { 6877 printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 6878 "expect unstable behavior.\n"); 6879 printf(" Consider tuning vm.kmem_size and " 6880 "vm.kmem_size_max\n"); 6881 printf(" in /boot/loader.conf.\n"); 6882 } 6883#endif 6884} 6885 6886void 6887arc_fini(void) 6888{ 6889#ifdef _KERNEL 6890 if (arc_event_lowmem != NULL) 6891 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 6892#endif 6893 6894 mutex_enter(&arc_reclaim_lock); 6895 arc_reclaim_thread_exit = B_TRUE; 6896 /* 6897 * The reclaim thread will set arc_reclaim_thread_exit back to 6898 * B_FALSE when it is finished exiting; we're waiting for that. 6899 */ 6900 while (arc_reclaim_thread_exit) { 6901 cv_signal(&arc_reclaim_thread_cv); 6902 cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); 6903 } 6904 mutex_exit(&arc_reclaim_lock); 6905 6906 /* Use B_TRUE to ensure *all* buffers are evicted */ 6907 arc_flush(NULL, B_TRUE); 6908 6909 mutex_enter(&arc_dnlc_evicts_lock); 6910 arc_dnlc_evicts_thread_exit = TRUE; 6911 /* 6912 * The user evicts thread will set arc_user_evicts_thread_exit 6913 * to FALSE when it is finished exiting; we're waiting for that. 6914 */ 6915 while (arc_dnlc_evicts_thread_exit) { 6916 cv_signal(&arc_dnlc_evicts_cv); 6917 cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); 6918 } 6919 mutex_exit(&arc_dnlc_evicts_lock); 6920 6921 arc_dead = B_TRUE; 6922 6923 if (arc_ksp != NULL) { 6924 kstat_delete(arc_ksp); 6925 arc_ksp = NULL; 6926 } 6927 6928 mutex_destroy(&arc_reclaim_lock); 6929 cv_destroy(&arc_reclaim_thread_cv); 6930 cv_destroy(&arc_reclaim_waiters_cv); 6931 6932 mutex_destroy(&arc_dnlc_evicts_lock); 6933 cv_destroy(&arc_dnlc_evicts_cv); 6934 6935 arc_state_fini(); 6936 buf_fini(); 6937 6938 ASSERT0(arc_loaned_bytes); 6939} 6940 6941/* 6942 * Level 2 ARC 6943 * 6944 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 6945 * It uses dedicated storage devices to hold cached data, which are populated 6946 * using large infrequent writes. The main role of this cache is to boost 6947 * the performance of random read workloads. The intended L2ARC devices 6948 * include short-stroked disks, solid state disks, and other media with 6949 * substantially faster read latency than disk. 6950 * 6951 * +-----------------------+ 6952 * | ARC | 6953 * +-----------------------+ 6954 * | ^ ^ 6955 * | | | 6956 * l2arc_feed_thread() arc_read() 6957 * | | | 6958 * | l2arc read | 6959 * V | | 6960 * +---------------+ | 6961 * | L2ARC | | 6962 * +---------------+ | 6963 * | ^ | 6964 * l2arc_write() | | 6965 * | | | 6966 * V | | 6967 * +-------+ +-------+ 6968 * | vdev | | vdev | 6969 * | cache | | cache | 6970 * +-------+ +-------+ 6971 * +=========+ .-----. 6972 * : L2ARC : |-_____-| 6973 * : devices : | Disks | 6974 * +=========+ `-_____-' 6975 * 6976 * Read requests are satisfied from the following sources, in order: 6977 * 6978 * 1) ARC 6979 * 2) vdev cache of L2ARC devices 6980 * 3) L2ARC devices 6981 * 4) vdev cache of disks 6982 * 5) disks 6983 * 6984 * Some L2ARC device types exhibit extremely slow write performance. 6985 * To accommodate for this there are some significant differences between 6986 * the L2ARC and traditional cache design: 6987 * 6988 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 6989 * the ARC behave as usual, freeing buffers and placing headers on ghost 6990 * lists. The ARC does not send buffers to the L2ARC during eviction as 6991 * this would add inflated write latencies for all ARC memory pressure. 6992 * 6993 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 6994 * It does this by periodically scanning buffers from the eviction-end of 6995 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 6996 * not already there. It scans until a headroom of buffers is satisfied, 6997 * which itself is a buffer for ARC eviction. If a compressible buffer is 6998 * found during scanning and selected for writing to an L2ARC device, we 6999 * temporarily boost scanning headroom during the next scan cycle to make 7000 * sure we adapt to compression effects (which might significantly reduce 7001 * the data volume we write to L2ARC). The thread that does this is 7002 * l2arc_feed_thread(), illustrated below; example sizes are included to 7003 * provide a better sense of ratio than this diagram: 7004 * 7005 * head --> tail 7006 * +---------------------+----------+ 7007 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 7008 * +---------------------+----------+ | o L2ARC eligible 7009 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 7010 * +---------------------+----------+ | 7011 * 15.9 Gbytes ^ 32 Mbytes | 7012 * headroom | 7013 * l2arc_feed_thread() 7014 * | 7015 * l2arc write hand <--[oooo]--' 7016 * | 8 Mbyte 7017 * | write max 7018 * V 7019 * +==============================+ 7020 * L2ARC dev |####|#|###|###| |####| ... | 7021 * +==============================+ 7022 * 32 Gbytes 7023 * 7024 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 7025 * evicted, then the L2ARC has cached a buffer much sooner than it probably 7026 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 7027 * safe to say that this is an uncommon case, since buffers at the end of 7028 * the ARC lists have moved there due to inactivity. 7029 * 7030 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 7031 * then the L2ARC simply misses copying some buffers. This serves as a 7032 * pressure valve to prevent heavy read workloads from both stalling the ARC 7033 * with waits and clogging the L2ARC with writes. This also helps prevent 7034 * the potential for the L2ARC to churn if it attempts to cache content too 7035 * quickly, such as during backups of the entire pool. 7036 * 7037 * 5. After system boot and before the ARC has filled main memory, there are 7038 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 7039 * lists can remain mostly static. Instead of searching from tail of these 7040 * lists as pictured, the l2arc_feed_thread() will search from the list heads 7041 * for eligible buffers, greatly increasing its chance of finding them. 7042 * 7043 * The L2ARC device write speed is also boosted during this time so that 7044 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 7045 * there are no L2ARC reads, and no fear of degrading read performance 7046 * through increased writes. 7047 * 7048 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 7049 * the vdev queue can aggregate them into larger and fewer writes. Each 7050 * device is written to in a rotor fashion, sweeping writes through 7051 * available space then repeating. 7052 * 7053 * 7. The L2ARC does not store dirty content. It never needs to flush 7054 * write buffers back to disk based storage. 7055 * 7056 * 8. If an ARC buffer is written (and dirtied) which also exists in the 7057 * L2ARC, the now stale L2ARC buffer is immediately dropped. 7058 * 7059 * The performance of the L2ARC can be tweaked by a number of tunables, which 7060 * may be necessary for different workloads: 7061 * 7062 * l2arc_write_max max write bytes per interval 7063 * l2arc_write_boost extra write bytes during device warmup 7064 * l2arc_noprefetch skip caching prefetched buffers 7065 * l2arc_headroom number of max device writes to precache 7066 * l2arc_headroom_boost when we find compressed buffers during ARC 7067 * scanning, we multiply headroom by this 7068 * percentage factor for the next scan cycle, 7069 * since more compressed buffers are likely to 7070 * be present 7071 * l2arc_feed_secs seconds between L2ARC writing 7072 * 7073 * Tunables may be removed or added as future performance improvements are 7074 * integrated, and also may become zpool properties. 7075 * 7076 * There are three key functions that control how the L2ARC warms up: 7077 * 7078 * l2arc_write_eligible() check if a buffer is eligible to cache 7079 * l2arc_write_size() calculate how much to write 7080 * l2arc_write_interval() calculate sleep delay between writes 7081 * 7082 * These three functions determine what to write, how much, and how quickly 7083 * to send writes. 7084 */ 7085 7086static boolean_t 7087l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) 7088{ 7089 /* 7090 * A buffer is *not* eligible for the L2ARC if it: 7091 * 1. belongs to a different spa. 7092 * 2. is already cached on the L2ARC. 7093 * 3. has an I/O in progress (it may be an incomplete read). 7094 * 4. is flagged not eligible (zfs property). 7095 */ 7096 if (hdr->b_spa != spa_guid) { 7097 ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 7098 return (B_FALSE); 7099 } 7100 if (HDR_HAS_L2HDR(hdr)) { 7101 ARCSTAT_BUMP(arcstat_l2_write_in_l2); 7102 return (B_FALSE); 7103 } 7104 if (HDR_IO_IN_PROGRESS(hdr)) { 7105 ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 7106 return (B_FALSE); 7107 } 7108 if (!HDR_L2CACHE(hdr)) { 7109 ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 7110 return (B_FALSE); 7111 } 7112 7113 return (B_TRUE); 7114} 7115 7116static uint64_t 7117l2arc_write_size(void) 7118{ 7119 uint64_t size; 7120 7121 /* 7122 * Make sure our globals have meaningful values in case the user 7123 * altered them. 7124 */ 7125 size = l2arc_write_max; 7126 if (size == 0) { 7127 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 7128 "be greater than zero, resetting it to the default (%d)", 7129 L2ARC_WRITE_SIZE); 7130 size = l2arc_write_max = L2ARC_WRITE_SIZE; 7131 } 7132 7133 if (arc_warm == B_FALSE) 7134 size += l2arc_write_boost; 7135 7136 return (size); 7137 7138} 7139 7140static clock_t 7141l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 7142{ 7143 clock_t interval, next, now; 7144 7145 /* 7146 * If the ARC lists are busy, increase our write rate; if the 7147 * lists are stale, idle back. This is achieved by checking 7148 * how much we previously wrote - if it was more than half of 7149 * what we wanted, schedule the next write much sooner. 7150 */ 7151 if (l2arc_feed_again && wrote > (wanted / 2)) 7152 interval = (hz * l2arc_feed_min_ms) / 1000; 7153 else 7154 interval = hz * l2arc_feed_secs; 7155 7156 now = ddi_get_lbolt(); 7157 next = MAX(now, MIN(now + interval, began + interval)); 7158 7159 return (next); 7160} 7161 7162/* 7163 * Cycle through L2ARC devices. This is how L2ARC load balances. 7164 * If a device is returned, this also returns holding the spa config lock. 7165 */ 7166static l2arc_dev_t * 7167l2arc_dev_get_next(void) 7168{ 7169 l2arc_dev_t *first, *next = NULL; 7170 7171 /* 7172 * Lock out the removal of spas (spa_namespace_lock), then removal 7173 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 7174 * both locks will be dropped and a spa config lock held instead. 7175 */ 7176 mutex_enter(&spa_namespace_lock); 7177 mutex_enter(&l2arc_dev_mtx); 7178 7179 /* if there are no vdevs, there is nothing to do */ 7180 if (l2arc_ndev == 0) 7181 goto out; 7182 7183 first = NULL; 7184 next = l2arc_dev_last; 7185 do { 7186 /* loop around the list looking for a non-faulted vdev */ 7187 if (next == NULL) { 7188 next = list_head(l2arc_dev_list); 7189 } else { 7190 next = list_next(l2arc_dev_list, next); 7191 if (next == NULL) 7192 next = list_head(l2arc_dev_list); 7193 } 7194 7195 /* if we have come back to the start, bail out */ 7196 if (first == NULL) 7197 first = next; 7198 else if (next == first) 7199 break; 7200 7201 } while (vdev_is_dead(next->l2ad_vdev)); 7202 7203 /* if we were unable to find any usable vdevs, return NULL */ 7204 if (vdev_is_dead(next->l2ad_vdev)) 7205 next = NULL; 7206 7207 l2arc_dev_last = next; 7208 7209out: 7210 mutex_exit(&l2arc_dev_mtx); 7211 7212 /* 7213 * Grab the config lock to prevent the 'next' device from being 7214 * removed while we are writing to it. 7215 */ 7216 if (next != NULL) 7217 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 7218 mutex_exit(&spa_namespace_lock); 7219 7220 return (next); 7221} 7222 7223/* 7224 * Free buffers that were tagged for destruction. 7225 */ 7226static void 7227l2arc_do_free_on_write() 7228{ 7229 list_t *buflist; 7230 l2arc_data_free_t *df, *df_prev; 7231 7232 mutex_enter(&l2arc_free_on_write_mtx); 7233 buflist = l2arc_free_on_write; 7234 7235 for (df = list_tail(buflist); df; df = df_prev) { 7236 df_prev = list_prev(buflist, df); 7237 ASSERT3P(df->l2df_abd, !=, NULL); 7238 abd_free(df->l2df_abd); 7239 list_remove(buflist, df); 7240 kmem_free(df, sizeof (l2arc_data_free_t)); 7241 } 7242 7243 mutex_exit(&l2arc_free_on_write_mtx); 7244} 7245 7246/* 7247 * A write to a cache device has completed. Update all headers to allow 7248 * reads from these buffers to begin. 7249 */ 7250static void 7251l2arc_write_done(zio_t *zio) 7252{ 7253 l2arc_write_callback_t *cb; 7254 l2arc_dev_t *dev; 7255 list_t *buflist; 7256 arc_buf_hdr_t *head, *hdr, *hdr_prev; 7257 kmutex_t *hash_lock; 7258 int64_t bytes_dropped = 0; 7259 7260 cb = zio->io_private; 7261 ASSERT3P(cb, !=, NULL); 7262 dev = cb->l2wcb_dev; 7263 ASSERT3P(dev, !=, NULL); 7264 head = cb->l2wcb_head; 7265 ASSERT3P(head, !=, NULL); 7266 buflist = &dev->l2ad_buflist; 7267 ASSERT3P(buflist, !=, NULL); 7268 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 7269 l2arc_write_callback_t *, cb); 7270 7271 if (zio->io_error != 0) 7272 ARCSTAT_BUMP(arcstat_l2_writes_error); 7273 7274 /* 7275 * All writes completed, or an error was hit. 7276 */ 7277top: 7278 mutex_enter(&dev->l2ad_mtx); 7279 for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { 7280 hdr_prev = list_prev(buflist, hdr); 7281 7282 hash_lock = HDR_LOCK(hdr); 7283 7284 /* 7285 * We cannot use mutex_enter or else we can deadlock 7286 * with l2arc_write_buffers (due to swapping the order 7287 * the hash lock and l2ad_mtx are taken). 7288 */ 7289 if (!mutex_tryenter(hash_lock)) { 7290 /* 7291 * Missed the hash lock. We must retry so we 7292 * don't leave the ARC_FLAG_L2_WRITING bit set. 7293 */ 7294 ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); 7295 7296 /* 7297 * We don't want to rescan the headers we've 7298 * already marked as having been written out, so 7299 * we reinsert the head node so we can pick up 7300 * where we left off. 7301 */ 7302 list_remove(buflist, head); 7303 list_insert_after(buflist, hdr, head); 7304 7305 mutex_exit(&dev->l2ad_mtx); 7306 7307 /* 7308 * We wait for the hash lock to become available 7309 * to try and prevent busy waiting, and increase 7310 * the chance we'll be able to acquire the lock 7311 * the next time around. 7312 */ 7313 mutex_enter(hash_lock); 7314 mutex_exit(hash_lock); 7315 goto top; 7316 } 7317 7318 /* 7319 * We could not have been moved into the arc_l2c_only 7320 * state while in-flight due to our ARC_FLAG_L2_WRITING 7321 * bit being set. Let's just ensure that's being enforced. 7322 */ 7323 ASSERT(HDR_HAS_L1HDR(hdr)); 7324 7325 if (zio->io_error != 0) { 7326 /* 7327 * Error - drop L2ARC entry. 7328 */ 7329 list_remove(buflist, hdr); 7330 l2arc_trim(hdr); 7331 arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); 7332 7333 ARCSTAT_INCR(arcstat_l2_psize, -arc_hdr_size(hdr)); 7334 ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); 7335 7336 bytes_dropped += arc_hdr_size(hdr); 7337 (void) refcount_remove_many(&dev->l2ad_alloc, 7338 arc_hdr_size(hdr), hdr); 7339 } 7340 7341 /* 7342 * Allow ARC to begin reads and ghost list evictions to 7343 * this L2ARC entry. 7344 */ 7345 arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); 7346 7347 mutex_exit(hash_lock); 7348 } 7349 7350 atomic_inc_64(&l2arc_writes_done); 7351 list_remove(buflist, head); 7352 ASSERT(!HDR_HAS_L1HDR(head)); 7353 kmem_cache_free(hdr_l2only_cache, head); 7354 mutex_exit(&dev->l2ad_mtx); 7355 7356 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); 7357 7358 l2arc_do_free_on_write(); 7359 7360 kmem_free(cb, sizeof (l2arc_write_callback_t)); 7361} 7362 7363/* 7364 * A read to a cache device completed. Validate buffer contents before 7365 * handing over to the regular ARC routines. 7366 */ 7367static void 7368l2arc_read_done(zio_t *zio) 7369{ 7370 l2arc_read_callback_t *cb; 7371 arc_buf_hdr_t *hdr; 7372 kmutex_t *hash_lock; 7373 boolean_t valid_cksum; 7374 7375 ASSERT3P(zio->io_vd, !=, NULL); 7376 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 7377 7378 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 7379 7380 cb = zio->io_private; 7381 ASSERT3P(cb, !=, NULL); 7382 hdr = cb->l2rcb_hdr; 7383 ASSERT3P(hdr, !=, NULL); 7384 7385 hash_lock = HDR_LOCK(hdr); 7386 mutex_enter(hash_lock); 7387 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 7388 7389 /* 7390 * If the data was read into a temporary buffer, 7391 * move it and free the buffer. 7392 */ 7393 if (cb->l2rcb_abd != NULL) { 7394 ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); 7395 if (zio->io_error == 0) { 7396 abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd, 7397 arc_hdr_size(hdr)); 7398 } 7399 7400 /* 7401 * The following must be done regardless of whether 7402 * there was an error: 7403 * - free the temporary buffer 7404 * - point zio to the real ARC buffer 7405 * - set zio size accordingly 7406 * These are required because zio is either re-used for 7407 * an I/O of the block in the case of the error 7408 * or the zio is passed to arc_read_done() and it 7409 * needs real data. 7410 */ 7411 abd_free(cb->l2rcb_abd); 7412 zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); 7413 zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd; 7414 } 7415 7416 ASSERT3P(zio->io_abd, !=, NULL); 7417 7418 /* 7419 * Check this survived the L2ARC journey. 7420 */ 7421 ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd); 7422 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 7423 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 7424 7425 valid_cksum = arc_cksum_is_equal(hdr, zio); 7426 if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 7427 mutex_exit(hash_lock); 7428 zio->io_private = hdr; 7429 arc_read_done(zio); 7430 } else { 7431 mutex_exit(hash_lock); 7432 /* 7433 * Buffer didn't survive caching. Increment stats and 7434 * reissue to the original storage device. 7435 */ 7436 if (zio->io_error != 0) { 7437 ARCSTAT_BUMP(arcstat_l2_io_error); 7438 } else { 7439 zio->io_error = SET_ERROR(EIO); 7440 } 7441 if (!valid_cksum) 7442 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 7443 7444 /* 7445 * If there's no waiter, issue an async i/o to the primary 7446 * storage now. If there *is* a waiter, the caller must 7447 * issue the i/o in a context where it's OK to block. 7448 */ 7449 if (zio->io_waiter == NULL) { 7450 zio_t *pio = zio_unique_parent(zio); 7451 7452 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 7453 7454 zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, 7455 hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done, 7456 hdr, zio->io_priority, cb->l2rcb_flags, 7457 &cb->l2rcb_zb)); 7458 } 7459 } 7460 7461 kmem_free(cb, sizeof (l2arc_read_callback_t)); 7462} 7463 7464/* 7465 * This is the list priority from which the L2ARC will search for pages to 7466 * cache. This is used within loops (0..3) to cycle through lists in the 7467 * desired order. This order can have a significant effect on cache 7468 * performance. 7469 * 7470 * Currently the metadata lists are hit first, MFU then MRU, followed by 7471 * the data lists. This function returns a locked list, and also returns 7472 * the lock pointer. 7473 */ 7474static multilist_sublist_t * 7475l2arc_sublist_lock(int list_num) 7476{ 7477 multilist_t *ml = NULL; 7478 unsigned int idx; 7479 7480 ASSERT(list_num >= 0 && list_num <= 3); 7481 7482 switch (list_num) { 7483 case 0: 7484 ml = arc_mfu->arcs_list[ARC_BUFC_METADATA]; 7485 break; 7486 case 1: 7487 ml = arc_mru->arcs_list[ARC_BUFC_METADATA]; 7488 break; 7489 case 2: 7490 ml = arc_mfu->arcs_list[ARC_BUFC_DATA]; 7491 break; 7492 case 3: 7493 ml = arc_mru->arcs_list[ARC_BUFC_DATA]; 7494 break; 7495 } 7496 7497 /* 7498 * Return a randomly-selected sublist. This is acceptable 7499 * because the caller feeds only a little bit of data for each 7500 * call (8MB). Subsequent calls will result in different 7501 * sublists being selected. 7502 */ 7503 idx = multilist_get_random_index(ml); 7504 return (multilist_sublist_lock(ml, idx)); 7505} 7506 7507/* 7508 * Evict buffers from the device write hand to the distance specified in 7509 * bytes. This distance may span populated buffers, it may span nothing. 7510 * This is clearing a region on the L2ARC device ready for writing. 7511 * If the 'all' boolean is set, every buffer is evicted. 7512 */ 7513static void 7514l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 7515{ 7516 list_t *buflist; 7517 arc_buf_hdr_t *hdr, *hdr_prev; 7518 kmutex_t *hash_lock; 7519 uint64_t taddr; 7520 7521 buflist = &dev->l2ad_buflist; 7522 7523 if (!all && dev->l2ad_first) { 7524 /* 7525 * This is the first sweep through the device. There is 7526 * nothing to evict. 7527 */ 7528 return; 7529 } 7530 7531 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 7532 /* 7533 * When nearing the end of the device, evict to the end 7534 * before the device write hand jumps to the start. 7535 */ 7536 taddr = dev->l2ad_end; 7537 } else { 7538 taddr = dev->l2ad_hand + distance; 7539 } 7540 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 7541 uint64_t, taddr, boolean_t, all); 7542 7543top: 7544 mutex_enter(&dev->l2ad_mtx); 7545 for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { 7546 hdr_prev = list_prev(buflist, hdr); 7547 7548 hash_lock = HDR_LOCK(hdr); 7549 7550 /* 7551 * We cannot use mutex_enter or else we can deadlock 7552 * with l2arc_write_buffers (due to swapping the order 7553 * the hash lock and l2ad_mtx are taken). 7554 */ 7555 if (!mutex_tryenter(hash_lock)) { 7556 /* 7557 * Missed the hash lock. Retry. 7558 */ 7559 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 7560 mutex_exit(&dev->l2ad_mtx); 7561 mutex_enter(hash_lock); 7562 mutex_exit(hash_lock); 7563 goto top; 7564 } 7565 7566 /* 7567 * A header can't be on this list if it doesn't have L2 header. 7568 */ 7569 ASSERT(HDR_HAS_L2HDR(hdr)); 7570 7571 /* Ensure this header has finished being written. */ 7572 ASSERT(!HDR_L2_WRITING(hdr)); 7573 ASSERT(!HDR_L2_WRITE_HEAD(hdr)); 7574 7575 if (!all && (hdr->b_l2hdr.b_daddr >= taddr || 7576 hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { 7577 /* 7578 * We've evicted to the target address, 7579 * or the end of the device. 7580 */ 7581 mutex_exit(hash_lock); 7582 break; 7583 } 7584 7585 if (!HDR_HAS_L1HDR(hdr)) { 7586 ASSERT(!HDR_L2_READING(hdr)); 7587 /* 7588 * This doesn't exist in the ARC. Destroy. 7589 * arc_hdr_destroy() will call list_remove() 7590 * and decrement arcstat_l2_lsize. 7591 */ 7592 arc_change_state(arc_anon, hdr, hash_lock); 7593 arc_hdr_destroy(hdr); 7594 } else { 7595 ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); 7596 ARCSTAT_BUMP(arcstat_l2_evict_l1cached); 7597 /* 7598 * Invalidate issued or about to be issued 7599 * reads, since we may be about to write 7600 * over this location. 7601 */ 7602 if (HDR_L2_READING(hdr)) { 7603 ARCSTAT_BUMP(arcstat_l2_evict_reading); 7604 arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); 7605 } 7606 7607 arc_hdr_l2hdr_destroy(hdr); 7608 } 7609 mutex_exit(hash_lock); 7610 } 7611 mutex_exit(&dev->l2ad_mtx); 7612} 7613 7614/* 7615 * Find and write ARC buffers to the L2ARC device. 7616 * 7617 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid 7618 * for reading until they have completed writing. 7619 * The headroom_boost is an in-out parameter used to maintain headroom boost 7620 * state between calls to this function. 7621 * 7622 * Returns the number of bytes actually written (which may be smaller than 7623 * the delta by which the device hand has changed due to alignment). 7624 */ 7625static uint64_t 7626l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) 7627{ 7628 arc_buf_hdr_t *hdr, *hdr_prev, *head; 7629 uint64_t write_asize, write_psize, write_lsize, headroom; 7630 boolean_t full; 7631 l2arc_write_callback_t *cb; 7632 zio_t *pio, *wzio; 7633 uint64_t guid = spa_load_guid(spa); 7634 int try; 7635 7636 ASSERT3P(dev->l2ad_vdev, !=, NULL); 7637 7638 pio = NULL; 7639 write_lsize = write_asize = write_psize = 0; 7640 full = B_FALSE; 7641 head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); 7642 arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); 7643 7644 ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); 7645 /* 7646 * Copy buffers for L2ARC writing. 7647 */ 7648 for (try = 0; try <= 3; try++) { 7649 multilist_sublist_t *mls = l2arc_sublist_lock(try); 7650 uint64_t passed_sz = 0; 7651 7652 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); 7653 7654 /* 7655 * L2ARC fast warmup. 7656 * 7657 * Until the ARC is warm and starts to evict, read from the 7658 * head of the ARC lists rather than the tail. 7659 */ 7660 if (arc_warm == B_FALSE) 7661 hdr = multilist_sublist_head(mls); 7662 else 7663 hdr = multilist_sublist_tail(mls); 7664 if (hdr == NULL) 7665 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); 7666 7667 headroom = target_sz * l2arc_headroom; 7668 if (zfs_compressed_arc_enabled) 7669 headroom = (headroom * l2arc_headroom_boost) / 100; 7670 7671 for (; hdr; hdr = hdr_prev) { 7672 kmutex_t *hash_lock; 7673 7674 if (arc_warm == B_FALSE) 7675 hdr_prev = multilist_sublist_next(mls, hdr); 7676 else 7677 hdr_prev = multilist_sublist_prev(mls, hdr); 7678 ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, 7679 HDR_GET_LSIZE(hdr)); 7680 7681 hash_lock = HDR_LOCK(hdr); 7682 if (!mutex_tryenter(hash_lock)) { 7683 ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); 7684 /* 7685 * Skip this buffer rather than waiting. 7686 */ 7687 continue; 7688 } 7689 7690 passed_sz += HDR_GET_LSIZE(hdr); 7691 if (passed_sz > headroom) { 7692 /* 7693 * Searched too far. 7694 */ 7695 mutex_exit(hash_lock); 7696 ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); 7697 break; 7698 } 7699 7700 if (!l2arc_write_eligible(guid, hdr)) { 7701 mutex_exit(hash_lock); 7702 continue; 7703 } 7704 7705 /* 7706 * We rely on the L1 portion of the header below, so 7707 * it's invalid for this header to have been evicted out 7708 * of the ghost cache, prior to being written out. The 7709 * ARC_FLAG_L2_WRITING bit ensures this won't happen. 7710 */ 7711 ASSERT(HDR_HAS_L1HDR(hdr)); 7712 7713 ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); 7714 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 7715 ASSERT3U(arc_hdr_size(hdr), >, 0); 7716 uint64_t psize = arc_hdr_size(hdr); 7717 uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, 7718 psize); 7719 7720 if ((write_asize + asize) > target_sz) { 7721 full = B_TRUE; 7722 mutex_exit(hash_lock); 7723 ARCSTAT_BUMP(arcstat_l2_write_full); 7724 break; 7725 } 7726 7727 if (pio == NULL) { 7728 /* 7729 * Insert a dummy header on the buflist so 7730 * l2arc_write_done() can find where the 7731 * write buffers begin without searching. 7732 */ 7733 mutex_enter(&dev->l2ad_mtx); 7734 list_insert_head(&dev->l2ad_buflist, head); 7735 mutex_exit(&dev->l2ad_mtx); 7736 7737 cb = kmem_alloc( 7738 sizeof (l2arc_write_callback_t), KM_SLEEP); 7739 cb->l2wcb_dev = dev; 7740 cb->l2wcb_head = head; 7741 pio = zio_root(spa, l2arc_write_done, cb, 7742 ZIO_FLAG_CANFAIL); 7743 ARCSTAT_BUMP(arcstat_l2_write_pios); 7744 } 7745 7746 hdr->b_l2hdr.b_dev = dev; 7747 hdr->b_l2hdr.b_daddr = dev->l2ad_hand; 7748 arc_hdr_set_flags(hdr, 7749 ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR); 7750 7751 mutex_enter(&dev->l2ad_mtx); 7752 list_insert_head(&dev->l2ad_buflist, hdr); 7753 mutex_exit(&dev->l2ad_mtx); 7754 7755 (void) refcount_add_many(&dev->l2ad_alloc, psize, hdr); 7756 7757 /* 7758 * Normally the L2ARC can use the hdr's data, but if 7759 * we're sharing data between the hdr and one of its 7760 * bufs, L2ARC needs its own copy of the data so that 7761 * the ZIO below can't race with the buf consumer. 7762 * Another case where we need to create a copy of the 7763 * data is when the buffer size is not device-aligned 7764 * and we need to pad the block to make it such. 7765 * That also keeps the clock hand suitably aligned. 7766 * 7767 * To ensure that the copy will be available for the 7768 * lifetime of the ZIO and be cleaned up afterwards, we 7769 * add it to the l2arc_free_on_write queue. 7770 */ 7771 abd_t *to_write; 7772 if (!HDR_SHARED_DATA(hdr) && psize == asize) { 7773 to_write = hdr->b_l1hdr.b_pabd; 7774 } else { 7775 to_write = abd_alloc_for_io(asize, 7776 HDR_ISTYPE_METADATA(hdr)); 7777 abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize); 7778 if (asize != psize) { 7779 abd_zero_off(to_write, psize, 7780 asize - psize); 7781 } 7782 l2arc_free_abd_on_write(to_write, asize, 7783 arc_buf_type(hdr)); 7784 } 7785 wzio = zio_write_phys(pio, dev->l2ad_vdev, 7786 hdr->b_l2hdr.b_daddr, asize, to_write, 7787 ZIO_CHECKSUM_OFF, NULL, hdr, 7788 ZIO_PRIORITY_ASYNC_WRITE, 7789 ZIO_FLAG_CANFAIL, B_FALSE); 7790 7791 write_lsize += HDR_GET_LSIZE(hdr); 7792 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 7793 zio_t *, wzio); 7794 7795 write_psize += psize; 7796 write_asize += asize; 7797 dev->l2ad_hand += asize; 7798 7799 mutex_exit(hash_lock); 7800 7801 (void) zio_nowait(wzio); 7802 } 7803 7804 multilist_sublist_unlock(mls); 7805 7806 if (full == B_TRUE) 7807 break; 7808 } 7809 7810 /* No buffers selected for writing? */ 7811 if (pio == NULL) { 7812 ASSERT0(write_lsize); 7813 ASSERT(!HDR_HAS_L1HDR(head)); 7814 kmem_cache_free(hdr_l2only_cache, head); 7815 return (0); 7816 } 7817 7818 ASSERT3U(write_psize, <=, target_sz); 7819 ARCSTAT_BUMP(arcstat_l2_writes_sent); 7820 ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); 7821 ARCSTAT_INCR(arcstat_l2_lsize, write_lsize); 7822 ARCSTAT_INCR(arcstat_l2_psize, write_psize); 7823 vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0); 7824 7825 /* 7826 * Bump device hand to the device start if it is approaching the end. 7827 * l2arc_evict() will already have evicted ahead for this case. 7828 */ 7829 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 7830 dev->l2ad_hand = dev->l2ad_start; 7831 dev->l2ad_first = B_FALSE; 7832 } 7833 7834 dev->l2ad_writing = B_TRUE; 7835 (void) zio_wait(pio); 7836 dev->l2ad_writing = B_FALSE; 7837 7838 return (write_asize); 7839} 7840 7841/* 7842 * This thread feeds the L2ARC at regular intervals. This is the beating 7843 * heart of the L2ARC. 7844 */ 7845/* ARGSUSED */ 7846static void 7847l2arc_feed_thread(void *unused __unused) 7848{ 7849 callb_cpr_t cpr; 7850 l2arc_dev_t *dev; 7851 spa_t *spa; 7852 uint64_t size, wrote; 7853 clock_t begin, next = ddi_get_lbolt(); 7854 7855 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 7856 7857 mutex_enter(&l2arc_feed_thr_lock); 7858 7859 while (l2arc_thread_exit == 0) { 7860 CALLB_CPR_SAFE_BEGIN(&cpr); 7861 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 7862 next - ddi_get_lbolt()); 7863 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 7864 next = ddi_get_lbolt() + hz; 7865 7866 /* 7867 * Quick check for L2ARC devices. 7868 */ 7869 mutex_enter(&l2arc_dev_mtx); 7870 if (l2arc_ndev == 0) { 7871 mutex_exit(&l2arc_dev_mtx); 7872 continue; 7873 } 7874 mutex_exit(&l2arc_dev_mtx); 7875 begin = ddi_get_lbolt(); 7876 7877 /* 7878 * This selects the next l2arc device to write to, and in 7879 * doing so the next spa to feed from: dev->l2ad_spa. This 7880 * will return NULL if there are now no l2arc devices or if 7881 * they are all faulted. 7882 * 7883 * If a device is returned, its spa's config lock is also 7884 * held to prevent device removal. l2arc_dev_get_next() 7885 * will grab and release l2arc_dev_mtx. 7886 */ 7887 if ((dev = l2arc_dev_get_next()) == NULL) 7888 continue; 7889 7890 spa = dev->l2ad_spa; 7891 ASSERT3P(spa, !=, NULL); 7892 7893 /* 7894 * If the pool is read-only then force the feed thread to 7895 * sleep a little longer. 7896 */ 7897 if (!spa_writeable(spa)) { 7898 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 7899 spa_config_exit(spa, SCL_L2ARC, dev); 7900 continue; 7901 } 7902 7903 /* 7904 * Avoid contributing to memory pressure. 7905 */ 7906 if (arc_reclaim_needed()) { 7907 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 7908 spa_config_exit(spa, SCL_L2ARC, dev); 7909 continue; 7910 } 7911 7912 ARCSTAT_BUMP(arcstat_l2_feeds); 7913 7914 size = l2arc_write_size(); 7915 7916 /* 7917 * Evict L2ARC buffers that will be overwritten. 7918 */ 7919 l2arc_evict(dev, size, B_FALSE); 7920 7921 /* 7922 * Write ARC buffers. 7923 */ 7924 wrote = l2arc_write_buffers(spa, dev, size); 7925 7926 /* 7927 * Calculate interval between writes. 7928 */ 7929 next = l2arc_write_interval(begin, size, wrote); 7930 spa_config_exit(spa, SCL_L2ARC, dev); 7931 } 7932 7933 l2arc_thread_exit = 0; 7934 cv_broadcast(&l2arc_feed_thr_cv); 7935 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 7936 thread_exit(); 7937} 7938 7939boolean_t 7940l2arc_vdev_present(vdev_t *vd) 7941{ 7942 l2arc_dev_t *dev; 7943 7944 mutex_enter(&l2arc_dev_mtx); 7945 for (dev = list_head(l2arc_dev_list); dev != NULL; 7946 dev = list_next(l2arc_dev_list, dev)) { 7947 if (dev->l2ad_vdev == vd) 7948 break; 7949 } 7950 mutex_exit(&l2arc_dev_mtx); 7951 7952 return (dev != NULL); 7953} 7954 7955/* 7956 * Add a vdev for use by the L2ARC. By this point the spa has already 7957 * validated the vdev and opened it. 7958 */ 7959void 7960l2arc_add_vdev(spa_t *spa, vdev_t *vd) 7961{ 7962 l2arc_dev_t *adddev; 7963 7964 ASSERT(!l2arc_vdev_present(vd)); 7965 7966 vdev_ashift_optimize(vd); 7967 7968 /* 7969 * Create a new l2arc device entry. 7970 */ 7971 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 7972 adddev->l2ad_spa = spa; 7973 adddev->l2ad_vdev = vd; 7974 adddev->l2ad_start = VDEV_LABEL_START_SIZE; 7975 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 7976 adddev->l2ad_hand = adddev->l2ad_start; 7977 adddev->l2ad_first = B_TRUE; 7978 adddev->l2ad_writing = B_FALSE; 7979 7980 mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); 7981 /* 7982 * This is a list of all ARC buffers that are still valid on the 7983 * device. 7984 */ 7985 list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 7986 offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); 7987 7988 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 7989 refcount_create(&adddev->l2ad_alloc); 7990 7991 /* 7992 * Add device to global list 7993 */ 7994 mutex_enter(&l2arc_dev_mtx); 7995 list_insert_head(l2arc_dev_list, adddev); 7996 atomic_inc_64(&l2arc_ndev); 7997 mutex_exit(&l2arc_dev_mtx); 7998} 7999 8000/* 8001 * Remove a vdev from the L2ARC. 8002 */ 8003void 8004l2arc_remove_vdev(vdev_t *vd) 8005{ 8006 l2arc_dev_t *dev, *nextdev, *remdev = NULL; 8007 8008 /* 8009 * Find the device by vdev 8010 */ 8011 mutex_enter(&l2arc_dev_mtx); 8012 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 8013 nextdev = list_next(l2arc_dev_list, dev); 8014 if (vd == dev->l2ad_vdev) { 8015 remdev = dev; 8016 break; 8017 } 8018 } 8019 ASSERT3P(remdev, !=, NULL); 8020 8021 /* 8022 * Remove device from global list 8023 */ 8024 list_remove(l2arc_dev_list, remdev); 8025 l2arc_dev_last = NULL; /* may have been invalidated */ 8026 atomic_dec_64(&l2arc_ndev); 8027 mutex_exit(&l2arc_dev_mtx); 8028 8029 /* 8030 * Clear all buflists and ARC references. L2ARC device flush. 8031 */ 8032 l2arc_evict(remdev, 0, B_TRUE); 8033 list_destroy(&remdev->l2ad_buflist); 8034 mutex_destroy(&remdev->l2ad_mtx); 8035 refcount_destroy(&remdev->l2ad_alloc); 8036 kmem_free(remdev, sizeof (l2arc_dev_t)); 8037} 8038 8039void 8040l2arc_init(void) 8041{ 8042 l2arc_thread_exit = 0; 8043 l2arc_ndev = 0; 8044 l2arc_writes_sent = 0; 8045 l2arc_writes_done = 0; 8046 8047 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 8048 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 8049 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 8050 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 8051 8052 l2arc_dev_list = &L2ARC_dev_list; 8053 l2arc_free_on_write = &L2ARC_free_on_write; 8054 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 8055 offsetof(l2arc_dev_t, l2ad_node)); 8056 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 8057 offsetof(l2arc_data_free_t, l2df_list_node)); 8058} 8059 8060void 8061l2arc_fini(void) 8062{ 8063 /* 8064 * This is called from dmu_fini(), which is called from spa_fini(); 8065 * Because of this, we can assume that all l2arc devices have 8066 * already been removed when the pools themselves were removed. 8067 */ 8068 8069 l2arc_do_free_on_write(); 8070 8071 mutex_destroy(&l2arc_feed_thr_lock); 8072 cv_destroy(&l2arc_feed_thr_cv); 8073 mutex_destroy(&l2arc_dev_mtx); 8074 mutex_destroy(&l2arc_free_on_write_mtx); 8075 8076 list_destroy(l2arc_dev_list); 8077 list_destroy(l2arc_free_on_write); 8078} 8079 8080void 8081l2arc_start(void) 8082{ 8083 if (!(spa_mode_global & FWRITE)) 8084 return; 8085 8086 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 8087 TS_RUN, minclsyspri); 8088} 8089 8090void 8091l2arc_stop(void) 8092{ 8093 if (!(spa_mode_global & FWRITE)) 8094 return; 8095 8096 mutex_enter(&l2arc_feed_thr_lock); 8097 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 8098 l2arc_thread_exit = 1; 8099 while (l2arc_thread_exit != 0) 8100 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 8101 mutex_exit(&l2arc_feed_thr_lock); 8102} 8103