arc.c revision 286576
1253512Sadrian/* 2253512Sadrian * CDDL HEADER START 3253512Sadrian * 4256585Sadrian * The contents of this file are subject to the terms of the 5256585Sadrian * Common Development and Distribution License (the "License"). 6256585Sadrian * You may not use this file except in compliance with the License. 7256585Sadrian * 8256585Sadrian * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9256585Sadrian * or http://www.opensolaris.org/os/licensing. 10256585Sadrian * See the License for the specific language governing permissions 11256585Sadrian * and limitations under the License. 12256585Sadrian * 13256585Sadrian * When distributing Covered Code, include this CDDL HEADER in each 14256585Sadrian * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15256585Sadrian * If applicable, add the following below this CDDL HEADER, with the 16256585Sadrian * fields enclosed by brackets "[]" replaced with your own identifying 17262432Sadrian * information: Portions Copyright [yyyy] [name of copyright owner] 18262432Sadrian * 19262432Sadrian * CDDL HEADER END 20262432Sadrian */ 21262432Sadrian/* 22262432Sadrian * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23262432Sadrian * Copyright (c) 2012, Joyent, Inc. All rights reserved. 24256585Sadrian * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25262515Sadrian * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. 26262432Sadrian * Copyright 2014 Nexenta Systems, Inc. All rights reserved. 27262515Sadrian */ 28262515Sadrian 29262515Sadrian/* 30262515Sadrian * DVA-based Adjustable Replacement Cache 31262515Sadrian * 32262515Sadrian * While much of the theory of operation used here is 33262515Sadrian * based on the self-tuning, low overhead replacement cache 34262515Sadrian * presented by Megiddo and Modha at FAST 2003, there are some 35262432Sadrian * significant differences: 36262515Sadrian * 37262515Sadrian * 1. The Megiddo and Modha model assumes any page is evictable. 38262515Sadrian * Pages in its cache cannot be "locked" into memory. This makes 39262515Sadrian * the eviction algorithm simple: evict the last page in the list. 40262515Sadrian * This also make the performance characteristics easy to reason 41262515Sadrian * about. Our cache is not so simple. At any given moment, some 42262432Sadrian * subset of the blocks in the cache are un-evictable because we 43262515Sadrian * have handed out a reference to them. Blocks are only evictable 44262515Sadrian * when there are no external references active. This makes 45262515Sadrian * eviction far more problematic: we choose to evict the evictable 46262515Sadrian * blocks that are the "lowest" in the list. 47262515Sadrian * 48262515Sadrian * There are times when it is not possible to evict the requested 49262515Sadrian * space. In these circumstances we are unable to adjust the cache 50262432Sadrian * size. To prevent the cache growing unbounded at these times we 51256585Sadrian * implement a "cache throttle" that slows the flow of new data 52256585Sadrian * into the cache until we can make space available. 53256585Sadrian * 54256585Sadrian * 2. The Megiddo and Modha model assumes a fixed cache size. 55256585Sadrian * Pages are evicted when the cache is full and there is a cache 56256585Sadrian * miss. Our model has a variable sized cache. It grows with 57256585Sadrian * high use, but also tries to react to memory pressure from the 58263223Sadrian * operating system: decreasing its size when system memory is 59263223Sadrian * tight. 60263223Sadrian * 61256585Sadrian * 3. The Megiddo and Modha model assumes a fixed page size. All 62256585Sadrian * elements of the cache are therefore exactly the same size. So 63256585Sadrian * when adjusting the cache size following a cache miss, its simply 64256585Sadrian * a matter of choosing a single page to evict. In our model, we 65256585Sadrian * have variable sized cache blocks (rangeing from 512 bytes to 66256585Sadrian * 128K bytes). We therefore choose a set of blocks to evict to make 67256585Sadrian * space for a cache miss that approximates as closely as possible 68262432Sadrian * the space used by the new block. 69262432Sadrian * 70262432Sadrian * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 71256585Sadrian * by N. Megiddo & D. Modha, FAST 2003 72256585Sadrian */ 73263223Sadrian 74256585Sadrian/* 75256585Sadrian * The locking model: 76256585Sadrian * 77256585Sadrian * A new reference to a cache buffer can be obtained in two 78256585Sadrian * ways: 1) via a hash table lookup using the DVA as a key, 79256585Sadrian * or 2) via one of the ARC lists. The arc_read() interface 80256585Sadrian * uses method 1, while the internal arc algorithms for 81256585Sadrian * adjusting the cache use method 2. We therefore provide two 82256585Sadrian * types of locks: 1) the hash table lock array, and 2) the 83256585Sadrian * arc list locks. 84263223Sadrian * 85263223Sadrian * Buffers do not have their own mutexs, rather they rely on the 86263223Sadrian * hash table mutexs for the bulk of their protection (i.e. most 87261869Sadrian * fields in the arc_buf_hdr_t are protected by these mutexs). 88261869Sadrian * 89261869Sadrian * buf_hash_find() returns the appropriate mutex (held) when it 90261869Sadrian * locates the requested buffer in the hash table. It returns 91261869Sadrian * NULL for the mutex if the buffer was not in the table. 92261869Sadrian * 93261869Sadrian * buf_hash_remove() expects the appropriate hash mutex to be 94261869Sadrian * already held before it is invoked. 95261869Sadrian * 96261869Sadrian * Each arc state also has a mutex which is used to protect the 97261871Sadrian * buffer list associated with the state. When attempting to 98261871Sadrian * obtain a hash table lock while holding an arc list lock you 99261871Sadrian * must use: mutex_tryenter() to avoid deadlock. Also note that 100261871Sadrian * the active state mutex must be held before the ghost state mutex. 101261871Sadrian * 102261871Sadrian * Arc buffers may have an associated eviction callback function. 103261871Sadrian * This function will be invoked prior to removing the buffer (e.g. 104256585Sadrian * in arc_do_user_evicts()). Note however that the data associated 105256585Sadrian * with the buffer may be evicted prior to the callback. The callback 106256585Sadrian * must be made with *no locks held* (to prevent deadlock). Additionally, 107256585Sadrian * the users of callbacks must ensure that their private data is 108256585Sadrian * protected from simultaneous callbacks from arc_clear_callback() 109256585Sadrian * and arc_do_user_evicts(). 110256585Sadrian * 111256585Sadrian * Note that the majority of the performance stats are manipulated 112256585Sadrian * with atomic operations. 113256585Sadrian * 114256585Sadrian * The L2ARC uses the l2ad_mtx on each vdev for the following: 115256585Sadrian * 116256585Sadrian * - L2ARC buflist creation 117256585Sadrian * - L2ARC buflist eviction 118256585Sadrian * - L2ARC write completion, which walks L2ARC buflists 119256585Sadrian * - ARC header destruction, as it removes from L2ARC buflists 120256585Sadrian * - ARC header release, as it removes from L2ARC buflists 121256585Sadrian */ 122256585Sadrian 123256585Sadrian#include <sys/spa.h> 124256585Sadrian#include <sys/zio.h> 125256585Sadrian#include <sys/zio_compress.h> 126256585Sadrian#include <sys/zfs_context.h> 127256585Sadrian#include <sys/arc.h> 128256585Sadrian#include <sys/refcount.h> 129256585Sadrian#include <sys/vdev.h> 130256585Sadrian#include <sys/vdev_impl.h> 131256585Sadrian#include <sys/dsl_pool.h> 132256585Sadrian#ifdef _KERNEL 133256585Sadrian#include <sys/dnlc.h> 134256585Sadrian#endif 135256585Sadrian#include <sys/callb.h> 136256585Sadrian#include <sys/kstat.h> 137256585Sadrian#include <sys/trim_map.h> 138256585Sadrian#include <zfs_fletcher.h> 139256585Sadrian#include <sys/sdt.h> 140256585Sadrian 141256585Sadrian#include <vm/vm_pageout.h> 142256585Sadrian#include <machine/vmparam.h> 143256585Sadrian 144256585Sadrian#ifdef illumos 145256585Sadrian#ifndef _KERNEL 146256585Sadrian/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 147256585Sadrianboolean_t arc_watch = B_FALSE; 148256585Sadrianint arc_procfd; 149256585Sadrian#endif 150256585Sadrian#endif /* illumos */ 151256585Sadrian 152256585Sadrianstatic kmutex_t arc_reclaim_thr_lock; 153256585Sadrianstatic kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 154256585Sadrianstatic uint8_t arc_thread_exit; 155256585Sadrian 156#define ARC_REDUCE_DNLC_PERCENT 3 157uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 158 159typedef enum arc_reclaim_strategy { 160 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 161 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 162} arc_reclaim_strategy_t; 163 164/* 165 * The number of iterations through arc_evict_*() before we 166 * drop & reacquire the lock. 167 */ 168int arc_evict_iterations = 100; 169 170/* number of seconds before growing cache again */ 171static int arc_grow_retry = 60; 172 173/* shift of arc_c for calculating both min and max arc_p */ 174static int arc_p_min_shift = 4; 175 176/* log2(fraction of arc to reclaim) */ 177static int arc_shrink_shift = 5; 178 179/* 180 * minimum lifespan of a prefetch block in clock ticks 181 * (initialized in arc_init()) 182 */ 183static int arc_min_prefetch_lifespan; 184 185/* 186 * If this percent of memory is free, don't throttle. 187 */ 188int arc_lotsfree_percent = 10; 189 190static int arc_dead; 191extern int zfs_prefetch_disable; 192 193/* 194 * The arc has filled available memory and has now warmed up. 195 */ 196static boolean_t arc_warm; 197 198uint64_t zfs_arc_max; 199uint64_t zfs_arc_min; 200uint64_t zfs_arc_meta_limit = 0; 201uint64_t zfs_arc_meta_min = 0; 202int zfs_arc_grow_retry = 0; 203int zfs_arc_shrink_shift = 0; 204int zfs_arc_p_min_shift = 0; 205int zfs_disable_dup_eviction = 0; 206uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ 207u_int zfs_arc_free_target = 0; 208 209static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); 210static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); 211 212#ifdef _KERNEL 213static void 214arc_free_target_init(void *unused __unused) 215{ 216 217 zfs_arc_free_target = vm_pageout_wakeup_thresh; 218} 219SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, 220 arc_free_target_init, NULL); 221 222TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 223TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); 224TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); 225SYSCTL_DECL(_vfs_zfs); 226SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, 227 "Maximum ARC size"); 228SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, 229 "Minimum ARC size"); 230SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, 231 &zfs_arc_average_blocksize, 0, 232 "ARC average blocksize"); 233SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, 234 &arc_shrink_shift, 0, 235 "log2(fraction of arc to reclaim)"); 236 237/* 238 * We don't have a tunable for arc_free_target due to the dependency on 239 * pagedaemon initialisation. 240 */ 241SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, 242 CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), 243 sysctl_vfs_zfs_arc_free_target, "IU", 244 "Desired number of free pages below which ARC triggers reclaim"); 245 246static int 247sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) 248{ 249 u_int val; 250 int err; 251 252 val = zfs_arc_free_target; 253 err = sysctl_handle_int(oidp, &val, 0, req); 254 if (err != 0 || req->newptr == NULL) 255 return (err); 256 257 if (val < minfree) 258 return (EINVAL); 259 if (val > vm_cnt.v_page_count) 260 return (EINVAL); 261 262 zfs_arc_free_target = val; 263 264 return (0); 265} 266 267/* 268 * Must be declared here, before the definition of corresponding kstat 269 * macro which uses the same names will confuse the compiler. 270 */ 271SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, 272 CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 273 sysctl_vfs_zfs_arc_meta_limit, "QU", 274 "ARC metadata limit"); 275#endif 276 277/* 278 * Note that buffers can be in one of 6 states: 279 * ARC_anon - anonymous (discussed below) 280 * ARC_mru - recently used, currently cached 281 * ARC_mru_ghost - recentely used, no longer in cache 282 * ARC_mfu - frequently used, currently cached 283 * ARC_mfu_ghost - frequently used, no longer in cache 284 * ARC_l2c_only - exists in L2ARC but not other states 285 * When there are no active references to the buffer, they are 286 * are linked onto a list in one of these arc states. These are 287 * the only buffers that can be evicted or deleted. Within each 288 * state there are multiple lists, one for meta-data and one for 289 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 290 * etc.) is tracked separately so that it can be managed more 291 * explicitly: favored over data, limited explicitly. 292 * 293 * Anonymous buffers are buffers that are not associated with 294 * a DVA. These are buffers that hold dirty block copies 295 * before they are written to stable storage. By definition, 296 * they are "ref'd" and are considered part of arc_mru 297 * that cannot be freed. Generally, they will aquire a DVA 298 * as they are written and migrate onto the arc_mru list. 299 * 300 * The ARC_l2c_only state is for buffers that are in the second 301 * level ARC but no longer in any of the ARC_m* lists. The second 302 * level ARC itself may also contain buffers that are in any of 303 * the ARC_m* states - meaning that a buffer can exist in two 304 * places. The reason for the ARC_l2c_only state is to keep the 305 * buffer header in the hash table, so that reads that hit the 306 * second level ARC benefit from these fast lookups. 307 */ 308 309#define ARCS_LOCK_PAD CACHE_LINE_SIZE 310struct arcs_lock { 311 kmutex_t arcs_lock; 312#ifdef _KERNEL 313 unsigned char pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))]; 314#endif 315}; 316 317/* 318 * must be power of two for mask use to work 319 * 320 */ 321#define ARC_BUFC_NUMDATALISTS 16 322#define ARC_BUFC_NUMMETADATALISTS 16 323#define ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS) 324 325typedef struct arc_state { 326 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 327 uint64_t arcs_size; /* total amount of data in this state */ 328 list_t arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */ 329 struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE); 330} arc_state_t; 331 332#define ARCS_LOCK(s, i) (&((s)->arcs_locks[(i)].arcs_lock)) 333 334/* The 6 states: */ 335static arc_state_t ARC_anon; 336static arc_state_t ARC_mru; 337static arc_state_t ARC_mru_ghost; 338static arc_state_t ARC_mfu; 339static arc_state_t ARC_mfu_ghost; 340static arc_state_t ARC_l2c_only; 341 342typedef struct arc_stats { 343 kstat_named_t arcstat_hits; 344 kstat_named_t arcstat_misses; 345 kstat_named_t arcstat_demand_data_hits; 346 kstat_named_t arcstat_demand_data_misses; 347 kstat_named_t arcstat_demand_metadata_hits; 348 kstat_named_t arcstat_demand_metadata_misses; 349 kstat_named_t arcstat_prefetch_data_hits; 350 kstat_named_t arcstat_prefetch_data_misses; 351 kstat_named_t arcstat_prefetch_metadata_hits; 352 kstat_named_t arcstat_prefetch_metadata_misses; 353 kstat_named_t arcstat_mru_hits; 354 kstat_named_t arcstat_mru_ghost_hits; 355 kstat_named_t arcstat_mfu_hits; 356 kstat_named_t arcstat_mfu_ghost_hits; 357 kstat_named_t arcstat_allocated; 358 kstat_named_t arcstat_deleted; 359 kstat_named_t arcstat_stolen; 360 kstat_named_t arcstat_recycle_miss; 361 /* 362 * Number of buffers that could not be evicted because the hash lock 363 * was held by another thread. The lock may not necessarily be held 364 * by something using the same buffer, since hash locks are shared 365 * by multiple buffers. 366 */ 367 kstat_named_t arcstat_mutex_miss; 368 /* 369 * Number of buffers skipped because they have I/O in progress, are 370 * indrect prefetch buffers that have not lived long enough, or are 371 * not from the spa we're trying to evict from. 372 */ 373 kstat_named_t arcstat_evict_skip; 374 kstat_named_t arcstat_evict_l2_cached; 375 kstat_named_t arcstat_evict_l2_eligible; 376 kstat_named_t arcstat_evict_l2_ineligible; 377 kstat_named_t arcstat_hash_elements; 378 kstat_named_t arcstat_hash_elements_max; 379 kstat_named_t arcstat_hash_collisions; 380 kstat_named_t arcstat_hash_chains; 381 kstat_named_t arcstat_hash_chain_max; 382 kstat_named_t arcstat_p; 383 kstat_named_t arcstat_c; 384 kstat_named_t arcstat_c_min; 385 kstat_named_t arcstat_c_max; 386 kstat_named_t arcstat_size; 387 /* 388 * Number of bytes consumed by internal ARC structures necessary 389 * for tracking purposes; these structures are not actually 390 * backed by ARC buffers. This includes arc_buf_hdr_t structures 391 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 392 * caches), and arc_buf_t structures (allocated via arc_buf_t 393 * cache). 394 */ 395 kstat_named_t arcstat_hdr_size; 396 /* 397 * Number of bytes consumed by ARC buffers of type equal to 398 * ARC_BUFC_DATA. This is generally consumed by buffers backing 399 * on disk user data (e.g. plain file contents). 400 */ 401 kstat_named_t arcstat_data_size; 402 /* 403 * Number of bytes consumed by ARC buffers of type equal to 404 * ARC_BUFC_METADATA. This is generally consumed by buffers 405 * backing on disk data that is used for internal ZFS 406 * structures (e.g. ZAP, dnode, indirect blocks, etc). 407 */ 408 kstat_named_t arcstat_metadata_size; 409 /* 410 * Number of bytes consumed by various buffers and structures 411 * not actually backed with ARC buffers. This includes bonus 412 * buffers (allocated directly via zio_buf_* functions), 413 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t 414 * cache), and dnode_t structures (allocated via dnode_t cache). 415 */ 416 kstat_named_t arcstat_other_size; 417 /* 418 * Total number of bytes consumed by ARC buffers residing in the 419 * arc_anon state. This includes *all* buffers in the arc_anon 420 * state; e.g. data, metadata, evictable, and unevictable buffers 421 * are all included in this value. 422 */ 423 kstat_named_t arcstat_anon_size; 424 /* 425 * Number of bytes consumed by ARC buffers that meet the 426 * following criteria: backing buffers of type ARC_BUFC_DATA, 427 * residing in the arc_anon state, and are eligible for eviction 428 * (e.g. have no outstanding holds on the buffer). 429 */ 430 kstat_named_t arcstat_anon_evictable_data; 431 /* 432 * Number of bytes consumed by ARC buffers that meet the 433 * following criteria: backing buffers of type ARC_BUFC_METADATA, 434 * residing in the arc_anon state, and are eligible for eviction 435 * (e.g. have no outstanding holds on the buffer). 436 */ 437 kstat_named_t arcstat_anon_evictable_metadata; 438 /* 439 * Total number of bytes consumed by ARC buffers residing in the 440 * arc_mru state. This includes *all* buffers in the arc_mru 441 * state; e.g. data, metadata, evictable, and unevictable buffers 442 * are all included in this value. 443 */ 444 kstat_named_t arcstat_mru_size; 445 /* 446 * Number of bytes consumed by ARC buffers that meet the 447 * following criteria: backing buffers of type ARC_BUFC_DATA, 448 * residing in the arc_mru state, and are eligible for eviction 449 * (e.g. have no outstanding holds on the buffer). 450 */ 451 kstat_named_t arcstat_mru_evictable_data; 452 /* 453 * Number of bytes consumed by ARC buffers that meet the 454 * following criteria: backing buffers of type ARC_BUFC_METADATA, 455 * residing in the arc_mru state, and are eligible for eviction 456 * (e.g. have no outstanding holds on the buffer). 457 */ 458 kstat_named_t arcstat_mru_evictable_metadata; 459 /* 460 * Total number of bytes that *would have been* consumed by ARC 461 * buffers in the arc_mru_ghost state. The key thing to note 462 * here, is the fact that this size doesn't actually indicate 463 * RAM consumption. The ghost lists only consist of headers and 464 * don't actually have ARC buffers linked off of these headers. 465 * Thus, *if* the headers had associated ARC buffers, these 466 * buffers *would have* consumed this number of bytes. 467 */ 468 kstat_named_t arcstat_mru_ghost_size; 469 /* 470 * Number of bytes that *would have been* consumed by ARC 471 * buffers that are eligible for eviction, of type 472 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 473 */ 474 kstat_named_t arcstat_mru_ghost_evictable_data; 475 /* 476 * Number of bytes that *would have been* consumed by ARC 477 * buffers that are eligible for eviction, of type 478 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 479 */ 480 kstat_named_t arcstat_mru_ghost_evictable_metadata; 481 /* 482 * Total number of bytes consumed by ARC buffers residing in the 483 * arc_mfu state. This includes *all* buffers in the arc_mfu 484 * state; e.g. data, metadata, evictable, and unevictable buffers 485 * are all included in this value. 486 */ 487 kstat_named_t arcstat_mfu_size; 488 /* 489 * Number of bytes consumed by ARC buffers that are eligible for 490 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 491 * state. 492 */ 493 kstat_named_t arcstat_mfu_evictable_data; 494 /* 495 * Number of bytes consumed by ARC buffers that are eligible for 496 * eviction, of type ARC_BUFC_METADATA, and reside in the 497 * arc_mfu state. 498 */ 499 kstat_named_t arcstat_mfu_evictable_metadata; 500 /* 501 * Total number of bytes that *would have been* consumed by ARC 502 * buffers in the arc_mfu_ghost state. See the comment above 503 * arcstat_mru_ghost_size for more details. 504 */ 505 kstat_named_t arcstat_mfu_ghost_size; 506 /* 507 * Number of bytes that *would have been* consumed by ARC 508 * buffers that are eligible for eviction, of type 509 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 510 */ 511 kstat_named_t arcstat_mfu_ghost_evictable_data; 512 /* 513 * Number of bytes that *would have been* consumed by ARC 514 * buffers that are eligible for eviction, of type 515 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 516 */ 517 kstat_named_t arcstat_mfu_ghost_evictable_metadata; 518 kstat_named_t arcstat_l2_hits; 519 kstat_named_t arcstat_l2_misses; 520 kstat_named_t arcstat_l2_feeds; 521 kstat_named_t arcstat_l2_rw_clash; 522 kstat_named_t arcstat_l2_read_bytes; 523 kstat_named_t arcstat_l2_write_bytes; 524 kstat_named_t arcstat_l2_writes_sent; 525 kstat_named_t arcstat_l2_writes_done; 526 kstat_named_t arcstat_l2_writes_error; 527 kstat_named_t arcstat_l2_writes_hdr_miss; 528 kstat_named_t arcstat_l2_evict_lock_retry; 529 kstat_named_t arcstat_l2_evict_reading; 530 kstat_named_t arcstat_l2_evict_l1cached; 531 kstat_named_t arcstat_l2_free_on_write; 532 kstat_named_t arcstat_l2_cdata_free_on_write; 533 kstat_named_t arcstat_l2_abort_lowmem; 534 kstat_named_t arcstat_l2_cksum_bad; 535 kstat_named_t arcstat_l2_io_error; 536 kstat_named_t arcstat_l2_size; 537 kstat_named_t arcstat_l2_asize; 538 kstat_named_t arcstat_l2_hdr_size; 539 kstat_named_t arcstat_l2_compress_successes; 540 kstat_named_t arcstat_l2_compress_zeros; 541 kstat_named_t arcstat_l2_compress_failures; 542 kstat_named_t arcstat_l2_write_trylock_fail; 543 kstat_named_t arcstat_l2_write_passed_headroom; 544 kstat_named_t arcstat_l2_write_spa_mismatch; 545 kstat_named_t arcstat_l2_write_in_l2; 546 kstat_named_t arcstat_l2_write_hdr_io_in_progress; 547 kstat_named_t arcstat_l2_write_not_cacheable; 548 kstat_named_t arcstat_l2_write_full; 549 kstat_named_t arcstat_l2_write_buffer_iter; 550 kstat_named_t arcstat_l2_write_pios; 551 kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 552 kstat_named_t arcstat_l2_write_buffer_list_iter; 553 kstat_named_t arcstat_l2_write_buffer_list_null_iter; 554 kstat_named_t arcstat_memory_throttle_count; 555 kstat_named_t arcstat_duplicate_buffers; 556 kstat_named_t arcstat_duplicate_buffers_size; 557 kstat_named_t arcstat_duplicate_reads; 558 kstat_named_t arcstat_meta_used; 559 kstat_named_t arcstat_meta_limit; 560 kstat_named_t arcstat_meta_max; 561 kstat_named_t arcstat_meta_min; 562} arc_stats_t; 563 564static arc_stats_t arc_stats = { 565 { "hits", KSTAT_DATA_UINT64 }, 566 { "misses", KSTAT_DATA_UINT64 }, 567 { "demand_data_hits", KSTAT_DATA_UINT64 }, 568 { "demand_data_misses", KSTAT_DATA_UINT64 }, 569 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 570 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 571 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 572 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 573 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 574 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 575 { "mru_hits", KSTAT_DATA_UINT64 }, 576 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 577 { "mfu_hits", KSTAT_DATA_UINT64 }, 578 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 579 { "allocated", KSTAT_DATA_UINT64 }, 580 { "deleted", KSTAT_DATA_UINT64 }, 581 { "stolen", KSTAT_DATA_UINT64 }, 582 { "recycle_miss", KSTAT_DATA_UINT64 }, 583 { "mutex_miss", KSTAT_DATA_UINT64 }, 584 { "evict_skip", KSTAT_DATA_UINT64 }, 585 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 586 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 587 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 588 { "hash_elements", KSTAT_DATA_UINT64 }, 589 { "hash_elements_max", KSTAT_DATA_UINT64 }, 590 { "hash_collisions", KSTAT_DATA_UINT64 }, 591 { "hash_chains", KSTAT_DATA_UINT64 }, 592 { "hash_chain_max", KSTAT_DATA_UINT64 }, 593 { "p", KSTAT_DATA_UINT64 }, 594 { "c", KSTAT_DATA_UINT64 }, 595 { "c_min", KSTAT_DATA_UINT64 }, 596 { "c_max", KSTAT_DATA_UINT64 }, 597 { "size", KSTAT_DATA_UINT64 }, 598 { "hdr_size", KSTAT_DATA_UINT64 }, 599 { "data_size", KSTAT_DATA_UINT64 }, 600 { "metadata_size", KSTAT_DATA_UINT64 }, 601 { "other_size", KSTAT_DATA_UINT64 }, 602 { "anon_size", KSTAT_DATA_UINT64 }, 603 { "anon_evictable_data", KSTAT_DATA_UINT64 }, 604 { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, 605 { "mru_size", KSTAT_DATA_UINT64 }, 606 { "mru_evictable_data", KSTAT_DATA_UINT64 }, 607 { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, 608 { "mru_ghost_size", KSTAT_DATA_UINT64 }, 609 { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, 610 { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 611 { "mfu_size", KSTAT_DATA_UINT64 }, 612 { "mfu_evictable_data", KSTAT_DATA_UINT64 }, 613 { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, 614 { "mfu_ghost_size", KSTAT_DATA_UINT64 }, 615 { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, 616 { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 617 { "l2_hits", KSTAT_DATA_UINT64 }, 618 { "l2_misses", KSTAT_DATA_UINT64 }, 619 { "l2_feeds", KSTAT_DATA_UINT64 }, 620 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 621 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 622 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 623 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 624 { "l2_writes_done", KSTAT_DATA_UINT64 }, 625 { "l2_writes_error", KSTAT_DATA_UINT64 }, 626 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 627 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 628 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 629 { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, 630 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 631 { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, 632 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 633 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 634 { "l2_io_error", KSTAT_DATA_UINT64 }, 635 { "l2_size", KSTAT_DATA_UINT64 }, 636 { "l2_asize", KSTAT_DATA_UINT64 }, 637 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 638 { "l2_compress_successes", KSTAT_DATA_UINT64 }, 639 { "l2_compress_zeros", KSTAT_DATA_UINT64 }, 640 { "l2_compress_failures", KSTAT_DATA_UINT64 }, 641 { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 642 { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 643 { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 644 { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 645 { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 646 { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 647 { "l2_write_full", KSTAT_DATA_UINT64 }, 648 { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 649 { "l2_write_pios", KSTAT_DATA_UINT64 }, 650 { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 651 { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 652 { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, 653 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 654 { "duplicate_buffers", KSTAT_DATA_UINT64 }, 655 { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 656 { "duplicate_reads", KSTAT_DATA_UINT64 }, 657 { "arc_meta_used", KSTAT_DATA_UINT64 }, 658 { "arc_meta_limit", KSTAT_DATA_UINT64 }, 659 { "arc_meta_max", KSTAT_DATA_UINT64 }, 660 { "arc_meta_min", KSTAT_DATA_UINT64 } 661}; 662 663#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 664 665#define ARCSTAT_INCR(stat, val) \ 666 atomic_add_64(&arc_stats.stat.value.ui64, (val)) 667 668#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 669#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 670 671#define ARCSTAT_MAX(stat, val) { \ 672 uint64_t m; \ 673 while ((val) > (m = arc_stats.stat.value.ui64) && \ 674 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 675 continue; \ 676} 677 678#define ARCSTAT_MAXSTAT(stat) \ 679 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 680 681/* 682 * We define a macro to allow ARC hits/misses to be easily broken down by 683 * two separate conditions, giving a total of four different subtypes for 684 * each of hits and misses (so eight statistics total). 685 */ 686#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 687 if (cond1) { \ 688 if (cond2) { \ 689 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 690 } else { \ 691 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 692 } \ 693 } else { \ 694 if (cond2) { \ 695 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 696 } else { \ 697 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 698 } \ 699 } 700 701kstat_t *arc_ksp; 702static arc_state_t *arc_anon; 703static arc_state_t *arc_mru; 704static arc_state_t *arc_mru_ghost; 705static arc_state_t *arc_mfu; 706static arc_state_t *arc_mfu_ghost; 707static arc_state_t *arc_l2c_only; 708 709/* 710 * There are several ARC variables that are critical to export as kstats -- 711 * but we don't want to have to grovel around in the kstat whenever we wish to 712 * manipulate them. For these variables, we therefore define them to be in 713 * terms of the statistic variable. This assures that we are not introducing 714 * the possibility of inconsistency by having shadow copies of the variables, 715 * while still allowing the code to be readable. 716 */ 717#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 718#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 719#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 720#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 721#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 722#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 723#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ 724#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ 725#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 726 727#define L2ARC_IS_VALID_COMPRESS(_c_) \ 728 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) 729 730static int arc_no_grow; /* Don't try to grow cache size */ 731static uint64_t arc_tempreserve; 732static uint64_t arc_loaned_bytes; 733 734typedef struct arc_callback arc_callback_t; 735 736struct arc_callback { 737 void *acb_private; 738 arc_done_func_t *acb_done; 739 arc_buf_t *acb_buf; 740 zio_t *acb_zio_dummy; 741 arc_callback_t *acb_next; 742}; 743 744typedef struct arc_write_callback arc_write_callback_t; 745 746struct arc_write_callback { 747 void *awcb_private; 748 arc_done_func_t *awcb_ready; 749 arc_done_func_t *awcb_physdone; 750 arc_done_func_t *awcb_done; 751 arc_buf_t *awcb_buf; 752}; 753 754/* 755 * ARC buffers are separated into multiple structs as a memory saving measure: 756 * - Common fields struct, always defined, and embedded within it: 757 * - L2-only fields, always allocated but undefined when not in L2ARC 758 * - L1-only fields, only allocated when in L1ARC 759 * 760 * Buffer in L1 Buffer only in L2 761 * +------------------------+ +------------------------+ 762 * | arc_buf_hdr_t | | arc_buf_hdr_t | 763 * | | | | 764 * | | | | 765 * | | | | 766 * +------------------------+ +------------------------+ 767 * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 768 * | (undefined if L1-only) | | | 769 * +------------------------+ +------------------------+ 770 * | l1arc_buf_hdr_t | 771 * | | 772 * | | 773 * | | 774 * | | 775 * +------------------------+ 776 * 777 * Because it's possible for the L2ARC to become extremely large, we can wind 778 * up eating a lot of memory in L2ARC buffer headers, so the size of a header 779 * is minimized by only allocating the fields necessary for an L1-cached buffer 780 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 781 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 782 * words in pointers. arc_hdr_realloc() is used to switch a header between 783 * these two allocation states. 784 */ 785typedef struct l1arc_buf_hdr { 786 kmutex_t b_freeze_lock; 787#ifdef ZFS_DEBUG 788 /* 789 * used for debugging wtih kmem_flags - by allocating and freeing 790 * b_thawed when the buffer is thawed, we get a record of the stack 791 * trace that thawed it. 792 */ 793 void *b_thawed; 794#endif 795 796 arc_buf_t *b_buf; 797 uint32_t b_datacnt; 798 /* for waiting on writes to complete */ 799 kcondvar_t b_cv; 800 801 /* protected by arc state mutex */ 802 arc_state_t *b_state; 803 list_node_t b_arc_node; 804 805 /* updated atomically */ 806 clock_t b_arc_access; 807 808 /* self protecting */ 809 refcount_t b_refcnt; 810 811 arc_callback_t *b_acb; 812 /* temporary buffer holder for in-flight compressed data */ 813 void *b_tmp_cdata; 814} l1arc_buf_hdr_t; 815 816typedef struct l2arc_dev l2arc_dev_t; 817 818typedef struct l2arc_buf_hdr { 819 /* protected by arc_buf_hdr mutex */ 820 l2arc_dev_t *b_dev; /* L2ARC device */ 821 uint64_t b_daddr; /* disk address, offset byte */ 822 /* real alloc'd buffer size depending on b_compress applied */ 823 int32_t b_asize; 824 825 list_node_t b_l2node; 826} l2arc_buf_hdr_t; 827 828struct arc_buf_hdr { 829 /* protected by hash lock */ 830 dva_t b_dva; 831 uint64_t b_birth; 832 /* 833 * Even though this checksum is only set/verified when a buffer is in 834 * the L1 cache, it needs to be in the set of common fields because it 835 * must be preserved from the time before a buffer is written out to 836 * L2ARC until after it is read back in. 837 */ 838 zio_cksum_t *b_freeze_cksum; 839 840 arc_buf_hdr_t *b_hash_next; 841 arc_flags_t b_flags; 842 843 /* immutable */ 844 int32_t b_size; 845 uint64_t b_spa; 846 847 /* L2ARC fields. Undefined when not in L2ARC. */ 848 l2arc_buf_hdr_t b_l2hdr; 849 /* L1ARC fields. Undefined when in l2arc_only state */ 850 l1arc_buf_hdr_t b_l1hdr; 851}; 852 853#ifdef _KERNEL 854static int 855sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) 856{ 857 uint64_t val; 858 int err; 859 860 val = arc_meta_limit; 861 err = sysctl_handle_64(oidp, &val, 0, req); 862 if (err != 0 || req->newptr == NULL) 863 return (err); 864 865 if (val <= 0 || val > arc_c_max) 866 return (EINVAL); 867 868 arc_meta_limit = val; 869 return (0); 870} 871#endif 872 873static arc_buf_t *arc_eviction_list; 874static kmutex_t arc_eviction_mtx; 875static arc_buf_hdr_t arc_eviction_hdr; 876 877#define GHOST_STATE(state) \ 878 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 879 (state) == arc_l2c_only) 880 881#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) 882#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) 883#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) 884#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) 885#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) 886#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) 887 888#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) 889#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) 890#define HDR_L2_READING(hdr) \ 891 (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ 892 ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) 893#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) 894#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) 895#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) 896 897#define HDR_ISTYPE_METADATA(hdr) \ 898 ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) 899#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) 900 901#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) 902#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) 903 904/* For storing compression mode in b_flags */ 905#define HDR_COMPRESS_OFFSET 24 906#define HDR_COMPRESS_NBITS 7 907 908#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET(hdr->b_flags, \ 909 HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS)) 910#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \ 911 HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp)) 912 913/* 914 * Other sizes 915 */ 916 917#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 918#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) 919 920/* 921 * Hash table routines 922 */ 923 924#define HT_LOCK_PAD CACHE_LINE_SIZE 925 926struct ht_lock { 927 kmutex_t ht_lock; 928#ifdef _KERNEL 929 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 930#endif 931}; 932 933#define BUF_LOCKS 256 934typedef struct buf_hash_table { 935 uint64_t ht_mask; 936 arc_buf_hdr_t **ht_table; 937 struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 938} buf_hash_table_t; 939 940static buf_hash_table_t buf_hash_table; 941 942#define BUF_HASH_INDEX(spa, dva, birth) \ 943 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 944#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 945#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 946#define HDR_LOCK(hdr) \ 947 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 948 949uint64_t zfs_crc64_table[256]; 950 951/* 952 * Level 2 ARC 953 */ 954 955#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 956#define L2ARC_HEADROOM 2 /* num of writes */ 957/* 958 * If we discover during ARC scan any buffers to be compressed, we boost 959 * our headroom for the next scanning cycle by this percentage multiple. 960 */ 961#define L2ARC_HEADROOM_BOOST 200 962#define L2ARC_FEED_SECS 1 /* caching interval secs */ 963#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 964 965#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 966#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 967 968/* L2ARC Performance Tunables */ 969uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 970uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 971uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 972uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 973uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 974uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 975boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 976boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 977boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 978 979SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, 980 &l2arc_write_max, 0, "max write size"); 981SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, 982 &l2arc_write_boost, 0, "extra write during warmup"); 983SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, 984 &l2arc_headroom, 0, "number of dev writes"); 985SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, 986 &l2arc_feed_secs, 0, "interval seconds"); 987SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, 988 &l2arc_feed_min_ms, 0, "min interval milliseconds"); 989 990SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, 991 &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 992SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, 993 &l2arc_feed_again, 0, "turbo warmup"); 994SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, 995 &l2arc_norw, 0, "no reads during writes"); 996 997SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 998 &ARC_anon.arcs_size, 0, "size of anonymous state"); 999SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, 1000 &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); 1001SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, 1002 &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); 1003 1004SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 1005 &ARC_mru.arcs_size, 0, "size of mru state"); 1006SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, 1007 &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); 1008SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, 1009 &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); 1010 1011SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 1012 &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state"); 1013SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, 1014 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1015 "size of metadata in mru ghost state"); 1016SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, 1017 &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1018 "size of data in mru ghost state"); 1019 1020SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 1021 &ARC_mfu.arcs_size, 0, "size of mfu state"); 1022SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, 1023 &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); 1024SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, 1025 &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); 1026 1027SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 1028 &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state"); 1029SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, 1030 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1031 "size of metadata in mfu ghost state"); 1032SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, 1033 &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1034 "size of data in mfu ghost state"); 1035 1036SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 1037 &ARC_l2c_only.arcs_size, 0, "size of mru state"); 1038 1039/* 1040 * L2ARC Internals 1041 */ 1042struct l2arc_dev { 1043 vdev_t *l2ad_vdev; /* vdev */ 1044 spa_t *l2ad_spa; /* spa */ 1045 uint64_t l2ad_hand; /* next write location */ 1046 uint64_t l2ad_start; /* first addr on device */ 1047 uint64_t l2ad_end; /* last addr on device */ 1048 uint64_t l2ad_evict; /* last addr eviction reached */ 1049 boolean_t l2ad_first; /* first sweep through */ 1050 boolean_t l2ad_writing; /* currently writing */ 1051 kmutex_t l2ad_mtx; /* lock for buffer list */ 1052 list_t l2ad_buflist; /* buffer list */ 1053 list_node_t l2ad_node; /* device list node */ 1054}; 1055 1056static list_t L2ARC_dev_list; /* device list */ 1057static list_t *l2arc_dev_list; /* device list pointer */ 1058static kmutex_t l2arc_dev_mtx; /* device list mutex */ 1059static l2arc_dev_t *l2arc_dev_last; /* last device used */ 1060static list_t L2ARC_free_on_write; /* free after write buf list */ 1061static list_t *l2arc_free_on_write; /* free after write list ptr */ 1062static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 1063static uint64_t l2arc_ndev; /* number of devices */ 1064 1065typedef struct l2arc_read_callback { 1066 arc_buf_t *l2rcb_buf; /* read buffer */ 1067 spa_t *l2rcb_spa; /* spa */ 1068 blkptr_t l2rcb_bp; /* original blkptr */ 1069 zbookmark_phys_t l2rcb_zb; /* original bookmark */ 1070 int l2rcb_flags; /* original flags */ 1071 enum zio_compress l2rcb_compress; /* applied compress */ 1072} l2arc_read_callback_t; 1073 1074typedef struct l2arc_write_callback { 1075 l2arc_dev_t *l2wcb_dev; /* device info */ 1076 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 1077} l2arc_write_callback_t; 1078 1079typedef struct l2arc_data_free { 1080 /* protected by l2arc_free_on_write_mtx */ 1081 void *l2df_data; 1082 size_t l2df_size; 1083 void (*l2df_func)(void *, size_t); 1084 list_node_t l2df_list_node; 1085} l2arc_data_free_t; 1086 1087static kmutex_t l2arc_feed_thr_lock; 1088static kcondvar_t l2arc_feed_thr_cv; 1089static uint8_t l2arc_thread_exit; 1090 1091static void arc_get_data_buf(arc_buf_t *); 1092static void arc_access(arc_buf_hdr_t *, kmutex_t *); 1093static int arc_evict_needed(arc_buf_contents_t); 1094static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t); 1095static void arc_buf_watch(arc_buf_t *); 1096 1097static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); 1098static uint32_t arc_bufc_to_flags(arc_buf_contents_t); 1099 1100static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); 1101static void l2arc_read_done(zio_t *); 1102 1103static boolean_t l2arc_compress_buf(arc_buf_hdr_t *); 1104static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); 1105static void l2arc_release_cdata_buf(arc_buf_hdr_t *); 1106 1107static uint64_t 1108buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 1109{ 1110 uint8_t *vdva = (uint8_t *)dva; 1111 uint64_t crc = -1ULL; 1112 int i; 1113 1114 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 1115 1116 for (i = 0; i < sizeof (dva_t); i++) 1117 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 1118 1119 crc ^= (spa>>8) ^ birth; 1120 1121 return (crc); 1122} 1123 1124#define BUF_EMPTY(buf) \ 1125 ((buf)->b_dva.dva_word[0] == 0 && \ 1126 (buf)->b_dva.dva_word[1] == 0) 1127 1128#define BUF_EQUAL(spa, dva, birth, buf) \ 1129 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 1130 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 1131 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 1132 1133static void 1134buf_discard_identity(arc_buf_hdr_t *hdr) 1135{ 1136 hdr->b_dva.dva_word[0] = 0; 1137 hdr->b_dva.dva_word[1] = 0; 1138 hdr->b_birth = 0; 1139} 1140 1141static arc_buf_hdr_t * 1142buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) 1143{ 1144 const dva_t *dva = BP_IDENTITY(bp); 1145 uint64_t birth = BP_PHYSICAL_BIRTH(bp); 1146 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 1147 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1148 arc_buf_hdr_t *hdr; 1149 1150 mutex_enter(hash_lock); 1151 for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; 1152 hdr = hdr->b_hash_next) { 1153 if (BUF_EQUAL(spa, dva, birth, hdr)) { 1154 *lockp = hash_lock; 1155 return (hdr); 1156 } 1157 } 1158 mutex_exit(hash_lock); 1159 *lockp = NULL; 1160 return (NULL); 1161} 1162 1163/* 1164 * Insert an entry into the hash table. If there is already an element 1165 * equal to elem in the hash table, then the already existing element 1166 * will be returned and the new element will not be inserted. 1167 * Otherwise returns NULL. 1168 * If lockp == NULL, the caller is assumed to already hold the hash lock. 1169 */ 1170static arc_buf_hdr_t * 1171buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) 1172{ 1173 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1174 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1175 arc_buf_hdr_t *fhdr; 1176 uint32_t i; 1177 1178 ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); 1179 ASSERT(hdr->b_birth != 0); 1180 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1181 1182 if (lockp != NULL) { 1183 *lockp = hash_lock; 1184 mutex_enter(hash_lock); 1185 } else { 1186 ASSERT(MUTEX_HELD(hash_lock)); 1187 } 1188 1189 for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; 1190 fhdr = fhdr->b_hash_next, i++) { 1191 if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) 1192 return (fhdr); 1193 } 1194 1195 hdr->b_hash_next = buf_hash_table.ht_table[idx]; 1196 buf_hash_table.ht_table[idx] = hdr; 1197 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 1198 1199 /* collect some hash table performance data */ 1200 if (i > 0) { 1201 ARCSTAT_BUMP(arcstat_hash_collisions); 1202 if (i == 1) 1203 ARCSTAT_BUMP(arcstat_hash_chains); 1204 1205 ARCSTAT_MAX(arcstat_hash_chain_max, i); 1206 } 1207 1208 ARCSTAT_BUMP(arcstat_hash_elements); 1209 ARCSTAT_MAXSTAT(arcstat_hash_elements); 1210 1211 return (NULL); 1212} 1213 1214static void 1215buf_hash_remove(arc_buf_hdr_t *hdr) 1216{ 1217 arc_buf_hdr_t *fhdr, **hdrp; 1218 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1219 1220 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 1221 ASSERT(HDR_IN_HASH_TABLE(hdr)); 1222 1223 hdrp = &buf_hash_table.ht_table[idx]; 1224 while ((fhdr = *hdrp) != hdr) { 1225 ASSERT(fhdr != NULL); 1226 hdrp = &fhdr->b_hash_next; 1227 } 1228 *hdrp = hdr->b_hash_next; 1229 hdr->b_hash_next = NULL; 1230 hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; 1231 1232 /* collect some hash table performance data */ 1233 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 1234 1235 if (buf_hash_table.ht_table[idx] && 1236 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 1237 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 1238} 1239 1240/* 1241 * Global data structures and functions for the buf kmem cache. 1242 */ 1243static kmem_cache_t *hdr_full_cache; 1244static kmem_cache_t *hdr_l2only_cache; 1245static kmem_cache_t *buf_cache; 1246 1247static void 1248buf_fini(void) 1249{ 1250 int i; 1251 1252 kmem_free(buf_hash_table.ht_table, 1253 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 1254 for (i = 0; i < BUF_LOCKS; i++) 1255 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 1256 kmem_cache_destroy(hdr_full_cache); 1257 kmem_cache_destroy(hdr_l2only_cache); 1258 kmem_cache_destroy(buf_cache); 1259} 1260 1261/* 1262 * Constructor callback - called when the cache is empty 1263 * and a new buf is requested. 1264 */ 1265/* ARGSUSED */ 1266static int 1267hdr_full_cons(void *vbuf, void *unused, int kmflag) 1268{ 1269 arc_buf_hdr_t *hdr = vbuf; 1270 1271 bzero(hdr, HDR_FULL_SIZE); 1272 cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); 1273 refcount_create(&hdr->b_l1hdr.b_refcnt); 1274 mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 1275 arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1276 1277 return (0); 1278} 1279 1280/* ARGSUSED */ 1281static int 1282hdr_l2only_cons(void *vbuf, void *unused, int kmflag) 1283{ 1284 arc_buf_hdr_t *hdr = vbuf; 1285 1286 bzero(hdr, HDR_L2ONLY_SIZE); 1287 arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1288 1289 return (0); 1290} 1291 1292/* ARGSUSED */ 1293static int 1294buf_cons(void *vbuf, void *unused, int kmflag) 1295{ 1296 arc_buf_t *buf = vbuf; 1297 1298 bzero(buf, sizeof (arc_buf_t)); 1299 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 1300 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1301 1302 return (0); 1303} 1304 1305/* 1306 * Destructor callback - called when a cached buf is 1307 * no longer required. 1308 */ 1309/* ARGSUSED */ 1310static void 1311hdr_full_dest(void *vbuf, void *unused) 1312{ 1313 arc_buf_hdr_t *hdr = vbuf; 1314 1315 ASSERT(BUF_EMPTY(hdr)); 1316 cv_destroy(&hdr->b_l1hdr.b_cv); 1317 refcount_destroy(&hdr->b_l1hdr.b_refcnt); 1318 mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); 1319 arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1320} 1321 1322/* ARGSUSED */ 1323static void 1324hdr_l2only_dest(void *vbuf, void *unused) 1325{ 1326 arc_buf_hdr_t *hdr = vbuf; 1327 1328 ASSERT(BUF_EMPTY(hdr)); 1329 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1330} 1331 1332/* ARGSUSED */ 1333static void 1334buf_dest(void *vbuf, void *unused) 1335{ 1336 arc_buf_t *buf = vbuf; 1337 1338 mutex_destroy(&buf->b_evict_lock); 1339 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1340} 1341 1342/* 1343 * Reclaim callback -- invoked when memory is low. 1344 */ 1345/* ARGSUSED */ 1346static void 1347hdr_recl(void *unused) 1348{ 1349 dprintf("hdr_recl called\n"); 1350 /* 1351 * umem calls the reclaim func when we destroy the buf cache, 1352 * which is after we do arc_fini(). 1353 */ 1354 if (!arc_dead) 1355 cv_signal(&arc_reclaim_thr_cv); 1356} 1357 1358static void 1359buf_init(void) 1360{ 1361 uint64_t *ct; 1362 uint64_t hsize = 1ULL << 12; 1363 int i, j; 1364 1365 /* 1366 * The hash table is big enough to fill all of physical memory 1367 * with an average block size of zfs_arc_average_blocksize (default 8K). 1368 * By default, the table will take up 1369 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). 1370 */ 1371 while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) 1372 hsize <<= 1; 1373retry: 1374 buf_hash_table.ht_mask = hsize - 1; 1375 buf_hash_table.ht_table = 1376 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1377 if (buf_hash_table.ht_table == NULL) { 1378 ASSERT(hsize > (1ULL << 8)); 1379 hsize >>= 1; 1380 goto retry; 1381 } 1382 1383 hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 1384 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); 1385 hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", 1386 HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, 1387 NULL, NULL, 0); 1388 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1389 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1390 1391 for (i = 0; i < 256; i++) 1392 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1393 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1394 1395 for (i = 0; i < BUF_LOCKS; i++) { 1396 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1397 NULL, MUTEX_DEFAULT, NULL); 1398 } 1399} 1400 1401/* 1402 * Transition between the two allocation states for the arc_buf_hdr struct. 1403 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without 1404 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller 1405 * version is used when a cache buffer is only in the L2ARC in order to reduce 1406 * memory usage. 1407 */ 1408static arc_buf_hdr_t * 1409arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) 1410{ 1411 ASSERT(HDR_HAS_L2HDR(hdr)); 1412 1413 arc_buf_hdr_t *nhdr; 1414 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 1415 1416 ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || 1417 (old == hdr_l2only_cache && new == hdr_full_cache)); 1418 1419 nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); 1420 1421 ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); 1422 buf_hash_remove(hdr); 1423 1424 bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); 1425 if (new == hdr_full_cache) { 1426 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1427 /* 1428 * arc_access and arc_change_state need to be aware that a 1429 * header has just come out of L2ARC, so we set its state to 1430 * l2c_only even though it's about to change. 1431 */ 1432 nhdr->b_l1hdr.b_state = arc_l2c_only; 1433 } else { 1434 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1435 ASSERT0(hdr->b_l1hdr.b_datacnt); 1436 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 1437 /* 1438 * We might be removing the L1hdr of a buffer which was just 1439 * written out to L2ARC. If such a buffer is compressed then we 1440 * need to free its b_tmp_cdata before destroying the header. 1441 */ 1442 if (hdr->b_l1hdr.b_tmp_cdata != NULL && 1443 HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 1444 l2arc_release_cdata_buf(hdr); 1445 nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; 1446 } 1447 /* 1448 * The header has been reallocated so we need to re-insert it into any 1449 * lists it was on. 1450 */ 1451 (void) buf_hash_insert(nhdr, NULL); 1452 1453 ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); 1454 1455 mutex_enter(&dev->l2ad_mtx); 1456 1457 /* 1458 * We must place the realloc'ed header back into the list at 1459 * the same spot. Otherwise, if it's placed earlier in the list, 1460 * l2arc_write_buffers() could find it during the function's 1461 * write phase, and try to write it out to the l2arc. 1462 */ 1463 list_insert_after(&dev->l2ad_buflist, hdr, nhdr); 1464 list_remove(&dev->l2ad_buflist, hdr); 1465 1466 mutex_exit(&dev->l2ad_mtx); 1467 1468 buf_discard_identity(hdr); 1469 hdr->b_freeze_cksum = NULL; 1470 kmem_cache_free(old, hdr); 1471 1472 return (nhdr); 1473} 1474 1475 1476#define ARC_MINTIME (hz>>4) /* 62 ms */ 1477 1478static void 1479arc_cksum_verify(arc_buf_t *buf) 1480{ 1481 zio_cksum_t zc; 1482 1483 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1484 return; 1485 1486 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1487 if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { 1488 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1489 return; 1490 } 1491 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1492 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 1493 panic("buffer modified while frozen!"); 1494 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1495} 1496 1497static int 1498arc_cksum_equal(arc_buf_t *buf) 1499{ 1500 zio_cksum_t zc; 1501 int equal; 1502 1503 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1504 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1505 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 1506 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1507 1508 return (equal); 1509} 1510 1511static void 1512arc_cksum_compute(arc_buf_t *buf, boolean_t force) 1513{ 1514 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 1515 return; 1516 1517 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1518 if (buf->b_hdr->b_freeze_cksum != NULL) { 1519 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1520 return; 1521 } 1522 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 1523 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 1524 buf->b_hdr->b_freeze_cksum); 1525 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1526#ifdef illumos 1527 arc_buf_watch(buf); 1528#endif 1529} 1530 1531#ifdef illumos 1532#ifndef _KERNEL 1533typedef struct procctl { 1534 long cmd; 1535 prwatch_t prwatch; 1536} procctl_t; 1537#endif 1538 1539/* ARGSUSED */ 1540static void 1541arc_buf_unwatch(arc_buf_t *buf) 1542{ 1543#ifndef _KERNEL 1544 if (arc_watch) { 1545 int result; 1546 procctl_t ctl; 1547 ctl.cmd = PCWATCH; 1548 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1549 ctl.prwatch.pr_size = 0; 1550 ctl.prwatch.pr_wflags = 0; 1551 result = write(arc_procfd, &ctl, sizeof (ctl)); 1552 ASSERT3U(result, ==, sizeof (ctl)); 1553 } 1554#endif 1555} 1556 1557/* ARGSUSED */ 1558static void 1559arc_buf_watch(arc_buf_t *buf) 1560{ 1561#ifndef _KERNEL 1562 if (arc_watch) { 1563 int result; 1564 procctl_t ctl; 1565 ctl.cmd = PCWATCH; 1566 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1567 ctl.prwatch.pr_size = buf->b_hdr->b_size; 1568 ctl.prwatch.pr_wflags = WA_WRITE; 1569 result = write(arc_procfd, &ctl, sizeof (ctl)); 1570 ASSERT3U(result, ==, sizeof (ctl)); 1571 } 1572#endif 1573} 1574#endif /* illumos */ 1575 1576static arc_buf_contents_t 1577arc_buf_type(arc_buf_hdr_t *hdr) 1578{ 1579 if (HDR_ISTYPE_METADATA(hdr)) { 1580 return (ARC_BUFC_METADATA); 1581 } else { 1582 return (ARC_BUFC_DATA); 1583 } 1584} 1585 1586static uint32_t 1587arc_bufc_to_flags(arc_buf_contents_t type) 1588{ 1589 switch (type) { 1590 case ARC_BUFC_DATA: 1591 /* metadata field is 0 if buffer contains normal data */ 1592 return (0); 1593 case ARC_BUFC_METADATA: 1594 return (ARC_FLAG_BUFC_METADATA); 1595 default: 1596 break; 1597 } 1598 panic("undefined ARC buffer type!"); 1599 return ((uint32_t)-1); 1600} 1601 1602void 1603arc_buf_thaw(arc_buf_t *buf) 1604{ 1605 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1606 if (buf->b_hdr->b_l1hdr.b_state != arc_anon) 1607 panic("modifying non-anon buffer!"); 1608 if (HDR_IO_IN_PROGRESS(buf->b_hdr)) 1609 panic("modifying buffer while i/o in progress!"); 1610 arc_cksum_verify(buf); 1611 } 1612 1613 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1614 if (buf->b_hdr->b_freeze_cksum != NULL) { 1615 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1616 buf->b_hdr->b_freeze_cksum = NULL; 1617 } 1618 1619#ifdef ZFS_DEBUG 1620 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1621 if (buf->b_hdr->b_l1hdr.b_thawed != NULL) 1622 kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1); 1623 buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); 1624 } 1625#endif 1626 1627 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1628 1629#ifdef illumos 1630 arc_buf_unwatch(buf); 1631#endif 1632} 1633 1634void 1635arc_buf_freeze(arc_buf_t *buf) 1636{ 1637 kmutex_t *hash_lock; 1638 1639 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1640 return; 1641 1642 hash_lock = HDR_LOCK(buf->b_hdr); 1643 mutex_enter(hash_lock); 1644 1645 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 1646 buf->b_hdr->b_l1hdr.b_state == arc_anon); 1647 arc_cksum_compute(buf, B_FALSE); 1648 mutex_exit(hash_lock); 1649 1650} 1651 1652static void 1653get_buf_info(arc_buf_hdr_t *hdr, arc_state_t *state, list_t **list, kmutex_t **lock) 1654{ 1655 uint64_t buf_hashid = buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1656 1657 if (arc_buf_type(hdr) == ARC_BUFC_METADATA) 1658 buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1); 1659 else { 1660 buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1); 1661 buf_hashid += ARC_BUFC_NUMMETADATALISTS; 1662 } 1663 1664 *list = &state->arcs_lists[buf_hashid]; 1665 *lock = ARCS_LOCK(state, buf_hashid); 1666} 1667 1668 1669static void 1670add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1671{ 1672 ASSERT(HDR_HAS_L1HDR(hdr)); 1673 ASSERT(MUTEX_HELD(hash_lock)); 1674 arc_state_t *state = hdr->b_l1hdr.b_state; 1675 1676 if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && 1677 (state != arc_anon)) { 1678 /* We don't use the L2-only state list. */ 1679 if (state != arc_l2c_only) { 1680 uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; 1681 uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; 1682 list_t *list; 1683 kmutex_t *lock; 1684 1685 get_buf_info(hdr, state, &list, &lock); 1686 ASSERT(!MUTEX_HELD(lock)); 1687 mutex_enter(lock); 1688 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 1689 list_remove(list, hdr); 1690 if (GHOST_STATE(state)) { 1691 ASSERT0(hdr->b_l1hdr.b_datacnt); 1692 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 1693 delta = hdr->b_size; 1694 } 1695 ASSERT(delta > 0); 1696 ASSERT3U(*size, >=, delta); 1697 atomic_add_64(size, -delta); 1698 mutex_exit(lock); 1699 } 1700 /* remove the prefetch flag if we get a reference */ 1701 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 1702 } 1703} 1704 1705static int 1706remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1707{ 1708 int cnt; 1709 arc_state_t *state = hdr->b_l1hdr.b_state; 1710 1711 ASSERT(HDR_HAS_L1HDR(hdr)); 1712 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 1713 ASSERT(!GHOST_STATE(state)); 1714 1715 /* 1716 * arc_l2c_only counts as a ghost state so we don't need to explicitly 1717 * check to prevent usage of the arc_l2c_only list. 1718 */ 1719 if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && 1720 (state != arc_anon)) { 1721 uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; 1722 list_t *list; 1723 kmutex_t *lock; 1724 1725 get_buf_info(hdr, state, &list, &lock); 1726 ASSERT(!MUTEX_HELD(lock)); 1727 mutex_enter(lock); 1728 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 1729 list_insert_head(list, hdr); 1730 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 1731 atomic_add_64(size, hdr->b_size * 1732 hdr->b_l1hdr.b_datacnt); 1733 mutex_exit(lock); 1734 } 1735 return (cnt); 1736} 1737 1738/* 1739 * Move the supplied buffer to the indicated state. The mutex 1740 * for the buffer must be held by the caller. 1741 */ 1742static void 1743arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, 1744 kmutex_t *hash_lock) 1745{ 1746 arc_state_t *old_state; 1747 int64_t refcnt; 1748 uint32_t datacnt; 1749 uint64_t from_delta, to_delta; 1750 arc_buf_contents_t buftype = arc_buf_type(hdr); 1751 list_t *list; 1752 kmutex_t *lock; 1753 1754 /* 1755 * We almost always have an L1 hdr here, since we call arc_hdr_realloc() 1756 * in arc_read() when bringing a buffer out of the L2ARC. However, the 1757 * L1 hdr doesn't always exist when we change state to arc_anon before 1758 * destroying a header, in which case reallocating to add the L1 hdr is 1759 * pointless. 1760 */ 1761 if (HDR_HAS_L1HDR(hdr)) { 1762 old_state = hdr->b_l1hdr.b_state; 1763 refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); 1764 datacnt = hdr->b_l1hdr.b_datacnt; 1765 } else { 1766 old_state = arc_l2c_only; 1767 refcnt = 0; 1768 datacnt = 0; 1769 } 1770 1771 ASSERT(MUTEX_HELD(hash_lock)); 1772 ASSERT3P(new_state, !=, old_state); 1773 ASSERT(refcnt == 0 || datacnt > 0); 1774 ASSERT(!GHOST_STATE(new_state) || datacnt == 0); 1775 ASSERT(old_state != arc_anon || datacnt <= 1); 1776 1777 from_delta = to_delta = datacnt * hdr->b_size; 1778 1779 /* 1780 * If this buffer is evictable, transfer it from the 1781 * old state list to the new state list. 1782 */ 1783 if (refcnt == 0) { 1784 if (old_state != arc_anon && old_state != arc_l2c_only) { 1785 int use_mutex; 1786 uint64_t *size = &old_state->arcs_lsize[buftype]; 1787 1788 get_buf_info(hdr, old_state, &list, &lock); 1789 use_mutex = !MUTEX_HELD(lock); 1790 if (use_mutex) 1791 mutex_enter(lock); 1792 1793 ASSERT(HDR_HAS_L1HDR(hdr)); 1794 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 1795 list_remove(list, hdr); 1796 1797 /* 1798 * If prefetching out of the ghost cache, 1799 * we will have a non-zero datacnt. 1800 */ 1801 if (GHOST_STATE(old_state) && datacnt == 0) { 1802 /* ghost elements have a ghost size */ 1803 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1804 from_delta = hdr->b_size; 1805 } 1806 ASSERT3U(*size, >=, from_delta); 1807 atomic_add_64(size, -from_delta); 1808 1809 if (use_mutex) 1810 mutex_exit(lock); 1811 } 1812 if (new_state != arc_anon && new_state != arc_l2c_only) { 1813 int use_mutex; 1814 uint64_t *size = &new_state->arcs_lsize[buftype]; 1815 1816 /* 1817 * An L1 header always exists here, since if we're 1818 * moving to some L1-cached state (i.e. not l2c_only or 1819 * anonymous), we realloc the header to add an L1hdr 1820 * beforehand. 1821 */ 1822 ASSERT(HDR_HAS_L1HDR(hdr)); 1823 get_buf_info(hdr, new_state, &list, &lock); 1824 use_mutex = !MUTEX_HELD(lock); 1825 if (use_mutex) 1826 mutex_enter(lock); 1827 1828 list_insert_head(list, hdr); 1829 1830 /* ghost elements have a ghost size */ 1831 if (GHOST_STATE(new_state)) { 1832 ASSERT(datacnt == 0); 1833 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1834 to_delta = hdr->b_size; 1835 } 1836 atomic_add_64(size, to_delta); 1837 1838 if (use_mutex) 1839 mutex_exit(lock); 1840 } 1841 } 1842 1843 ASSERT(!BUF_EMPTY(hdr)); 1844 if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) 1845 buf_hash_remove(hdr); 1846 1847 /* adjust state sizes (ignore arc_l2c_only) */ 1848 if (to_delta && new_state != arc_l2c_only) 1849 atomic_add_64(&new_state->arcs_size, to_delta); 1850 if (from_delta && old_state != arc_l2c_only) { 1851 ASSERT3U(old_state->arcs_size, >=, from_delta); 1852 atomic_add_64(&old_state->arcs_size, -from_delta); 1853 } 1854 if (HDR_HAS_L1HDR(hdr)) 1855 hdr->b_l1hdr.b_state = new_state; 1856 1857 /* 1858 * L2 headers should never be on the L2 state list since they don't 1859 * have L1 headers allocated. 1860 */ 1861#ifdef illumos 1862 ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && 1863 list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); 1864#endif 1865} 1866 1867void 1868arc_space_consume(uint64_t space, arc_space_type_t type) 1869{ 1870 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1871 1872 switch (type) { 1873 case ARC_SPACE_DATA: 1874 ARCSTAT_INCR(arcstat_data_size, space); 1875 break; 1876 case ARC_SPACE_META: 1877 ARCSTAT_INCR(arcstat_metadata_size, space); 1878 break; 1879 case ARC_SPACE_OTHER: 1880 ARCSTAT_INCR(arcstat_other_size, space); 1881 break; 1882 case ARC_SPACE_HDRS: 1883 ARCSTAT_INCR(arcstat_hdr_size, space); 1884 break; 1885 case ARC_SPACE_L2HDRS: 1886 ARCSTAT_INCR(arcstat_l2_hdr_size, space); 1887 break; 1888 } 1889 1890 if (type != ARC_SPACE_DATA) 1891 ARCSTAT_INCR(arcstat_meta_used, space); 1892 1893 atomic_add_64(&arc_size, space); 1894} 1895 1896void 1897arc_space_return(uint64_t space, arc_space_type_t type) 1898{ 1899 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1900 1901 switch (type) { 1902 case ARC_SPACE_DATA: 1903 ARCSTAT_INCR(arcstat_data_size, -space); 1904 break; 1905 case ARC_SPACE_META: 1906 ARCSTAT_INCR(arcstat_metadata_size, -space); 1907 break; 1908 case ARC_SPACE_OTHER: 1909 ARCSTAT_INCR(arcstat_other_size, -space); 1910 break; 1911 case ARC_SPACE_HDRS: 1912 ARCSTAT_INCR(arcstat_hdr_size, -space); 1913 break; 1914 case ARC_SPACE_L2HDRS: 1915 ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 1916 break; 1917 } 1918 1919 if (type != ARC_SPACE_DATA) { 1920 ASSERT(arc_meta_used >= space); 1921 if (arc_meta_max < arc_meta_used) 1922 arc_meta_max = arc_meta_used; 1923 ARCSTAT_INCR(arcstat_meta_used, -space); 1924 } 1925 1926 ASSERT(arc_size >= space); 1927 atomic_add_64(&arc_size, -space); 1928} 1929 1930arc_buf_t * 1931arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) 1932{ 1933 arc_buf_hdr_t *hdr; 1934 arc_buf_t *buf; 1935 1936 ASSERT3U(size, >, 0); 1937 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 1938 ASSERT(BUF_EMPTY(hdr)); 1939 ASSERT3P(hdr->b_freeze_cksum, ==, NULL); 1940 hdr->b_size = size; 1941 hdr->b_spa = spa_load_guid(spa); 1942 1943 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1944 buf->b_hdr = hdr; 1945 buf->b_data = NULL; 1946 buf->b_efunc = NULL; 1947 buf->b_private = NULL; 1948 buf->b_next = NULL; 1949 1950 hdr->b_flags = arc_bufc_to_flags(type); 1951 hdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1952 1953 hdr->b_l1hdr.b_buf = buf; 1954 hdr->b_l1hdr.b_state = arc_anon; 1955 hdr->b_l1hdr.b_arc_access = 0; 1956 hdr->b_l1hdr.b_datacnt = 1; 1957 1958 arc_get_data_buf(buf); 1959 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 1960 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 1961 1962 return (buf); 1963} 1964 1965static char *arc_onloan_tag = "onloan"; 1966 1967/* 1968 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 1969 * flight data by arc_tempreserve_space() until they are "returned". Loaned 1970 * buffers must be returned to the arc before they can be used by the DMU or 1971 * freed. 1972 */ 1973arc_buf_t * 1974arc_loan_buf(spa_t *spa, int size) 1975{ 1976 arc_buf_t *buf; 1977 1978 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 1979 1980 atomic_add_64(&arc_loaned_bytes, size); 1981 return (buf); 1982} 1983 1984/* 1985 * Return a loaned arc buffer to the arc. 1986 */ 1987void 1988arc_return_buf(arc_buf_t *buf, void *tag) 1989{ 1990 arc_buf_hdr_t *hdr = buf->b_hdr; 1991 1992 ASSERT(buf->b_data != NULL); 1993 ASSERT(HDR_HAS_L1HDR(hdr)); 1994 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 1995 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 1996 1997 atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 1998} 1999 2000/* Detach an arc_buf from a dbuf (tag) */ 2001void 2002arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 2003{ 2004 arc_buf_hdr_t *hdr = buf->b_hdr; 2005 2006 ASSERT(buf->b_data != NULL); 2007 ASSERT(HDR_HAS_L1HDR(hdr)); 2008 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2009 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); 2010 buf->b_efunc = NULL; 2011 buf->b_private = NULL; 2012 2013 atomic_add_64(&arc_loaned_bytes, hdr->b_size); 2014} 2015 2016static arc_buf_t * 2017arc_buf_clone(arc_buf_t *from) 2018{ 2019 arc_buf_t *buf; 2020 arc_buf_hdr_t *hdr = from->b_hdr; 2021 uint64_t size = hdr->b_size; 2022 2023 ASSERT(HDR_HAS_L1HDR(hdr)); 2024 ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2025 2026 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2027 buf->b_hdr = hdr; 2028 buf->b_data = NULL; 2029 buf->b_efunc = NULL; 2030 buf->b_private = NULL; 2031 buf->b_next = hdr->b_l1hdr.b_buf; 2032 hdr->b_l1hdr.b_buf = buf; 2033 arc_get_data_buf(buf); 2034 bcopy(from->b_data, buf->b_data, size); 2035 2036 /* 2037 * This buffer already exists in the arc so create a duplicate 2038 * copy for the caller. If the buffer is associated with user data 2039 * then track the size and number of duplicates. These stats will be 2040 * updated as duplicate buffers are created and destroyed. 2041 */ 2042 if (HDR_ISTYPE_DATA(hdr)) { 2043 ARCSTAT_BUMP(arcstat_duplicate_buffers); 2044 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 2045 } 2046 hdr->b_l1hdr.b_datacnt += 1; 2047 return (buf); 2048} 2049 2050void 2051arc_buf_add_ref(arc_buf_t *buf, void* tag) 2052{ 2053 arc_buf_hdr_t *hdr; 2054 kmutex_t *hash_lock; 2055 2056 /* 2057 * Check to see if this buffer is evicted. Callers 2058 * must verify b_data != NULL to know if the add_ref 2059 * was successful. 2060 */ 2061 mutex_enter(&buf->b_evict_lock); 2062 if (buf->b_data == NULL) { 2063 mutex_exit(&buf->b_evict_lock); 2064 return; 2065 } 2066 hash_lock = HDR_LOCK(buf->b_hdr); 2067 mutex_enter(hash_lock); 2068 hdr = buf->b_hdr; 2069 ASSERT(HDR_HAS_L1HDR(hdr)); 2070 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2071 mutex_exit(&buf->b_evict_lock); 2072 2073 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 2074 hdr->b_l1hdr.b_state == arc_mfu); 2075 2076 add_reference(hdr, hash_lock, tag); 2077 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2078 arc_access(hdr, hash_lock); 2079 mutex_exit(hash_lock); 2080 ARCSTAT_BUMP(arcstat_hits); 2081 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 2082 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 2083 data, metadata, hits); 2084} 2085 2086static void 2087arc_buf_free_on_write(void *data, size_t size, 2088 void (*free_func)(void *, size_t)) 2089{ 2090 l2arc_data_free_t *df; 2091 2092 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 2093 df->l2df_data = data; 2094 df->l2df_size = size; 2095 df->l2df_func = free_func; 2096 mutex_enter(&l2arc_free_on_write_mtx); 2097 list_insert_head(l2arc_free_on_write, df); 2098 mutex_exit(&l2arc_free_on_write_mtx); 2099} 2100 2101/* 2102 * Free the arc data buffer. If it is an l2arc write in progress, 2103 * the buffer is placed on l2arc_free_on_write to be freed later. 2104 */ 2105static void 2106arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) 2107{ 2108 arc_buf_hdr_t *hdr = buf->b_hdr; 2109 2110 if (HDR_L2_WRITING(hdr)) { 2111 arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); 2112 ARCSTAT_BUMP(arcstat_l2_free_on_write); 2113 } else { 2114 free_func(buf->b_data, hdr->b_size); 2115 } 2116} 2117 2118/* 2119 * Free up buf->b_data and if 'remove' is set, then pull the 2120 * arc_buf_t off of the the arc_buf_hdr_t's list and free it. 2121 */ 2122static void 2123arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) 2124{ 2125 ASSERT(HDR_HAS_L2HDR(hdr)); 2126 ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); 2127 2128 /* 2129 * The b_tmp_cdata field is linked off of the b_l1hdr, so if 2130 * that doesn't exist, the header is in the arc_l2c_only state, 2131 * and there isn't anything to free (it's already been freed). 2132 */ 2133 if (!HDR_HAS_L1HDR(hdr)) 2134 return; 2135 2136 if (hdr->b_l1hdr.b_tmp_cdata == NULL) 2137 return; 2138 2139 ASSERT(HDR_L2_WRITING(hdr)); 2140 arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size, 2141 zio_data_buf_free); 2142 2143 ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); 2144 hdr->b_l1hdr.b_tmp_cdata = NULL; 2145} 2146 2147static void 2148arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) 2149{ 2150 arc_buf_t **bufp; 2151 2152 /* free up data associated with the buf */ 2153 if (buf->b_data != NULL) { 2154 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 2155 uint64_t size = buf->b_hdr->b_size; 2156 arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 2157 2158 arc_cksum_verify(buf); 2159#ifdef illumos 2160 arc_buf_unwatch(buf); 2161#endif 2162 2163 if (!recycle) { 2164 if (type == ARC_BUFC_METADATA) { 2165 arc_buf_data_free(buf, zio_buf_free); 2166 arc_space_return(size, ARC_SPACE_META); 2167 } else { 2168 ASSERT(type == ARC_BUFC_DATA); 2169 arc_buf_data_free(buf, zio_data_buf_free); 2170 arc_space_return(size, ARC_SPACE_DATA); 2171 } 2172 } 2173 if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { 2174 uint64_t *cnt = &state->arcs_lsize[type]; 2175 2176 ASSERT(refcount_is_zero( 2177 &buf->b_hdr->b_l1hdr.b_refcnt)); 2178 ASSERT(state != arc_anon && state != arc_l2c_only); 2179 2180 ASSERT3U(*cnt, >=, size); 2181 atomic_add_64(cnt, -size); 2182 } 2183 ASSERT3U(state->arcs_size, >=, size); 2184 atomic_add_64(&state->arcs_size, -size); 2185 buf->b_data = NULL; 2186 2187 /* 2188 * If we're destroying a duplicate buffer make sure 2189 * that the appropriate statistics are updated. 2190 */ 2191 if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && 2192 HDR_ISTYPE_DATA(buf->b_hdr)) { 2193 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 2194 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 2195 } 2196 ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); 2197 buf->b_hdr->b_l1hdr.b_datacnt -= 1; 2198 } 2199 2200 /* only remove the buf if requested */ 2201 if (!remove) 2202 return; 2203 2204 /* remove the buf from the hdr list */ 2205 for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; 2206 bufp = &(*bufp)->b_next) 2207 continue; 2208 *bufp = buf->b_next; 2209 buf->b_next = NULL; 2210 2211 ASSERT(buf->b_efunc == NULL); 2212 2213 /* clean up the buf */ 2214 buf->b_hdr = NULL; 2215 kmem_cache_free(buf_cache, buf); 2216} 2217 2218static void 2219arc_hdr_destroy(arc_buf_hdr_t *hdr) 2220{ 2221 if (HDR_HAS_L1HDR(hdr)) { 2222 ASSERT(hdr->b_l1hdr.b_buf == NULL || 2223 hdr->b_l1hdr.b_datacnt > 0); 2224 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2225 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 2226 } 2227 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2228 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 2229 2230 if (HDR_HAS_L2HDR(hdr)) { 2231 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 2232 boolean_t buflist_held = MUTEX_HELD(&l2hdr->b_dev->l2ad_mtx); 2233 2234 if (!buflist_held) { 2235 mutex_enter(&l2hdr->b_dev->l2ad_mtx); 2236 l2hdr = &hdr->b_l2hdr; 2237 } 2238 2239 trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr, 2240 l2hdr->b_asize, 0); 2241 list_remove(&l2hdr->b_dev->l2ad_buflist, hdr); 2242 2243 /* 2244 * We don't want to leak the b_tmp_cdata buffer that was 2245 * allocated in l2arc_write_buffers() 2246 */ 2247 arc_buf_l2_cdata_free(hdr); 2248 2249 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 2250 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 2251 2252 if (!buflist_held) 2253 mutex_exit(&l2hdr->b_dev->l2ad_mtx); 2254 2255 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 2256 } 2257 2258 if (!BUF_EMPTY(hdr)) 2259 buf_discard_identity(hdr); 2260 if (hdr->b_freeze_cksum != NULL) { 2261 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 2262 hdr->b_freeze_cksum = NULL; 2263 } 2264 2265 if (HDR_HAS_L1HDR(hdr)) { 2266 while (hdr->b_l1hdr.b_buf) { 2267 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2268 2269 if (buf->b_efunc != NULL) { 2270 mutex_enter(&arc_eviction_mtx); 2271 mutex_enter(&buf->b_evict_lock); 2272 ASSERT(buf->b_hdr != NULL); 2273 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, 2274 FALSE); 2275 hdr->b_l1hdr.b_buf = buf->b_next; 2276 buf->b_hdr = &arc_eviction_hdr; 2277 buf->b_next = arc_eviction_list; 2278 arc_eviction_list = buf; 2279 mutex_exit(&buf->b_evict_lock); 2280 mutex_exit(&arc_eviction_mtx); 2281 } else { 2282 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, 2283 TRUE); 2284 } 2285 } 2286#ifdef ZFS_DEBUG 2287 if (hdr->b_l1hdr.b_thawed != NULL) { 2288 kmem_free(hdr->b_l1hdr.b_thawed, 1); 2289 hdr->b_l1hdr.b_thawed = NULL; 2290 } 2291#endif 2292 } 2293 2294 ASSERT3P(hdr->b_hash_next, ==, NULL); 2295 if (HDR_HAS_L1HDR(hdr)) { 2296 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 2297 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 2298 kmem_cache_free(hdr_full_cache, hdr); 2299 } else { 2300 kmem_cache_free(hdr_l2only_cache, hdr); 2301 } 2302} 2303 2304void 2305arc_buf_free(arc_buf_t *buf, void *tag) 2306{ 2307 arc_buf_hdr_t *hdr = buf->b_hdr; 2308 int hashed = hdr->b_l1hdr.b_state != arc_anon; 2309 2310 ASSERT(buf->b_efunc == NULL); 2311 ASSERT(buf->b_data != NULL); 2312 2313 if (hashed) { 2314 kmutex_t *hash_lock = HDR_LOCK(hdr); 2315 2316 mutex_enter(hash_lock); 2317 hdr = buf->b_hdr; 2318 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2319 2320 (void) remove_reference(hdr, hash_lock, tag); 2321 if (hdr->b_l1hdr.b_datacnt > 1) { 2322 arc_buf_destroy(buf, FALSE, TRUE); 2323 } else { 2324 ASSERT(buf == hdr->b_l1hdr.b_buf); 2325 ASSERT(buf->b_efunc == NULL); 2326 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2327 } 2328 mutex_exit(hash_lock); 2329 } else if (HDR_IO_IN_PROGRESS(hdr)) { 2330 int destroy_hdr; 2331 /* 2332 * We are in the middle of an async write. Don't destroy 2333 * this buffer unless the write completes before we finish 2334 * decrementing the reference count. 2335 */ 2336 mutex_enter(&arc_eviction_mtx); 2337 (void) remove_reference(hdr, NULL, tag); 2338 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2339 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 2340 mutex_exit(&arc_eviction_mtx); 2341 if (destroy_hdr) 2342 arc_hdr_destroy(hdr); 2343 } else { 2344 if (remove_reference(hdr, NULL, tag) > 0) 2345 arc_buf_destroy(buf, FALSE, TRUE); 2346 else 2347 arc_hdr_destroy(hdr); 2348 } 2349} 2350 2351boolean_t 2352arc_buf_remove_ref(arc_buf_t *buf, void* tag) 2353{ 2354 arc_buf_hdr_t *hdr = buf->b_hdr; 2355 kmutex_t *hash_lock = HDR_LOCK(hdr); 2356 boolean_t no_callback = (buf->b_efunc == NULL); 2357 2358 if (hdr->b_l1hdr.b_state == arc_anon) { 2359 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 2360 arc_buf_free(buf, tag); 2361 return (no_callback); 2362 } 2363 2364 mutex_enter(hash_lock); 2365 hdr = buf->b_hdr; 2366 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 2367 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2368 ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2369 ASSERT(buf->b_data != NULL); 2370 2371 (void) remove_reference(hdr, hash_lock, tag); 2372 if (hdr->b_l1hdr.b_datacnt > 1) { 2373 if (no_callback) 2374 arc_buf_destroy(buf, FALSE, TRUE); 2375 } else if (no_callback) { 2376 ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); 2377 ASSERT(buf->b_efunc == NULL); 2378 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2379 } 2380 ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || 2381 refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2382 mutex_exit(hash_lock); 2383 return (no_callback); 2384} 2385 2386int32_t 2387arc_buf_size(arc_buf_t *buf) 2388{ 2389 return (buf->b_hdr->b_size); 2390} 2391 2392/* 2393 * Called from the DMU to determine if the current buffer should be 2394 * evicted. In order to ensure proper locking, the eviction must be initiated 2395 * from the DMU. Return true if the buffer is associated with user data and 2396 * duplicate buffers still exist. 2397 */ 2398boolean_t 2399arc_buf_eviction_needed(arc_buf_t *buf) 2400{ 2401 arc_buf_hdr_t *hdr; 2402 boolean_t evict_needed = B_FALSE; 2403 2404 if (zfs_disable_dup_eviction) 2405 return (B_FALSE); 2406 2407 mutex_enter(&buf->b_evict_lock); 2408 hdr = buf->b_hdr; 2409 if (hdr == NULL) { 2410 /* 2411 * We are in arc_do_user_evicts(); let that function 2412 * perform the eviction. 2413 */ 2414 ASSERT(buf->b_data == NULL); 2415 mutex_exit(&buf->b_evict_lock); 2416 return (B_FALSE); 2417 } else if (buf->b_data == NULL) { 2418 /* 2419 * We have already been added to the arc eviction list; 2420 * recommend eviction. 2421 */ 2422 ASSERT3P(hdr, ==, &arc_eviction_hdr); 2423 mutex_exit(&buf->b_evict_lock); 2424 return (B_TRUE); 2425 } 2426 2427 if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) 2428 evict_needed = B_TRUE; 2429 2430 mutex_exit(&buf->b_evict_lock); 2431 return (evict_needed); 2432} 2433 2434/* 2435 * Evict buffers from list until we've removed the specified number of 2436 * bytes. Move the removed buffers to the appropriate evict state. 2437 * If the recycle flag is set, then attempt to "recycle" a buffer: 2438 * - look for a buffer to evict that is `bytes' long. 2439 * - return the data block from this buffer rather than freeing it. 2440 * This flag is used by callers that are trying to make space for a 2441 * new buffer in a full arc cache. 2442 * 2443 * This function makes a "best effort". It skips over any buffers 2444 * it can't get a hash_lock on, and so may not catch all candidates. 2445 * It may also return without evicting as much space as requested. 2446 */ 2447static void * 2448arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, 2449 arc_buf_contents_t type) 2450{ 2451 arc_state_t *evicted_state; 2452 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 2453 int64_t bytes_remaining; 2454 arc_buf_hdr_t *hdr, *hdr_prev = NULL; 2455 list_t *evicted_list, *list, *evicted_list_start, *list_start; 2456 kmutex_t *lock, *evicted_lock; 2457 kmutex_t *hash_lock; 2458 boolean_t have_lock; 2459 void *stolen = NULL; 2460 arc_buf_hdr_t marker = { 0 }; 2461 int count = 0; 2462 static int evict_metadata_offset, evict_data_offset; 2463 int i, idx, offset, list_count, lists; 2464 2465 ASSERT(state == arc_mru || state == arc_mfu); 2466 2467 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2468 2469 /* 2470 * Decide which "type" (data vs metadata) to recycle from. 2471 * 2472 * If we are over the metadata limit, recycle from metadata. 2473 * If we are under the metadata minimum, recycle from data. 2474 * Otherwise, recycle from whichever type has the oldest (least 2475 * recently accessed) header. This is not yet implemented. 2476 */ 2477 if (recycle) { 2478 arc_buf_contents_t realtype; 2479 if (state->arcs_lsize[ARC_BUFC_DATA] == 0) { 2480 realtype = ARC_BUFC_METADATA; 2481 } else if (state->arcs_lsize[ARC_BUFC_METADATA] == 0) { 2482 realtype = ARC_BUFC_DATA; 2483 } else if (arc_meta_used >= arc_meta_limit) { 2484 realtype = ARC_BUFC_METADATA; 2485 } else if (arc_meta_used <= arc_meta_min) { 2486 realtype = ARC_BUFC_DATA; 2487#ifdef illumos 2488 } else if (HDR_HAS_L1HDR(data_hdr) && 2489 HDR_HAS_L1HDR(metadata_hdr) && 2490 data_hdr->b_l1hdr.b_arc_access < 2491 metadata_hdr->b_l1hdr.b_arc_access) { 2492 realtype = ARC_BUFC_DATA; 2493 } else { 2494 realtype = ARC_BUFC_METADATA; 2495#else 2496 } else { 2497 /* TODO */ 2498 realtype = type; 2499#endif 2500 } 2501 if (realtype != type) { 2502 /* 2503 * If we want to evict from a different list, 2504 * we can not recycle, because DATA vs METADATA 2505 * buffers are segregated into different kmem 2506 * caches (and vmem arenas). 2507 */ 2508 type = realtype; 2509 recycle = B_FALSE; 2510 } 2511 } 2512 2513 if (type == ARC_BUFC_METADATA) { 2514 offset = 0; 2515 list_count = ARC_BUFC_NUMMETADATALISTS; 2516 list_start = &state->arcs_lists[0]; 2517 evicted_list_start = &evicted_state->arcs_lists[0]; 2518 idx = evict_metadata_offset; 2519 } else { 2520 offset = ARC_BUFC_NUMMETADATALISTS; 2521 list_start = &state->arcs_lists[offset]; 2522 evicted_list_start = &evicted_state->arcs_lists[offset]; 2523 list_count = ARC_BUFC_NUMDATALISTS; 2524 idx = evict_data_offset; 2525 } 2526 bytes_remaining = evicted_state->arcs_lsize[type]; 2527 lists = 0; 2528 2529evict_start: 2530 list = &list_start[idx]; 2531 evicted_list = &evicted_list_start[idx]; 2532 lock = ARCS_LOCK(state, (offset + idx)); 2533 evicted_lock = ARCS_LOCK(evicted_state, (offset + idx)); 2534 2535 /* 2536 * The ghost list lock must be acquired first in order to prevent 2537 * a 3 party deadlock: 2538 * 2539 * - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by 2540 * l2ad_mtx in arc_hdr_realloc 2541 * - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx 2542 * - arc_evict acquires arc_*_ghost->arcs_mtx, followed by 2543 * arc_*_ghost->arcs_mtx and forms a deadlock cycle. 2544 * 2545 * This situation is avoided by acquiring the ghost list lock first. 2546 */ 2547 mutex_enter(evicted_lock); 2548 mutex_enter(lock); 2549 2550 for (hdr = list_tail(list); hdr; hdr = hdr_prev) { 2551 hdr_prev = list_prev(list, hdr); 2552 if (HDR_HAS_L1HDR(hdr)) { 2553 bytes_remaining -= 2554 (hdr->b_size * hdr->b_l1hdr.b_datacnt); 2555 } 2556 /* prefetch buffers have a minimum lifespan */ 2557 if (HDR_IO_IN_PROGRESS(hdr) || 2558 (spa && hdr->b_spa != spa) || 2559 ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && 2560 ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < 2561 arc_min_prefetch_lifespan)) { 2562 skipped++; 2563 continue; 2564 } 2565 /* "lookahead" for better eviction candidate */ 2566 if (recycle && hdr->b_size != bytes && 2567 hdr_prev && hdr_prev->b_size == bytes) 2568 continue; 2569 2570 /* ignore markers */ 2571 if (hdr->b_spa == 0) 2572 continue; 2573 2574 /* 2575 * It may take a long time to evict all the bufs requested. 2576 * To avoid blocking all arc activity, periodically drop 2577 * the arcs_mtx and give other threads a chance to run 2578 * before reacquiring the lock. 2579 * 2580 * If we are looking for a buffer to recycle, we are in 2581 * the hot code path, so don't sleep. 2582 */ 2583 if (!recycle && count++ > arc_evict_iterations) { 2584 list_insert_after(list, hdr, &marker); 2585 mutex_exit(lock); 2586 mutex_exit(evicted_lock); 2587 kpreempt(KPREEMPT_SYNC); 2588 mutex_enter(evicted_lock); 2589 mutex_enter(lock); 2590 hdr_prev = list_prev(list, &marker); 2591 list_remove(list, &marker); 2592 count = 0; 2593 continue; 2594 } 2595 2596 hash_lock = HDR_LOCK(hdr); 2597 have_lock = MUTEX_HELD(hash_lock); 2598 if (have_lock || mutex_tryenter(hash_lock)) { 2599 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 2600 ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); 2601 while (hdr->b_l1hdr.b_buf) { 2602 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2603 if (!mutex_tryenter(&buf->b_evict_lock)) { 2604 missed += 1; 2605 break; 2606 } 2607 if (buf->b_data != NULL) { 2608 bytes_evicted += hdr->b_size; 2609 if (recycle && 2610 arc_buf_type(hdr) == type && 2611 hdr->b_size == bytes && 2612 !HDR_L2_WRITING(hdr)) { 2613 stolen = buf->b_data; 2614 recycle = FALSE; 2615 } 2616 } 2617 if (buf->b_efunc != NULL) { 2618 mutex_enter(&arc_eviction_mtx); 2619 arc_buf_destroy(buf, 2620 buf->b_data == stolen, FALSE); 2621 hdr->b_l1hdr.b_buf = buf->b_next; 2622 buf->b_hdr = &arc_eviction_hdr; 2623 buf->b_next = arc_eviction_list; 2624 arc_eviction_list = buf; 2625 mutex_exit(&arc_eviction_mtx); 2626 mutex_exit(&buf->b_evict_lock); 2627 } else { 2628 mutex_exit(&buf->b_evict_lock); 2629 arc_buf_destroy(buf, 2630 buf->b_data == stolen, TRUE); 2631 } 2632 } 2633 2634 if (HDR_HAS_L2HDR(hdr)) { 2635 ARCSTAT_INCR(arcstat_evict_l2_cached, 2636 hdr->b_size); 2637 } else { 2638 if (l2arc_write_eligible(hdr->b_spa, hdr)) { 2639 ARCSTAT_INCR(arcstat_evict_l2_eligible, 2640 hdr->b_size); 2641 } else { 2642 ARCSTAT_INCR( 2643 arcstat_evict_l2_ineligible, 2644 hdr->b_size); 2645 } 2646 } 2647 2648 if (hdr->b_l1hdr.b_datacnt == 0) { 2649 arc_change_state(evicted_state, hdr, hash_lock); 2650 ASSERT(HDR_IN_HASH_TABLE(hdr)); 2651 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 2652 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 2653 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); 2654 } 2655 if (!have_lock) 2656 mutex_exit(hash_lock); 2657 if (bytes >= 0 && bytes_evicted >= bytes) 2658 break; 2659 if (bytes_remaining > 0) { 2660 mutex_exit(evicted_lock); 2661 mutex_exit(lock); 2662 idx = ((idx + 1) & (list_count - 1)); 2663 lists++; 2664 goto evict_start; 2665 } 2666 } else { 2667 missed += 1; 2668 } 2669 } 2670 2671 mutex_exit(lock); 2672 mutex_exit(evicted_lock); 2673 2674 idx = ((idx + 1) & (list_count - 1)); 2675 lists++; 2676 2677 if (bytes_evicted < bytes) { 2678 if (lists < list_count) 2679 goto evict_start; 2680 else 2681 dprintf("only evicted %lld bytes from %x", 2682 (longlong_t)bytes_evicted, state); 2683 } 2684 if (type == ARC_BUFC_METADATA) 2685 evict_metadata_offset = idx; 2686 else 2687 evict_data_offset = idx; 2688 2689 if (skipped) 2690 ARCSTAT_INCR(arcstat_evict_skip, skipped); 2691 2692 if (missed) 2693 ARCSTAT_INCR(arcstat_mutex_miss, missed); 2694 2695 /* 2696 * Note: we have just evicted some data into the ghost state, 2697 * potentially putting the ghost size over the desired size. Rather 2698 * that evicting from the ghost list in this hot code path, leave 2699 * this chore to the arc_reclaim_thread(). 2700 */ 2701 2702 if (stolen) 2703 ARCSTAT_BUMP(arcstat_stolen); 2704 return (stolen); 2705} 2706 2707/* 2708 * Remove buffers from list until we've removed the specified number of 2709 * bytes. Destroy the buffers that are removed. 2710 */ 2711static void 2712arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) 2713{ 2714 arc_buf_hdr_t *hdr, *hdr_prev; 2715 arc_buf_hdr_t marker = { 0 }; 2716 list_t *list, *list_start; 2717 kmutex_t *hash_lock, *lock; 2718 uint64_t bytes_deleted = 0; 2719 uint64_t bufs_skipped = 0; 2720 int count = 0; 2721 static int evict_offset; 2722 int list_count, idx = evict_offset; 2723 int offset, lists = 0; 2724 2725 ASSERT(GHOST_STATE(state)); 2726 2727 /* 2728 * data lists come after metadata lists 2729 */ 2730 list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS]; 2731 list_count = ARC_BUFC_NUMDATALISTS; 2732 offset = ARC_BUFC_NUMMETADATALISTS; 2733 2734evict_start: 2735 list = &list_start[idx]; 2736 lock = ARCS_LOCK(state, idx + offset); 2737 2738 mutex_enter(lock); 2739 for (hdr = list_tail(list); hdr; hdr = hdr_prev) { 2740 hdr_prev = list_prev(list, hdr); 2741 if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES) 2742 panic("invalid hdr=%p", (void *)hdr); 2743 if (spa && hdr->b_spa != spa) 2744 continue; 2745 2746 /* ignore markers */ 2747 if (hdr->b_spa == 0) 2748 continue; 2749 2750 hash_lock = HDR_LOCK(hdr); 2751 /* caller may be trying to modify this buffer, skip it */ 2752 if (MUTEX_HELD(hash_lock)) 2753 continue; 2754 2755 /* 2756 * It may take a long time to evict all the bufs requested. 2757 * To avoid blocking all arc activity, periodically drop 2758 * the arcs_mtx and give other threads a chance to run 2759 * before reacquiring the lock. 2760 */ 2761 if (count++ > arc_evict_iterations) { 2762 list_insert_after(list, hdr, &marker); 2763 mutex_exit(lock); 2764 kpreempt(KPREEMPT_SYNC); 2765 mutex_enter(lock); 2766 hdr_prev = list_prev(list, &marker); 2767 list_remove(list, &marker); 2768 count = 0; 2769 continue; 2770 } 2771 if (mutex_tryenter(hash_lock)) { 2772 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2773 ASSERT(!HDR_HAS_L1HDR(hdr) || 2774 hdr->b_l1hdr.b_buf == NULL); 2775 ARCSTAT_BUMP(arcstat_deleted); 2776 bytes_deleted += hdr->b_size; 2777 2778 if (HDR_HAS_L2HDR(hdr)) { 2779 /* 2780 * This buffer is cached on the 2nd Level ARC; 2781 * don't destroy the header. 2782 */ 2783 arc_change_state(arc_l2c_only, hdr, hash_lock); 2784 /* 2785 * dropping from L1+L2 cached to L2-only, 2786 * realloc to remove the L1 header. 2787 */ 2788 hdr = arc_hdr_realloc(hdr, hdr_full_cache, 2789 hdr_l2only_cache); 2790 mutex_exit(hash_lock); 2791 } else { 2792 arc_change_state(arc_anon, hdr, hash_lock); 2793 mutex_exit(hash_lock); 2794 arc_hdr_destroy(hdr); 2795 } 2796 2797 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); 2798 if (bytes >= 0 && bytes_deleted >= bytes) 2799 break; 2800 } else if (bytes < 0) { 2801 /* 2802 * Insert a list marker and then wait for the 2803 * hash lock to become available. Once its 2804 * available, restart from where we left off. 2805 */ 2806 list_insert_after(list, hdr, &marker); 2807 mutex_exit(lock); 2808 mutex_enter(hash_lock); 2809 mutex_exit(hash_lock); 2810 mutex_enter(lock); 2811 hdr_prev = list_prev(list, &marker); 2812 list_remove(list, &marker); 2813 } else { 2814 bufs_skipped += 1; 2815 } 2816 2817 } 2818 mutex_exit(lock); 2819 idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1)); 2820 lists++; 2821 2822 if (lists < list_count) 2823 goto evict_start; 2824 2825 evict_offset = idx; 2826 if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] && 2827 (bytes < 0 || bytes_deleted < bytes)) { 2828 list_start = &state->arcs_lists[0]; 2829 list_count = ARC_BUFC_NUMMETADATALISTS; 2830 offset = lists = 0; 2831 goto evict_start; 2832 } 2833 2834 if (bufs_skipped) { 2835 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 2836 ASSERT(bytes >= 0); 2837 } 2838 2839 if (bytes_deleted < bytes) 2840 dprintf("only deleted %lld bytes from %p", 2841 (longlong_t)bytes_deleted, state); 2842} 2843 2844static void 2845arc_adjust(void) 2846{ 2847 int64_t adjustment, delta; 2848 2849 /* 2850 * Adjust MRU size 2851 */ 2852 2853 adjustment = MIN((int64_t)(arc_size - arc_c), 2854 (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - 2855 arc_p)); 2856 2857 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 2858 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); 2859 (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); 2860 adjustment -= delta; 2861 } 2862 2863 if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2864 delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); 2865 (void) arc_evict(arc_mru, 0, delta, FALSE, 2866 ARC_BUFC_METADATA); 2867 } 2868 2869 /* 2870 * Adjust MFU size 2871 */ 2872 2873 adjustment = arc_size - arc_c; 2874 2875 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 2876 delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); 2877 (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); 2878 adjustment -= delta; 2879 } 2880 2881 if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2882 int64_t delta = MIN(adjustment, 2883 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); 2884 (void) arc_evict(arc_mfu, 0, delta, FALSE, 2885 ARC_BUFC_METADATA); 2886 } 2887 2888 /* 2889 * Adjust ghost lists 2890 */ 2891 2892 adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; 2893 2894 if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { 2895 delta = MIN(arc_mru_ghost->arcs_size, adjustment); 2896 arc_evict_ghost(arc_mru_ghost, 0, delta); 2897 } 2898 2899 adjustment = 2900 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; 2901 2902 if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { 2903 delta = MIN(arc_mfu_ghost->arcs_size, adjustment); 2904 arc_evict_ghost(arc_mfu_ghost, 0, delta); 2905 } 2906} 2907 2908static void 2909arc_do_user_evicts(void) 2910{ 2911 static arc_buf_t *tmp_arc_eviction_list; 2912 2913 /* 2914 * Move list over to avoid LOR 2915 */ 2916restart: 2917 mutex_enter(&arc_eviction_mtx); 2918 tmp_arc_eviction_list = arc_eviction_list; 2919 arc_eviction_list = NULL; 2920 mutex_exit(&arc_eviction_mtx); 2921 2922 while (tmp_arc_eviction_list != NULL) { 2923 arc_buf_t *buf = tmp_arc_eviction_list; 2924 tmp_arc_eviction_list = buf->b_next; 2925 mutex_enter(&buf->b_evict_lock); 2926 buf->b_hdr = NULL; 2927 mutex_exit(&buf->b_evict_lock); 2928 2929 if (buf->b_efunc != NULL) 2930 VERIFY0(buf->b_efunc(buf->b_private)); 2931 2932 buf->b_efunc = NULL; 2933 buf->b_private = NULL; 2934 kmem_cache_free(buf_cache, buf); 2935 } 2936 2937 if (arc_eviction_list != NULL) 2938 goto restart; 2939} 2940 2941/* 2942 * Flush all *evictable* data from the cache for the given spa. 2943 * NOTE: this will not touch "active" (i.e. referenced) data. 2944 */ 2945void 2946arc_flush(spa_t *spa) 2947{ 2948 uint64_t guid = 0; 2949 2950 if (spa != NULL) 2951 guid = spa_load_guid(spa); 2952 2953 while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) { 2954 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); 2955 if (spa != NULL) 2956 break; 2957 } 2958 while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) { 2959 (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); 2960 if (spa != NULL) 2961 break; 2962 } 2963 while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) { 2964 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); 2965 if (spa != NULL) 2966 break; 2967 } 2968 while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) { 2969 (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); 2970 if (spa != NULL) 2971 break; 2972 } 2973 2974 arc_evict_ghost(arc_mru_ghost, guid, -1); 2975 arc_evict_ghost(arc_mfu_ghost, guid, -1); 2976 2977 mutex_enter(&arc_reclaim_thr_lock); 2978 arc_do_user_evicts(); 2979 mutex_exit(&arc_reclaim_thr_lock); 2980 ASSERT(spa || arc_eviction_list == NULL); 2981} 2982 2983void 2984arc_shrink(void) 2985{ 2986 2987 if (arc_c > arc_c_min) { 2988 uint64_t to_free; 2989 2990 to_free = arc_c >> arc_shrink_shift; 2991 DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, 2992 arc_c_min, uint64_t, arc_p, uint64_t, to_free); 2993 if (arc_c > arc_c_min + to_free) 2994 atomic_add_64(&arc_c, -to_free); 2995 else 2996 arc_c = arc_c_min; 2997 2998 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 2999 if (arc_c > arc_size) 3000 arc_c = MAX(arc_size, arc_c_min); 3001 if (arc_p > arc_c) 3002 arc_p = (arc_c >> 1); 3003 3004 DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, 3005 arc_p); 3006 3007 ASSERT(arc_c >= arc_c_min); 3008 ASSERT((int64_t)arc_p >= 0); 3009 } 3010 3011 if (arc_size > arc_c) { 3012 DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, 3013 uint64_t, arc_c); 3014 arc_adjust(); 3015 } 3016} 3017 3018static int needfree = 0; 3019 3020static int 3021arc_reclaim_needed(void) 3022{ 3023 3024#ifdef _KERNEL 3025 3026 if (needfree) { 3027 DTRACE_PROBE(arc__reclaim_needfree); 3028 return (1); 3029 } 3030 3031 /* 3032 * Cooperate with pagedaemon when it's time for it to scan 3033 * and reclaim some pages. 3034 */ 3035 if (freemem < zfs_arc_free_target) { 3036 DTRACE_PROBE2(arc__reclaim_freemem, uint64_t, 3037 freemem, uint64_t, zfs_arc_free_target); 3038 return (1); 3039 } 3040 3041#ifdef illumos 3042 /* 3043 * take 'desfree' extra pages, so we reclaim sooner, rather than later 3044 */ 3045 extra = desfree; 3046 3047 /* 3048 * check that we're out of range of the pageout scanner. It starts to 3049 * schedule paging if freemem is less than lotsfree and needfree. 3050 * lotsfree is the high-water mark for pageout, and needfree is the 3051 * number of needed free pages. We add extra pages here to make sure 3052 * the scanner doesn't start up while we're freeing memory. 3053 */ 3054 if (freemem < lotsfree + needfree + extra) 3055 return (1); 3056 3057 /* 3058 * check to make sure that swapfs has enough space so that anon 3059 * reservations can still succeed. anon_resvmem() checks that the 3060 * availrmem is greater than swapfs_minfree, and the number of reserved 3061 * swap pages. We also add a bit of extra here just to prevent 3062 * circumstances from getting really dire. 3063 */ 3064 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 3065 return (1); 3066 3067 /* 3068 * Check that we have enough availrmem that memory locking (e.g., via 3069 * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum 3070 * stores the number of pages that cannot be locked; when availrmem 3071 * drops below pages_pp_maximum, page locking mechanisms such as 3072 * page_pp_lock() will fail.) 3073 */ 3074 if (availrmem <= pages_pp_maximum) 3075 return (1); 3076 3077#endif /* illumos */ 3078#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 3079 /* 3080 * If we're on an i386 platform, it's possible that we'll exhaust the 3081 * kernel heap space before we ever run out of available physical 3082 * memory. Most checks of the size of the heap_area compare against 3083 * tune.t_minarmem, which is the minimum available real memory that we 3084 * can have in the system. However, this is generally fixed at 25 pages 3085 * which is so low that it's useless. In this comparison, we seek to 3086 * calculate the total heap-size, and reclaim if more than 3/4ths of the 3087 * heap is allocated. (Or, in the calculation, if less than 1/4th is 3088 * free) 3089 */ 3090 if (vmem_size(heap_arena, VMEM_FREE) < 3091 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) { 3092 DTRACE_PROBE2(arc__reclaim_used, uint64_t, 3093 vmem_size(heap_arena, VMEM_FREE), uint64_t, 3094 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2); 3095 return (1); 3096 } 3097#define zio_arena NULL 3098#else 3099#define zio_arena heap_arena 3100#endif 3101 3102 /* 3103 * If zio data pages are being allocated out of a separate heap segment, 3104 * then enforce that the size of available vmem for this arena remains 3105 * above about 1/16th free. 3106 * 3107 * Note: The 1/16th arena free requirement was put in place 3108 * to aggressively evict memory from the arc in order to avoid 3109 * memory fragmentation issues. 3110 */ 3111 if (zio_arena != NULL && 3112 vmem_size(zio_arena, VMEM_FREE) < 3113 (vmem_size(zio_arena, VMEM_ALLOC) >> 4)) 3114 return (1); 3115 3116 /* 3117 * Above limits know nothing about real level of KVA fragmentation. 3118 * Start aggressive reclamation if too little sequential KVA left. 3119 */ 3120 if (vmem_size(heap_arena, VMEM_MAXFREE) < zfs_max_recordsize) { 3121 DTRACE_PROBE2(arc__reclaim_maxfree, uint64_t, 3122 vmem_size(heap_arena, VMEM_MAXFREE), 3123 uint64_t, zfs_max_recordsize); 3124 return (1); 3125 } 3126 3127#else /* _KERNEL */ 3128 if (spa_get_random(100) == 0) 3129 return (1); 3130#endif /* _KERNEL */ 3131 DTRACE_PROBE(arc__reclaim_no); 3132 3133 return (0); 3134} 3135 3136extern kmem_cache_t *zio_buf_cache[]; 3137extern kmem_cache_t *zio_data_buf_cache[]; 3138extern kmem_cache_t *range_seg_cache; 3139 3140static __noinline void 3141arc_kmem_reap_now(arc_reclaim_strategy_t strat) 3142{ 3143 size_t i; 3144 kmem_cache_t *prev_cache = NULL; 3145 kmem_cache_t *prev_data_cache = NULL; 3146 3147 DTRACE_PROBE(arc__kmem_reap_start); 3148#ifdef _KERNEL 3149 if (arc_meta_used >= arc_meta_limit) { 3150 /* 3151 * We are exceeding our meta-data cache limit. 3152 * Purge some DNLC entries to release holds on meta-data. 3153 */ 3154 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 3155 } 3156#if defined(__i386) 3157 /* 3158 * Reclaim unused memory from all kmem caches. 3159 */ 3160 kmem_reap(); 3161#endif 3162#endif 3163 3164 /* 3165 * An aggressive reclamation will shrink the cache size as well as 3166 * reap free buffers from the arc kmem caches. 3167 */ 3168 if (strat == ARC_RECLAIM_AGGR) 3169 arc_shrink(); 3170 3171 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 3172 if (zio_buf_cache[i] != prev_cache) { 3173 prev_cache = zio_buf_cache[i]; 3174 kmem_cache_reap_now(zio_buf_cache[i]); 3175 } 3176 if (zio_data_buf_cache[i] != prev_data_cache) { 3177 prev_data_cache = zio_data_buf_cache[i]; 3178 kmem_cache_reap_now(zio_data_buf_cache[i]); 3179 } 3180 } 3181 kmem_cache_reap_now(buf_cache); 3182 kmem_cache_reap_now(hdr_full_cache); 3183 kmem_cache_reap_now(hdr_l2only_cache); 3184 kmem_cache_reap_now(range_seg_cache); 3185 3186#ifdef illumos 3187 /* 3188 * Ask the vmem arena to reclaim unused memory from its 3189 * quantum caches. 3190 */ 3191 if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR) 3192 vmem_qcache_reap(zio_arena); 3193#endif 3194 DTRACE_PROBE(arc__kmem_reap_end); 3195} 3196 3197static void 3198arc_reclaim_thread(void *dummy __unused) 3199{ 3200 clock_t growtime = 0; 3201 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 3202 callb_cpr_t cpr; 3203 3204 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 3205 3206 mutex_enter(&arc_reclaim_thr_lock); 3207 while (arc_thread_exit == 0) { 3208 if (arc_reclaim_needed()) { 3209 3210 if (arc_no_grow) { 3211 if (last_reclaim == ARC_RECLAIM_CONS) { 3212 DTRACE_PROBE(arc__reclaim_aggr_no_grow); 3213 last_reclaim = ARC_RECLAIM_AGGR; 3214 } else { 3215 last_reclaim = ARC_RECLAIM_CONS; 3216 } 3217 } else { 3218 arc_no_grow = TRUE; 3219 last_reclaim = ARC_RECLAIM_AGGR; 3220 DTRACE_PROBE(arc__reclaim_aggr); 3221 membar_producer(); 3222 } 3223 3224 /* reset the growth delay for every reclaim */ 3225 growtime = ddi_get_lbolt() + (arc_grow_retry * hz); 3226 3227 if (needfree && last_reclaim == ARC_RECLAIM_CONS) { 3228 /* 3229 * If needfree is TRUE our vm_lowmem hook 3230 * was called and in that case we must free some 3231 * memory, so switch to aggressive mode. 3232 */ 3233 arc_no_grow = TRUE; 3234 last_reclaim = ARC_RECLAIM_AGGR; 3235 } 3236 arc_kmem_reap_now(last_reclaim); 3237 arc_warm = B_TRUE; 3238 3239 } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { 3240 arc_no_grow = FALSE; 3241 } 3242 3243 arc_adjust(); 3244 3245 if (arc_eviction_list != NULL) 3246 arc_do_user_evicts(); 3247 3248#ifdef _KERNEL 3249 if (needfree) { 3250 needfree = 0; 3251 wakeup(&needfree); 3252 } 3253#endif 3254 3255 /* 3256 * This is necessary in order for the mdb ::arc dcmd to 3257 * show up to date information. Since the ::arc command 3258 * does not call the kstat's update function, without 3259 * this call, the command may show stale stats for the 3260 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even 3261 * with this change, the data might be up to 1 second 3262 * out of date; but that should suffice. The arc_state_t 3263 * structures can be queried directly if more accurate 3264 * information is needed. 3265 */ 3266 if (arc_ksp != NULL) 3267 arc_ksp->ks_update(arc_ksp, KSTAT_READ); 3268 3269 /* block until needed, or one second, whichever is shorter */ 3270 CALLB_CPR_SAFE_BEGIN(&cpr); 3271 (void) cv_timedwait(&arc_reclaim_thr_cv, 3272 &arc_reclaim_thr_lock, hz); 3273 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 3274 } 3275 3276 arc_thread_exit = 0; 3277 cv_broadcast(&arc_reclaim_thr_cv); 3278 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 3279 thread_exit(); 3280} 3281 3282/* 3283 * Adapt arc info given the number of bytes we are trying to add and 3284 * the state that we are comming from. This function is only called 3285 * when we are adding new content to the cache. 3286 */ 3287static void 3288arc_adapt(int bytes, arc_state_t *state) 3289{ 3290 int mult; 3291 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 3292 3293 if (state == arc_l2c_only) 3294 return; 3295 3296 ASSERT(bytes > 0); 3297 /* 3298 * Adapt the target size of the MRU list: 3299 * - if we just hit in the MRU ghost list, then increase 3300 * the target size of the MRU list. 3301 * - if we just hit in the MFU ghost list, then increase 3302 * the target size of the MFU list by decreasing the 3303 * target size of the MRU list. 3304 */ 3305 if (state == arc_mru_ghost) { 3306 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 3307 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 3308 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 3309 3310 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 3311 } else if (state == arc_mfu_ghost) { 3312 uint64_t delta; 3313 3314 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 3315 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 3316 mult = MIN(mult, 10); 3317 3318 delta = MIN(bytes * mult, arc_p); 3319 arc_p = MAX(arc_p_min, arc_p - delta); 3320 } 3321 ASSERT((int64_t)arc_p >= 0); 3322 3323 if (arc_reclaim_needed()) { 3324 cv_signal(&arc_reclaim_thr_cv); 3325 return; 3326 } 3327 3328 if (arc_no_grow) 3329 return; 3330 3331 if (arc_c >= arc_c_max) 3332 return; 3333 3334 /* 3335 * If we're within (2 * maxblocksize) bytes of the target 3336 * cache size, increment the target cache size 3337 */ 3338 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 3339 DTRACE_PROBE1(arc__inc_adapt, int, bytes); 3340 atomic_add_64(&arc_c, (int64_t)bytes); 3341 if (arc_c > arc_c_max) 3342 arc_c = arc_c_max; 3343 else if (state == arc_anon) 3344 atomic_add_64(&arc_p, (int64_t)bytes); 3345 if (arc_p > arc_c) 3346 arc_p = arc_c; 3347 } 3348 ASSERT((int64_t)arc_p >= 0); 3349} 3350 3351/* 3352 * Check if the cache has reached its limits and eviction is required 3353 * prior to insert. 3354 */ 3355static int 3356arc_evict_needed(arc_buf_contents_t type) 3357{ 3358 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 3359 return (1); 3360 3361 if (arc_reclaim_needed()) 3362 return (1); 3363 3364 return (arc_size > arc_c); 3365} 3366 3367/* 3368 * The buffer, supplied as the first argument, needs a data block. 3369 * So, if we are at cache max, determine which cache should be victimized. 3370 * We have the following cases: 3371 * 3372 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 3373 * In this situation if we're out of space, but the resident size of the MFU is 3374 * under the limit, victimize the MFU cache to satisfy this insertion request. 3375 * 3376 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 3377 * Here, we've used up all of the available space for the MRU, so we need to 3378 * evict from our own cache instead. Evict from the set of resident MRU 3379 * entries. 3380 * 3381 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 3382 * c minus p represents the MFU space in the cache, since p is the size of the 3383 * cache that is dedicated to the MRU. In this situation there's still space on 3384 * the MFU side, so the MRU side needs to be victimized. 3385 * 3386 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 3387 * MFU's resident set is consuming more space than it has been allotted. In 3388 * this situation, we must victimize our own cache, the MFU, for this insertion. 3389 */ 3390static void 3391arc_get_data_buf(arc_buf_t *buf) 3392{ 3393 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 3394 uint64_t size = buf->b_hdr->b_size; 3395 arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 3396 3397 arc_adapt(size, state); 3398 3399 /* 3400 * We have not yet reached cache maximum size, 3401 * just allocate a new buffer. 3402 */ 3403 if (!arc_evict_needed(type)) { 3404 if (type == ARC_BUFC_METADATA) { 3405 buf->b_data = zio_buf_alloc(size); 3406 arc_space_consume(size, ARC_SPACE_META); 3407 } else { 3408 ASSERT(type == ARC_BUFC_DATA); 3409 buf->b_data = zio_data_buf_alloc(size); 3410 arc_space_consume(size, ARC_SPACE_DATA); 3411 } 3412 goto out; 3413 } 3414 3415 /* 3416 * If we are prefetching from the mfu ghost list, this buffer 3417 * will end up on the mru list; so steal space from there. 3418 */ 3419 if (state == arc_mfu_ghost) 3420 state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu; 3421 else if (state == arc_mru_ghost) 3422 state = arc_mru; 3423 3424 if (state == arc_mru || state == arc_anon) { 3425 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 3426 state = (arc_mfu->arcs_lsize[type] >= size && 3427 arc_p > mru_used) ? arc_mfu : arc_mru; 3428 } else { 3429 /* MFU cases */ 3430 uint64_t mfu_space = arc_c - arc_p; 3431 state = (arc_mru->arcs_lsize[type] >= size && 3432 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 3433 } 3434 if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { 3435 if (type == ARC_BUFC_METADATA) { 3436 buf->b_data = zio_buf_alloc(size); 3437 arc_space_consume(size, ARC_SPACE_META); 3438 } else { 3439 ASSERT(type == ARC_BUFC_DATA); 3440 buf->b_data = zio_data_buf_alloc(size); 3441 arc_space_consume(size, ARC_SPACE_DATA); 3442 } 3443 ARCSTAT_BUMP(arcstat_recycle_miss); 3444 } 3445 ASSERT(buf->b_data != NULL); 3446out: 3447 /* 3448 * Update the state size. Note that ghost states have a 3449 * "ghost size" and so don't need to be updated. 3450 */ 3451 if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { 3452 arc_buf_hdr_t *hdr = buf->b_hdr; 3453 3454 atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size); 3455 if (list_link_active(&hdr->b_l1hdr.b_arc_node)) { 3456 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3457 atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], 3458 size); 3459 } 3460 /* 3461 * If we are growing the cache, and we are adding anonymous 3462 * data, and we have outgrown arc_p, update arc_p 3463 */ 3464 if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && 3465 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 3466 arc_p = MIN(arc_c, arc_p + size); 3467 } 3468 ARCSTAT_BUMP(arcstat_allocated); 3469} 3470 3471/* 3472 * This routine is called whenever a buffer is accessed. 3473 * NOTE: the hash lock is dropped in this function. 3474 */ 3475static void 3476arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 3477{ 3478 clock_t now; 3479 3480 ASSERT(MUTEX_HELD(hash_lock)); 3481 ASSERT(HDR_HAS_L1HDR(hdr)); 3482 3483 if (hdr->b_l1hdr.b_state == arc_anon) { 3484 /* 3485 * This buffer is not in the cache, and does not 3486 * appear in our "ghost" list. Add the new buffer 3487 * to the MRU state. 3488 */ 3489 3490 ASSERT0(hdr->b_l1hdr.b_arc_access); 3491 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3492 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 3493 arc_change_state(arc_mru, hdr, hash_lock); 3494 3495 } else if (hdr->b_l1hdr.b_state == arc_mru) { 3496 now = ddi_get_lbolt(); 3497 3498 /* 3499 * If this buffer is here because of a prefetch, then either: 3500 * - clear the flag if this is a "referencing" read 3501 * (any subsequent access will bump this into the MFU state). 3502 * or 3503 * - move the buffer to the head of the list if this is 3504 * another prefetch (to make it less likely to be evicted). 3505 */ 3506 if (HDR_PREFETCH(hdr)) { 3507 if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 3508 ASSERT(list_link_active( 3509 &hdr->b_l1hdr.b_arc_node)); 3510 } else { 3511 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 3512 ARCSTAT_BUMP(arcstat_mru_hits); 3513 } 3514 hdr->b_l1hdr.b_arc_access = now; 3515 return; 3516 } 3517 3518 /* 3519 * This buffer has been "accessed" only once so far, 3520 * but it is still in the cache. Move it to the MFU 3521 * state. 3522 */ 3523 if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { 3524 /* 3525 * More than 125ms have passed since we 3526 * instantiated this buffer. Move it to the 3527 * most frequently used state. 3528 */ 3529 hdr->b_l1hdr.b_arc_access = now; 3530 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3531 arc_change_state(arc_mfu, hdr, hash_lock); 3532 } 3533 ARCSTAT_BUMP(arcstat_mru_hits); 3534 } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { 3535 arc_state_t *new_state; 3536 /* 3537 * This buffer has been "accessed" recently, but 3538 * was evicted from the cache. Move it to the 3539 * MFU state. 3540 */ 3541 3542 if (HDR_PREFETCH(hdr)) { 3543 new_state = arc_mru; 3544 if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) 3545 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 3546 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 3547 } else { 3548 new_state = arc_mfu; 3549 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3550 } 3551 3552 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3553 arc_change_state(new_state, hdr, hash_lock); 3554 3555 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 3556 } else if (hdr->b_l1hdr.b_state == arc_mfu) { 3557 /* 3558 * This buffer has been accessed more than once and is 3559 * still in the cache. Keep it in the MFU state. 3560 * 3561 * NOTE: an add_reference() that occurred when we did 3562 * the arc_read() will have kicked this off the list. 3563 * If it was a prefetch, we will explicitly move it to 3564 * the head of the list now. 3565 */ 3566 if ((HDR_PREFETCH(hdr)) != 0) { 3567 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3568 ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 3569 } 3570 ARCSTAT_BUMP(arcstat_mfu_hits); 3571 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3572 } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { 3573 arc_state_t *new_state = arc_mfu; 3574 /* 3575 * This buffer has been accessed more than once but has 3576 * been evicted from the cache. Move it back to the 3577 * MFU state. 3578 */ 3579 3580 if (HDR_PREFETCH(hdr)) { 3581 /* 3582 * This is a prefetch access... 3583 * move this block back to the MRU state. 3584 */ 3585 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 3586 new_state = arc_mru; 3587 } 3588 3589 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3590 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3591 arc_change_state(new_state, hdr, hash_lock); 3592 3593 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 3594 } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { 3595 /* 3596 * This buffer is on the 2nd Level ARC. 3597 */ 3598 3599 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3600 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3601 arc_change_state(arc_mfu, hdr, hash_lock); 3602 } else { 3603 ASSERT(!"invalid arc state"); 3604 } 3605} 3606 3607/* a generic arc_done_func_t which you can use */ 3608/* ARGSUSED */ 3609void 3610arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 3611{ 3612 if (zio == NULL || zio->io_error == 0) 3613 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 3614 VERIFY(arc_buf_remove_ref(buf, arg)); 3615} 3616 3617/* a generic arc_done_func_t */ 3618void 3619arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 3620{ 3621 arc_buf_t **bufp = arg; 3622 if (zio && zio->io_error) { 3623 VERIFY(arc_buf_remove_ref(buf, arg)); 3624 *bufp = NULL; 3625 } else { 3626 *bufp = buf; 3627 ASSERT(buf->b_data); 3628 } 3629} 3630 3631static void 3632arc_read_done(zio_t *zio) 3633{ 3634 arc_buf_hdr_t *hdr; 3635 arc_buf_t *buf; 3636 arc_buf_t *abuf; /* buffer we're assigning to callback */ 3637 kmutex_t *hash_lock = NULL; 3638 arc_callback_t *callback_list, *acb; 3639 int freeable = FALSE; 3640 3641 buf = zio->io_private; 3642 hdr = buf->b_hdr; 3643 3644 /* 3645 * The hdr was inserted into hash-table and removed from lists 3646 * prior to starting I/O. We should find this header, since 3647 * it's in the hash table, and it should be legit since it's 3648 * not possible to evict it during the I/O. The only possible 3649 * reason for it not to be found is if we were freed during the 3650 * read. 3651 */ 3652 if (HDR_IN_HASH_TABLE(hdr)) { 3653 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); 3654 ASSERT3U(hdr->b_dva.dva_word[0], ==, 3655 BP_IDENTITY(zio->io_bp)->dva_word[0]); 3656 ASSERT3U(hdr->b_dva.dva_word[1], ==, 3657 BP_IDENTITY(zio->io_bp)->dva_word[1]); 3658 3659 arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, 3660 &hash_lock); 3661 3662 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && 3663 hash_lock == NULL) || 3664 (found == hdr && 3665 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 3666 (found == hdr && HDR_L2_READING(hdr))); 3667 } 3668 3669 hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; 3670 if (l2arc_noprefetch && HDR_PREFETCH(hdr)) 3671 hdr->b_flags &= ~ARC_FLAG_L2CACHE; 3672 3673 /* byteswap if necessary */ 3674 callback_list = hdr->b_l1hdr.b_acb; 3675 ASSERT(callback_list != NULL); 3676 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 3677 dmu_object_byteswap_t bswap = 3678 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 3679 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 3680 byteswap_uint64_array : 3681 dmu_ot_byteswap[bswap].ob_func; 3682 func(buf->b_data, hdr->b_size); 3683 } 3684 3685 arc_cksum_compute(buf, B_FALSE); 3686#ifdef illumos 3687 arc_buf_watch(buf); 3688#endif 3689 3690 if (hash_lock && zio->io_error == 0 && 3691 hdr->b_l1hdr.b_state == arc_anon) { 3692 /* 3693 * Only call arc_access on anonymous buffers. This is because 3694 * if we've issued an I/O for an evicted buffer, we've already 3695 * called arc_access (to prevent any simultaneous readers from 3696 * getting confused). 3697 */ 3698 arc_access(hdr, hash_lock); 3699 } 3700 3701 /* create copies of the data buffer for the callers */ 3702 abuf = buf; 3703 for (acb = callback_list; acb; acb = acb->acb_next) { 3704 if (acb->acb_done) { 3705 if (abuf == NULL) { 3706 ARCSTAT_BUMP(arcstat_duplicate_reads); 3707 abuf = arc_buf_clone(buf); 3708 } 3709 acb->acb_buf = abuf; 3710 abuf = NULL; 3711 } 3712 } 3713 hdr->b_l1hdr.b_acb = NULL; 3714 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 3715 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 3716 if (abuf == buf) { 3717 ASSERT(buf->b_efunc == NULL); 3718 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 3719 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 3720 } 3721 3722 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || 3723 callback_list != NULL); 3724 3725 if (zio->io_error != 0) { 3726 hdr->b_flags |= ARC_FLAG_IO_ERROR; 3727 if (hdr->b_l1hdr.b_state != arc_anon) 3728 arc_change_state(arc_anon, hdr, hash_lock); 3729 if (HDR_IN_HASH_TABLE(hdr)) 3730 buf_hash_remove(hdr); 3731 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 3732 } 3733 3734 /* 3735 * Broadcast before we drop the hash_lock to avoid the possibility 3736 * that the hdr (and hence the cv) might be freed before we get to 3737 * the cv_broadcast(). 3738 */ 3739 cv_broadcast(&hdr->b_l1hdr.b_cv); 3740 3741 if (hash_lock != NULL) { 3742 mutex_exit(hash_lock); 3743 } else { 3744 /* 3745 * This block was freed while we waited for the read to 3746 * complete. It has been removed from the hash table and 3747 * moved to the anonymous state (so that it won't show up 3748 * in the cache). 3749 */ 3750 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 3751 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 3752 } 3753 3754 /* execute each callback and free its structure */ 3755 while ((acb = callback_list) != NULL) { 3756 if (acb->acb_done) 3757 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 3758 3759 if (acb->acb_zio_dummy != NULL) { 3760 acb->acb_zio_dummy->io_error = zio->io_error; 3761 zio_nowait(acb->acb_zio_dummy); 3762 } 3763 3764 callback_list = acb->acb_next; 3765 kmem_free(acb, sizeof (arc_callback_t)); 3766 } 3767 3768 if (freeable) 3769 arc_hdr_destroy(hdr); 3770} 3771 3772/* 3773 * "Read" the block block at the specified DVA (in bp) via the 3774 * cache. If the block is found in the cache, invoke the provided 3775 * callback immediately and return. Note that the `zio' parameter 3776 * in the callback will be NULL in this case, since no IO was 3777 * required. If the block is not in the cache pass the read request 3778 * on to the spa with a substitute callback function, so that the 3779 * requested block will be added to the cache. 3780 * 3781 * If a read request arrives for a block that has a read in-progress, 3782 * either wait for the in-progress read to complete (and return the 3783 * results); or, if this is a read with a "done" func, add a record 3784 * to the read to invoke the "done" func when the read completes, 3785 * and return; or just return. 3786 * 3787 * arc_read_done() will invoke all the requested "done" functions 3788 * for readers of this block. 3789 */ 3790int 3791arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 3792 void *private, zio_priority_t priority, int zio_flags, 3793 arc_flags_t *arc_flags, const zbookmark_phys_t *zb) 3794{ 3795 arc_buf_hdr_t *hdr = NULL; 3796 arc_buf_t *buf = NULL; 3797 kmutex_t *hash_lock = NULL; 3798 zio_t *rzio; 3799 uint64_t guid = spa_load_guid(spa); 3800 3801 ASSERT(!BP_IS_EMBEDDED(bp) || 3802 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 3803 3804top: 3805 if (!BP_IS_EMBEDDED(bp)) { 3806 /* 3807 * Embedded BP's have no DVA and require no I/O to "read". 3808 * Create an anonymous arc buf to back it. 3809 */ 3810 hdr = buf_hash_find(guid, bp, &hash_lock); 3811 } 3812 3813 if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { 3814 3815 *arc_flags |= ARC_FLAG_CACHED; 3816 3817 if (HDR_IO_IN_PROGRESS(hdr)) { 3818 3819 if (*arc_flags & ARC_FLAG_WAIT) { 3820 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); 3821 mutex_exit(hash_lock); 3822 goto top; 3823 } 3824 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 3825 3826 if (done) { 3827 arc_callback_t *acb = NULL; 3828 3829 acb = kmem_zalloc(sizeof (arc_callback_t), 3830 KM_SLEEP); 3831 acb->acb_done = done; 3832 acb->acb_private = private; 3833 if (pio != NULL) 3834 acb->acb_zio_dummy = zio_null(pio, 3835 spa, NULL, NULL, NULL, zio_flags); 3836 3837 ASSERT(acb->acb_done != NULL); 3838 acb->acb_next = hdr->b_l1hdr.b_acb; 3839 hdr->b_l1hdr.b_acb = acb; 3840 add_reference(hdr, hash_lock, private); 3841 mutex_exit(hash_lock); 3842 return (0); 3843 } 3844 mutex_exit(hash_lock); 3845 return (0); 3846 } 3847 3848 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 3849 hdr->b_l1hdr.b_state == arc_mfu); 3850 3851 if (done) { 3852 add_reference(hdr, hash_lock, private); 3853 /* 3854 * If this block is already in use, create a new 3855 * copy of the data so that we will be guaranteed 3856 * that arc_release() will always succeed. 3857 */ 3858 buf = hdr->b_l1hdr.b_buf; 3859 ASSERT(buf); 3860 ASSERT(buf->b_data); 3861 if (HDR_BUF_AVAILABLE(hdr)) { 3862 ASSERT(buf->b_efunc == NULL); 3863 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 3864 } else { 3865 buf = arc_buf_clone(buf); 3866 } 3867 3868 } else if (*arc_flags & ARC_FLAG_PREFETCH && 3869 refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 3870 hdr->b_flags |= ARC_FLAG_PREFETCH; 3871 } 3872 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 3873 arc_access(hdr, hash_lock); 3874 if (*arc_flags & ARC_FLAG_L2CACHE) 3875 hdr->b_flags |= ARC_FLAG_L2CACHE; 3876 if (*arc_flags & ARC_FLAG_L2COMPRESS) 3877 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3878 mutex_exit(hash_lock); 3879 ARCSTAT_BUMP(arcstat_hits); 3880 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 3881 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 3882 data, metadata, hits); 3883 3884 if (done) 3885 done(NULL, buf, private); 3886 } else { 3887 uint64_t size = BP_GET_LSIZE(bp); 3888 arc_callback_t *acb; 3889 vdev_t *vd = NULL; 3890 uint64_t addr = 0; 3891 boolean_t devw = B_FALSE; 3892 enum zio_compress b_compress = ZIO_COMPRESS_OFF; 3893 int32_t b_asize = 0; 3894 3895 if (hdr == NULL) { 3896 /* this block is not in the cache */ 3897 arc_buf_hdr_t *exists = NULL; 3898 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 3899 buf = arc_buf_alloc(spa, size, private, type); 3900 hdr = buf->b_hdr; 3901 if (!BP_IS_EMBEDDED(bp)) { 3902 hdr->b_dva = *BP_IDENTITY(bp); 3903 hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 3904 exists = buf_hash_insert(hdr, &hash_lock); 3905 } 3906 if (exists != NULL) { 3907 /* somebody beat us to the hash insert */ 3908 mutex_exit(hash_lock); 3909 buf_discard_identity(hdr); 3910 (void) arc_buf_remove_ref(buf, private); 3911 goto top; /* restart the IO request */ 3912 } 3913 3914 /* if this is a prefetch, we don't have a reference */ 3915 if (*arc_flags & ARC_FLAG_PREFETCH) { 3916 (void) remove_reference(hdr, hash_lock, 3917 private); 3918 hdr->b_flags |= ARC_FLAG_PREFETCH; 3919 } 3920 if (*arc_flags & ARC_FLAG_L2CACHE) 3921 hdr->b_flags |= ARC_FLAG_L2CACHE; 3922 if (*arc_flags & ARC_FLAG_L2COMPRESS) 3923 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3924 if (BP_GET_LEVEL(bp) > 0) 3925 hdr->b_flags |= ARC_FLAG_INDIRECT; 3926 } else { 3927 /* 3928 * This block is in the ghost cache. If it was L2-only 3929 * (and thus didn't have an L1 hdr), we realloc the 3930 * header to add an L1 hdr. 3931 */ 3932 if (!HDR_HAS_L1HDR(hdr)) { 3933 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, 3934 hdr_full_cache); 3935 } 3936 3937 ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); 3938 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3939 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3940 ASSERT(hdr->b_l1hdr.b_buf == NULL); 3941 3942 /* if this is a prefetch, we don't have a reference */ 3943 if (*arc_flags & ARC_FLAG_PREFETCH) 3944 hdr->b_flags |= ARC_FLAG_PREFETCH; 3945 else 3946 add_reference(hdr, hash_lock, private); 3947 if (*arc_flags & ARC_FLAG_L2CACHE) 3948 hdr->b_flags |= ARC_FLAG_L2CACHE; 3949 if (*arc_flags & ARC_FLAG_L2COMPRESS) 3950 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3951 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 3952 buf->b_hdr = hdr; 3953 buf->b_data = NULL; 3954 buf->b_efunc = NULL; 3955 buf->b_private = NULL; 3956 buf->b_next = NULL; 3957 hdr->b_l1hdr.b_buf = buf; 3958 ASSERT0(hdr->b_l1hdr.b_datacnt); 3959 hdr->b_l1hdr.b_datacnt = 1; 3960 arc_get_data_buf(buf); 3961 arc_access(hdr, hash_lock); 3962 } 3963 3964 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); 3965 3966 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 3967 acb->acb_done = done; 3968 acb->acb_private = private; 3969 3970 ASSERT(hdr->b_l1hdr.b_acb == NULL); 3971 hdr->b_l1hdr.b_acb = acb; 3972 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 3973 3974 if (HDR_HAS_L2HDR(hdr) && 3975 (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { 3976 devw = hdr->b_l2hdr.b_dev->l2ad_writing; 3977 addr = hdr->b_l2hdr.b_daddr; 3978 b_compress = HDR_GET_COMPRESS(hdr); 3979 b_asize = hdr->b_l2hdr.b_asize; 3980 /* 3981 * Lock out device removal. 3982 */ 3983 if (vdev_is_dead(vd) || 3984 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 3985 vd = NULL; 3986 } 3987 3988 if (hash_lock != NULL) 3989 mutex_exit(hash_lock); 3990 3991 /* 3992 * At this point, we have a level 1 cache miss. Try again in 3993 * L2ARC if possible. 3994 */ 3995 ASSERT3U(hdr->b_size, ==, size); 3996 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 3997 uint64_t, size, zbookmark_phys_t *, zb); 3998 ARCSTAT_BUMP(arcstat_misses); 3999 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 4000 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 4001 data, metadata, misses); 4002#ifdef _KERNEL 4003 curthread->td_ru.ru_inblock++; 4004#endif 4005 4006 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 4007 /* 4008 * Read from the L2ARC if the following are true: 4009 * 1. The L2ARC vdev was previously cached. 4010 * 2. This buffer still has L2ARC metadata. 4011 * 3. This buffer isn't currently writing to the L2ARC. 4012 * 4. The L2ARC entry wasn't evicted, which may 4013 * also have invalidated the vdev. 4014 * 5. This isn't prefetch and l2arc_noprefetch is set. 4015 */ 4016 if (HDR_HAS_L2HDR(hdr) && 4017 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 4018 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 4019 l2arc_read_callback_t *cb; 4020 4021 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 4022 ARCSTAT_BUMP(arcstat_l2_hits); 4023 4024 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 4025 KM_SLEEP); 4026 cb->l2rcb_buf = buf; 4027 cb->l2rcb_spa = spa; 4028 cb->l2rcb_bp = *bp; 4029 cb->l2rcb_zb = *zb; 4030 cb->l2rcb_flags = zio_flags; 4031 cb->l2rcb_compress = b_compress; 4032 4033 ASSERT(addr >= VDEV_LABEL_START_SIZE && 4034 addr + size < vd->vdev_psize - 4035 VDEV_LABEL_END_SIZE); 4036 4037 /* 4038 * l2arc read. The SCL_L2ARC lock will be 4039 * released by l2arc_read_done(). 4040 * Issue a null zio if the underlying buffer 4041 * was squashed to zero size by compression. 4042 */ 4043 if (b_compress == ZIO_COMPRESS_EMPTY) { 4044 rzio = zio_null(pio, spa, vd, 4045 l2arc_read_done, cb, 4046 zio_flags | ZIO_FLAG_DONT_CACHE | 4047 ZIO_FLAG_CANFAIL | 4048 ZIO_FLAG_DONT_PROPAGATE | 4049 ZIO_FLAG_DONT_RETRY); 4050 } else { 4051 rzio = zio_read_phys(pio, vd, addr, 4052 b_asize, buf->b_data, 4053 ZIO_CHECKSUM_OFF, 4054 l2arc_read_done, cb, priority, 4055 zio_flags | ZIO_FLAG_DONT_CACHE | 4056 ZIO_FLAG_CANFAIL | 4057 ZIO_FLAG_DONT_PROPAGATE | 4058 ZIO_FLAG_DONT_RETRY, B_FALSE); 4059 } 4060 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 4061 zio_t *, rzio); 4062 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); 4063 4064 if (*arc_flags & ARC_FLAG_NOWAIT) { 4065 zio_nowait(rzio); 4066 return (0); 4067 } 4068 4069 ASSERT(*arc_flags & ARC_FLAG_WAIT); 4070 if (zio_wait(rzio) == 0) 4071 return (0); 4072 4073 /* l2arc read error; goto zio_read() */ 4074 } else { 4075 DTRACE_PROBE1(l2arc__miss, 4076 arc_buf_hdr_t *, hdr); 4077 ARCSTAT_BUMP(arcstat_l2_misses); 4078 if (HDR_L2_WRITING(hdr)) 4079 ARCSTAT_BUMP(arcstat_l2_rw_clash); 4080 spa_config_exit(spa, SCL_L2ARC, vd); 4081 } 4082 } else { 4083 if (vd != NULL) 4084 spa_config_exit(spa, SCL_L2ARC, vd); 4085 if (l2arc_ndev != 0) { 4086 DTRACE_PROBE1(l2arc__miss, 4087 arc_buf_hdr_t *, hdr); 4088 ARCSTAT_BUMP(arcstat_l2_misses); 4089 } 4090 } 4091 4092 rzio = zio_read(pio, spa, bp, buf->b_data, size, 4093 arc_read_done, buf, priority, zio_flags, zb); 4094 4095 if (*arc_flags & ARC_FLAG_WAIT) 4096 return (zio_wait(rzio)); 4097 4098 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 4099 zio_nowait(rzio); 4100 } 4101 return (0); 4102} 4103 4104void 4105arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 4106{ 4107 ASSERT(buf->b_hdr != NULL); 4108 ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); 4109 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || 4110 func == NULL); 4111 ASSERT(buf->b_efunc == NULL); 4112 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 4113 4114 buf->b_efunc = func; 4115 buf->b_private = private; 4116} 4117 4118/* 4119 * Notify the arc that a block was freed, and thus will never be used again. 4120 */ 4121void 4122arc_freed(spa_t *spa, const blkptr_t *bp) 4123{ 4124 arc_buf_hdr_t *hdr; 4125 kmutex_t *hash_lock; 4126 uint64_t guid = spa_load_guid(spa); 4127 4128 ASSERT(!BP_IS_EMBEDDED(bp)); 4129 4130 hdr = buf_hash_find(guid, bp, &hash_lock); 4131 if (hdr == NULL) 4132 return; 4133 if (HDR_BUF_AVAILABLE(hdr)) { 4134 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 4135 add_reference(hdr, hash_lock, FTAG); 4136 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 4137 mutex_exit(hash_lock); 4138 4139 arc_release(buf, FTAG); 4140 (void) arc_buf_remove_ref(buf, FTAG); 4141 } else { 4142 mutex_exit(hash_lock); 4143 } 4144 4145} 4146 4147/* 4148 * Clear the user eviction callback set by arc_set_callback(), first calling 4149 * it if it exists. Because the presence of a callback keeps an arc_buf cached 4150 * clearing the callback may result in the arc_buf being destroyed. However, 4151 * it will not result in the *last* arc_buf being destroyed, hence the data 4152 * will remain cached in the ARC. We make a copy of the arc buffer here so 4153 * that we can process the callback without holding any locks. 4154 * 4155 * It's possible that the callback is already in the process of being cleared 4156 * by another thread. In this case we can not clear the callback. 4157 * 4158 * Returns B_TRUE if the callback was successfully called and cleared. 4159 */ 4160boolean_t 4161arc_clear_callback(arc_buf_t *buf) 4162{ 4163 arc_buf_hdr_t *hdr; 4164 kmutex_t *hash_lock; 4165 arc_evict_func_t *efunc = buf->b_efunc; 4166 void *private = buf->b_private; 4167 list_t *list, *evicted_list; 4168 kmutex_t *lock, *evicted_lock; 4169 4170 mutex_enter(&buf->b_evict_lock); 4171 hdr = buf->b_hdr; 4172 if (hdr == NULL) { 4173 /* 4174 * We are in arc_do_user_evicts(). 4175 */ 4176 ASSERT(buf->b_data == NULL); 4177 mutex_exit(&buf->b_evict_lock); 4178 return (B_FALSE); 4179 } else if (buf->b_data == NULL) { 4180 /* 4181 * We are on the eviction list; process this buffer now 4182 * but let arc_do_user_evicts() do the reaping. 4183 */ 4184 buf->b_efunc = NULL; 4185 mutex_exit(&buf->b_evict_lock); 4186 VERIFY0(efunc(private)); 4187 return (B_TRUE); 4188 } 4189 hash_lock = HDR_LOCK(hdr); 4190 mutex_enter(hash_lock); 4191 hdr = buf->b_hdr; 4192 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4193 4194 ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, 4195 hdr->b_l1hdr.b_datacnt); 4196 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 4197 hdr->b_l1hdr.b_state == arc_mfu); 4198 4199 buf->b_efunc = NULL; 4200 buf->b_private = NULL; 4201 4202 if (hdr->b_l1hdr.b_datacnt > 1) { 4203 mutex_exit(&buf->b_evict_lock); 4204 arc_buf_destroy(buf, FALSE, TRUE); 4205 } else { 4206 ASSERT(buf == hdr->b_l1hdr.b_buf); 4207 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 4208 mutex_exit(&buf->b_evict_lock); 4209 } 4210 4211 mutex_exit(hash_lock); 4212 VERIFY0(efunc(private)); 4213 return (B_TRUE); 4214} 4215 4216/* 4217 * Release this buffer from the cache, making it an anonymous buffer. This 4218 * must be done after a read and prior to modifying the buffer contents. 4219 * If the buffer has more than one reference, we must make 4220 * a new hdr for the buffer. 4221 */ 4222void 4223arc_release(arc_buf_t *buf, void *tag) 4224{ 4225 arc_buf_hdr_t *hdr = buf->b_hdr; 4226 4227 /* 4228 * It would be nice to assert that if it's DMU metadata (level > 4229 * 0 || it's the dnode file), then it must be syncing context. 4230 * But we don't know that information at this level. 4231 */ 4232 4233 mutex_enter(&buf->b_evict_lock); 4234 /* 4235 * We don't grab the hash lock prior to this check, because if 4236 * the buffer's header is in the arc_anon state, it won't be 4237 * linked into the hash table. 4238 */ 4239 if (hdr->b_l1hdr.b_state == arc_anon) { 4240 mutex_exit(&buf->b_evict_lock); 4241 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4242 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 4243 ASSERT(!HDR_HAS_L2HDR(hdr)); 4244 ASSERT(BUF_EMPTY(hdr)); 4245 ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); 4246 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); 4247 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 4248 4249 ASSERT3P(buf->b_efunc, ==, NULL); 4250 ASSERT3P(buf->b_private, ==, NULL); 4251 4252 hdr->b_l1hdr.b_arc_access = 0; 4253 arc_buf_thaw(buf); 4254 4255 return; 4256 } 4257 4258 kmutex_t *hash_lock = HDR_LOCK(hdr); 4259 mutex_enter(hash_lock); 4260 4261 /* 4262 * This assignment is only valid as long as the hash_lock is 4263 * held, we must be careful not to reference state or the 4264 * b_state field after dropping the lock. 4265 */ 4266 arc_state_t *state = hdr->b_l1hdr.b_state; 4267 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4268 ASSERT3P(state, !=, arc_anon); 4269 4270 /* this buffer is not on any list */ 4271 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); 4272 4273 if (HDR_HAS_L2HDR(hdr)) { 4274 ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); 4275 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 4276 4277 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4278 trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, 4279 hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0); 4280 list_remove(&hdr->b_l2hdr.b_dev->l2ad_buflist, hdr); 4281 4282 /* 4283 * We don't want to leak the b_tmp_cdata buffer that was 4284 * allocated in l2arc_write_buffers() 4285 */ 4286 arc_buf_l2_cdata_free(hdr); 4287 4288 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4289 4290 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 4291 } 4292 4293 /* 4294 * Do we have more than one buf? 4295 */ 4296 if (hdr->b_l1hdr.b_datacnt > 1) { 4297 arc_buf_hdr_t *nhdr; 4298 arc_buf_t **bufp; 4299 uint64_t blksz = hdr->b_size; 4300 uint64_t spa = hdr->b_spa; 4301 arc_buf_contents_t type = arc_buf_type(hdr); 4302 uint32_t flags = hdr->b_flags; 4303 4304 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); 4305 /* 4306 * Pull the data off of this hdr and attach it to 4307 * a new anonymous hdr. 4308 */ 4309 (void) remove_reference(hdr, hash_lock, tag); 4310 bufp = &hdr->b_l1hdr.b_buf; 4311 while (*bufp != buf) 4312 bufp = &(*bufp)->b_next; 4313 *bufp = buf->b_next; 4314 buf->b_next = NULL; 4315 4316 ASSERT3P(state, !=, arc_l2c_only); 4317 ASSERT3U(state->arcs_size, >=, hdr->b_size); 4318 atomic_add_64(&state->arcs_size, -hdr->b_size); 4319 if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { 4320 ASSERT3P(state, !=, arc_l2c_only); 4321 uint64_t *size = &state->arcs_lsize[type]; 4322 ASSERT3U(*size, >=, hdr->b_size); 4323 atomic_add_64(size, -hdr->b_size); 4324 } 4325 4326 /* 4327 * We're releasing a duplicate user data buffer, update 4328 * our statistics accordingly. 4329 */ 4330 if (HDR_ISTYPE_DATA(hdr)) { 4331 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 4332 ARCSTAT_INCR(arcstat_duplicate_buffers_size, 4333 -hdr->b_size); 4334 } 4335 hdr->b_l1hdr.b_datacnt -= 1; 4336 arc_cksum_verify(buf); 4337#ifdef illumos 4338 arc_buf_unwatch(buf); 4339#endif 4340 4341 mutex_exit(hash_lock); 4342 4343 nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 4344 nhdr->b_size = blksz; 4345 nhdr->b_spa = spa; 4346 4347 nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; 4348 nhdr->b_flags |= arc_bufc_to_flags(type); 4349 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 4350 4351 nhdr->b_l1hdr.b_buf = buf; 4352 nhdr->b_l1hdr.b_datacnt = 1; 4353 nhdr->b_l1hdr.b_state = arc_anon; 4354 nhdr->b_l1hdr.b_arc_access = 0; 4355 nhdr->b_freeze_cksum = NULL; 4356 4357 (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); 4358 buf->b_hdr = nhdr; 4359 mutex_exit(&buf->b_evict_lock); 4360 atomic_add_64(&arc_anon->arcs_size, blksz); 4361 } else { 4362 mutex_exit(&buf->b_evict_lock); 4363 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); 4364 /* protected by hash lock */ 4365 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 4366 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4367 arc_change_state(arc_anon, hdr, hash_lock); 4368 hdr->b_l1hdr.b_arc_access = 0; 4369 mutex_exit(hash_lock); 4370 4371 buf_discard_identity(hdr); 4372 arc_buf_thaw(buf); 4373 } 4374 buf->b_efunc = NULL; 4375 buf->b_private = NULL; 4376} 4377 4378int 4379arc_released(arc_buf_t *buf) 4380{ 4381 int released; 4382 4383 mutex_enter(&buf->b_evict_lock); 4384 released = (buf->b_data != NULL && 4385 buf->b_hdr->b_l1hdr.b_state == arc_anon); 4386 mutex_exit(&buf->b_evict_lock); 4387 return (released); 4388} 4389 4390#ifdef ZFS_DEBUG 4391int 4392arc_referenced(arc_buf_t *buf) 4393{ 4394 int referenced; 4395 4396 mutex_enter(&buf->b_evict_lock); 4397 referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); 4398 mutex_exit(&buf->b_evict_lock); 4399 return (referenced); 4400} 4401#endif 4402 4403static void 4404arc_write_ready(zio_t *zio) 4405{ 4406 arc_write_callback_t *callback = zio->io_private; 4407 arc_buf_t *buf = callback->awcb_buf; 4408 arc_buf_hdr_t *hdr = buf->b_hdr; 4409 4410 ASSERT(HDR_HAS_L1HDR(hdr)); 4411 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); 4412 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 4413 callback->awcb_ready(zio, buf, callback->awcb_private); 4414 4415 /* 4416 * If the IO is already in progress, then this is a re-write 4417 * attempt, so we need to thaw and re-compute the cksum. 4418 * It is the responsibility of the callback to handle the 4419 * accounting for any re-write attempt. 4420 */ 4421 if (HDR_IO_IN_PROGRESS(hdr)) { 4422 mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 4423 if (hdr->b_freeze_cksum != NULL) { 4424 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 4425 hdr->b_freeze_cksum = NULL; 4426 } 4427 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 4428 } 4429 arc_cksum_compute(buf, B_FALSE); 4430 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 4431} 4432 4433/* 4434 * The SPA calls this callback for each physical write that happens on behalf 4435 * of a logical write. See the comment in dbuf_write_physdone() for details. 4436 */ 4437static void 4438arc_write_physdone(zio_t *zio) 4439{ 4440 arc_write_callback_t *cb = zio->io_private; 4441 if (cb->awcb_physdone != NULL) 4442 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 4443} 4444 4445static void 4446arc_write_done(zio_t *zio) 4447{ 4448 arc_write_callback_t *callback = zio->io_private; 4449 arc_buf_t *buf = callback->awcb_buf; 4450 arc_buf_hdr_t *hdr = buf->b_hdr; 4451 4452 ASSERT(hdr->b_l1hdr.b_acb == NULL); 4453 4454 if (zio->io_error == 0) { 4455 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 4456 buf_discard_identity(hdr); 4457 } else { 4458 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 4459 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 4460 } 4461 } else { 4462 ASSERT(BUF_EMPTY(hdr)); 4463 } 4464 4465 /* 4466 * If the block to be written was all-zero or compressed enough to be 4467 * embedded in the BP, no write was performed so there will be no 4468 * dva/birth/checksum. The buffer must therefore remain anonymous 4469 * (and uncached). 4470 */ 4471 if (!BUF_EMPTY(hdr)) { 4472 arc_buf_hdr_t *exists; 4473 kmutex_t *hash_lock; 4474 4475 ASSERT(zio->io_error == 0); 4476 4477 arc_cksum_verify(buf); 4478 4479 exists = buf_hash_insert(hdr, &hash_lock); 4480 if (exists != NULL) { 4481 /* 4482 * This can only happen if we overwrite for 4483 * sync-to-convergence, because we remove 4484 * buffers from the hash table when we arc_free(). 4485 */ 4486 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 4487 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 4488 panic("bad overwrite, hdr=%p exists=%p", 4489 (void *)hdr, (void *)exists); 4490 ASSERT(refcount_is_zero( 4491 &exists->b_l1hdr.b_refcnt)); 4492 arc_change_state(arc_anon, exists, hash_lock); 4493 mutex_exit(hash_lock); 4494 arc_hdr_destroy(exists); 4495 exists = buf_hash_insert(hdr, &hash_lock); 4496 ASSERT3P(exists, ==, NULL); 4497 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 4498 /* nopwrite */ 4499 ASSERT(zio->io_prop.zp_nopwrite); 4500 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 4501 panic("bad nopwrite, hdr=%p exists=%p", 4502 (void *)hdr, (void *)exists); 4503 } else { 4504 /* Dedup */ 4505 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 4506 ASSERT(hdr->b_l1hdr.b_state == arc_anon); 4507 ASSERT(BP_GET_DEDUP(zio->io_bp)); 4508 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 4509 } 4510 } 4511 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4512 /* if it's not anon, we are doing a scrub */ 4513 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) 4514 arc_access(hdr, hash_lock); 4515 mutex_exit(hash_lock); 4516 } else { 4517 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4518 } 4519 4520 ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4521 callback->awcb_done(zio, buf, callback->awcb_private); 4522 4523 kmem_free(callback, sizeof (arc_write_callback_t)); 4524} 4525 4526zio_t * 4527arc_write(zio_t *pio, spa_t *spa, uint64_t txg, 4528 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, 4529 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, 4530 arc_done_func_t *done, void *private, zio_priority_t priority, 4531 int zio_flags, const zbookmark_phys_t *zb) 4532{ 4533 arc_buf_hdr_t *hdr = buf->b_hdr; 4534 arc_write_callback_t *callback; 4535 zio_t *zio; 4536 4537 ASSERT(ready != NULL); 4538 ASSERT(done != NULL); 4539 ASSERT(!HDR_IO_ERROR(hdr)); 4540 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4541 ASSERT(hdr->b_l1hdr.b_acb == NULL); 4542 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 4543 if (l2arc) 4544 hdr->b_flags |= ARC_FLAG_L2CACHE; 4545 if (l2arc_compress) 4546 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4547 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 4548 callback->awcb_ready = ready; 4549 callback->awcb_physdone = physdone; 4550 callback->awcb_done = done; 4551 callback->awcb_private = private; 4552 callback->awcb_buf = buf; 4553 4554 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 4555 arc_write_ready, arc_write_physdone, arc_write_done, callback, 4556 priority, zio_flags, zb); 4557 4558 return (zio); 4559} 4560 4561static int 4562arc_memory_throttle(uint64_t reserve, uint64_t txg) 4563{ 4564#ifdef _KERNEL 4565 uint64_t available_memory = ptob(freemem); 4566 static uint64_t page_load = 0; 4567 static uint64_t last_txg = 0; 4568 4569#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 4570 available_memory = 4571 MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); 4572#endif 4573 4574 if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) 4575 return (0); 4576 4577 if (txg > last_txg) { 4578 last_txg = txg; 4579 page_load = 0; 4580 } 4581 /* 4582 * If we are in pageout, we know that memory is already tight, 4583 * the arc is already going to be evicting, so we just want to 4584 * continue to let page writes occur as quickly as possible. 4585 */ 4586 if (curproc == pageproc) { 4587 if (page_load > MAX(ptob(minfree), available_memory) / 4) 4588 return (SET_ERROR(ERESTART)); 4589 /* Note: reserve is inflated, so we deflate */ 4590 page_load += reserve / 8; 4591 return (0); 4592 } else if (page_load > 0 && arc_reclaim_needed()) { 4593 /* memory is low, delay before restarting */ 4594 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 4595 return (SET_ERROR(EAGAIN)); 4596 } 4597 page_load = 0; 4598#endif 4599 return (0); 4600} 4601 4602static void 4603arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, 4604 kstat_named_t *evict_data, kstat_named_t *evict_metadata) 4605{ 4606 size->value.ui64 = state->arcs_size; 4607 evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; 4608 evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; 4609} 4610 4611static int 4612arc_kstat_update(kstat_t *ksp, int rw) 4613{ 4614 arc_stats_t *as = ksp->ks_data; 4615 4616 if (rw == KSTAT_WRITE) { 4617 return (EACCES); 4618 } else { 4619 arc_kstat_update_state(arc_anon, 4620 &as->arcstat_anon_size, 4621 &as->arcstat_anon_evictable_data, 4622 &as->arcstat_anon_evictable_metadata); 4623 arc_kstat_update_state(arc_mru, 4624 &as->arcstat_mru_size, 4625 &as->arcstat_mru_evictable_data, 4626 &as->arcstat_mru_evictable_metadata); 4627 arc_kstat_update_state(arc_mru_ghost, 4628 &as->arcstat_mru_ghost_size, 4629 &as->arcstat_mru_ghost_evictable_data, 4630 &as->arcstat_mru_ghost_evictable_metadata); 4631 arc_kstat_update_state(arc_mfu, 4632 &as->arcstat_mfu_size, 4633 &as->arcstat_mfu_evictable_data, 4634 &as->arcstat_mfu_evictable_metadata); 4635 arc_kstat_update_state(arc_mfu_ghost, 4636 &as->arcstat_mfu_ghost_size, 4637 &as->arcstat_mfu_ghost_evictable_data, 4638 &as->arcstat_mfu_ghost_evictable_metadata); 4639 } 4640 4641 return (0); 4642} 4643 4644void 4645arc_tempreserve_clear(uint64_t reserve) 4646{ 4647 atomic_add_64(&arc_tempreserve, -reserve); 4648 ASSERT((int64_t)arc_tempreserve >= 0); 4649} 4650 4651int 4652arc_tempreserve_space(uint64_t reserve, uint64_t txg) 4653{ 4654 int error; 4655 uint64_t anon_size; 4656 4657 if (reserve > arc_c/4 && !arc_no_grow) { 4658 arc_c = MIN(arc_c_max, reserve * 4); 4659 DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); 4660 } 4661 if (reserve > arc_c) 4662 return (SET_ERROR(ENOMEM)); 4663 4664 /* 4665 * Don't count loaned bufs as in flight dirty data to prevent long 4666 * network delays from blocking transactions that are ready to be 4667 * assigned to a txg. 4668 */ 4669 anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); 4670 4671 /* 4672 * Writes will, almost always, require additional memory allocations 4673 * in order to compress/encrypt/etc the data. We therefore need to 4674 * make sure that there is sufficient available memory for this. 4675 */ 4676 error = arc_memory_throttle(reserve, txg); 4677 if (error != 0) 4678 return (error); 4679 4680 /* 4681 * Throttle writes when the amount of dirty data in the cache 4682 * gets too large. We try to keep the cache less than half full 4683 * of dirty blocks so that our sync times don't grow too large. 4684 * Note: if two requests come in concurrently, we might let them 4685 * both succeed, when one of them should fail. Not a huge deal. 4686 */ 4687 4688 if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 4689 anon_size > arc_c / 4) { 4690 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 4691 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 4692 arc_tempreserve>>10, 4693 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 4694 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 4695 reserve>>10, arc_c>>10); 4696 return (SET_ERROR(ERESTART)); 4697 } 4698 atomic_add_64(&arc_tempreserve, reserve); 4699 return (0); 4700} 4701 4702static kmutex_t arc_lowmem_lock; 4703#ifdef _KERNEL 4704static eventhandler_tag arc_event_lowmem = NULL; 4705 4706static void 4707arc_lowmem(void *arg __unused, int howto __unused) 4708{ 4709 4710 /* Serialize access via arc_lowmem_lock. */ 4711 mutex_enter(&arc_lowmem_lock); 4712 mutex_enter(&arc_reclaim_thr_lock); 4713 needfree = 1; 4714 DTRACE_PROBE(arc__needfree); 4715 cv_signal(&arc_reclaim_thr_cv); 4716 4717 /* 4718 * It is unsafe to block here in arbitrary threads, because we can come 4719 * here from ARC itself and may hold ARC locks and thus risk a deadlock 4720 * with ARC reclaim thread. 4721 */ 4722 if (curproc == pageproc) { 4723 while (needfree) 4724 msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0); 4725 } 4726 mutex_exit(&arc_reclaim_thr_lock); 4727 mutex_exit(&arc_lowmem_lock); 4728} 4729#endif 4730 4731void 4732arc_init(void) 4733{ 4734 int i, prefetch_tunable_set = 0; 4735 4736 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 4737 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 4738 mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL); 4739 4740 /* Convert seconds to clock ticks */ 4741 arc_min_prefetch_lifespan = 1 * hz; 4742 4743 /* Start out with 1/8 of all memory */ 4744 arc_c = kmem_size() / 8; 4745 4746#ifdef illumos 4747#ifdef _KERNEL 4748 /* 4749 * On architectures where the physical memory can be larger 4750 * than the addressable space (intel in 32-bit mode), we may 4751 * need to limit the cache to 1/8 of VM size. 4752 */ 4753 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 4754#endif 4755#endif /* illumos */ 4756 /* set min cache to 1/32 of all memory, or 16MB, whichever is more */ 4757 arc_c_min = MAX(arc_c / 4, 16 << 20); 4758 /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ 4759 if (arc_c * 8 >= 1 << 30) 4760 arc_c_max = (arc_c * 8) - (1 << 30); 4761 else 4762 arc_c_max = arc_c_min; 4763 arc_c_max = MAX(arc_c * 5, arc_c_max); 4764 4765#ifdef _KERNEL 4766 /* 4767 * Allow the tunables to override our calculations if they are 4768 * reasonable (ie. over 16MB) 4769 */ 4770 if (zfs_arc_max > 16 << 20 && zfs_arc_max < kmem_size()) 4771 arc_c_max = zfs_arc_max; 4772 if (zfs_arc_min > 16 << 20 && zfs_arc_min <= arc_c_max) 4773 arc_c_min = zfs_arc_min; 4774#endif 4775 4776 arc_c = arc_c_max; 4777 arc_p = (arc_c >> 1); 4778 4779 /* limit meta-data to 1/4 of the arc capacity */ 4780 arc_meta_limit = arc_c_max / 4; 4781 4782 /* Allow the tunable to override if it is reasonable */ 4783 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 4784 arc_meta_limit = zfs_arc_meta_limit; 4785 4786 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 4787 arc_c_min = arc_meta_limit / 2; 4788 4789 if (zfs_arc_meta_min > 0) { 4790 arc_meta_min = zfs_arc_meta_min; 4791 } else { 4792 arc_meta_min = arc_c_min / 2; 4793 } 4794 4795 if (zfs_arc_grow_retry > 0) 4796 arc_grow_retry = zfs_arc_grow_retry; 4797 4798 if (zfs_arc_shrink_shift > 0) 4799 arc_shrink_shift = zfs_arc_shrink_shift; 4800 4801 if (zfs_arc_p_min_shift > 0) 4802 arc_p_min_shift = zfs_arc_p_min_shift; 4803 4804 /* if kmem_flags are set, lets try to use less memory */ 4805 if (kmem_debugging()) 4806 arc_c = arc_c / 2; 4807 if (arc_c < arc_c_min) 4808 arc_c = arc_c_min; 4809 4810 zfs_arc_min = arc_c_min; 4811 zfs_arc_max = arc_c_max; 4812 4813 arc_anon = &ARC_anon; 4814 arc_mru = &ARC_mru; 4815 arc_mru_ghost = &ARC_mru_ghost; 4816 arc_mfu = &ARC_mfu; 4817 arc_mfu_ghost = &ARC_mfu_ghost; 4818 arc_l2c_only = &ARC_l2c_only; 4819 arc_size = 0; 4820 4821 for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 4822 mutex_init(&arc_anon->arcs_locks[i].arcs_lock, 4823 NULL, MUTEX_DEFAULT, NULL); 4824 mutex_init(&arc_mru->arcs_locks[i].arcs_lock, 4825 NULL, MUTEX_DEFAULT, NULL); 4826 mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock, 4827 NULL, MUTEX_DEFAULT, NULL); 4828 mutex_init(&arc_mfu->arcs_locks[i].arcs_lock, 4829 NULL, MUTEX_DEFAULT, NULL); 4830 mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock, 4831 NULL, MUTEX_DEFAULT, NULL); 4832 mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock, 4833 NULL, MUTEX_DEFAULT, NULL); 4834 4835 list_create(&arc_mru->arcs_lists[i], 4836 sizeof (arc_buf_hdr_t), 4837 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4838 list_create(&arc_mru_ghost->arcs_lists[i], 4839 sizeof (arc_buf_hdr_t), 4840 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4841 list_create(&arc_mfu->arcs_lists[i], 4842 sizeof (arc_buf_hdr_t), 4843 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4844 list_create(&arc_mfu_ghost->arcs_lists[i], 4845 sizeof (arc_buf_hdr_t), 4846 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4847 list_create(&arc_mfu_ghost->arcs_lists[i], 4848 sizeof (arc_buf_hdr_t), 4849 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4850 list_create(&arc_l2c_only->arcs_lists[i], 4851 sizeof (arc_buf_hdr_t), 4852 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4853 } 4854 4855 buf_init(); 4856 4857 arc_thread_exit = 0; 4858 arc_eviction_list = NULL; 4859 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 4860 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 4861 4862 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 4863 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 4864 4865 if (arc_ksp != NULL) { 4866 arc_ksp->ks_data = &arc_stats; 4867 arc_ksp->ks_update = arc_kstat_update; 4868 kstat_install(arc_ksp); 4869 } 4870 4871 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 4872 TS_RUN, minclsyspri); 4873 4874#ifdef _KERNEL 4875 arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 4876 EVENTHANDLER_PRI_FIRST); 4877#endif 4878 4879 arc_dead = FALSE; 4880 arc_warm = B_FALSE; 4881 4882 /* 4883 * Calculate maximum amount of dirty data per pool. 4884 * 4885 * If it has been set by /etc/system, take that. 4886 * Otherwise, use a percentage of physical memory defined by 4887 * zfs_dirty_data_max_percent (default 10%) with a cap at 4888 * zfs_dirty_data_max_max (default 4GB). 4889 */ 4890 if (zfs_dirty_data_max == 0) { 4891 zfs_dirty_data_max = ptob(physmem) * 4892 zfs_dirty_data_max_percent / 100; 4893 zfs_dirty_data_max = MIN(zfs_dirty_data_max, 4894 zfs_dirty_data_max_max); 4895 } 4896 4897#ifdef _KERNEL 4898 if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 4899 prefetch_tunable_set = 1; 4900 4901#ifdef __i386__ 4902 if (prefetch_tunable_set == 0) { 4903 printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 4904 "-- to enable,\n"); 4905 printf(" add \"vfs.zfs.prefetch_disable=0\" " 4906 "to /boot/loader.conf.\n"); 4907 zfs_prefetch_disable = 1; 4908 } 4909#else 4910 if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 4911 prefetch_tunable_set == 0) { 4912 printf("ZFS NOTICE: Prefetch is disabled by default if less " 4913 "than 4GB of RAM is present;\n" 4914 " to enable, add \"vfs.zfs.prefetch_disable=0\" " 4915 "to /boot/loader.conf.\n"); 4916 zfs_prefetch_disable = 1; 4917 } 4918#endif 4919 /* Warn about ZFS memory and address space requirements. */ 4920 if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 4921 printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 4922 "expect unstable behavior.\n"); 4923 } 4924 if (kmem_size() < 512 * (1 << 20)) { 4925 printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 4926 "expect unstable behavior.\n"); 4927 printf(" Consider tuning vm.kmem_size and " 4928 "vm.kmem_size_max\n"); 4929 printf(" in /boot/loader.conf.\n"); 4930 } 4931#endif 4932} 4933 4934void 4935arc_fini(void) 4936{ 4937 int i; 4938 4939 mutex_enter(&arc_reclaim_thr_lock); 4940 arc_thread_exit = 1; 4941 cv_signal(&arc_reclaim_thr_cv); 4942 while (arc_thread_exit != 0) 4943 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 4944 mutex_exit(&arc_reclaim_thr_lock); 4945 4946 arc_flush(NULL); 4947 4948 arc_dead = TRUE; 4949 4950 if (arc_ksp != NULL) { 4951 kstat_delete(arc_ksp); 4952 arc_ksp = NULL; 4953 } 4954 4955 mutex_destroy(&arc_eviction_mtx); 4956 mutex_destroy(&arc_reclaim_thr_lock); 4957 cv_destroy(&arc_reclaim_thr_cv); 4958 4959 for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { 4960 list_destroy(&arc_mru->arcs_lists[i]); 4961 list_destroy(&arc_mru_ghost->arcs_lists[i]); 4962 list_destroy(&arc_mfu->arcs_lists[i]); 4963 list_destroy(&arc_mfu_ghost->arcs_lists[i]); 4964 list_destroy(&arc_l2c_only->arcs_lists[i]); 4965 4966 mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock); 4967 mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock); 4968 mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock); 4969 mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock); 4970 mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock); 4971 mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock); 4972 } 4973 4974 buf_fini(); 4975 4976 ASSERT0(arc_loaned_bytes); 4977 4978 mutex_destroy(&arc_lowmem_lock); 4979#ifdef _KERNEL 4980 if (arc_event_lowmem != NULL) 4981 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 4982#endif 4983} 4984 4985/* 4986 * Level 2 ARC 4987 * 4988 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 4989 * It uses dedicated storage devices to hold cached data, which are populated 4990 * using large infrequent writes. The main role of this cache is to boost 4991 * the performance of random read workloads. The intended L2ARC devices 4992 * include short-stroked disks, solid state disks, and other media with 4993 * substantially faster read latency than disk. 4994 * 4995 * +-----------------------+ 4996 * | ARC | 4997 * +-----------------------+ 4998 * | ^ ^ 4999 * | | | 5000 * l2arc_feed_thread() arc_read() 5001 * | | | 5002 * | l2arc read | 5003 * V | | 5004 * +---------------+ | 5005 * | L2ARC | | 5006 * +---------------+ | 5007 * | ^ | 5008 * l2arc_write() | | 5009 * | | | 5010 * V | | 5011 * +-------+ +-------+ 5012 * | vdev | | vdev | 5013 * | cache | | cache | 5014 * +-------+ +-------+ 5015 * +=========+ .-----. 5016 * : L2ARC : |-_____-| 5017 * : devices : | Disks | 5018 * +=========+ `-_____-' 5019 * 5020 * Read requests are satisfied from the following sources, in order: 5021 * 5022 * 1) ARC 5023 * 2) vdev cache of L2ARC devices 5024 * 3) L2ARC devices 5025 * 4) vdev cache of disks 5026 * 5) disks 5027 * 5028 * Some L2ARC device types exhibit extremely slow write performance. 5029 * To accommodate for this there are some significant differences between 5030 * the L2ARC and traditional cache design: 5031 * 5032 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 5033 * the ARC behave as usual, freeing buffers and placing headers on ghost 5034 * lists. The ARC does not send buffers to the L2ARC during eviction as 5035 * this would add inflated write latencies for all ARC memory pressure. 5036 * 5037 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 5038 * It does this by periodically scanning buffers from the eviction-end of 5039 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 5040 * not already there. It scans until a headroom of buffers is satisfied, 5041 * which itself is a buffer for ARC eviction. If a compressible buffer is 5042 * found during scanning and selected for writing to an L2ARC device, we 5043 * temporarily boost scanning headroom during the next scan cycle to make 5044 * sure we adapt to compression effects (which might significantly reduce 5045 * the data volume we write to L2ARC). The thread that does this is 5046 * l2arc_feed_thread(), illustrated below; example sizes are included to 5047 * provide a better sense of ratio than this diagram: 5048 * 5049 * head --> tail 5050 * +---------------------+----------+ 5051 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 5052 * +---------------------+----------+ | o L2ARC eligible 5053 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 5054 * +---------------------+----------+ | 5055 * 15.9 Gbytes ^ 32 Mbytes | 5056 * headroom | 5057 * l2arc_feed_thread() 5058 * | 5059 * l2arc write hand <--[oooo]--' 5060 * | 8 Mbyte 5061 * | write max 5062 * V 5063 * +==============================+ 5064 * L2ARC dev |####|#|###|###| |####| ... | 5065 * +==============================+ 5066 * 32 Gbytes 5067 * 5068 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 5069 * evicted, then the L2ARC has cached a buffer much sooner than it probably 5070 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 5071 * safe to say that this is an uncommon case, since buffers at the end of 5072 * the ARC lists have moved there due to inactivity. 5073 * 5074 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 5075 * then the L2ARC simply misses copying some buffers. This serves as a 5076 * pressure valve to prevent heavy read workloads from both stalling the ARC 5077 * with waits and clogging the L2ARC with writes. This also helps prevent 5078 * the potential for the L2ARC to churn if it attempts to cache content too 5079 * quickly, such as during backups of the entire pool. 5080 * 5081 * 5. After system boot and before the ARC has filled main memory, there are 5082 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 5083 * lists can remain mostly static. Instead of searching from tail of these 5084 * lists as pictured, the l2arc_feed_thread() will search from the list heads 5085 * for eligible buffers, greatly increasing its chance of finding them. 5086 * 5087 * The L2ARC device write speed is also boosted during this time so that 5088 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 5089 * there are no L2ARC reads, and no fear of degrading read performance 5090 * through increased writes. 5091 * 5092 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 5093 * the vdev queue can aggregate them into larger and fewer writes. Each 5094 * device is written to in a rotor fashion, sweeping writes through 5095 * available space then repeating. 5096 * 5097 * 7. The L2ARC does not store dirty content. It never needs to flush 5098 * write buffers back to disk based storage. 5099 * 5100 * 8. If an ARC buffer is written (and dirtied) which also exists in the 5101 * L2ARC, the now stale L2ARC buffer is immediately dropped. 5102 * 5103 * The performance of the L2ARC can be tweaked by a number of tunables, which 5104 * may be necessary for different workloads: 5105 * 5106 * l2arc_write_max max write bytes per interval 5107 * l2arc_write_boost extra write bytes during device warmup 5108 * l2arc_noprefetch skip caching prefetched buffers 5109 * l2arc_headroom number of max device writes to precache 5110 * l2arc_headroom_boost when we find compressed buffers during ARC 5111 * scanning, we multiply headroom by this 5112 * percentage factor for the next scan cycle, 5113 * since more compressed buffers are likely to 5114 * be present 5115 * l2arc_feed_secs seconds between L2ARC writing 5116 * 5117 * Tunables may be removed or added as future performance improvements are 5118 * integrated, and also may become zpool properties. 5119 * 5120 * There are three key functions that control how the L2ARC warms up: 5121 * 5122 * l2arc_write_eligible() check if a buffer is eligible to cache 5123 * l2arc_write_size() calculate how much to write 5124 * l2arc_write_interval() calculate sleep delay between writes 5125 * 5126 * These three functions determine what to write, how much, and how quickly 5127 * to send writes. 5128 */ 5129 5130static boolean_t 5131l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) 5132{ 5133 /* 5134 * A buffer is *not* eligible for the L2ARC if it: 5135 * 1. belongs to a different spa. 5136 * 2. is already cached on the L2ARC. 5137 * 3. has an I/O in progress (it may be an incomplete read). 5138 * 4. is flagged not eligible (zfs property). 5139 */ 5140 if (hdr->b_spa != spa_guid) { 5141 ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 5142 return (B_FALSE); 5143 } 5144 if (HDR_HAS_L2HDR(hdr)) { 5145 ARCSTAT_BUMP(arcstat_l2_write_in_l2); 5146 return (B_FALSE); 5147 } 5148 if (HDR_IO_IN_PROGRESS(hdr)) { 5149 ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 5150 return (B_FALSE); 5151 } 5152 if (!HDR_L2CACHE(hdr)) { 5153 ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 5154 return (B_FALSE); 5155 } 5156 5157 return (B_TRUE); 5158} 5159 5160static uint64_t 5161l2arc_write_size(void) 5162{ 5163 uint64_t size; 5164 5165 /* 5166 * Make sure our globals have meaningful values in case the user 5167 * altered them. 5168 */ 5169 size = l2arc_write_max; 5170 if (size == 0) { 5171 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 5172 "be greater than zero, resetting it to the default (%d)", 5173 L2ARC_WRITE_SIZE); 5174 size = l2arc_write_max = L2ARC_WRITE_SIZE; 5175 } 5176 5177 if (arc_warm == B_FALSE) 5178 size += l2arc_write_boost; 5179 5180 return (size); 5181 5182} 5183 5184static clock_t 5185l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 5186{ 5187 clock_t interval, next, now; 5188 5189 /* 5190 * If the ARC lists are busy, increase our write rate; if the 5191 * lists are stale, idle back. This is achieved by checking 5192 * how much we previously wrote - if it was more than half of 5193 * what we wanted, schedule the next write much sooner. 5194 */ 5195 if (l2arc_feed_again && wrote > (wanted / 2)) 5196 interval = (hz * l2arc_feed_min_ms) / 1000; 5197 else 5198 interval = hz * l2arc_feed_secs; 5199 5200 now = ddi_get_lbolt(); 5201 next = MAX(now, MIN(now + interval, began + interval)); 5202 5203 return (next); 5204} 5205 5206/* 5207 * Cycle through L2ARC devices. This is how L2ARC load balances. 5208 * If a device is returned, this also returns holding the spa config lock. 5209 */ 5210static l2arc_dev_t * 5211l2arc_dev_get_next(void) 5212{ 5213 l2arc_dev_t *first, *next = NULL; 5214 5215 /* 5216 * Lock out the removal of spas (spa_namespace_lock), then removal 5217 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 5218 * both locks will be dropped and a spa config lock held instead. 5219 */ 5220 mutex_enter(&spa_namespace_lock); 5221 mutex_enter(&l2arc_dev_mtx); 5222 5223 /* if there are no vdevs, there is nothing to do */ 5224 if (l2arc_ndev == 0) 5225 goto out; 5226 5227 first = NULL; 5228 next = l2arc_dev_last; 5229 do { 5230 /* loop around the list looking for a non-faulted vdev */ 5231 if (next == NULL) { 5232 next = list_head(l2arc_dev_list); 5233 } else { 5234 next = list_next(l2arc_dev_list, next); 5235 if (next == NULL) 5236 next = list_head(l2arc_dev_list); 5237 } 5238 5239 /* if we have come back to the start, bail out */ 5240 if (first == NULL) 5241 first = next; 5242 else if (next == first) 5243 break; 5244 5245 } while (vdev_is_dead(next->l2ad_vdev)); 5246 5247 /* if we were unable to find any usable vdevs, return NULL */ 5248 if (vdev_is_dead(next->l2ad_vdev)) 5249 next = NULL; 5250 5251 l2arc_dev_last = next; 5252 5253out: 5254 mutex_exit(&l2arc_dev_mtx); 5255 5256 /* 5257 * Grab the config lock to prevent the 'next' device from being 5258 * removed while we are writing to it. 5259 */ 5260 if (next != NULL) 5261 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 5262 mutex_exit(&spa_namespace_lock); 5263 5264 return (next); 5265} 5266 5267/* 5268 * Free buffers that were tagged for destruction. 5269 */ 5270static void 5271l2arc_do_free_on_write() 5272{ 5273 list_t *buflist; 5274 l2arc_data_free_t *df, *df_prev; 5275 5276 mutex_enter(&l2arc_free_on_write_mtx); 5277 buflist = l2arc_free_on_write; 5278 5279 for (df = list_tail(buflist); df; df = df_prev) { 5280 df_prev = list_prev(buflist, df); 5281 ASSERT(df->l2df_data != NULL); 5282 ASSERT(df->l2df_func != NULL); 5283 df->l2df_func(df->l2df_data, df->l2df_size); 5284 list_remove(buflist, df); 5285 kmem_free(df, sizeof (l2arc_data_free_t)); 5286 } 5287 5288 mutex_exit(&l2arc_free_on_write_mtx); 5289} 5290 5291/* 5292 * A write to a cache device has completed. Update all headers to allow 5293 * reads from these buffers to begin. 5294 */ 5295static void 5296l2arc_write_done(zio_t *zio) 5297{ 5298 l2arc_write_callback_t *cb; 5299 l2arc_dev_t *dev; 5300 list_t *buflist; 5301 arc_buf_hdr_t *head, *hdr, *hdr_prev; 5302 kmutex_t *hash_lock; 5303 int64_t bytes_dropped = 0; 5304 5305 cb = zio->io_private; 5306 ASSERT(cb != NULL); 5307 dev = cb->l2wcb_dev; 5308 ASSERT(dev != NULL); 5309 head = cb->l2wcb_head; 5310 ASSERT(head != NULL); 5311 buflist = &dev->l2ad_buflist; 5312 ASSERT(buflist != NULL); 5313 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 5314 l2arc_write_callback_t *, cb); 5315 5316 if (zio->io_error != 0) 5317 ARCSTAT_BUMP(arcstat_l2_writes_error); 5318 5319 mutex_enter(&dev->l2ad_mtx); 5320 5321 /* 5322 * All writes completed, or an error was hit. 5323 */ 5324 for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { 5325 hdr_prev = list_prev(buflist, hdr); 5326 5327 hash_lock = HDR_LOCK(hdr); 5328 if (!mutex_tryenter(hash_lock)) { 5329 /* 5330 * This buffer misses out. It may be in a stage 5331 * of eviction. Its ARC_FLAG_L2_WRITING flag will be 5332 * left set, denying reads to this buffer. 5333 */ 5334 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); 5335 continue; 5336 } 5337 5338 /* 5339 * It's possible that this buffer got evicted from the L1 cache 5340 * before we grabbed the vdev + hash locks, in which case 5341 * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated. 5342 * Only free the buffer if we still have an L1 hdr. 5343 */ 5344 if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL && 5345 HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 5346 l2arc_release_cdata_buf(hdr); 5347 5348 if (zio->io_error != 0) { 5349 /* 5350 * Error - drop L2ARC entry. 5351 */ 5352 trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, 5353 hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0); 5354 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 5355 5356 ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); 5357 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 5358 } 5359 5360 /* 5361 * Allow ARC to begin reads to this L2ARC entry. 5362 */ 5363 hdr->b_flags &= ~ARC_FLAG_L2_WRITING; 5364 5365 mutex_exit(hash_lock); 5366 } 5367 5368 atomic_inc_64(&l2arc_writes_done); 5369 list_remove(buflist, head); 5370 ASSERT(!HDR_HAS_L1HDR(head)); 5371 kmem_cache_free(hdr_l2only_cache, head); 5372 mutex_exit(&dev->l2ad_mtx); 5373 5374 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); 5375 5376 l2arc_do_free_on_write(); 5377 5378 kmem_free(cb, sizeof (l2arc_write_callback_t)); 5379} 5380 5381/* 5382 * A read to a cache device completed. Validate buffer contents before 5383 * handing over to the regular ARC routines. 5384 */ 5385static void 5386l2arc_read_done(zio_t *zio) 5387{ 5388 l2arc_read_callback_t *cb; 5389 arc_buf_hdr_t *hdr; 5390 arc_buf_t *buf; 5391 kmutex_t *hash_lock; 5392 int equal; 5393 5394 ASSERT(zio->io_vd != NULL); 5395 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 5396 5397 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 5398 5399 cb = zio->io_private; 5400 ASSERT(cb != NULL); 5401 buf = cb->l2rcb_buf; 5402 ASSERT(buf != NULL); 5403 5404 hash_lock = HDR_LOCK(buf->b_hdr); 5405 mutex_enter(hash_lock); 5406 hdr = buf->b_hdr; 5407 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 5408 5409 /* 5410 * If the buffer was compressed, decompress it first. 5411 */ 5412 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) 5413 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); 5414 ASSERT(zio->io_data != NULL); 5415 5416 /* 5417 * Check this survived the L2ARC journey. 5418 */ 5419 equal = arc_cksum_equal(buf); 5420 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 5421 mutex_exit(hash_lock); 5422 zio->io_private = buf; 5423 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 5424 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 5425 arc_read_done(zio); 5426 } else { 5427 mutex_exit(hash_lock); 5428 /* 5429 * Buffer didn't survive caching. Increment stats and 5430 * reissue to the original storage device. 5431 */ 5432 if (zio->io_error != 0) { 5433 ARCSTAT_BUMP(arcstat_l2_io_error); 5434 } else { 5435 zio->io_error = SET_ERROR(EIO); 5436 } 5437 if (!equal) 5438 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 5439 5440 /* 5441 * If there's no waiter, issue an async i/o to the primary 5442 * storage now. If there *is* a waiter, the caller must 5443 * issue the i/o in a context where it's OK to block. 5444 */ 5445 if (zio->io_waiter == NULL) { 5446 zio_t *pio = zio_unique_parent(zio); 5447 5448 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 5449 5450 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 5451 buf->b_data, zio->io_size, arc_read_done, buf, 5452 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 5453 } 5454 } 5455 5456 kmem_free(cb, sizeof (l2arc_read_callback_t)); 5457} 5458 5459/* 5460 * This is the list priority from which the L2ARC will search for pages to 5461 * cache. This is used within loops (0..3) to cycle through lists in the 5462 * desired order. This order can have a significant effect on cache 5463 * performance. 5464 * 5465 * Currently the metadata lists are hit first, MFU then MRU, followed by 5466 * the data lists. This function returns a locked list, and also returns 5467 * the lock pointer. 5468 */ 5469static list_t * 5470l2arc_list_locked(int list_num, kmutex_t **lock) 5471{ 5472 list_t *list = NULL; 5473 int idx; 5474 5475 ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS); 5476 5477 if (list_num < ARC_BUFC_NUMMETADATALISTS) { 5478 idx = list_num; 5479 list = &arc_mfu->arcs_lists[idx]; 5480 *lock = ARCS_LOCK(arc_mfu, idx); 5481 } else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) { 5482 idx = list_num - ARC_BUFC_NUMMETADATALISTS; 5483 list = &arc_mru->arcs_lists[idx]; 5484 *lock = ARCS_LOCK(arc_mru, idx); 5485 } else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 + 5486 ARC_BUFC_NUMDATALISTS)) { 5487 idx = list_num - ARC_BUFC_NUMMETADATALISTS; 5488 list = &arc_mfu->arcs_lists[idx]; 5489 *lock = ARCS_LOCK(arc_mfu, idx); 5490 } else { 5491 idx = list_num - ARC_BUFC_NUMLISTS; 5492 list = &arc_mru->arcs_lists[idx]; 5493 *lock = ARCS_LOCK(arc_mru, idx); 5494 } 5495 5496 ASSERT(!(MUTEX_HELD(*lock))); 5497 mutex_enter(*lock); 5498 return (list); 5499} 5500 5501/* 5502 * Evict buffers from the device write hand to the distance specified in 5503 * bytes. This distance may span populated buffers, it may span nothing. 5504 * This is clearing a region on the L2ARC device ready for writing. 5505 * If the 'all' boolean is set, every buffer is evicted. 5506 */ 5507static void 5508l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 5509{ 5510 list_t *buflist; 5511 arc_buf_hdr_t *hdr, *hdr_prev; 5512 kmutex_t *hash_lock; 5513 uint64_t taddr; 5514 int64_t bytes_evicted = 0; 5515 5516 buflist = &dev->l2ad_buflist; 5517 5518 if (!all && dev->l2ad_first) { 5519 /* 5520 * This is the first sweep through the device. There is 5521 * nothing to evict. 5522 */ 5523 return; 5524 } 5525 5526 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 5527 /* 5528 * When nearing the end of the device, evict to the end 5529 * before the device write hand jumps to the start. 5530 */ 5531 taddr = dev->l2ad_end; 5532 } else { 5533 taddr = dev->l2ad_hand + distance; 5534 } 5535 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 5536 uint64_t, taddr, boolean_t, all); 5537 5538top: 5539 mutex_enter(&dev->l2ad_mtx); 5540 for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { 5541 hdr_prev = list_prev(buflist, hdr); 5542 5543 hash_lock = HDR_LOCK(hdr); 5544 if (!mutex_tryenter(hash_lock)) { 5545 /* 5546 * Missed the hash lock. Retry. 5547 */ 5548 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 5549 mutex_exit(&dev->l2ad_mtx); 5550 mutex_enter(hash_lock); 5551 mutex_exit(hash_lock); 5552 goto top; 5553 } 5554 5555 if (HDR_L2_WRITE_HEAD(hdr)) { 5556 /* 5557 * We hit a write head node. Leave it for 5558 * l2arc_write_done(). 5559 */ 5560 list_remove(buflist, hdr); 5561 mutex_exit(hash_lock); 5562 continue; 5563 } 5564 5565 if (!all && HDR_HAS_L2HDR(hdr) && 5566 (hdr->b_l2hdr.b_daddr > taddr || 5567 hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { 5568 /* 5569 * We've evicted to the target address, 5570 * or the end of the device. 5571 */ 5572 mutex_exit(hash_lock); 5573 break; 5574 } 5575 5576 ASSERT(HDR_HAS_L2HDR(hdr)); 5577 if (!HDR_HAS_L1HDR(hdr)) { 5578 ASSERT(!HDR_L2_READING(hdr)); 5579 /* 5580 * This doesn't exist in the ARC. Destroy. 5581 * arc_hdr_destroy() will call list_remove() 5582 * and decrement arcstat_l2_size. 5583 */ 5584 arc_change_state(arc_anon, hdr, hash_lock); 5585 arc_hdr_destroy(hdr); 5586 } else { 5587 ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); 5588 ARCSTAT_BUMP(arcstat_l2_evict_l1cached); 5589 /* 5590 * Invalidate issued or about to be issued 5591 * reads, since we may be about to write 5592 * over this location. 5593 */ 5594 if (HDR_L2_READING(hdr)) { 5595 ARCSTAT_BUMP(arcstat_l2_evict_reading); 5596 hdr->b_flags |= ARC_FLAG_L2_EVICTED; 5597 } 5598 5599 /* Tell ARC this no longer exists in L2ARC. */ 5600 ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); 5601 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 5602 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 5603 list_remove(buflist, hdr); 5604 5605 /* This may have been leftover after a failed write. */ 5606 hdr->b_flags &= ~ARC_FLAG_L2_WRITING; 5607 } 5608 mutex_exit(hash_lock); 5609 } 5610 mutex_exit(&dev->l2ad_mtx); 5611 5612 vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0); 5613 dev->l2ad_evict = taddr; 5614} 5615 5616/* 5617 * Find and write ARC buffers to the L2ARC device. 5618 * 5619 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid 5620 * for reading until they have completed writing. 5621 * The headroom_boost is an in-out parameter used to maintain headroom boost 5622 * state between calls to this function. 5623 * 5624 * Returns the number of bytes actually written (which may be smaller than 5625 * the delta by which the device hand has changed due to alignment). 5626 */ 5627static uint64_t 5628l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, 5629 boolean_t *headroom_boost) 5630{ 5631 arc_buf_hdr_t *hdr, *hdr_prev, *head; 5632 list_t *list; 5633 uint64_t write_asize, write_psize, write_sz, headroom, 5634 buf_compress_minsz; 5635 void *buf_data; 5636 kmutex_t *list_lock; 5637 boolean_t full; 5638 l2arc_write_callback_t *cb; 5639 zio_t *pio, *wzio; 5640 uint64_t guid = spa_load_guid(spa); 5641 const boolean_t do_headroom_boost = *headroom_boost; 5642 int try; 5643 5644 ASSERT(dev->l2ad_vdev != NULL); 5645 5646 /* Lower the flag now, we might want to raise it again later. */ 5647 *headroom_boost = B_FALSE; 5648 5649 pio = NULL; 5650 write_sz = write_asize = write_psize = 0; 5651 full = B_FALSE; 5652 head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); 5653 head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; 5654 head->b_flags |= ARC_FLAG_HAS_L2HDR; 5655 5656 ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); 5657 /* 5658 * We will want to try to compress buffers that are at least 2x the 5659 * device sector size. 5660 */ 5661 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; 5662 5663 /* 5664 * Copy buffers for L2ARC writing. 5665 */ 5666 mutex_enter(&dev->l2ad_mtx); 5667 for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) { 5668 uint64_t passed_sz = 0; 5669 5670 list = l2arc_list_locked(try, &list_lock); 5671 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); 5672 5673 /* 5674 * L2ARC fast warmup. 5675 * 5676 * Until the ARC is warm and starts to evict, read from the 5677 * head of the ARC lists rather than the tail. 5678 */ 5679 if (arc_warm == B_FALSE) 5680 hdr = list_head(list); 5681 else 5682 hdr = list_tail(list); 5683 if (hdr == NULL) 5684 ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); 5685 5686 headroom = target_sz * l2arc_headroom * 2 / ARC_BUFC_NUMLISTS; 5687 if (do_headroom_boost) 5688 headroom = (headroom * l2arc_headroom_boost) / 100; 5689 5690 for (; hdr; hdr = hdr_prev) { 5691 kmutex_t *hash_lock; 5692 uint64_t buf_sz; 5693 5694 if (arc_warm == B_FALSE) 5695 hdr_prev = list_next(list, hdr); 5696 else 5697 hdr_prev = list_prev(list, hdr); 5698 ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size); 5699 5700 hash_lock = HDR_LOCK(hdr); 5701 if (!mutex_tryenter(hash_lock)) { 5702 ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); 5703 /* 5704 * Skip this buffer rather than waiting. 5705 */ 5706 continue; 5707 } 5708 5709 passed_sz += hdr->b_size; 5710 if (passed_sz > headroom) { 5711 /* 5712 * Searched too far. 5713 */ 5714 mutex_exit(hash_lock); 5715 ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); 5716 break; 5717 } 5718 5719 if (!l2arc_write_eligible(guid, hdr)) { 5720 mutex_exit(hash_lock); 5721 continue; 5722 } 5723 5724 if ((write_sz + hdr->b_size) > target_sz) { 5725 full = B_TRUE; 5726 mutex_exit(hash_lock); 5727 ARCSTAT_BUMP(arcstat_l2_write_full); 5728 break; 5729 } 5730 5731 if (pio == NULL) { 5732 /* 5733 * Insert a dummy header on the buflist so 5734 * l2arc_write_done() can find where the 5735 * write buffers begin without searching. 5736 */ 5737 list_insert_head(&dev->l2ad_buflist, head); 5738 5739 cb = kmem_alloc( 5740 sizeof (l2arc_write_callback_t), KM_SLEEP); 5741 cb->l2wcb_dev = dev; 5742 cb->l2wcb_head = head; 5743 pio = zio_root(spa, l2arc_write_done, cb, 5744 ZIO_FLAG_CANFAIL); 5745 ARCSTAT_BUMP(arcstat_l2_write_pios); 5746 } 5747 5748 /* 5749 * Create and add a new L2ARC header. 5750 */ 5751 hdr->b_l2hdr.b_dev = dev; 5752 hdr->b_flags |= ARC_FLAG_L2_WRITING; 5753 /* 5754 * Temporarily stash the data buffer in b_tmp_cdata. 5755 * The subsequent write step will pick it up from 5756 * there. This is because can't access b_l1hdr.b_buf 5757 * without holding the hash_lock, which we in turn 5758 * can't access without holding the ARC list locks 5759 * (which we want to avoid during compression/writing). 5760 */ 5761 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); 5762 hdr->b_l2hdr.b_asize = hdr->b_size; 5763 hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; 5764 5765 buf_sz = hdr->b_size; 5766 hdr->b_flags |= ARC_FLAG_HAS_L2HDR; 5767 5768 list_insert_head(&dev->l2ad_buflist, hdr); 5769 5770 /* 5771 * Compute and store the buffer cksum before 5772 * writing. On debug the cksum is verified first. 5773 */ 5774 arc_cksum_verify(hdr->b_l1hdr.b_buf); 5775 arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); 5776 5777 mutex_exit(hash_lock); 5778 5779 write_sz += buf_sz; 5780 } 5781 5782 mutex_exit(list_lock); 5783 5784 if (full == B_TRUE) 5785 break; 5786 } 5787 5788 /* No buffers selected for writing? */ 5789 if (pio == NULL) { 5790 ASSERT0(write_sz); 5791 mutex_exit(&dev->l2ad_mtx); 5792 ASSERT(!HDR_HAS_L1HDR(head)); 5793 kmem_cache_free(hdr_l2only_cache, head); 5794 return (0); 5795 } 5796 5797 /* 5798 * Now start writing the buffers. We're starting at the write head 5799 * and work backwards, retracing the course of the buffer selector 5800 * loop above. 5801 */ 5802 for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; 5803 hdr = list_prev(&dev->l2ad_buflist, hdr)) { 5804 uint64_t buf_sz; 5805 5806 /* 5807 * We shouldn't need to lock the buffer here, since we flagged 5808 * it as ARC_FLAG_L2_WRITING in the previous step, but we must 5809 * take care to only access its L2 cache parameters. In 5810 * particular, hdr->l1hdr.b_buf may be invalid by now due to 5811 * ARC eviction. 5812 */ 5813 hdr->b_l2hdr.b_daddr = dev->l2ad_hand; 5814 5815 if ((HDR_L2COMPRESS(hdr)) && 5816 hdr->b_l2hdr.b_asize >= buf_compress_minsz) { 5817 if (l2arc_compress_buf(hdr)) { 5818 /* 5819 * If compression succeeded, enable headroom 5820 * boost on the next scan cycle. 5821 */ 5822 *headroom_boost = B_TRUE; 5823 } 5824 } 5825 5826 /* 5827 * Pick up the buffer data we had previously stashed away 5828 * (and now potentially also compressed). 5829 */ 5830 buf_data = hdr->b_l1hdr.b_tmp_cdata; 5831 buf_sz = hdr->b_l2hdr.b_asize; 5832 5833 /* 5834 * If the data has not been compressed, then clear b_tmp_cdata 5835 * to make sure that it points only to a temporary compression 5836 * buffer. 5837 */ 5838 if (!L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr))) 5839 hdr->b_l1hdr.b_tmp_cdata = NULL; 5840 5841 /* Compression may have squashed the buffer to zero length. */ 5842 if (buf_sz != 0) { 5843 uint64_t buf_p_sz; 5844 5845 wzio = zio_write_phys(pio, dev->l2ad_vdev, 5846 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 5847 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 5848 ZIO_FLAG_CANFAIL, B_FALSE); 5849 5850 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 5851 zio_t *, wzio); 5852 (void) zio_nowait(wzio); 5853 5854 write_asize += buf_sz; 5855 /* 5856 * Keep the clock hand suitably device-aligned. 5857 */ 5858 buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 5859 write_psize += buf_p_sz; 5860 dev->l2ad_hand += buf_p_sz; 5861 } 5862 } 5863 5864 mutex_exit(&dev->l2ad_mtx); 5865 5866 ASSERT3U(write_asize, <=, target_sz); 5867 ARCSTAT_BUMP(arcstat_l2_writes_sent); 5868 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); 5869 ARCSTAT_INCR(arcstat_l2_size, write_sz); 5870 ARCSTAT_INCR(arcstat_l2_asize, write_asize); 5871 vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0); 5872 5873 /* 5874 * Bump device hand to the device start if it is approaching the end. 5875 * l2arc_evict() will already have evicted ahead for this case. 5876 */ 5877 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 5878 dev->l2ad_hand = dev->l2ad_start; 5879 dev->l2ad_evict = dev->l2ad_start; 5880 dev->l2ad_first = B_FALSE; 5881 } 5882 5883 dev->l2ad_writing = B_TRUE; 5884 (void) zio_wait(pio); 5885 dev->l2ad_writing = B_FALSE; 5886 5887 return (write_asize); 5888} 5889 5890/* 5891 * Compresses an L2ARC buffer. 5892 * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its 5893 * size in l2hdr->b_asize. This routine tries to compress the data and 5894 * depending on the compression result there are three possible outcomes: 5895 * *) The buffer was incompressible. The original l2hdr contents were left 5896 * untouched and are ready for writing to an L2 device. 5897 * *) The buffer was all-zeros, so there is no need to write it to an L2 5898 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is 5899 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. 5900 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary 5901 * data buffer which holds the compressed data to be written, and b_asize 5902 * tells us how much data there is. b_compress is set to the appropriate 5903 * compression algorithm. Once writing is done, invoke 5904 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. 5905 * 5906 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the 5907 * buffer was incompressible). 5908 */ 5909static boolean_t 5910l2arc_compress_buf(arc_buf_hdr_t *hdr) 5911{ 5912 void *cdata; 5913 size_t csize, len, rounded; 5914 ASSERT(HDR_HAS_L2HDR(hdr)); 5915 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 5916 5917 ASSERT(HDR_HAS_L1HDR(hdr)); 5918 ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF); 5919 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 5920 5921 len = l2hdr->b_asize; 5922 cdata = zio_data_buf_alloc(len); 5923 ASSERT3P(cdata, !=, NULL); 5924 csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata, 5925 cdata, l2hdr->b_asize); 5926 5927 if (csize == 0) { 5928 /* zero block, indicate that there's nothing to write */ 5929 zio_data_buf_free(cdata, len); 5930 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY); 5931 l2hdr->b_asize = 0; 5932 hdr->b_l1hdr.b_tmp_cdata = NULL; 5933 ARCSTAT_BUMP(arcstat_l2_compress_zeros); 5934 return (B_TRUE); 5935 } 5936 5937 rounded = P2ROUNDUP(csize, 5938 (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift); 5939 if (rounded < len) { 5940 /* 5941 * Compression succeeded, we'll keep the cdata around for 5942 * writing and release it afterwards. 5943 */ 5944 if (rounded > csize) { 5945 bzero((char *)cdata + csize, rounded - csize); 5946 csize = rounded; 5947 } 5948 HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4); 5949 l2hdr->b_asize = csize; 5950 hdr->b_l1hdr.b_tmp_cdata = cdata; 5951 ARCSTAT_BUMP(arcstat_l2_compress_successes); 5952 return (B_TRUE); 5953 } else { 5954 /* 5955 * Compression failed, release the compressed buffer. 5956 * l2hdr will be left unmodified. 5957 */ 5958 zio_data_buf_free(cdata, len); 5959 ARCSTAT_BUMP(arcstat_l2_compress_failures); 5960 return (B_FALSE); 5961 } 5962} 5963 5964/* 5965 * Decompresses a zio read back from an l2arc device. On success, the 5966 * underlying zio's io_data buffer is overwritten by the uncompressed 5967 * version. On decompression error (corrupt compressed stream), the 5968 * zio->io_error value is set to signal an I/O error. 5969 * 5970 * Please note that the compressed data stream is not checksummed, so 5971 * if the underlying device is experiencing data corruption, we may feed 5972 * corrupt data to the decompressor, so the decompressor needs to be 5973 * able to handle this situation (LZ4 does). 5974 */ 5975static void 5976l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) 5977{ 5978 ASSERT(L2ARC_IS_VALID_COMPRESS(c)); 5979 5980 if (zio->io_error != 0) { 5981 /* 5982 * An io error has occured, just restore the original io 5983 * size in preparation for a main pool read. 5984 */ 5985 zio->io_orig_size = zio->io_size = hdr->b_size; 5986 return; 5987 } 5988 5989 if (c == ZIO_COMPRESS_EMPTY) { 5990 /* 5991 * An empty buffer results in a null zio, which means we 5992 * need to fill its io_data after we're done restoring the 5993 * buffer's contents. 5994 */ 5995 ASSERT(hdr->b_l1hdr.b_buf != NULL); 5996 bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); 5997 zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; 5998 } else { 5999 ASSERT(zio->io_data != NULL); 6000 /* 6001 * We copy the compressed data from the start of the arc buffer 6002 * (the zio_read will have pulled in only what we need, the 6003 * rest is garbage which we will overwrite at decompression) 6004 * and then decompress back to the ARC data buffer. This way we 6005 * can minimize copying by simply decompressing back over the 6006 * original compressed data (rather than decompressing to an 6007 * aux buffer and then copying back the uncompressed buffer, 6008 * which is likely to be much larger). 6009 */ 6010 uint64_t csize; 6011 void *cdata; 6012 6013 csize = zio->io_size; 6014 cdata = zio_data_buf_alloc(csize); 6015 bcopy(zio->io_data, cdata, csize); 6016 if (zio_decompress_data(c, cdata, zio->io_data, csize, 6017 hdr->b_size) != 0) 6018 zio->io_error = EIO; 6019 zio_data_buf_free(cdata, csize); 6020 } 6021 6022 /* Restore the expected uncompressed IO size. */ 6023 zio->io_orig_size = zio->io_size = hdr->b_size; 6024} 6025 6026/* 6027 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. 6028 * This buffer serves as a temporary holder of compressed data while 6029 * the buffer entry is being written to an l2arc device. Once that is 6030 * done, we can dispose of it. 6031 */ 6032static void 6033l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) 6034{ 6035 ASSERT(HDR_HAS_L1HDR(hdr)); 6036 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) { 6037 /* 6038 * If the data was compressed, then we've allocated a 6039 * temporary buffer for it, so now we need to release it. 6040 */ 6041 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 6042 zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, 6043 hdr->b_size); 6044 hdr->b_l1hdr.b_tmp_cdata = NULL; 6045 } else { 6046 ASSERT(hdr->b_l1hdr.b_tmp_cdata == NULL); 6047 } 6048} 6049 6050/* 6051 * This thread feeds the L2ARC at regular intervals. This is the beating 6052 * heart of the L2ARC. 6053 */ 6054static void 6055l2arc_feed_thread(void *dummy __unused) 6056{ 6057 callb_cpr_t cpr; 6058 l2arc_dev_t *dev; 6059 spa_t *spa; 6060 uint64_t size, wrote; 6061 clock_t begin, next = ddi_get_lbolt(); 6062 boolean_t headroom_boost = B_FALSE; 6063 6064 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 6065 6066 mutex_enter(&l2arc_feed_thr_lock); 6067 6068 while (l2arc_thread_exit == 0) { 6069 CALLB_CPR_SAFE_BEGIN(&cpr); 6070 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 6071 next - ddi_get_lbolt()); 6072 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 6073 next = ddi_get_lbolt() + hz; 6074 6075 /* 6076 * Quick check for L2ARC devices. 6077 */ 6078 mutex_enter(&l2arc_dev_mtx); 6079 if (l2arc_ndev == 0) { 6080 mutex_exit(&l2arc_dev_mtx); 6081 continue; 6082 } 6083 mutex_exit(&l2arc_dev_mtx); 6084 begin = ddi_get_lbolt(); 6085 6086 /* 6087 * This selects the next l2arc device to write to, and in 6088 * doing so the next spa to feed from: dev->l2ad_spa. This 6089 * will return NULL if there are now no l2arc devices or if 6090 * they are all faulted. 6091 * 6092 * If a device is returned, its spa's config lock is also 6093 * held to prevent device removal. l2arc_dev_get_next() 6094 * will grab and release l2arc_dev_mtx. 6095 */ 6096 if ((dev = l2arc_dev_get_next()) == NULL) 6097 continue; 6098 6099 spa = dev->l2ad_spa; 6100 ASSERT(spa != NULL); 6101 6102 /* 6103 * If the pool is read-only then force the feed thread to 6104 * sleep a little longer. 6105 */ 6106 if (!spa_writeable(spa)) { 6107 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 6108 spa_config_exit(spa, SCL_L2ARC, dev); 6109 continue; 6110 } 6111 6112 /* 6113 * Avoid contributing to memory pressure. 6114 */ 6115 if (arc_reclaim_needed()) { 6116 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 6117 spa_config_exit(spa, SCL_L2ARC, dev); 6118 continue; 6119 } 6120 6121 ARCSTAT_BUMP(arcstat_l2_feeds); 6122 6123 size = l2arc_write_size(); 6124 6125 /* 6126 * Evict L2ARC buffers that will be overwritten. 6127 */ 6128 l2arc_evict(dev, size, B_FALSE); 6129 6130 /* 6131 * Write ARC buffers. 6132 */ 6133 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); 6134 6135 /* 6136 * Calculate interval between writes. 6137 */ 6138 next = l2arc_write_interval(begin, size, wrote); 6139 spa_config_exit(spa, SCL_L2ARC, dev); 6140 } 6141 6142 l2arc_thread_exit = 0; 6143 cv_broadcast(&l2arc_feed_thr_cv); 6144 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 6145 thread_exit(); 6146} 6147 6148boolean_t 6149l2arc_vdev_present(vdev_t *vd) 6150{ 6151 l2arc_dev_t *dev; 6152 6153 mutex_enter(&l2arc_dev_mtx); 6154 for (dev = list_head(l2arc_dev_list); dev != NULL; 6155 dev = list_next(l2arc_dev_list, dev)) { 6156 if (dev->l2ad_vdev == vd) 6157 break; 6158 } 6159 mutex_exit(&l2arc_dev_mtx); 6160 6161 return (dev != NULL); 6162} 6163 6164/* 6165 * Add a vdev for use by the L2ARC. By this point the spa has already 6166 * validated the vdev and opened it. 6167 */ 6168void 6169l2arc_add_vdev(spa_t *spa, vdev_t *vd) 6170{ 6171 l2arc_dev_t *adddev; 6172 6173 ASSERT(!l2arc_vdev_present(vd)); 6174 6175 vdev_ashift_optimize(vd); 6176 6177 /* 6178 * Create a new l2arc device entry. 6179 */ 6180 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 6181 adddev->l2ad_spa = spa; 6182 adddev->l2ad_vdev = vd; 6183 adddev->l2ad_start = VDEV_LABEL_START_SIZE; 6184 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 6185 adddev->l2ad_hand = adddev->l2ad_start; 6186 adddev->l2ad_evict = adddev->l2ad_start; 6187 adddev->l2ad_first = B_TRUE; 6188 adddev->l2ad_writing = B_FALSE; 6189 6190 mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); 6191 /* 6192 * This is a list of all ARC buffers that are still valid on the 6193 * device. 6194 */ 6195 list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 6196 offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); 6197 6198 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 6199 6200 /* 6201 * Add device to global list 6202 */ 6203 mutex_enter(&l2arc_dev_mtx); 6204 list_insert_head(l2arc_dev_list, adddev); 6205 atomic_inc_64(&l2arc_ndev); 6206 mutex_exit(&l2arc_dev_mtx); 6207} 6208 6209/* 6210 * Remove a vdev from the L2ARC. 6211 */ 6212void 6213l2arc_remove_vdev(vdev_t *vd) 6214{ 6215 l2arc_dev_t *dev, *nextdev, *remdev = NULL; 6216 6217 /* 6218 * Find the device by vdev 6219 */ 6220 mutex_enter(&l2arc_dev_mtx); 6221 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 6222 nextdev = list_next(l2arc_dev_list, dev); 6223 if (vd == dev->l2ad_vdev) { 6224 remdev = dev; 6225 break; 6226 } 6227 } 6228 ASSERT(remdev != NULL); 6229 6230 /* 6231 * Remove device from global list 6232 */ 6233 list_remove(l2arc_dev_list, remdev); 6234 l2arc_dev_last = NULL; /* may have been invalidated */ 6235 atomic_dec_64(&l2arc_ndev); 6236 mutex_exit(&l2arc_dev_mtx); 6237 6238 /* 6239 * Clear all buflists and ARC references. L2ARC device flush. 6240 */ 6241 l2arc_evict(remdev, 0, B_TRUE); 6242 list_destroy(&remdev->l2ad_buflist); 6243 mutex_destroy(&remdev->l2ad_mtx); 6244 kmem_free(remdev, sizeof (l2arc_dev_t)); 6245} 6246 6247void 6248l2arc_init(void) 6249{ 6250 l2arc_thread_exit = 0; 6251 l2arc_ndev = 0; 6252 l2arc_writes_sent = 0; 6253 l2arc_writes_done = 0; 6254 6255 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 6256 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 6257 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 6258 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 6259 6260 l2arc_dev_list = &L2ARC_dev_list; 6261 l2arc_free_on_write = &L2ARC_free_on_write; 6262 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 6263 offsetof(l2arc_dev_t, l2ad_node)); 6264 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 6265 offsetof(l2arc_data_free_t, l2df_list_node)); 6266} 6267 6268void 6269l2arc_fini(void) 6270{ 6271 /* 6272 * This is called from dmu_fini(), which is called from spa_fini(); 6273 * Because of this, we can assume that all l2arc devices have 6274 * already been removed when the pools themselves were removed. 6275 */ 6276 6277 l2arc_do_free_on_write(); 6278 6279 mutex_destroy(&l2arc_feed_thr_lock); 6280 cv_destroy(&l2arc_feed_thr_cv); 6281 mutex_destroy(&l2arc_dev_mtx); 6282 mutex_destroy(&l2arc_free_on_write_mtx); 6283 6284 list_destroy(l2arc_dev_list); 6285 list_destroy(l2arc_free_on_write); 6286} 6287 6288void 6289l2arc_start(void) 6290{ 6291 if (!(spa_mode_global & FWRITE)) 6292 return; 6293 6294 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 6295 TS_RUN, minclsyspri); 6296} 6297 6298void 6299l2arc_stop(void) 6300{ 6301 if (!(spa_mode_global & FWRITE)) 6302 return; 6303 6304 mutex_enter(&l2arc_feed_thr_lock); 6305 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 6306 l2arc_thread_exit = 1; 6307 while (l2arc_thread_exit != 0) 6308 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 6309 mutex_exit(&l2arc_feed_thr_lock); 6310} 6311