arc.c revision 304138
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23277826Sdelphij * Copyright (c) 2012, Joyent, Inc. All rights reserved. 24304138Savg * Copyright (c) 2011, 2016 by Delphix. All rights reserved. 25260835Sdelphij * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. 26286764Smav * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 27168404Spjd */ 28168404Spjd 29168404Spjd/* 30168404Spjd * DVA-based Adjustable Replacement Cache 31168404Spjd * 32168404Spjd * While much of the theory of operation used here is 33168404Spjd * based on the self-tuning, low overhead replacement cache 34168404Spjd * presented by Megiddo and Modha at FAST 2003, there are some 35168404Spjd * significant differences: 36168404Spjd * 37168404Spjd * 1. The Megiddo and Modha model assumes any page is evictable. 38168404Spjd * Pages in its cache cannot be "locked" into memory. This makes 39168404Spjd * the eviction algorithm simple: evict the last page in the list. 40168404Spjd * This also make the performance characteristics easy to reason 41168404Spjd * about. Our cache is not so simple. At any given moment, some 42168404Spjd * subset of the blocks in the cache are un-evictable because we 43168404Spjd * have handed out a reference to them. Blocks are only evictable 44168404Spjd * when there are no external references active. This makes 45168404Spjd * eviction far more problematic: we choose to evict the evictable 46168404Spjd * blocks that are the "lowest" in the list. 47168404Spjd * 48168404Spjd * There are times when it is not possible to evict the requested 49168404Spjd * space. In these circumstances we are unable to adjust the cache 50168404Spjd * size. To prevent the cache growing unbounded at these times we 51185029Spjd * implement a "cache throttle" that slows the flow of new data 52185029Spjd * into the cache until we can make space available. 53168404Spjd * 54168404Spjd * 2. The Megiddo and Modha model assumes a fixed cache size. 55168404Spjd * Pages are evicted when the cache is full and there is a cache 56168404Spjd * miss. Our model has a variable sized cache. It grows with 57185029Spjd * high use, but also tries to react to memory pressure from the 58168404Spjd * operating system: decreasing its size when system memory is 59168404Spjd * tight. 60168404Spjd * 61168404Spjd * 3. The Megiddo and Modha model assumes a fixed page size. All 62251631Sdelphij * elements of the cache are therefore exactly the same size. So 63168404Spjd * when adjusting the cache size following a cache miss, its simply 64168404Spjd * a matter of choosing a single page to evict. In our model, we 65168404Spjd * have variable sized cache blocks (rangeing from 512 bytes to 66251631Sdelphij * 128K bytes). We therefore choose a set of blocks to evict to make 67168404Spjd * space for a cache miss that approximates as closely as possible 68168404Spjd * the space used by the new block. 69168404Spjd * 70168404Spjd * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 71168404Spjd * by N. Megiddo & D. Modha, FAST 2003 72168404Spjd */ 73168404Spjd 74168404Spjd/* 75168404Spjd * The locking model: 76168404Spjd * 77168404Spjd * A new reference to a cache buffer can be obtained in two 78168404Spjd * ways: 1) via a hash table lookup using the DVA as a key, 79185029Spjd * or 2) via one of the ARC lists. The arc_read() interface 80168404Spjd * uses method 1, while the internal arc algorithms for 81251631Sdelphij * adjusting the cache use method 2. We therefore provide two 82168404Spjd * types of locks: 1) the hash table lock array, and 2) the 83168404Spjd * arc list locks. 84168404Spjd * 85286774Smav * Buffers do not have their own mutexes, rather they rely on the 86286774Smav * hash table mutexes for the bulk of their protection (i.e. most 87286774Smav * fields in the arc_buf_hdr_t are protected by these mutexes). 88168404Spjd * 89168404Spjd * buf_hash_find() returns the appropriate mutex (held) when it 90168404Spjd * locates the requested buffer in the hash table. It returns 91168404Spjd * NULL for the mutex if the buffer was not in the table. 92168404Spjd * 93168404Spjd * buf_hash_remove() expects the appropriate hash mutex to be 94168404Spjd * already held before it is invoked. 95168404Spjd * 96168404Spjd * Each arc state also has a mutex which is used to protect the 97168404Spjd * buffer list associated with the state. When attempting to 98168404Spjd * obtain a hash table lock while holding an arc list lock you 99168404Spjd * must use: mutex_tryenter() to avoid deadlock. Also note that 100168404Spjd * the active state mutex must be held before the ghost state mutex. 101168404Spjd * 102168404Spjd * Arc buffers may have an associated eviction callback function. 103168404Spjd * This function will be invoked prior to removing the buffer (e.g. 104168404Spjd * in arc_do_user_evicts()). Note however that the data associated 105168404Spjd * with the buffer may be evicted prior to the callback. The callback 106168404Spjd * must be made with *no locks held* (to prevent deadlock). Additionally, 107168404Spjd * the users of callbacks must ensure that their private data is 108268858Sdelphij * protected from simultaneous callbacks from arc_clear_callback() 109168404Spjd * and arc_do_user_evicts(). 110168404Spjd * 111168404Spjd * Note that the majority of the performance stats are manipulated 112168404Spjd * with atomic operations. 113185029Spjd * 114286570Smav * The L2ARC uses the l2ad_mtx on each vdev for the following: 115185029Spjd * 116185029Spjd * - L2ARC buflist creation 117185029Spjd * - L2ARC buflist eviction 118185029Spjd * - L2ARC write completion, which walks L2ARC buflists 119185029Spjd * - ARC header destruction, as it removes from L2ARC buflists 120185029Spjd * - ARC header release, as it removes from L2ARC buflists 121168404Spjd */ 122168404Spjd 123168404Spjd#include <sys/spa.h> 124168404Spjd#include <sys/zio.h> 125251478Sdelphij#include <sys/zio_compress.h> 126168404Spjd#include <sys/zfs_context.h> 127168404Spjd#include <sys/arc.h> 128168404Spjd#include <sys/refcount.h> 129185029Spjd#include <sys/vdev.h> 130219089Spjd#include <sys/vdev_impl.h> 131258632Savg#include <sys/dsl_pool.h> 132286763Smav#include <sys/multilist.h> 133168404Spjd#ifdef _KERNEL 134168404Spjd#include <sys/dnlc.h> 135297633Strasz#include <sys/racct.h> 136168404Spjd#endif 137168404Spjd#include <sys/callb.h> 138168404Spjd#include <sys/kstat.h> 139248572Ssmh#include <sys/trim_map.h> 140219089Spjd#include <zfs_fletcher.h> 141168404Spjd#include <sys/sdt.h> 142168404Spjd 143272483Ssmh#include <machine/vmparam.h> 144191902Skmacy 145240133Smm#ifdef illumos 146240133Smm#ifndef _KERNEL 147240133Smm/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 148240133Smmboolean_t arc_watch = B_FALSE; 149240133Smmint arc_procfd; 150240133Smm#endif 151240133Smm#endif /* illumos */ 152240133Smm 153286763Smavstatic kmutex_t arc_reclaim_lock; 154286763Smavstatic kcondvar_t arc_reclaim_thread_cv; 155286763Smavstatic boolean_t arc_reclaim_thread_exit; 156286763Smavstatic kcondvar_t arc_reclaim_waiters_cv; 157168404Spjd 158286763Smavstatic kmutex_t arc_user_evicts_lock; 159286763Smavstatic kcondvar_t arc_user_evicts_cv; 160286763Smavstatic boolean_t arc_user_evicts_thread_exit; 161286763Smav 162301997Skibstatic kmutex_t arc_dnlc_evicts_lock; 163301997Skibstatic kcondvar_t arc_dnlc_evicts_cv; 164301997Skibstatic boolean_t arc_dnlc_evicts_thread_exit; 165301997Skib 166286625Smavuint_t arc_reduce_dnlc_percent = 3; 167168404Spjd 168258632Savg/* 169286763Smav * The number of headers to evict in arc_evict_state_impl() before 170286763Smav * dropping the sublist lock and evicting from another sublist. A lower 171286763Smav * value means we're more likely to evict the "correct" header (i.e. the 172286763Smav * oldest header in the arc state), but comes with higher overhead 173286763Smav * (i.e. more invocations of arc_evict_state_impl()). 174258632Savg */ 175286763Smavint zfs_arc_evict_batch_limit = 10; 176258632Savg 177286763Smav/* 178286763Smav * The number of sublists used for each of the arc state lists. If this 179286763Smav * is not set to a suitable value by the user, it will be configured to 180286763Smav * the number of CPUs on the system in arc_init(). 181286763Smav */ 182286763Smavint zfs_arc_num_sublists_per_state = 0; 183286763Smav 184168404Spjd/* number of seconds before growing cache again */ 185168404Spjdstatic int arc_grow_retry = 60; 186168404Spjd 187286763Smav/* shift of arc_c for calculating overflow limit in arc_get_data_buf */ 188286763Smavint zfs_arc_overflow_shift = 8; 189286763Smav 190208373Smm/* shift of arc_c for calculating both min and max arc_p */ 191208373Smmstatic int arc_p_min_shift = 4; 192208373Smm 193208373Smm/* log2(fraction of arc to reclaim) */ 194286625Smavstatic int arc_shrink_shift = 7; 195208373Smm 196168404Spjd/* 197286625Smav * log2(fraction of ARC which must be free to allow growing). 198286625Smav * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, 199286625Smav * when reading a new block into the ARC, we will evict an equal-sized block 200286625Smav * from the ARC. 201286625Smav * 202286625Smav * This must be less than arc_shrink_shift, so that when we shrink the ARC, 203286625Smav * we will still not allow it to grow. 204286625Smav */ 205286625Smavint arc_no_grow_shift = 5; 206286625Smav 207286625Smav 208286625Smav/* 209168404Spjd * minimum lifespan of a prefetch block in clock ticks 210168404Spjd * (initialized in arc_init()) 211168404Spjd */ 212168404Spjdstatic int arc_min_prefetch_lifespan; 213168404Spjd 214258632Savg/* 215258632Savg * If this percent of memory is free, don't throttle. 216258632Savg */ 217258632Savgint arc_lotsfree_percent = 10; 218258632Savg 219208373Smmstatic int arc_dead; 220287702Sdelphijextern boolean_t zfs_prefetch_disable; 221168404Spjd 222168404Spjd/* 223185029Spjd * The arc has filled available memory and has now warmed up. 224185029Spjd */ 225185029Spjdstatic boolean_t arc_warm; 226185029Spjd 227286762Smav/* 228286762Smav * These tunables are for performance analysis. 229286762Smav */ 230185029Spjduint64_t zfs_arc_max; 231185029Spjduint64_t zfs_arc_min; 232185029Spjduint64_t zfs_arc_meta_limit = 0; 233275780Sdelphijuint64_t zfs_arc_meta_min = 0; 234208373Smmint zfs_arc_grow_retry = 0; 235208373Smmint zfs_arc_shrink_shift = 0; 236208373Smmint zfs_arc_p_min_shift = 0; 237242845Sdelphijint zfs_disable_dup_eviction = 0; 238269230Sdelphijuint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ 239272483Ssmhu_int zfs_arc_free_target = 0; 240185029Spjd 241302265Ssmh/* Absolute min for arc min / max is 16MB. */ 242302265Ssmhstatic uint64_t arc_abs_min = 16 << 20; 243302265Ssmh 244270759Ssmhstatic int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); 245275748Sdelphijstatic int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); 246302265Ssmhstatic int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS); 247302265Ssmhstatic int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS); 248270759Ssmh 249302265Ssmh#if defined(__FreeBSD__) && defined(_KERNEL) 250270759Ssmhstatic void 251270759Ssmharc_free_target_init(void *unused __unused) 252270759Ssmh{ 253270759Ssmh 254272483Ssmh zfs_arc_free_target = vm_pageout_wakeup_thresh; 255270759Ssmh} 256270759SsmhSYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, 257270759Ssmh arc_free_target_init, NULL); 258270759Ssmh 259185029SpjdTUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 260275780SdelphijTUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); 261273026SdelphijTUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); 262168473SpjdSYSCTL_DECL(_vfs_zfs); 263302265SsmhSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_U64 | CTLFLAG_RWTUN, 264302265Ssmh 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size"); 265302265SsmhSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_U64 | CTLFLAG_RWTUN, 266302265Ssmh 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size"); 267269230SdelphijSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, 268269230Sdelphij &zfs_arc_average_blocksize, 0, 269269230Sdelphij "ARC average blocksize"); 270273026SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, 271273026Sdelphij &arc_shrink_shift, 0, 272273026Sdelphij "log2(fraction of arc to reclaim)"); 273273026Sdelphij 274270759Ssmh/* 275270759Ssmh * We don't have a tunable for arc_free_target due to the dependency on 276270759Ssmh * pagedaemon initialisation. 277270759Ssmh */ 278270759SsmhSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, 279270759Ssmh CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), 280270759Ssmh sysctl_vfs_zfs_arc_free_target, "IU", 281270759Ssmh "Desired number of free pages below which ARC triggers reclaim"); 282168404Spjd 283270759Ssmhstatic int 284270759Ssmhsysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) 285270759Ssmh{ 286270759Ssmh u_int val; 287270759Ssmh int err; 288270759Ssmh 289270759Ssmh val = zfs_arc_free_target; 290270759Ssmh err = sysctl_handle_int(oidp, &val, 0, req); 291270759Ssmh if (err != 0 || req->newptr == NULL) 292270759Ssmh return (err); 293270759Ssmh 294272483Ssmh if (val < minfree) 295270759Ssmh return (EINVAL); 296272483Ssmh if (val > vm_cnt.v_page_count) 297270759Ssmh return (EINVAL); 298270759Ssmh 299270759Ssmh zfs_arc_free_target = val; 300270759Ssmh 301270759Ssmh return (0); 302270759Ssmh} 303275748Sdelphij 304275748Sdelphij/* 305275748Sdelphij * Must be declared here, before the definition of corresponding kstat 306275748Sdelphij * macro which uses the same names will confuse the compiler. 307275748Sdelphij */ 308275748SdelphijSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, 309275748Sdelphij CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 310275748Sdelphij sysctl_vfs_zfs_arc_meta_limit, "QU", 311275748Sdelphij "ARC metadata limit"); 312272483Ssmh#endif 313270759Ssmh 314168404Spjd/* 315185029Spjd * Note that buffers can be in one of 6 states: 316168404Spjd * ARC_anon - anonymous (discussed below) 317168404Spjd * ARC_mru - recently used, currently cached 318168404Spjd * ARC_mru_ghost - recentely used, no longer in cache 319168404Spjd * ARC_mfu - frequently used, currently cached 320168404Spjd * ARC_mfu_ghost - frequently used, no longer in cache 321185029Spjd * ARC_l2c_only - exists in L2ARC but not other states 322185029Spjd * When there are no active references to the buffer, they are 323185029Spjd * are linked onto a list in one of these arc states. These are 324185029Spjd * the only buffers that can be evicted or deleted. Within each 325185029Spjd * state there are multiple lists, one for meta-data and one for 326185029Spjd * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 327185029Spjd * etc.) is tracked separately so that it can be managed more 328185029Spjd * explicitly: favored over data, limited explicitly. 329168404Spjd * 330168404Spjd * Anonymous buffers are buffers that are not associated with 331168404Spjd * a DVA. These are buffers that hold dirty block copies 332168404Spjd * before they are written to stable storage. By definition, 333168404Spjd * they are "ref'd" and are considered part of arc_mru 334168404Spjd * that cannot be freed. Generally, they will aquire a DVA 335168404Spjd * as they are written and migrate onto the arc_mru list. 336185029Spjd * 337185029Spjd * The ARC_l2c_only state is for buffers that are in the second 338185029Spjd * level ARC but no longer in any of the ARC_m* lists. The second 339185029Spjd * level ARC itself may also contain buffers that are in any of 340185029Spjd * the ARC_m* states - meaning that a buffer can exist in two 341185029Spjd * places. The reason for the ARC_l2c_only state is to keep the 342185029Spjd * buffer header in the hash table, so that reads that hit the 343185029Spjd * second level ARC benefit from these fast lookups. 344168404Spjd */ 345168404Spjd 346168404Spjdtypedef struct arc_state { 347286763Smav /* 348286763Smav * list of evictable buffers 349286763Smav */ 350286763Smav multilist_t arcs_list[ARC_BUFC_NUMTYPES]; 351286763Smav /* 352286763Smav * total amount of evictable data in this state 353286763Smav */ 354286763Smav uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; 355286763Smav /* 356286763Smav * total amount of data in this state; this includes: evictable, 357286763Smav * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. 358286763Smav */ 359286766Smav refcount_t arcs_size; 360168404Spjd} arc_state_t; 361168404Spjd 362185029Spjd/* The 6 states: */ 363168404Spjdstatic arc_state_t ARC_anon; 364168404Spjdstatic arc_state_t ARC_mru; 365168404Spjdstatic arc_state_t ARC_mru_ghost; 366168404Spjdstatic arc_state_t ARC_mfu; 367168404Spjdstatic arc_state_t ARC_mfu_ghost; 368185029Spjdstatic arc_state_t ARC_l2c_only; 369168404Spjd 370168404Spjdtypedef struct arc_stats { 371168404Spjd kstat_named_t arcstat_hits; 372168404Spjd kstat_named_t arcstat_misses; 373168404Spjd kstat_named_t arcstat_demand_data_hits; 374168404Spjd kstat_named_t arcstat_demand_data_misses; 375168404Spjd kstat_named_t arcstat_demand_metadata_hits; 376168404Spjd kstat_named_t arcstat_demand_metadata_misses; 377168404Spjd kstat_named_t arcstat_prefetch_data_hits; 378168404Spjd kstat_named_t arcstat_prefetch_data_misses; 379168404Spjd kstat_named_t arcstat_prefetch_metadata_hits; 380168404Spjd kstat_named_t arcstat_prefetch_metadata_misses; 381168404Spjd kstat_named_t arcstat_mru_hits; 382168404Spjd kstat_named_t arcstat_mru_ghost_hits; 383168404Spjd kstat_named_t arcstat_mfu_hits; 384168404Spjd kstat_named_t arcstat_mfu_ghost_hits; 385205231Skmacy kstat_named_t arcstat_allocated; 386168404Spjd kstat_named_t arcstat_deleted; 387251629Sdelphij /* 388251629Sdelphij * Number of buffers that could not be evicted because the hash lock 389251629Sdelphij * was held by another thread. The lock may not necessarily be held 390251629Sdelphij * by something using the same buffer, since hash locks are shared 391251629Sdelphij * by multiple buffers. 392251629Sdelphij */ 393168404Spjd kstat_named_t arcstat_mutex_miss; 394251629Sdelphij /* 395251629Sdelphij * Number of buffers skipped because they have I/O in progress, are 396251629Sdelphij * indrect prefetch buffers that have not lived long enough, or are 397251629Sdelphij * not from the spa we're trying to evict from. 398251629Sdelphij */ 399168404Spjd kstat_named_t arcstat_evict_skip; 400286763Smav /* 401286763Smav * Number of times arc_evict_state() was unable to evict enough 402286763Smav * buffers to reach it's target amount. 403286763Smav */ 404286763Smav kstat_named_t arcstat_evict_not_enough; 405208373Smm kstat_named_t arcstat_evict_l2_cached; 406208373Smm kstat_named_t arcstat_evict_l2_eligible; 407208373Smm kstat_named_t arcstat_evict_l2_ineligible; 408286763Smav kstat_named_t arcstat_evict_l2_skip; 409168404Spjd kstat_named_t arcstat_hash_elements; 410168404Spjd kstat_named_t arcstat_hash_elements_max; 411168404Spjd kstat_named_t arcstat_hash_collisions; 412168404Spjd kstat_named_t arcstat_hash_chains; 413168404Spjd kstat_named_t arcstat_hash_chain_max; 414168404Spjd kstat_named_t arcstat_p; 415168404Spjd kstat_named_t arcstat_c; 416168404Spjd kstat_named_t arcstat_c_min; 417168404Spjd kstat_named_t arcstat_c_max; 418168404Spjd kstat_named_t arcstat_size; 419286574Smav /* 420286574Smav * Number of bytes consumed by internal ARC structures necessary 421286574Smav * for tracking purposes; these structures are not actually 422286574Smav * backed by ARC buffers. This includes arc_buf_hdr_t structures 423286574Smav * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 424286574Smav * caches), and arc_buf_t structures (allocated via arc_buf_t 425286574Smav * cache). 426286574Smav */ 427185029Spjd kstat_named_t arcstat_hdr_size; 428286574Smav /* 429286574Smav * Number of bytes consumed by ARC buffers of type equal to 430286574Smav * ARC_BUFC_DATA. This is generally consumed by buffers backing 431286574Smav * on disk user data (e.g. plain file contents). 432286574Smav */ 433208373Smm kstat_named_t arcstat_data_size; 434286574Smav /* 435286574Smav * Number of bytes consumed by ARC buffers of type equal to 436286574Smav * ARC_BUFC_METADATA. This is generally consumed by buffers 437286574Smav * backing on disk data that is used for internal ZFS 438286574Smav * structures (e.g. ZAP, dnode, indirect blocks, etc). 439286574Smav */ 440286574Smav kstat_named_t arcstat_metadata_size; 441286574Smav /* 442286574Smav * Number of bytes consumed by various buffers and structures 443286574Smav * not actually backed with ARC buffers. This includes bonus 444286574Smav * buffers (allocated directly via zio_buf_* functions), 445286574Smav * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t 446286574Smav * cache), and dnode_t structures (allocated via dnode_t cache). 447286574Smav */ 448208373Smm kstat_named_t arcstat_other_size; 449286574Smav /* 450286574Smav * Total number of bytes consumed by ARC buffers residing in the 451286574Smav * arc_anon state. This includes *all* buffers in the arc_anon 452286574Smav * state; e.g. data, metadata, evictable, and unevictable buffers 453286574Smav * are all included in this value. 454286574Smav */ 455286574Smav kstat_named_t arcstat_anon_size; 456286574Smav /* 457286574Smav * Number of bytes consumed by ARC buffers that meet the 458286574Smav * following criteria: backing buffers of type ARC_BUFC_DATA, 459286574Smav * residing in the arc_anon state, and are eligible for eviction 460286574Smav * (e.g. have no outstanding holds on the buffer). 461286574Smav */ 462286574Smav kstat_named_t arcstat_anon_evictable_data; 463286574Smav /* 464286574Smav * Number of bytes consumed by ARC buffers that meet the 465286574Smav * following criteria: backing buffers of type ARC_BUFC_METADATA, 466286574Smav * residing in the arc_anon state, and are eligible for eviction 467286574Smav * (e.g. have no outstanding holds on the buffer). 468286574Smav */ 469286574Smav kstat_named_t arcstat_anon_evictable_metadata; 470286574Smav /* 471286574Smav * Total number of bytes consumed by ARC buffers residing in the 472286574Smav * arc_mru state. This includes *all* buffers in the arc_mru 473286574Smav * state; e.g. data, metadata, evictable, and unevictable buffers 474286574Smav * are all included in this value. 475286574Smav */ 476286574Smav kstat_named_t arcstat_mru_size; 477286574Smav /* 478286574Smav * Number of bytes consumed by ARC buffers that meet the 479286574Smav * following criteria: backing buffers of type ARC_BUFC_DATA, 480286574Smav * residing in the arc_mru state, and are eligible for eviction 481286574Smav * (e.g. have no outstanding holds on the buffer). 482286574Smav */ 483286574Smav kstat_named_t arcstat_mru_evictable_data; 484286574Smav /* 485286574Smav * Number of bytes consumed by ARC buffers that meet the 486286574Smav * following criteria: backing buffers of type ARC_BUFC_METADATA, 487286574Smav * residing in the arc_mru state, and are eligible for eviction 488286574Smav * (e.g. have no outstanding holds on the buffer). 489286574Smav */ 490286574Smav kstat_named_t arcstat_mru_evictable_metadata; 491286574Smav /* 492286574Smav * Total number of bytes that *would have been* consumed by ARC 493286574Smav * buffers in the arc_mru_ghost state. The key thing to note 494286574Smav * here, is the fact that this size doesn't actually indicate 495286574Smav * RAM consumption. The ghost lists only consist of headers and 496286574Smav * don't actually have ARC buffers linked off of these headers. 497286574Smav * Thus, *if* the headers had associated ARC buffers, these 498286574Smav * buffers *would have* consumed this number of bytes. 499286574Smav */ 500286574Smav kstat_named_t arcstat_mru_ghost_size; 501286574Smav /* 502286574Smav * Number of bytes that *would have been* consumed by ARC 503286574Smav * buffers that are eligible for eviction, of type 504286574Smav * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 505286574Smav */ 506286574Smav kstat_named_t arcstat_mru_ghost_evictable_data; 507286574Smav /* 508286574Smav * Number of bytes that *would have been* consumed by ARC 509286574Smav * buffers that are eligible for eviction, of type 510286574Smav * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 511286574Smav */ 512286574Smav kstat_named_t arcstat_mru_ghost_evictable_metadata; 513286574Smav /* 514286574Smav * Total number of bytes consumed by ARC buffers residing in the 515286574Smav * arc_mfu state. This includes *all* buffers in the arc_mfu 516286574Smav * state; e.g. data, metadata, evictable, and unevictable buffers 517286574Smav * are all included in this value. 518286574Smav */ 519286574Smav kstat_named_t arcstat_mfu_size; 520286574Smav /* 521286574Smav * Number of bytes consumed by ARC buffers that are eligible for 522286574Smav * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 523286574Smav * state. 524286574Smav */ 525286574Smav kstat_named_t arcstat_mfu_evictable_data; 526286574Smav /* 527286574Smav * Number of bytes consumed by ARC buffers that are eligible for 528286574Smav * eviction, of type ARC_BUFC_METADATA, and reside in the 529286574Smav * arc_mfu state. 530286574Smav */ 531286574Smav kstat_named_t arcstat_mfu_evictable_metadata; 532286574Smav /* 533286574Smav * Total number of bytes that *would have been* consumed by ARC 534286574Smav * buffers in the arc_mfu_ghost state. See the comment above 535286574Smav * arcstat_mru_ghost_size for more details. 536286574Smav */ 537286574Smav kstat_named_t arcstat_mfu_ghost_size; 538286574Smav /* 539286574Smav * Number of bytes that *would have been* consumed by ARC 540286574Smav * buffers that are eligible for eviction, of type 541286574Smav * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 542286574Smav */ 543286574Smav kstat_named_t arcstat_mfu_ghost_evictable_data; 544286574Smav /* 545286574Smav * Number of bytes that *would have been* consumed by ARC 546286574Smav * buffers that are eligible for eviction, of type 547286574Smav * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 548286574Smav */ 549286574Smav kstat_named_t arcstat_mfu_ghost_evictable_metadata; 550185029Spjd kstat_named_t arcstat_l2_hits; 551185029Spjd kstat_named_t arcstat_l2_misses; 552185029Spjd kstat_named_t arcstat_l2_feeds; 553185029Spjd kstat_named_t arcstat_l2_rw_clash; 554208373Smm kstat_named_t arcstat_l2_read_bytes; 555208373Smm kstat_named_t arcstat_l2_write_bytes; 556185029Spjd kstat_named_t arcstat_l2_writes_sent; 557185029Spjd kstat_named_t arcstat_l2_writes_done; 558185029Spjd kstat_named_t arcstat_l2_writes_error; 559286763Smav kstat_named_t arcstat_l2_writes_lock_retry; 560185029Spjd kstat_named_t arcstat_l2_evict_lock_retry; 561185029Spjd kstat_named_t arcstat_l2_evict_reading; 562286570Smav kstat_named_t arcstat_l2_evict_l1cached; 563185029Spjd kstat_named_t arcstat_l2_free_on_write; 564274172Savg kstat_named_t arcstat_l2_cdata_free_on_write; 565185029Spjd kstat_named_t arcstat_l2_abort_lowmem; 566185029Spjd kstat_named_t arcstat_l2_cksum_bad; 567185029Spjd kstat_named_t arcstat_l2_io_error; 568185029Spjd kstat_named_t arcstat_l2_size; 569251478Sdelphij kstat_named_t arcstat_l2_asize; 570185029Spjd kstat_named_t arcstat_l2_hdr_size; 571251478Sdelphij kstat_named_t arcstat_l2_compress_successes; 572251478Sdelphij kstat_named_t arcstat_l2_compress_zeros; 573251478Sdelphij kstat_named_t arcstat_l2_compress_failures; 574297848Savg kstat_named_t arcstat_l2_padding_needed; 575205231Skmacy kstat_named_t arcstat_l2_write_trylock_fail; 576205231Skmacy kstat_named_t arcstat_l2_write_passed_headroom; 577205231Skmacy kstat_named_t arcstat_l2_write_spa_mismatch; 578206796Spjd kstat_named_t arcstat_l2_write_in_l2; 579205231Skmacy kstat_named_t arcstat_l2_write_hdr_io_in_progress; 580205231Skmacy kstat_named_t arcstat_l2_write_not_cacheable; 581205231Skmacy kstat_named_t arcstat_l2_write_full; 582205231Skmacy kstat_named_t arcstat_l2_write_buffer_iter; 583205231Skmacy kstat_named_t arcstat_l2_write_pios; 584205231Skmacy kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 585205231Skmacy kstat_named_t arcstat_l2_write_buffer_list_iter; 586205231Skmacy kstat_named_t arcstat_l2_write_buffer_list_null_iter; 587242845Sdelphij kstat_named_t arcstat_memory_throttle_count; 588242845Sdelphij kstat_named_t arcstat_duplicate_buffers; 589242845Sdelphij kstat_named_t arcstat_duplicate_buffers_size; 590242845Sdelphij kstat_named_t arcstat_duplicate_reads; 591275748Sdelphij kstat_named_t arcstat_meta_used; 592275748Sdelphij kstat_named_t arcstat_meta_limit; 593275748Sdelphij kstat_named_t arcstat_meta_max; 594275780Sdelphij kstat_named_t arcstat_meta_min; 595287702Sdelphij kstat_named_t arcstat_sync_wait_for_async; 596287702Sdelphij kstat_named_t arcstat_demand_hit_predictive_prefetch; 597168404Spjd} arc_stats_t; 598168404Spjd 599168404Spjdstatic arc_stats_t arc_stats = { 600168404Spjd { "hits", KSTAT_DATA_UINT64 }, 601168404Spjd { "misses", KSTAT_DATA_UINT64 }, 602168404Spjd { "demand_data_hits", KSTAT_DATA_UINT64 }, 603168404Spjd { "demand_data_misses", KSTAT_DATA_UINT64 }, 604168404Spjd { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 605168404Spjd { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 606168404Spjd { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 607168404Spjd { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 608168404Spjd { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 609168404Spjd { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 610168404Spjd { "mru_hits", KSTAT_DATA_UINT64 }, 611168404Spjd { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 612168404Spjd { "mfu_hits", KSTAT_DATA_UINT64 }, 613168404Spjd { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 614205231Skmacy { "allocated", KSTAT_DATA_UINT64 }, 615168404Spjd { "deleted", KSTAT_DATA_UINT64 }, 616168404Spjd { "mutex_miss", KSTAT_DATA_UINT64 }, 617168404Spjd { "evict_skip", KSTAT_DATA_UINT64 }, 618286763Smav { "evict_not_enough", KSTAT_DATA_UINT64 }, 619208373Smm { "evict_l2_cached", KSTAT_DATA_UINT64 }, 620208373Smm { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 621208373Smm { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 622286763Smav { "evict_l2_skip", KSTAT_DATA_UINT64 }, 623168404Spjd { "hash_elements", KSTAT_DATA_UINT64 }, 624168404Spjd { "hash_elements_max", KSTAT_DATA_UINT64 }, 625168404Spjd { "hash_collisions", KSTAT_DATA_UINT64 }, 626168404Spjd { "hash_chains", KSTAT_DATA_UINT64 }, 627168404Spjd { "hash_chain_max", KSTAT_DATA_UINT64 }, 628168404Spjd { "p", KSTAT_DATA_UINT64 }, 629168404Spjd { "c", KSTAT_DATA_UINT64 }, 630168404Spjd { "c_min", KSTAT_DATA_UINT64 }, 631168404Spjd { "c_max", KSTAT_DATA_UINT64 }, 632185029Spjd { "size", KSTAT_DATA_UINT64 }, 633185029Spjd { "hdr_size", KSTAT_DATA_UINT64 }, 634208373Smm { "data_size", KSTAT_DATA_UINT64 }, 635286574Smav { "metadata_size", KSTAT_DATA_UINT64 }, 636208373Smm { "other_size", KSTAT_DATA_UINT64 }, 637286574Smav { "anon_size", KSTAT_DATA_UINT64 }, 638286574Smav { "anon_evictable_data", KSTAT_DATA_UINT64 }, 639286574Smav { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, 640286574Smav { "mru_size", KSTAT_DATA_UINT64 }, 641286574Smav { "mru_evictable_data", KSTAT_DATA_UINT64 }, 642286574Smav { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, 643286574Smav { "mru_ghost_size", KSTAT_DATA_UINT64 }, 644286574Smav { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, 645286574Smav { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 646286574Smav { "mfu_size", KSTAT_DATA_UINT64 }, 647286574Smav { "mfu_evictable_data", KSTAT_DATA_UINT64 }, 648286574Smav { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, 649286574Smav { "mfu_ghost_size", KSTAT_DATA_UINT64 }, 650286574Smav { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, 651286574Smav { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 652185029Spjd { "l2_hits", KSTAT_DATA_UINT64 }, 653185029Spjd { "l2_misses", KSTAT_DATA_UINT64 }, 654185029Spjd { "l2_feeds", KSTAT_DATA_UINT64 }, 655185029Spjd { "l2_rw_clash", KSTAT_DATA_UINT64 }, 656208373Smm { "l2_read_bytes", KSTAT_DATA_UINT64 }, 657208373Smm { "l2_write_bytes", KSTAT_DATA_UINT64 }, 658185029Spjd { "l2_writes_sent", KSTAT_DATA_UINT64 }, 659185029Spjd { "l2_writes_done", KSTAT_DATA_UINT64 }, 660185029Spjd { "l2_writes_error", KSTAT_DATA_UINT64 }, 661286763Smav { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, 662185029Spjd { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 663185029Spjd { "l2_evict_reading", KSTAT_DATA_UINT64 }, 664286570Smav { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, 665185029Spjd { "l2_free_on_write", KSTAT_DATA_UINT64 }, 666274172Savg { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, 667185029Spjd { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 668185029Spjd { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 669185029Spjd { "l2_io_error", KSTAT_DATA_UINT64 }, 670185029Spjd { "l2_size", KSTAT_DATA_UINT64 }, 671251478Sdelphij { "l2_asize", KSTAT_DATA_UINT64 }, 672185029Spjd { "l2_hdr_size", KSTAT_DATA_UINT64 }, 673251478Sdelphij { "l2_compress_successes", KSTAT_DATA_UINT64 }, 674251478Sdelphij { "l2_compress_zeros", KSTAT_DATA_UINT64 }, 675251478Sdelphij { "l2_compress_failures", KSTAT_DATA_UINT64 }, 676297848Savg { "l2_padding_needed", KSTAT_DATA_UINT64 }, 677206796Spjd { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 678206796Spjd { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 679206796Spjd { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 680206796Spjd { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 681206796Spjd { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 682206796Spjd { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 683206796Spjd { "l2_write_full", KSTAT_DATA_UINT64 }, 684206796Spjd { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 685206796Spjd { "l2_write_pios", KSTAT_DATA_UINT64 }, 686206796Spjd { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 687206796Spjd { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 688242845Sdelphij { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, 689242845Sdelphij { "memory_throttle_count", KSTAT_DATA_UINT64 }, 690242845Sdelphij { "duplicate_buffers", KSTAT_DATA_UINT64 }, 691242845Sdelphij { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 692275748Sdelphij { "duplicate_reads", KSTAT_DATA_UINT64 }, 693275748Sdelphij { "arc_meta_used", KSTAT_DATA_UINT64 }, 694275748Sdelphij { "arc_meta_limit", KSTAT_DATA_UINT64 }, 695275780Sdelphij { "arc_meta_max", KSTAT_DATA_UINT64 }, 696287702Sdelphij { "arc_meta_min", KSTAT_DATA_UINT64 }, 697287702Sdelphij { "sync_wait_for_async", KSTAT_DATA_UINT64 }, 698287702Sdelphij { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, 699168404Spjd}; 700168404Spjd 701168404Spjd#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 702168404Spjd 703168404Spjd#define ARCSTAT_INCR(stat, val) \ 704251631Sdelphij atomic_add_64(&arc_stats.stat.value.ui64, (val)) 705168404Spjd 706206796Spjd#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 707168404Spjd#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 708168404Spjd 709168404Spjd#define ARCSTAT_MAX(stat, val) { \ 710168404Spjd uint64_t m; \ 711168404Spjd while ((val) > (m = arc_stats.stat.value.ui64) && \ 712168404Spjd (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 713168404Spjd continue; \ 714168404Spjd} 715168404Spjd 716168404Spjd#define ARCSTAT_MAXSTAT(stat) \ 717168404Spjd ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 718168404Spjd 719168404Spjd/* 720168404Spjd * We define a macro to allow ARC hits/misses to be easily broken down by 721168404Spjd * two separate conditions, giving a total of four different subtypes for 722168404Spjd * each of hits and misses (so eight statistics total). 723168404Spjd */ 724168404Spjd#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 725168404Spjd if (cond1) { \ 726168404Spjd if (cond2) { \ 727168404Spjd ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 728168404Spjd } else { \ 729168404Spjd ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 730168404Spjd } \ 731168404Spjd } else { \ 732168404Spjd if (cond2) { \ 733168404Spjd ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 734168404Spjd } else { \ 735168404Spjd ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 736168404Spjd } \ 737168404Spjd } 738168404Spjd 739168404Spjdkstat_t *arc_ksp; 740206796Spjdstatic arc_state_t *arc_anon; 741168404Spjdstatic arc_state_t *arc_mru; 742168404Spjdstatic arc_state_t *arc_mru_ghost; 743168404Spjdstatic arc_state_t *arc_mfu; 744168404Spjdstatic arc_state_t *arc_mfu_ghost; 745185029Spjdstatic arc_state_t *arc_l2c_only; 746168404Spjd 747168404Spjd/* 748168404Spjd * There are several ARC variables that are critical to export as kstats -- 749168404Spjd * but we don't want to have to grovel around in the kstat whenever we wish to 750168404Spjd * manipulate them. For these variables, we therefore define them to be in 751168404Spjd * terms of the statistic variable. This assures that we are not introducing 752168404Spjd * the possibility of inconsistency by having shadow copies of the variables, 753168404Spjd * while still allowing the code to be readable. 754168404Spjd */ 755168404Spjd#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 756168404Spjd#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 757168404Spjd#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 758168404Spjd#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 759168404Spjd#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 760275748Sdelphij#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 761275780Sdelphij#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ 762275748Sdelphij#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ 763275748Sdelphij#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 764168404Spjd 765251478Sdelphij#define L2ARC_IS_VALID_COMPRESS(_c_) \ 766251478Sdelphij ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) 767251478Sdelphij 768168404Spjdstatic int arc_no_grow; /* Don't try to grow cache size */ 769168404Spjdstatic uint64_t arc_tempreserve; 770209962Smmstatic uint64_t arc_loaned_bytes; 771168404Spjd 772168404Spjdtypedef struct arc_callback arc_callback_t; 773168404Spjd 774168404Spjdstruct arc_callback { 775168404Spjd void *acb_private; 776168404Spjd arc_done_func_t *acb_done; 777168404Spjd arc_buf_t *acb_buf; 778168404Spjd zio_t *acb_zio_dummy; 779168404Spjd arc_callback_t *acb_next; 780168404Spjd}; 781168404Spjd 782168404Spjdtypedef struct arc_write_callback arc_write_callback_t; 783168404Spjd 784168404Spjdstruct arc_write_callback { 785168404Spjd void *awcb_private; 786168404Spjd arc_done_func_t *awcb_ready; 787304138Savg arc_done_func_t *awcb_children_ready; 788258632Savg arc_done_func_t *awcb_physdone; 789168404Spjd arc_done_func_t *awcb_done; 790168404Spjd arc_buf_t *awcb_buf; 791168404Spjd}; 792168404Spjd 793286570Smav/* 794286570Smav * ARC buffers are separated into multiple structs as a memory saving measure: 795286570Smav * - Common fields struct, always defined, and embedded within it: 796286570Smav * - L2-only fields, always allocated but undefined when not in L2ARC 797286570Smav * - L1-only fields, only allocated when in L1ARC 798286570Smav * 799286570Smav * Buffer in L1 Buffer only in L2 800286570Smav * +------------------------+ +------------------------+ 801286570Smav * | arc_buf_hdr_t | | arc_buf_hdr_t | 802286570Smav * | | | | 803286570Smav * | | | | 804286570Smav * | | | | 805286570Smav * +------------------------+ +------------------------+ 806286570Smav * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 807286570Smav * | (undefined if L1-only) | | | 808286570Smav * +------------------------+ +------------------------+ 809286570Smav * | l1arc_buf_hdr_t | 810286570Smav * | | 811286570Smav * | | 812286570Smav * | | 813286570Smav * | | 814286570Smav * +------------------------+ 815286570Smav * 816286570Smav * Because it's possible for the L2ARC to become extremely large, we can wind 817286570Smav * up eating a lot of memory in L2ARC buffer headers, so the size of a header 818286570Smav * is minimized by only allocating the fields necessary for an L1-cached buffer 819286570Smav * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 820286570Smav * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 821286570Smav * words in pointers. arc_hdr_realloc() is used to switch a header between 822286570Smav * these two allocation states. 823286570Smav */ 824286570Smavtypedef struct l1arc_buf_hdr { 825168404Spjd kmutex_t b_freeze_lock; 826286570Smav#ifdef ZFS_DEBUG 827286570Smav /* 828286570Smav * used for debugging wtih kmem_flags - by allocating and freeing 829286570Smav * b_thawed when the buffer is thawed, we get a record of the stack 830286570Smav * trace that thawed it. 831286570Smav */ 832219089Spjd void *b_thawed; 833286570Smav#endif 834168404Spjd 835168404Spjd arc_buf_t *b_buf; 836168404Spjd uint32_t b_datacnt; 837286570Smav /* for waiting on writes to complete */ 838168404Spjd kcondvar_t b_cv; 839168404Spjd 840168404Spjd /* protected by arc state mutex */ 841168404Spjd arc_state_t *b_state; 842286763Smav multilist_node_t b_arc_node; 843168404Spjd 844168404Spjd /* updated atomically */ 845168404Spjd clock_t b_arc_access; 846168404Spjd 847168404Spjd /* self protecting */ 848168404Spjd refcount_t b_refcnt; 849185029Spjd 850286570Smav arc_callback_t *b_acb; 851297848Savg /* temporary buffer holder for in-flight compressed or padded data */ 852286570Smav void *b_tmp_cdata; 853286570Smav} l1arc_buf_hdr_t; 854286570Smav 855286570Smavtypedef struct l2arc_dev l2arc_dev_t; 856286570Smav 857286570Smavtypedef struct l2arc_buf_hdr { 858286570Smav /* protected by arc_buf_hdr mutex */ 859286570Smav l2arc_dev_t *b_dev; /* L2ARC device */ 860286570Smav uint64_t b_daddr; /* disk address, offset byte */ 861286570Smav /* real alloc'd buffer size depending on b_compress applied */ 862286570Smav int32_t b_asize; 863287706Sdelphij uint8_t b_compress; 864286570Smav 865185029Spjd list_node_t b_l2node; 866286570Smav} l2arc_buf_hdr_t; 867286570Smav 868286570Smavstruct arc_buf_hdr { 869286570Smav /* protected by hash lock */ 870286570Smav dva_t b_dva; 871286570Smav uint64_t b_birth; 872286570Smav /* 873286570Smav * Even though this checksum is only set/verified when a buffer is in 874286570Smav * the L1 cache, it needs to be in the set of common fields because it 875286570Smav * must be preserved from the time before a buffer is written out to 876286570Smav * L2ARC until after it is read back in. 877286570Smav */ 878286570Smav zio_cksum_t *b_freeze_cksum; 879286570Smav 880286570Smav arc_buf_hdr_t *b_hash_next; 881286570Smav arc_flags_t b_flags; 882286570Smav 883286570Smav /* immutable */ 884286570Smav int32_t b_size; 885286570Smav uint64_t b_spa; 886286570Smav 887286570Smav /* L2ARC fields. Undefined when not in L2ARC. */ 888286570Smav l2arc_buf_hdr_t b_l2hdr; 889286570Smav /* L1ARC fields. Undefined when in l2arc_only state */ 890286570Smav l1arc_buf_hdr_t b_l1hdr; 891168404Spjd}; 892168404Spjd 893302265Ssmh#if defined(__FreeBSD__) && defined(_KERNEL) 894275748Sdelphijstatic int 895275748Sdelphijsysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) 896275748Sdelphij{ 897275748Sdelphij uint64_t val; 898275748Sdelphij int err; 899275748Sdelphij 900275748Sdelphij val = arc_meta_limit; 901275748Sdelphij err = sysctl_handle_64(oidp, &val, 0, req); 902275748Sdelphij if (err != 0 || req->newptr == NULL) 903275748Sdelphij return (err); 904275748Sdelphij 905275748Sdelphij if (val <= 0 || val > arc_c_max) 906275748Sdelphij return (EINVAL); 907275748Sdelphij 908275748Sdelphij arc_meta_limit = val; 909275748Sdelphij return (0); 910275748Sdelphij} 911302265Ssmh 912302265Ssmhstatic int 913302265Ssmhsysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS) 914302265Ssmh{ 915302265Ssmh uint64_t val; 916302265Ssmh int err; 917302265Ssmh 918302265Ssmh val = zfs_arc_max; 919302265Ssmh err = sysctl_handle_64(oidp, &val, 0, req); 920302265Ssmh if (err != 0 || req->newptr == NULL) 921302265Ssmh return (err); 922302265Ssmh 923302382Ssmh if (zfs_arc_max == 0) { 924302382Ssmh /* Loader tunable so blindly set */ 925302382Ssmh zfs_arc_max = val; 926302382Ssmh return (0); 927302382Ssmh } 928302382Ssmh 929302265Ssmh if (val < arc_abs_min || val > kmem_size()) 930302265Ssmh return (EINVAL); 931302265Ssmh if (val < arc_c_min) 932302265Ssmh return (EINVAL); 933302265Ssmh if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit) 934302265Ssmh return (EINVAL); 935302265Ssmh 936302265Ssmh arc_c_max = val; 937302265Ssmh 938302265Ssmh arc_c = arc_c_max; 939302265Ssmh arc_p = (arc_c >> 1); 940302265Ssmh 941302265Ssmh if (zfs_arc_meta_limit == 0) { 942302265Ssmh /* limit meta-data to 1/4 of the arc capacity */ 943302265Ssmh arc_meta_limit = arc_c_max / 4; 944302265Ssmh } 945302265Ssmh 946302265Ssmh /* if kmem_flags are set, lets try to use less memory */ 947302265Ssmh if (kmem_debugging()) 948302265Ssmh arc_c = arc_c / 2; 949302265Ssmh 950302265Ssmh zfs_arc_max = arc_c; 951302265Ssmh 952302265Ssmh return (0); 953302265Ssmh} 954302265Ssmh 955302265Ssmhstatic int 956302265Ssmhsysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS) 957302265Ssmh{ 958302265Ssmh uint64_t val; 959302265Ssmh int err; 960302265Ssmh 961302265Ssmh val = zfs_arc_min; 962302265Ssmh err = sysctl_handle_64(oidp, &val, 0, req); 963302265Ssmh if (err != 0 || req->newptr == NULL) 964302265Ssmh return (err); 965302265Ssmh 966302382Ssmh if (zfs_arc_min == 0) { 967302382Ssmh /* Loader tunable so blindly set */ 968302382Ssmh zfs_arc_min = val; 969302382Ssmh return (0); 970302382Ssmh } 971302382Ssmh 972302265Ssmh if (val < arc_abs_min || val > arc_c_max) 973302265Ssmh return (EINVAL); 974302265Ssmh 975302265Ssmh arc_c_min = val; 976302265Ssmh 977302265Ssmh if (zfs_arc_meta_min == 0) 978302265Ssmh arc_meta_min = arc_c_min / 2; 979302265Ssmh 980302265Ssmh if (arc_c < arc_c_min) 981302265Ssmh arc_c = arc_c_min; 982302265Ssmh 983302265Ssmh zfs_arc_min = arc_c_min; 984302265Ssmh 985302265Ssmh return (0); 986302265Ssmh} 987275748Sdelphij#endif 988275748Sdelphij 989168404Spjdstatic arc_buf_t *arc_eviction_list; 990168404Spjdstatic arc_buf_hdr_t arc_eviction_hdr; 991168404Spjd 992168404Spjd#define GHOST_STATE(state) \ 993185029Spjd ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 994185029Spjd (state) == arc_l2c_only) 995168404Spjd 996275811Sdelphij#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) 997275811Sdelphij#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) 998275811Sdelphij#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) 999275811Sdelphij#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) 1000275811Sdelphij#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) 1001275811Sdelphij#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) 1002286570Smav 1003275811Sdelphij#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) 1004286570Smav#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) 1005275811Sdelphij#define HDR_L2_READING(hdr) \ 1006286570Smav (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ 1007286570Smav ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) 1008275811Sdelphij#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) 1009275811Sdelphij#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) 1010275811Sdelphij#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) 1011168404Spjd 1012286570Smav#define HDR_ISTYPE_METADATA(hdr) \ 1013286570Smav ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) 1014286570Smav#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) 1015286570Smav 1016286570Smav#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) 1017286570Smav#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) 1018286570Smav 1019168404Spjd/* 1020185029Spjd * Other sizes 1021185029Spjd */ 1022185029Spjd 1023286570Smav#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 1024286570Smav#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) 1025185029Spjd 1026185029Spjd/* 1027168404Spjd * Hash table routines 1028168404Spjd */ 1029168404Spjd 1030205253Skmacy#define HT_LOCK_PAD CACHE_LINE_SIZE 1031168404Spjd 1032168404Spjdstruct ht_lock { 1033168404Spjd kmutex_t ht_lock; 1034168404Spjd#ifdef _KERNEL 1035168404Spjd unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 1036168404Spjd#endif 1037168404Spjd}; 1038168404Spjd 1039168404Spjd#define BUF_LOCKS 256 1040168404Spjdtypedef struct buf_hash_table { 1041168404Spjd uint64_t ht_mask; 1042168404Spjd arc_buf_hdr_t **ht_table; 1043205264Skmacy struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 1044168404Spjd} buf_hash_table_t; 1045168404Spjd 1046168404Spjdstatic buf_hash_table_t buf_hash_table; 1047168404Spjd 1048168404Spjd#define BUF_HASH_INDEX(spa, dva, birth) \ 1049168404Spjd (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 1050168404Spjd#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 1051168404Spjd#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 1052219089Spjd#define HDR_LOCK(hdr) \ 1053219089Spjd (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 1054168404Spjd 1055168404Spjduint64_t zfs_crc64_table[256]; 1056168404Spjd 1057185029Spjd/* 1058185029Spjd * Level 2 ARC 1059185029Spjd */ 1060185029Spjd 1061272707Savg#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 1062251478Sdelphij#define L2ARC_HEADROOM 2 /* num of writes */ 1063251478Sdelphij/* 1064251478Sdelphij * If we discover during ARC scan any buffers to be compressed, we boost 1065251478Sdelphij * our headroom for the next scanning cycle by this percentage multiple. 1066251478Sdelphij */ 1067251478Sdelphij#define L2ARC_HEADROOM_BOOST 200 1068208373Smm#define L2ARC_FEED_SECS 1 /* caching interval secs */ 1069208373Smm#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 1070185029Spjd 1071286598Smav/* 1072286598Smav * Used to distinguish headers that are being process by 1073286598Smav * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk 1074286598Smav * address. This can happen when the header is added to the l2arc's list 1075286598Smav * of buffers to write in the first stage of l2arc_write_buffers(), but 1076286598Smav * has not yet been written out which happens in the second stage of 1077286598Smav * l2arc_write_buffers(). 1078286598Smav */ 1079286598Smav#define L2ARC_ADDR_UNSET ((uint64_t)(-1)) 1080286598Smav 1081185029Spjd#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 1082185029Spjd#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 1083185029Spjd 1084251631Sdelphij/* L2ARC Performance Tunables */ 1085185029Spjduint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 1086185029Spjduint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 1087185029Spjduint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 1088251478Sdelphijuint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 1089185029Spjduint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 1090208373Smmuint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 1091219089Spjdboolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 1092208373Smmboolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 1093208373Smmboolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 1094185029Spjd 1095217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, 1096205231Skmacy &l2arc_write_max, 0, "max write size"); 1097217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, 1098205231Skmacy &l2arc_write_boost, 0, "extra write during warmup"); 1099217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, 1100205231Skmacy &l2arc_headroom, 0, "number of dev writes"); 1101217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, 1102205231Skmacy &l2arc_feed_secs, 0, "interval seconds"); 1103217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, 1104208373Smm &l2arc_feed_min_ms, 0, "min interval milliseconds"); 1105205231Skmacy 1106205231SkmacySYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, 1107205231Skmacy &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 1108208373SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, 1109208373Smm &l2arc_feed_again, 0, "turbo warmup"); 1110208373SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, 1111208373Smm &l2arc_norw, 0, "no reads during writes"); 1112205231Skmacy 1113217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 1114286770Smav &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); 1115217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, 1116205231Skmacy &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); 1117217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, 1118205231Skmacy &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); 1119205231Skmacy 1120217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 1121286770Smav &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); 1122217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, 1123205231Skmacy &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); 1124217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, 1125205231Skmacy &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); 1126205231Skmacy 1127217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 1128286770Smav &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); 1129217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, 1130205231Skmacy &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1131205231Skmacy "size of metadata in mru ghost state"); 1132217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, 1133205231Skmacy &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1134205231Skmacy "size of data in mru ghost state"); 1135205231Skmacy 1136217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 1137286770Smav &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); 1138217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, 1139205231Skmacy &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); 1140217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, 1141205231Skmacy &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); 1142205231Skmacy 1143217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 1144286770Smav &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); 1145217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, 1146205231Skmacy &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1147205231Skmacy "size of metadata in mfu ghost state"); 1148217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, 1149205231Skmacy &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1150205231Skmacy "size of data in mfu ghost state"); 1151205231Skmacy 1152217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 1153286770Smav &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); 1154205231Skmacy 1155185029Spjd/* 1156185029Spjd * L2ARC Internals 1157185029Spjd */ 1158286570Smavstruct l2arc_dev { 1159185029Spjd vdev_t *l2ad_vdev; /* vdev */ 1160185029Spjd spa_t *l2ad_spa; /* spa */ 1161185029Spjd uint64_t l2ad_hand; /* next write location */ 1162185029Spjd uint64_t l2ad_start; /* first addr on device */ 1163185029Spjd uint64_t l2ad_end; /* last addr on device */ 1164185029Spjd boolean_t l2ad_first; /* first sweep through */ 1165208373Smm boolean_t l2ad_writing; /* currently writing */ 1166286570Smav kmutex_t l2ad_mtx; /* lock for buffer list */ 1167286570Smav list_t l2ad_buflist; /* buffer list */ 1168185029Spjd list_node_t l2ad_node; /* device list node */ 1169286598Smav refcount_t l2ad_alloc; /* allocated bytes */ 1170286570Smav}; 1171185029Spjd 1172185029Spjdstatic list_t L2ARC_dev_list; /* device list */ 1173185029Spjdstatic list_t *l2arc_dev_list; /* device list pointer */ 1174185029Spjdstatic kmutex_t l2arc_dev_mtx; /* device list mutex */ 1175185029Spjdstatic l2arc_dev_t *l2arc_dev_last; /* last device used */ 1176185029Spjdstatic list_t L2ARC_free_on_write; /* free after write buf list */ 1177185029Spjdstatic list_t *l2arc_free_on_write; /* free after write list ptr */ 1178185029Spjdstatic kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 1179185029Spjdstatic uint64_t l2arc_ndev; /* number of devices */ 1180185029Spjd 1181185029Spjdtypedef struct l2arc_read_callback { 1182251478Sdelphij arc_buf_t *l2rcb_buf; /* read buffer */ 1183251478Sdelphij spa_t *l2rcb_spa; /* spa */ 1184251478Sdelphij blkptr_t l2rcb_bp; /* original blkptr */ 1185268123Sdelphij zbookmark_phys_t l2rcb_zb; /* original bookmark */ 1186251478Sdelphij int l2rcb_flags; /* original flags */ 1187251478Sdelphij enum zio_compress l2rcb_compress; /* applied compress */ 1188297848Savg void *l2rcb_data; /* temporary buffer */ 1189185029Spjd} l2arc_read_callback_t; 1190185029Spjd 1191185029Spjdtypedef struct l2arc_write_callback { 1192185029Spjd l2arc_dev_t *l2wcb_dev; /* device info */ 1193185029Spjd arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 1194185029Spjd} l2arc_write_callback_t; 1195185029Spjd 1196185029Spjdtypedef struct l2arc_data_free { 1197185029Spjd /* protected by l2arc_free_on_write_mtx */ 1198185029Spjd void *l2df_data; 1199185029Spjd size_t l2df_size; 1200185029Spjd void (*l2df_func)(void *, size_t); 1201185029Spjd list_node_t l2df_list_node; 1202185029Spjd} l2arc_data_free_t; 1203185029Spjd 1204185029Spjdstatic kmutex_t l2arc_feed_thr_lock; 1205185029Spjdstatic kcondvar_t l2arc_feed_thr_cv; 1206185029Spjdstatic uint8_t l2arc_thread_exit; 1207185029Spjd 1208275811Sdelphijstatic void arc_get_data_buf(arc_buf_t *); 1209275811Sdelphijstatic void arc_access(arc_buf_hdr_t *, kmutex_t *); 1210286763Smavstatic boolean_t arc_is_overflowing(); 1211275811Sdelphijstatic void arc_buf_watch(arc_buf_t *); 1212275811Sdelphij 1213286570Smavstatic arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); 1214286570Smavstatic uint32_t arc_bufc_to_flags(arc_buf_contents_t); 1215286570Smav 1216275811Sdelphijstatic boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); 1217275811Sdelphijstatic void l2arc_read_done(zio_t *); 1218185029Spjd 1219297848Savgstatic boolean_t l2arc_transform_buf(arc_buf_hdr_t *, boolean_t); 1220275811Sdelphijstatic void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); 1221275811Sdelphijstatic void l2arc_release_cdata_buf(arc_buf_hdr_t *); 1222251478Sdelphij 1223290191Savgstatic void 1224290191Savgl2arc_trim(const arc_buf_hdr_t *hdr) 1225290191Savg{ 1226290191Savg l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 1227290191Savg 1228290191Savg ASSERT(HDR_HAS_L2HDR(hdr)); 1229290191Savg ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 1230290191Savg 1231290191Savg if (hdr->b_l2hdr.b_daddr == L2ARC_ADDR_UNSET) 1232290191Savg return; 1233290191Savg if (hdr->b_l2hdr.b_asize != 0) { 1234290191Savg trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, 1235290191Savg hdr->b_l2hdr.b_asize, 0); 1236290191Savg } else { 1237290191Savg ASSERT3U(hdr->b_l2hdr.b_compress, ==, ZIO_COMPRESS_EMPTY); 1238290191Savg } 1239290191Savg} 1240290191Savg 1241168404Spjdstatic uint64_t 1242209962Smmbuf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 1243168404Spjd{ 1244168404Spjd uint8_t *vdva = (uint8_t *)dva; 1245168404Spjd uint64_t crc = -1ULL; 1246168404Spjd int i; 1247168404Spjd 1248168404Spjd ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 1249168404Spjd 1250168404Spjd for (i = 0; i < sizeof (dva_t); i++) 1251168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 1252168404Spjd 1253209962Smm crc ^= (spa>>8) ^ birth; 1254168404Spjd 1255168404Spjd return (crc); 1256168404Spjd} 1257168404Spjd 1258168404Spjd#define BUF_EMPTY(buf) \ 1259168404Spjd ((buf)->b_dva.dva_word[0] == 0 && \ 1260286570Smav (buf)->b_dva.dva_word[1] == 0) 1261168404Spjd 1262168404Spjd#define BUF_EQUAL(spa, dva, birth, buf) \ 1263168404Spjd ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 1264168404Spjd ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 1265168404Spjd ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 1266168404Spjd 1267219089Spjdstatic void 1268219089Spjdbuf_discard_identity(arc_buf_hdr_t *hdr) 1269219089Spjd{ 1270219089Spjd hdr->b_dva.dva_word[0] = 0; 1271219089Spjd hdr->b_dva.dva_word[1] = 0; 1272219089Spjd hdr->b_birth = 0; 1273219089Spjd} 1274219089Spjd 1275168404Spjdstatic arc_buf_hdr_t * 1276268075Sdelphijbuf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) 1277168404Spjd{ 1278268075Sdelphij const dva_t *dva = BP_IDENTITY(bp); 1279268075Sdelphij uint64_t birth = BP_PHYSICAL_BIRTH(bp); 1280168404Spjd uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 1281168404Spjd kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1282275811Sdelphij arc_buf_hdr_t *hdr; 1283168404Spjd 1284168404Spjd mutex_enter(hash_lock); 1285275811Sdelphij for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; 1286275811Sdelphij hdr = hdr->b_hash_next) { 1287275811Sdelphij if (BUF_EQUAL(spa, dva, birth, hdr)) { 1288168404Spjd *lockp = hash_lock; 1289275811Sdelphij return (hdr); 1290168404Spjd } 1291168404Spjd } 1292168404Spjd mutex_exit(hash_lock); 1293168404Spjd *lockp = NULL; 1294168404Spjd return (NULL); 1295168404Spjd} 1296168404Spjd 1297168404Spjd/* 1298168404Spjd * Insert an entry into the hash table. If there is already an element 1299168404Spjd * equal to elem in the hash table, then the already existing element 1300168404Spjd * will be returned and the new element will not be inserted. 1301168404Spjd * Otherwise returns NULL. 1302286570Smav * If lockp == NULL, the caller is assumed to already hold the hash lock. 1303168404Spjd */ 1304168404Spjdstatic arc_buf_hdr_t * 1305275811Sdelphijbuf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) 1306168404Spjd{ 1307275811Sdelphij uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1308168404Spjd kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1309275811Sdelphij arc_buf_hdr_t *fhdr; 1310168404Spjd uint32_t i; 1311168404Spjd 1312275811Sdelphij ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); 1313275811Sdelphij ASSERT(hdr->b_birth != 0); 1314275811Sdelphij ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1315286570Smav 1316286570Smav if (lockp != NULL) { 1317286570Smav *lockp = hash_lock; 1318286570Smav mutex_enter(hash_lock); 1319286570Smav } else { 1320286570Smav ASSERT(MUTEX_HELD(hash_lock)); 1321286570Smav } 1322286570Smav 1323275811Sdelphij for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; 1324275811Sdelphij fhdr = fhdr->b_hash_next, i++) { 1325275811Sdelphij if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) 1326275811Sdelphij return (fhdr); 1327168404Spjd } 1328168404Spjd 1329275811Sdelphij hdr->b_hash_next = buf_hash_table.ht_table[idx]; 1330275811Sdelphij buf_hash_table.ht_table[idx] = hdr; 1331275811Sdelphij hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 1332168404Spjd 1333168404Spjd /* collect some hash table performance data */ 1334168404Spjd if (i > 0) { 1335168404Spjd ARCSTAT_BUMP(arcstat_hash_collisions); 1336168404Spjd if (i == 1) 1337168404Spjd ARCSTAT_BUMP(arcstat_hash_chains); 1338168404Spjd 1339168404Spjd ARCSTAT_MAX(arcstat_hash_chain_max, i); 1340168404Spjd } 1341168404Spjd 1342168404Spjd ARCSTAT_BUMP(arcstat_hash_elements); 1343168404Spjd ARCSTAT_MAXSTAT(arcstat_hash_elements); 1344168404Spjd 1345168404Spjd return (NULL); 1346168404Spjd} 1347168404Spjd 1348168404Spjdstatic void 1349275811Sdelphijbuf_hash_remove(arc_buf_hdr_t *hdr) 1350168404Spjd{ 1351275811Sdelphij arc_buf_hdr_t *fhdr, **hdrp; 1352275811Sdelphij uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1353168404Spjd 1354168404Spjd ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 1355275811Sdelphij ASSERT(HDR_IN_HASH_TABLE(hdr)); 1356168404Spjd 1357275811Sdelphij hdrp = &buf_hash_table.ht_table[idx]; 1358275811Sdelphij while ((fhdr = *hdrp) != hdr) { 1359275811Sdelphij ASSERT(fhdr != NULL); 1360275811Sdelphij hdrp = &fhdr->b_hash_next; 1361168404Spjd } 1362275811Sdelphij *hdrp = hdr->b_hash_next; 1363275811Sdelphij hdr->b_hash_next = NULL; 1364275811Sdelphij hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; 1365168404Spjd 1366168404Spjd /* collect some hash table performance data */ 1367168404Spjd ARCSTAT_BUMPDOWN(arcstat_hash_elements); 1368168404Spjd 1369168404Spjd if (buf_hash_table.ht_table[idx] && 1370168404Spjd buf_hash_table.ht_table[idx]->b_hash_next == NULL) 1371168404Spjd ARCSTAT_BUMPDOWN(arcstat_hash_chains); 1372168404Spjd} 1373168404Spjd 1374168404Spjd/* 1375168404Spjd * Global data structures and functions for the buf kmem cache. 1376168404Spjd */ 1377286570Smavstatic kmem_cache_t *hdr_full_cache; 1378286570Smavstatic kmem_cache_t *hdr_l2only_cache; 1379168404Spjdstatic kmem_cache_t *buf_cache; 1380168404Spjd 1381168404Spjdstatic void 1382168404Spjdbuf_fini(void) 1383168404Spjd{ 1384168404Spjd int i; 1385168404Spjd 1386168404Spjd kmem_free(buf_hash_table.ht_table, 1387168404Spjd (buf_hash_table.ht_mask + 1) * sizeof (void *)); 1388168404Spjd for (i = 0; i < BUF_LOCKS; i++) 1389168404Spjd mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 1390286570Smav kmem_cache_destroy(hdr_full_cache); 1391286570Smav kmem_cache_destroy(hdr_l2only_cache); 1392168404Spjd kmem_cache_destroy(buf_cache); 1393168404Spjd} 1394168404Spjd 1395168404Spjd/* 1396168404Spjd * Constructor callback - called when the cache is empty 1397168404Spjd * and a new buf is requested. 1398168404Spjd */ 1399168404Spjd/* ARGSUSED */ 1400168404Spjdstatic int 1401286570Smavhdr_full_cons(void *vbuf, void *unused, int kmflag) 1402168404Spjd{ 1403275811Sdelphij arc_buf_hdr_t *hdr = vbuf; 1404168404Spjd 1405286570Smav bzero(hdr, HDR_FULL_SIZE); 1406286570Smav cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); 1407286570Smav refcount_create(&hdr->b_l1hdr.b_refcnt); 1408286570Smav mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 1409286763Smav multilist_link_init(&hdr->b_l1hdr.b_arc_node); 1410286570Smav arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1411185029Spjd 1412168404Spjd return (0); 1413168404Spjd} 1414168404Spjd 1415185029Spjd/* ARGSUSED */ 1416185029Spjdstatic int 1417286570Smavhdr_l2only_cons(void *vbuf, void *unused, int kmflag) 1418286570Smav{ 1419286570Smav arc_buf_hdr_t *hdr = vbuf; 1420286570Smav 1421286570Smav bzero(hdr, HDR_L2ONLY_SIZE); 1422286570Smav arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1423286570Smav 1424286570Smav return (0); 1425286570Smav} 1426286570Smav 1427286570Smav/* ARGSUSED */ 1428286570Smavstatic int 1429185029Spjdbuf_cons(void *vbuf, void *unused, int kmflag) 1430185029Spjd{ 1431185029Spjd arc_buf_t *buf = vbuf; 1432185029Spjd 1433185029Spjd bzero(buf, sizeof (arc_buf_t)); 1434219089Spjd mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 1435208373Smm arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1436208373Smm 1437185029Spjd return (0); 1438185029Spjd} 1439185029Spjd 1440168404Spjd/* 1441168404Spjd * Destructor callback - called when a cached buf is 1442168404Spjd * no longer required. 1443168404Spjd */ 1444168404Spjd/* ARGSUSED */ 1445168404Spjdstatic void 1446286570Smavhdr_full_dest(void *vbuf, void *unused) 1447168404Spjd{ 1448275811Sdelphij arc_buf_hdr_t *hdr = vbuf; 1449168404Spjd 1450275811Sdelphij ASSERT(BUF_EMPTY(hdr)); 1451286570Smav cv_destroy(&hdr->b_l1hdr.b_cv); 1452286570Smav refcount_destroy(&hdr->b_l1hdr.b_refcnt); 1453286570Smav mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); 1454286763Smav ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 1455286570Smav arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1456168404Spjd} 1457168404Spjd 1458185029Spjd/* ARGSUSED */ 1459185029Spjdstatic void 1460286570Smavhdr_l2only_dest(void *vbuf, void *unused) 1461286570Smav{ 1462286570Smav arc_buf_hdr_t *hdr = vbuf; 1463286570Smav 1464286570Smav ASSERT(BUF_EMPTY(hdr)); 1465286570Smav arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1466286570Smav} 1467286570Smav 1468286570Smav/* ARGSUSED */ 1469286570Smavstatic void 1470185029Spjdbuf_dest(void *vbuf, void *unused) 1471185029Spjd{ 1472185029Spjd arc_buf_t *buf = vbuf; 1473185029Spjd 1474219089Spjd mutex_destroy(&buf->b_evict_lock); 1475208373Smm arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1476185029Spjd} 1477185029Spjd 1478168404Spjd/* 1479168404Spjd * Reclaim callback -- invoked when memory is low. 1480168404Spjd */ 1481168404Spjd/* ARGSUSED */ 1482168404Spjdstatic void 1483168404Spjdhdr_recl(void *unused) 1484168404Spjd{ 1485168404Spjd dprintf("hdr_recl called\n"); 1486168404Spjd /* 1487168404Spjd * umem calls the reclaim func when we destroy the buf cache, 1488168404Spjd * which is after we do arc_fini(). 1489168404Spjd */ 1490168404Spjd if (!arc_dead) 1491286763Smav cv_signal(&arc_reclaim_thread_cv); 1492168404Spjd} 1493168404Spjd 1494168404Spjdstatic void 1495168404Spjdbuf_init(void) 1496168404Spjd{ 1497168404Spjd uint64_t *ct; 1498168404Spjd uint64_t hsize = 1ULL << 12; 1499168404Spjd int i, j; 1500168404Spjd 1501168404Spjd /* 1502168404Spjd * The hash table is big enough to fill all of physical memory 1503269230Sdelphij * with an average block size of zfs_arc_average_blocksize (default 8K). 1504269230Sdelphij * By default, the table will take up 1505269230Sdelphij * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). 1506168404Spjd */ 1507269230Sdelphij while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) 1508168404Spjd hsize <<= 1; 1509168404Spjdretry: 1510168404Spjd buf_hash_table.ht_mask = hsize - 1; 1511168404Spjd buf_hash_table.ht_table = 1512168404Spjd kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1513168404Spjd if (buf_hash_table.ht_table == NULL) { 1514168404Spjd ASSERT(hsize > (1ULL << 8)); 1515168404Spjd hsize >>= 1; 1516168404Spjd goto retry; 1517168404Spjd } 1518168404Spjd 1519286570Smav hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 1520286570Smav 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); 1521286570Smav hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", 1522286570Smav HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, 1523286570Smav NULL, NULL, 0); 1524168404Spjd buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1525185029Spjd 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1526168404Spjd 1527168404Spjd for (i = 0; i < 256; i++) 1528168404Spjd for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1529168404Spjd *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1530168404Spjd 1531168404Spjd for (i = 0; i < BUF_LOCKS; i++) { 1532168404Spjd mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1533168404Spjd NULL, MUTEX_DEFAULT, NULL); 1534168404Spjd } 1535168404Spjd} 1536168404Spjd 1537286570Smav/* 1538286570Smav * Transition between the two allocation states for the arc_buf_hdr struct. 1539286570Smav * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without 1540286570Smav * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller 1541286570Smav * version is used when a cache buffer is only in the L2ARC in order to reduce 1542286570Smav * memory usage. 1543286570Smav */ 1544286570Smavstatic arc_buf_hdr_t * 1545286570Smavarc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) 1546286570Smav{ 1547286570Smav ASSERT(HDR_HAS_L2HDR(hdr)); 1548286570Smav 1549286570Smav arc_buf_hdr_t *nhdr; 1550286570Smav l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 1551286570Smav 1552286570Smav ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || 1553286570Smav (old == hdr_l2only_cache && new == hdr_full_cache)); 1554286570Smav 1555286570Smav nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); 1556286570Smav 1557286570Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); 1558286570Smav buf_hash_remove(hdr); 1559286570Smav 1560286570Smav bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); 1561286598Smav 1562286570Smav if (new == hdr_full_cache) { 1563286570Smav nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1564286570Smav /* 1565286570Smav * arc_access and arc_change_state need to be aware that a 1566286570Smav * header has just come out of L2ARC, so we set its state to 1567286570Smav * l2c_only even though it's about to change. 1568286570Smav */ 1569286570Smav nhdr->b_l1hdr.b_state = arc_l2c_only; 1570286763Smav 1571286763Smav /* Verify previous threads set to NULL before freeing */ 1572286763Smav ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL); 1573286570Smav } else { 1574286570Smav ASSERT(hdr->b_l1hdr.b_buf == NULL); 1575286570Smav ASSERT0(hdr->b_l1hdr.b_datacnt); 1576286763Smav 1577286570Smav /* 1578286763Smav * If we've reached here, We must have been called from 1579286763Smav * arc_evict_hdr(), as such we should have already been 1580286763Smav * removed from any ghost list we were previously on 1581286763Smav * (which protects us from racing with arc_evict_state), 1582286763Smav * thus no locking is needed during this check. 1583286570Smav */ 1584286763Smav ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 1585286763Smav 1586286763Smav /* 1587286763Smav * A buffer must not be moved into the arc_l2c_only 1588286763Smav * state if it's not finished being written out to the 1589286763Smav * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field 1590286763Smav * might try to be accessed, even though it was removed. 1591286763Smav */ 1592286763Smav VERIFY(!HDR_L2_WRITING(hdr)); 1593286763Smav VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); 1594286763Smav 1595288064Savg#ifdef ZFS_DEBUG 1596288064Savg if (hdr->b_l1hdr.b_thawed != NULL) { 1597288064Savg kmem_free(hdr->b_l1hdr.b_thawed, 1); 1598288064Savg hdr->b_l1hdr.b_thawed = NULL; 1599288064Savg } 1600288064Savg#endif 1601288064Savg 1602286570Smav nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; 1603286570Smav } 1604286570Smav /* 1605286570Smav * The header has been reallocated so we need to re-insert it into any 1606286570Smav * lists it was on. 1607286570Smav */ 1608286570Smav (void) buf_hash_insert(nhdr, NULL); 1609286570Smav 1610286570Smav ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); 1611286570Smav 1612286570Smav mutex_enter(&dev->l2ad_mtx); 1613286570Smav 1614286570Smav /* 1615286570Smav * We must place the realloc'ed header back into the list at 1616286570Smav * the same spot. Otherwise, if it's placed earlier in the list, 1617286570Smav * l2arc_write_buffers() could find it during the function's 1618286570Smav * write phase, and try to write it out to the l2arc. 1619286570Smav */ 1620286570Smav list_insert_after(&dev->l2ad_buflist, hdr, nhdr); 1621286570Smav list_remove(&dev->l2ad_buflist, hdr); 1622286570Smav 1623286570Smav mutex_exit(&dev->l2ad_mtx); 1624286570Smav 1625286598Smav /* 1626286598Smav * Since we're using the pointer address as the tag when 1627286598Smav * incrementing and decrementing the l2ad_alloc refcount, we 1628286598Smav * must remove the old pointer (that we're about to destroy) and 1629286598Smav * add the new pointer to the refcount. Otherwise we'd remove 1630286598Smav * the wrong pointer address when calling arc_hdr_destroy() later. 1631286598Smav */ 1632286598Smav 1633286598Smav (void) refcount_remove_many(&dev->l2ad_alloc, 1634286598Smav hdr->b_l2hdr.b_asize, hdr); 1635286598Smav 1636286598Smav (void) refcount_add_many(&dev->l2ad_alloc, 1637286598Smav nhdr->b_l2hdr.b_asize, nhdr); 1638286598Smav 1639286570Smav buf_discard_identity(hdr); 1640286570Smav hdr->b_freeze_cksum = NULL; 1641286570Smav kmem_cache_free(old, hdr); 1642286570Smav 1643286570Smav return (nhdr); 1644286570Smav} 1645286570Smav 1646286570Smav 1647168404Spjd#define ARC_MINTIME (hz>>4) /* 62 ms */ 1648168404Spjd 1649168404Spjdstatic void 1650168404Spjdarc_cksum_verify(arc_buf_t *buf) 1651168404Spjd{ 1652168404Spjd zio_cksum_t zc; 1653168404Spjd 1654168404Spjd if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1655168404Spjd return; 1656168404Spjd 1657286570Smav mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1658286570Smav if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { 1659286570Smav mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1660168404Spjd return; 1661168404Spjd } 1662289422Smav fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc); 1663168404Spjd if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 1664168404Spjd panic("buffer modified while frozen!"); 1665286570Smav mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1666168404Spjd} 1667168404Spjd 1668185029Spjdstatic int 1669185029Spjdarc_cksum_equal(arc_buf_t *buf) 1670185029Spjd{ 1671185029Spjd zio_cksum_t zc; 1672185029Spjd int equal; 1673185029Spjd 1674286570Smav mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1675289422Smav fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc); 1676185029Spjd equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 1677286570Smav mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1678185029Spjd 1679185029Spjd return (equal); 1680185029Spjd} 1681185029Spjd 1682168404Spjdstatic void 1683185029Spjdarc_cksum_compute(arc_buf_t *buf, boolean_t force) 1684168404Spjd{ 1685185029Spjd if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 1686168404Spjd return; 1687168404Spjd 1688286570Smav mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1689168404Spjd if (buf->b_hdr->b_freeze_cksum != NULL) { 1690286570Smav mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1691168404Spjd return; 1692168404Spjd } 1693168404Spjd buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 1694168404Spjd fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 1695289422Smav NULL, buf->b_hdr->b_freeze_cksum); 1696286570Smav mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1697240133Smm#ifdef illumos 1698240133Smm arc_buf_watch(buf); 1699277300Ssmh#endif 1700168404Spjd} 1701168404Spjd 1702240133Smm#ifdef illumos 1703240133Smm#ifndef _KERNEL 1704240133Smmtypedef struct procctl { 1705240133Smm long cmd; 1706240133Smm prwatch_t prwatch; 1707240133Smm} procctl_t; 1708240133Smm#endif 1709240133Smm 1710240133Smm/* ARGSUSED */ 1711240133Smmstatic void 1712240133Smmarc_buf_unwatch(arc_buf_t *buf) 1713240133Smm{ 1714240133Smm#ifndef _KERNEL 1715240133Smm if (arc_watch) { 1716240133Smm int result; 1717240133Smm procctl_t ctl; 1718240133Smm ctl.cmd = PCWATCH; 1719240133Smm ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1720240133Smm ctl.prwatch.pr_size = 0; 1721240133Smm ctl.prwatch.pr_wflags = 0; 1722240133Smm result = write(arc_procfd, &ctl, sizeof (ctl)); 1723240133Smm ASSERT3U(result, ==, sizeof (ctl)); 1724240133Smm } 1725240133Smm#endif 1726240133Smm} 1727240133Smm 1728240133Smm/* ARGSUSED */ 1729240133Smmstatic void 1730240133Smmarc_buf_watch(arc_buf_t *buf) 1731240133Smm{ 1732240133Smm#ifndef _KERNEL 1733240133Smm if (arc_watch) { 1734240133Smm int result; 1735240133Smm procctl_t ctl; 1736240133Smm ctl.cmd = PCWATCH; 1737240133Smm ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1738240133Smm ctl.prwatch.pr_size = buf->b_hdr->b_size; 1739240133Smm ctl.prwatch.pr_wflags = WA_WRITE; 1740240133Smm result = write(arc_procfd, &ctl, sizeof (ctl)); 1741240133Smm ASSERT3U(result, ==, sizeof (ctl)); 1742240133Smm } 1743240133Smm#endif 1744240133Smm} 1745240133Smm#endif /* illumos */ 1746240133Smm 1747286570Smavstatic arc_buf_contents_t 1748286570Smavarc_buf_type(arc_buf_hdr_t *hdr) 1749286570Smav{ 1750286570Smav if (HDR_ISTYPE_METADATA(hdr)) { 1751286570Smav return (ARC_BUFC_METADATA); 1752286570Smav } else { 1753286570Smav return (ARC_BUFC_DATA); 1754286570Smav } 1755286570Smav} 1756286570Smav 1757286570Smavstatic uint32_t 1758286570Smavarc_bufc_to_flags(arc_buf_contents_t type) 1759286570Smav{ 1760286570Smav switch (type) { 1761286570Smav case ARC_BUFC_DATA: 1762286570Smav /* metadata field is 0 if buffer contains normal data */ 1763286570Smav return (0); 1764286570Smav case ARC_BUFC_METADATA: 1765286570Smav return (ARC_FLAG_BUFC_METADATA); 1766286570Smav default: 1767286570Smav break; 1768286570Smav } 1769286570Smav panic("undefined ARC buffer type!"); 1770286570Smav return ((uint32_t)-1); 1771286570Smav} 1772286570Smav 1773168404Spjdvoid 1774168404Spjdarc_buf_thaw(arc_buf_t *buf) 1775168404Spjd{ 1776185029Spjd if (zfs_flags & ZFS_DEBUG_MODIFY) { 1777286570Smav if (buf->b_hdr->b_l1hdr.b_state != arc_anon) 1778185029Spjd panic("modifying non-anon buffer!"); 1779286570Smav if (HDR_IO_IN_PROGRESS(buf->b_hdr)) 1780185029Spjd panic("modifying buffer while i/o in progress!"); 1781185029Spjd arc_cksum_verify(buf); 1782185029Spjd } 1783168404Spjd 1784286570Smav mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1785168404Spjd if (buf->b_hdr->b_freeze_cksum != NULL) { 1786168404Spjd kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1787168404Spjd buf->b_hdr->b_freeze_cksum = NULL; 1788168404Spjd } 1789219089Spjd 1790286570Smav#ifdef ZFS_DEBUG 1791219089Spjd if (zfs_flags & ZFS_DEBUG_MODIFY) { 1792286570Smav if (buf->b_hdr->b_l1hdr.b_thawed != NULL) 1793286570Smav kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1); 1794286570Smav buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); 1795219089Spjd } 1796286570Smav#endif 1797219089Spjd 1798286570Smav mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1799240133Smm 1800240133Smm#ifdef illumos 1801240133Smm arc_buf_unwatch(buf); 1802277300Ssmh#endif 1803168404Spjd} 1804168404Spjd 1805168404Spjdvoid 1806168404Spjdarc_buf_freeze(arc_buf_t *buf) 1807168404Spjd{ 1808219089Spjd kmutex_t *hash_lock; 1809219089Spjd 1810168404Spjd if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1811168404Spjd return; 1812168404Spjd 1813219089Spjd hash_lock = HDR_LOCK(buf->b_hdr); 1814219089Spjd mutex_enter(hash_lock); 1815219089Spjd 1816168404Spjd ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 1817286570Smav buf->b_hdr->b_l1hdr.b_state == arc_anon); 1818185029Spjd arc_cksum_compute(buf, B_FALSE); 1819219089Spjd mutex_exit(hash_lock); 1820240133Smm 1821168404Spjd} 1822168404Spjd 1823168404Spjdstatic void 1824275811Sdelphijadd_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1825168404Spjd{ 1826286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1827168404Spjd ASSERT(MUTEX_HELD(hash_lock)); 1828286570Smav arc_state_t *state = hdr->b_l1hdr.b_state; 1829168404Spjd 1830286570Smav if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && 1831286570Smav (state != arc_anon)) { 1832286570Smav /* We don't use the L2-only state list. */ 1833286570Smav if (state != arc_l2c_only) { 1834286763Smav arc_buf_contents_t type = arc_buf_type(hdr); 1835286570Smav uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; 1836286763Smav multilist_t *list = &state->arcs_list[type]; 1837286763Smav uint64_t *size = &state->arcs_lsize[type]; 1838168404Spjd 1839286763Smav multilist_remove(list, hdr); 1840286763Smav 1841286570Smav if (GHOST_STATE(state)) { 1842286570Smav ASSERT0(hdr->b_l1hdr.b_datacnt); 1843286570Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 1844286570Smav delta = hdr->b_size; 1845286570Smav } 1846286570Smav ASSERT(delta > 0); 1847286570Smav ASSERT3U(*size, >=, delta); 1848286570Smav atomic_add_64(size, -delta); 1849168404Spjd } 1850185029Spjd /* remove the prefetch flag if we get a reference */ 1851286570Smav hdr->b_flags &= ~ARC_FLAG_PREFETCH; 1852168404Spjd } 1853168404Spjd} 1854168404Spjd 1855168404Spjdstatic int 1856275811Sdelphijremove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1857168404Spjd{ 1858168404Spjd int cnt; 1859286570Smav arc_state_t *state = hdr->b_l1hdr.b_state; 1860168404Spjd 1861286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1862168404Spjd ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 1863168404Spjd ASSERT(!GHOST_STATE(state)); 1864168404Spjd 1865286570Smav /* 1866286570Smav * arc_l2c_only counts as a ghost state so we don't need to explicitly 1867286570Smav * check to prevent usage of the arc_l2c_only list. 1868286570Smav */ 1869286570Smav if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && 1870168404Spjd (state != arc_anon)) { 1871286763Smav arc_buf_contents_t type = arc_buf_type(hdr); 1872286763Smav multilist_t *list = &state->arcs_list[type]; 1873286763Smav uint64_t *size = &state->arcs_lsize[type]; 1874185029Spjd 1875286763Smav multilist_insert(list, hdr); 1876286763Smav 1877286570Smav ASSERT(hdr->b_l1hdr.b_datacnt > 0); 1878286570Smav atomic_add_64(size, hdr->b_size * 1879286570Smav hdr->b_l1hdr.b_datacnt); 1880168404Spjd } 1881168404Spjd return (cnt); 1882168404Spjd} 1883168404Spjd 1884168404Spjd/* 1885286763Smav * Move the supplied buffer to the indicated state. The hash lock 1886168404Spjd * for the buffer must be held by the caller. 1887168404Spjd */ 1888168404Spjdstatic void 1889275811Sdelphijarc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, 1890275811Sdelphij kmutex_t *hash_lock) 1891168404Spjd{ 1892286570Smav arc_state_t *old_state; 1893286570Smav int64_t refcnt; 1894286570Smav uint32_t datacnt; 1895168404Spjd uint64_t from_delta, to_delta; 1896286570Smav arc_buf_contents_t buftype = arc_buf_type(hdr); 1897168404Spjd 1898286570Smav /* 1899286570Smav * We almost always have an L1 hdr here, since we call arc_hdr_realloc() 1900286570Smav * in arc_read() when bringing a buffer out of the L2ARC. However, the 1901286570Smav * L1 hdr doesn't always exist when we change state to arc_anon before 1902286570Smav * destroying a header, in which case reallocating to add the L1 hdr is 1903286570Smav * pointless. 1904286570Smav */ 1905286570Smav if (HDR_HAS_L1HDR(hdr)) { 1906286570Smav old_state = hdr->b_l1hdr.b_state; 1907286570Smav refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); 1908286570Smav datacnt = hdr->b_l1hdr.b_datacnt; 1909286570Smav } else { 1910286570Smav old_state = arc_l2c_only; 1911286570Smav refcnt = 0; 1912286570Smav datacnt = 0; 1913286570Smav } 1914286570Smav 1915168404Spjd ASSERT(MUTEX_HELD(hash_lock)); 1916258632Savg ASSERT3P(new_state, !=, old_state); 1917286570Smav ASSERT(refcnt == 0 || datacnt > 0); 1918286570Smav ASSERT(!GHOST_STATE(new_state) || datacnt == 0); 1919286570Smav ASSERT(old_state != arc_anon || datacnt <= 1); 1920168404Spjd 1921286570Smav from_delta = to_delta = datacnt * hdr->b_size; 1922168404Spjd 1923168404Spjd /* 1924168404Spjd * If this buffer is evictable, transfer it from the 1925168404Spjd * old state list to the new state list. 1926168404Spjd */ 1927168404Spjd if (refcnt == 0) { 1928286570Smav if (old_state != arc_anon && old_state != arc_l2c_only) { 1929286570Smav uint64_t *size = &old_state->arcs_lsize[buftype]; 1930168404Spjd 1931286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1932286763Smav multilist_remove(&old_state->arcs_list[buftype], hdr); 1933168404Spjd 1934168404Spjd /* 1935168404Spjd * If prefetching out of the ghost cache, 1936219089Spjd * we will have a non-zero datacnt. 1937168404Spjd */ 1938286570Smav if (GHOST_STATE(old_state) && datacnt == 0) { 1939168404Spjd /* ghost elements have a ghost size */ 1940286570Smav ASSERT(hdr->b_l1hdr.b_buf == NULL); 1941275811Sdelphij from_delta = hdr->b_size; 1942168404Spjd } 1943185029Spjd ASSERT3U(*size, >=, from_delta); 1944185029Spjd atomic_add_64(size, -from_delta); 1945168404Spjd } 1946286570Smav if (new_state != arc_anon && new_state != arc_l2c_only) { 1947286570Smav uint64_t *size = &new_state->arcs_lsize[buftype]; 1948168404Spjd 1949286570Smav /* 1950286570Smav * An L1 header always exists here, since if we're 1951286570Smav * moving to some L1-cached state (i.e. not l2c_only or 1952286570Smav * anonymous), we realloc the header to add an L1hdr 1953286570Smav * beforehand. 1954286570Smav */ 1955286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1956286763Smav multilist_insert(&new_state->arcs_list[buftype], hdr); 1957168404Spjd 1958168404Spjd /* ghost elements have a ghost size */ 1959168404Spjd if (GHOST_STATE(new_state)) { 1960286762Smav ASSERT0(datacnt); 1961286570Smav ASSERT(hdr->b_l1hdr.b_buf == NULL); 1962275811Sdelphij to_delta = hdr->b_size; 1963168404Spjd } 1964185029Spjd atomic_add_64(size, to_delta); 1965168404Spjd } 1966168404Spjd } 1967168404Spjd 1968275811Sdelphij ASSERT(!BUF_EMPTY(hdr)); 1969275811Sdelphij if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) 1970275811Sdelphij buf_hash_remove(hdr); 1971168404Spjd 1972286570Smav /* adjust state sizes (ignore arc_l2c_only) */ 1973286766Smav 1974286766Smav if (to_delta && new_state != arc_l2c_only) { 1975286766Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1976286766Smav if (GHOST_STATE(new_state)) { 1977286766Smav ASSERT0(datacnt); 1978286766Smav 1979286766Smav /* 1980286766Smav * We moving a header to a ghost state, we first 1981286766Smav * remove all arc buffers. Thus, we'll have a 1982286766Smav * datacnt of zero, and no arc buffer to use for 1983286766Smav * the reference. As a result, we use the arc 1984286766Smav * header pointer for the reference. 1985286766Smav */ 1986286766Smav (void) refcount_add_many(&new_state->arcs_size, 1987286766Smav hdr->b_size, hdr); 1988286766Smav } else { 1989286766Smav ASSERT3U(datacnt, !=, 0); 1990286766Smav 1991286766Smav /* 1992286766Smav * Each individual buffer holds a unique reference, 1993286766Smav * thus we must remove each of these references one 1994286766Smav * at a time. 1995286766Smav */ 1996286766Smav for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 1997286766Smav buf = buf->b_next) { 1998286766Smav (void) refcount_add_many(&new_state->arcs_size, 1999286766Smav hdr->b_size, buf); 2000286766Smav } 2001286766Smav } 2002286766Smav } 2003286766Smav 2004286570Smav if (from_delta && old_state != arc_l2c_only) { 2005286766Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2006286766Smav if (GHOST_STATE(old_state)) { 2007286766Smav /* 2008286766Smav * When moving a header off of a ghost state, 2009286766Smav * there's the possibility for datacnt to be 2010286766Smav * non-zero. This is because we first add the 2011286766Smav * arc buffer to the header prior to changing 2012286766Smav * the header's state. Since we used the header 2013286766Smav * for the reference when putting the header on 2014286766Smav * the ghost state, we must balance that and use 2015286766Smav * the header when removing off the ghost state 2016286766Smav * (even though datacnt is non zero). 2017286766Smav */ 2018286766Smav 2019286766Smav IMPLY(datacnt == 0, new_state == arc_anon || 2020286766Smav new_state == arc_l2c_only); 2021286766Smav 2022286766Smav (void) refcount_remove_many(&old_state->arcs_size, 2023286766Smav hdr->b_size, hdr); 2024286766Smav } else { 2025286766Smav ASSERT3P(datacnt, !=, 0); 2026286766Smav 2027286766Smav /* 2028286766Smav * Each individual buffer holds a unique reference, 2029286766Smav * thus we must remove each of these references one 2030286766Smav * at a time. 2031286766Smav */ 2032286766Smav for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2033286766Smav buf = buf->b_next) { 2034286766Smav (void) refcount_remove_many( 2035286766Smav &old_state->arcs_size, hdr->b_size, buf); 2036286766Smav } 2037286766Smav } 2038168404Spjd } 2039286766Smav 2040286570Smav if (HDR_HAS_L1HDR(hdr)) 2041286570Smav hdr->b_l1hdr.b_state = new_state; 2042185029Spjd 2043286570Smav /* 2044286570Smav * L2 headers should never be on the L2 state list since they don't 2045286570Smav * have L1 headers allocated. 2046286570Smav */ 2047286763Smav ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && 2048286763Smav multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); 2049168404Spjd} 2050168404Spjd 2051185029Spjdvoid 2052208373Smmarc_space_consume(uint64_t space, arc_space_type_t type) 2053185029Spjd{ 2054208373Smm ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 2055208373Smm 2056208373Smm switch (type) { 2057208373Smm case ARC_SPACE_DATA: 2058208373Smm ARCSTAT_INCR(arcstat_data_size, space); 2059208373Smm break; 2060286574Smav case ARC_SPACE_META: 2061286574Smav ARCSTAT_INCR(arcstat_metadata_size, space); 2062286574Smav break; 2063208373Smm case ARC_SPACE_OTHER: 2064208373Smm ARCSTAT_INCR(arcstat_other_size, space); 2065208373Smm break; 2066208373Smm case ARC_SPACE_HDRS: 2067208373Smm ARCSTAT_INCR(arcstat_hdr_size, space); 2068208373Smm break; 2069208373Smm case ARC_SPACE_L2HDRS: 2070208373Smm ARCSTAT_INCR(arcstat_l2_hdr_size, space); 2071208373Smm break; 2072208373Smm } 2073208373Smm 2074286574Smav if (type != ARC_SPACE_DATA) 2075286574Smav ARCSTAT_INCR(arcstat_meta_used, space); 2076286574Smav 2077185029Spjd atomic_add_64(&arc_size, space); 2078185029Spjd} 2079185029Spjd 2080185029Spjdvoid 2081208373Smmarc_space_return(uint64_t space, arc_space_type_t type) 2082185029Spjd{ 2083208373Smm ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 2084208373Smm 2085208373Smm switch (type) { 2086208373Smm case ARC_SPACE_DATA: 2087208373Smm ARCSTAT_INCR(arcstat_data_size, -space); 2088208373Smm break; 2089286574Smav case ARC_SPACE_META: 2090286574Smav ARCSTAT_INCR(arcstat_metadata_size, -space); 2091286574Smav break; 2092208373Smm case ARC_SPACE_OTHER: 2093208373Smm ARCSTAT_INCR(arcstat_other_size, -space); 2094208373Smm break; 2095208373Smm case ARC_SPACE_HDRS: 2096208373Smm ARCSTAT_INCR(arcstat_hdr_size, -space); 2097208373Smm break; 2098208373Smm case ARC_SPACE_L2HDRS: 2099208373Smm ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 2100208373Smm break; 2101208373Smm } 2102208373Smm 2103286574Smav if (type != ARC_SPACE_DATA) { 2104286574Smav ASSERT(arc_meta_used >= space); 2105286574Smav if (arc_meta_max < arc_meta_used) 2106286574Smav arc_meta_max = arc_meta_used; 2107286574Smav ARCSTAT_INCR(arcstat_meta_used, -space); 2108286574Smav } 2109286574Smav 2110185029Spjd ASSERT(arc_size >= space); 2111185029Spjd atomic_add_64(&arc_size, -space); 2112185029Spjd} 2113185029Spjd 2114168404Spjdarc_buf_t * 2115286570Smavarc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) 2116168404Spjd{ 2117168404Spjd arc_buf_hdr_t *hdr; 2118168404Spjd arc_buf_t *buf; 2119168404Spjd 2120168404Spjd ASSERT3U(size, >, 0); 2121286570Smav hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 2122168404Spjd ASSERT(BUF_EMPTY(hdr)); 2123286570Smav ASSERT3P(hdr->b_freeze_cksum, ==, NULL); 2124168404Spjd hdr->b_size = size; 2125228103Smm hdr->b_spa = spa_load_guid(spa); 2126286570Smav 2127185029Spjd buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2128168404Spjd buf->b_hdr = hdr; 2129168404Spjd buf->b_data = NULL; 2130168404Spjd buf->b_efunc = NULL; 2131168404Spjd buf->b_private = NULL; 2132168404Spjd buf->b_next = NULL; 2133286570Smav 2134286570Smav hdr->b_flags = arc_bufc_to_flags(type); 2135286570Smav hdr->b_flags |= ARC_FLAG_HAS_L1HDR; 2136286570Smav 2137286570Smav hdr->b_l1hdr.b_buf = buf; 2138286570Smav hdr->b_l1hdr.b_state = arc_anon; 2139286570Smav hdr->b_l1hdr.b_arc_access = 0; 2140286570Smav hdr->b_l1hdr.b_datacnt = 1; 2141286763Smav hdr->b_l1hdr.b_tmp_cdata = NULL; 2142286570Smav 2143168404Spjd arc_get_data_buf(buf); 2144286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2145286570Smav (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 2146168404Spjd 2147168404Spjd return (buf); 2148168404Spjd} 2149168404Spjd 2150209962Smmstatic char *arc_onloan_tag = "onloan"; 2151209962Smm 2152209962Smm/* 2153209962Smm * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 2154209962Smm * flight data by arc_tempreserve_space() until they are "returned". Loaned 2155209962Smm * buffers must be returned to the arc before they can be used by the DMU or 2156209962Smm * freed. 2157209962Smm */ 2158209962Smmarc_buf_t * 2159209962Smmarc_loan_buf(spa_t *spa, int size) 2160209962Smm{ 2161209962Smm arc_buf_t *buf; 2162209962Smm 2163209962Smm buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 2164209962Smm 2165209962Smm atomic_add_64(&arc_loaned_bytes, size); 2166209962Smm return (buf); 2167209962Smm} 2168209962Smm 2169209962Smm/* 2170209962Smm * Return a loaned arc buffer to the arc. 2171209962Smm */ 2172209962Smmvoid 2173209962Smmarc_return_buf(arc_buf_t *buf, void *tag) 2174209962Smm{ 2175209962Smm arc_buf_hdr_t *hdr = buf->b_hdr; 2176209962Smm 2177209962Smm ASSERT(buf->b_data != NULL); 2178286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2179286570Smav (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 2180286570Smav (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2181209962Smm 2182209962Smm atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 2183209962Smm} 2184209962Smm 2185219089Spjd/* Detach an arc_buf from a dbuf (tag) */ 2186219089Spjdvoid 2187219089Spjdarc_loan_inuse_buf(arc_buf_t *buf, void *tag) 2188219089Spjd{ 2189286570Smav arc_buf_hdr_t *hdr = buf->b_hdr; 2190219089Spjd 2191219089Spjd ASSERT(buf->b_data != NULL); 2192286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2193286570Smav (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2194286570Smav (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); 2195219089Spjd buf->b_efunc = NULL; 2196219089Spjd buf->b_private = NULL; 2197219089Spjd 2198219089Spjd atomic_add_64(&arc_loaned_bytes, hdr->b_size); 2199219089Spjd} 2200219089Spjd 2201168404Spjdstatic arc_buf_t * 2202168404Spjdarc_buf_clone(arc_buf_t *from) 2203168404Spjd{ 2204168404Spjd arc_buf_t *buf; 2205168404Spjd arc_buf_hdr_t *hdr = from->b_hdr; 2206168404Spjd uint64_t size = hdr->b_size; 2207168404Spjd 2208286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2209286570Smav ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2210219089Spjd 2211185029Spjd buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2212168404Spjd buf->b_hdr = hdr; 2213168404Spjd buf->b_data = NULL; 2214168404Spjd buf->b_efunc = NULL; 2215168404Spjd buf->b_private = NULL; 2216286570Smav buf->b_next = hdr->b_l1hdr.b_buf; 2217286570Smav hdr->b_l1hdr.b_buf = buf; 2218168404Spjd arc_get_data_buf(buf); 2219168404Spjd bcopy(from->b_data, buf->b_data, size); 2220242845Sdelphij 2221242845Sdelphij /* 2222242845Sdelphij * This buffer already exists in the arc so create a duplicate 2223242845Sdelphij * copy for the caller. If the buffer is associated with user data 2224242845Sdelphij * then track the size and number of duplicates. These stats will be 2225242845Sdelphij * updated as duplicate buffers are created and destroyed. 2226242845Sdelphij */ 2227286570Smav if (HDR_ISTYPE_DATA(hdr)) { 2228242845Sdelphij ARCSTAT_BUMP(arcstat_duplicate_buffers); 2229242845Sdelphij ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 2230242845Sdelphij } 2231286570Smav hdr->b_l1hdr.b_datacnt += 1; 2232168404Spjd return (buf); 2233168404Spjd} 2234168404Spjd 2235168404Spjdvoid 2236168404Spjdarc_buf_add_ref(arc_buf_t *buf, void* tag) 2237168404Spjd{ 2238168404Spjd arc_buf_hdr_t *hdr; 2239168404Spjd kmutex_t *hash_lock; 2240168404Spjd 2241168404Spjd /* 2242185029Spjd * Check to see if this buffer is evicted. Callers 2243185029Spjd * must verify b_data != NULL to know if the add_ref 2244185029Spjd * was successful. 2245168404Spjd */ 2246219089Spjd mutex_enter(&buf->b_evict_lock); 2247185029Spjd if (buf->b_data == NULL) { 2248219089Spjd mutex_exit(&buf->b_evict_lock); 2249168404Spjd return; 2250168404Spjd } 2251219089Spjd hash_lock = HDR_LOCK(buf->b_hdr); 2252219089Spjd mutex_enter(hash_lock); 2253185029Spjd hdr = buf->b_hdr; 2254286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2255219089Spjd ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2256219089Spjd mutex_exit(&buf->b_evict_lock); 2257168404Spjd 2258286570Smav ASSERT(hdr->b_l1hdr.b_state == arc_mru || 2259286570Smav hdr->b_l1hdr.b_state == arc_mfu); 2260286570Smav 2261168404Spjd add_reference(hdr, hash_lock, tag); 2262208373Smm DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2263168404Spjd arc_access(hdr, hash_lock); 2264168404Spjd mutex_exit(hash_lock); 2265168404Spjd ARCSTAT_BUMP(arcstat_hits); 2266286570Smav ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 2267286570Smav demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 2268168404Spjd data, metadata, hits); 2269168404Spjd} 2270168404Spjd 2271274172Savgstatic void 2272274172Savgarc_buf_free_on_write(void *data, size_t size, 2273274172Savg void (*free_func)(void *, size_t)) 2274274172Savg{ 2275274172Savg l2arc_data_free_t *df; 2276274172Savg 2277286763Smav df = kmem_alloc(sizeof (*df), KM_SLEEP); 2278274172Savg df->l2df_data = data; 2279274172Savg df->l2df_size = size; 2280274172Savg df->l2df_func = free_func; 2281274172Savg mutex_enter(&l2arc_free_on_write_mtx); 2282274172Savg list_insert_head(l2arc_free_on_write, df); 2283274172Savg mutex_exit(&l2arc_free_on_write_mtx); 2284274172Savg} 2285274172Savg 2286185029Spjd/* 2287185029Spjd * Free the arc data buffer. If it is an l2arc write in progress, 2288185029Spjd * the buffer is placed on l2arc_free_on_write to be freed later. 2289185029Spjd */ 2290168404Spjdstatic void 2291240133Smmarc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) 2292185029Spjd{ 2293240133Smm arc_buf_hdr_t *hdr = buf->b_hdr; 2294240133Smm 2295185029Spjd if (HDR_L2_WRITING(hdr)) { 2296274172Savg arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); 2297185029Spjd ARCSTAT_BUMP(arcstat_l2_free_on_write); 2298185029Spjd } else { 2299240133Smm free_func(buf->b_data, hdr->b_size); 2300185029Spjd } 2301185029Spjd} 2302185029Spjd 2303185029Spjdstatic void 2304274172Savgarc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) 2305274172Savg{ 2306297848Savg size_t align, asize, len; 2307297848Savg 2308286570Smav ASSERT(HDR_HAS_L2HDR(hdr)); 2309286570Smav ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); 2310274172Savg 2311286570Smav /* 2312286570Smav * The b_tmp_cdata field is linked off of the b_l1hdr, so if 2313286570Smav * that doesn't exist, the header is in the arc_l2c_only state, 2314286570Smav * and there isn't anything to free (it's already been freed). 2315286570Smav */ 2316286570Smav if (!HDR_HAS_L1HDR(hdr)) 2317286570Smav return; 2318274172Savg 2319286763Smav /* 2320286763Smav * The header isn't being written to the l2arc device, thus it 2321286763Smav * shouldn't have a b_tmp_cdata to free. 2322286763Smav */ 2323286763Smav if (!HDR_L2_WRITING(hdr)) { 2324286763Smav ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); 2325274172Savg return; 2326286763Smav } 2327274172Savg 2328286763Smav /* 2329297848Savg * The bufer has been chosen for writing to L2ARC, but it's 2330297848Savg * not being written just yet. In other words, 2331297848Savg * b_tmp_cdata points to exactly the same buffer as b_data, 2332297848Savg * l2arc_transform_buf hasn't been called. 2333286763Smav */ 2334297848Savg if (hdr->b_l2hdr.b_daddr == L2ARC_ADDR_UNSET) { 2335297848Savg ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, 2336297848Savg hdr->b_l1hdr.b_buf->b_data); 2337297848Savg ASSERT3U(hdr->b_l2hdr.b_compress, ==, ZIO_COMPRESS_OFF); 2338301873Savg hdr->b_l1hdr.b_tmp_cdata = NULL; 2339286763Smav return; 2340286763Smav } 2341286570Smav 2342286763Smav /* 2343286763Smav * There's nothing to free since the buffer was all zero's and 2344286763Smav * compressed to a zero length buffer. 2345286763Smav */ 2346287706Sdelphij if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_EMPTY) { 2347286763Smav ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); 2348286763Smav return; 2349286763Smav } 2350286763Smav 2351297848Savg /* 2352297848Savg * Nothing to do if the temporary buffer was not required. 2353297848Savg */ 2354297848Savg if (hdr->b_l1hdr.b_tmp_cdata == NULL) 2355297848Savg return; 2356286763Smav 2357274172Savg ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); 2358297848Savg len = hdr->b_size; 2359297848Savg align = (size_t)1 << hdr->b_l2hdr.b_dev->l2ad_vdev->vdev_ashift; 2360297848Savg asize = P2ROUNDUP(len, align); 2361297848Savg arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, asize, 2362297848Savg zio_data_buf_free); 2363286570Smav hdr->b_l1hdr.b_tmp_cdata = NULL; 2364274172Savg} 2365274172Savg 2366286767Smav/* 2367286767Smav * Free up buf->b_data and if 'remove' is set, then pull the 2368286767Smav * arc_buf_t off of the the arc_buf_hdr_t's list and free it. 2369286767Smav */ 2370274172Savgstatic void 2371286763Smavarc_buf_destroy(arc_buf_t *buf, boolean_t remove) 2372168404Spjd{ 2373168404Spjd arc_buf_t **bufp; 2374168404Spjd 2375168404Spjd /* free up data associated with the buf */ 2376286570Smav if (buf->b_data != NULL) { 2377286570Smav arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 2378168404Spjd uint64_t size = buf->b_hdr->b_size; 2379286570Smav arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 2380168404Spjd 2381168404Spjd arc_cksum_verify(buf); 2382240133Smm#ifdef illumos 2383240133Smm arc_buf_unwatch(buf); 2384277300Ssmh#endif 2385219089Spjd 2386286763Smav if (type == ARC_BUFC_METADATA) { 2387286763Smav arc_buf_data_free(buf, zio_buf_free); 2388286763Smav arc_space_return(size, ARC_SPACE_META); 2389286763Smav } else { 2390286763Smav ASSERT(type == ARC_BUFC_DATA); 2391286763Smav arc_buf_data_free(buf, zio_data_buf_free); 2392286763Smav arc_space_return(size, ARC_SPACE_DATA); 2393168404Spjd } 2394286763Smav 2395286763Smav /* protected by hash lock, if in the hash table */ 2396286763Smav if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { 2397185029Spjd uint64_t *cnt = &state->arcs_lsize[type]; 2398185029Spjd 2399286570Smav ASSERT(refcount_is_zero( 2400286570Smav &buf->b_hdr->b_l1hdr.b_refcnt)); 2401286570Smav ASSERT(state != arc_anon && state != arc_l2c_only); 2402185029Spjd 2403185029Spjd ASSERT3U(*cnt, >=, size); 2404185029Spjd atomic_add_64(cnt, -size); 2405168404Spjd } 2406286766Smav 2407286766Smav (void) refcount_remove_many(&state->arcs_size, size, buf); 2408168404Spjd buf->b_data = NULL; 2409242845Sdelphij 2410242845Sdelphij /* 2411242845Sdelphij * If we're destroying a duplicate buffer make sure 2412242845Sdelphij * that the appropriate statistics are updated. 2413242845Sdelphij */ 2414286570Smav if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && 2415286570Smav HDR_ISTYPE_DATA(buf->b_hdr)) { 2416242845Sdelphij ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 2417242845Sdelphij ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 2418242845Sdelphij } 2419286570Smav ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); 2420286570Smav buf->b_hdr->b_l1hdr.b_datacnt -= 1; 2421168404Spjd } 2422168404Spjd 2423168404Spjd /* only remove the buf if requested */ 2424268858Sdelphij if (!remove) 2425168404Spjd return; 2426168404Spjd 2427168404Spjd /* remove the buf from the hdr list */ 2428286570Smav for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; 2429286570Smav bufp = &(*bufp)->b_next) 2430168404Spjd continue; 2431168404Spjd *bufp = buf->b_next; 2432219089Spjd buf->b_next = NULL; 2433168404Spjd 2434168404Spjd ASSERT(buf->b_efunc == NULL); 2435168404Spjd 2436168404Spjd /* clean up the buf */ 2437168404Spjd buf->b_hdr = NULL; 2438168404Spjd kmem_cache_free(buf_cache, buf); 2439168404Spjd} 2440168404Spjd 2441168404Spjdstatic void 2442286598Smavarc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) 2443286598Smav{ 2444286598Smav l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 2445286598Smav l2arc_dev_t *dev = l2hdr->b_dev; 2446286598Smav 2447286598Smav ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 2448286598Smav ASSERT(HDR_HAS_L2HDR(hdr)); 2449286598Smav 2450286598Smav list_remove(&dev->l2ad_buflist, hdr); 2451286598Smav 2452286598Smav /* 2453286598Smav * We don't want to leak the b_tmp_cdata buffer that was 2454286598Smav * allocated in l2arc_write_buffers() 2455286598Smav */ 2456286598Smav arc_buf_l2_cdata_free(hdr); 2457286598Smav 2458286598Smav /* 2459286598Smav * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then 2460286598Smav * this header is being processed by l2arc_write_buffers() (i.e. 2461286598Smav * it's in the first stage of l2arc_write_buffers()). 2462286598Smav * Re-affirming that truth here, just to serve as a reminder. If 2463286598Smav * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or 2464286598Smav * may not have its HDR_L2_WRITING flag set. (the write may have 2465286598Smav * completed, in which case HDR_L2_WRITING will be false and the 2466286598Smav * b_daddr field will point to the address of the buffer on disk). 2467286598Smav */ 2468286598Smav IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr)); 2469286598Smav 2470286598Smav /* 2471286598Smav * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with 2472286598Smav * l2arc_write_buffers(). Since we've just removed this header 2473286598Smav * from the l2arc buffer list, this header will never reach the 2474286598Smav * second stage of l2arc_write_buffers(), which increments the 2475286598Smav * accounting stats for this header. Thus, we must be careful 2476286598Smav * not to decrement them for this header either. 2477286598Smav */ 2478286598Smav if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) { 2479286598Smav ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 2480286598Smav ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 2481286598Smav 2482286598Smav vdev_space_update(dev->l2ad_vdev, 2483286598Smav -l2hdr->b_asize, 0, 0); 2484286598Smav 2485286598Smav (void) refcount_remove_many(&dev->l2ad_alloc, 2486286598Smav l2hdr->b_asize, hdr); 2487286598Smav } 2488286598Smav 2489286598Smav hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 2490286598Smav} 2491286598Smav 2492286598Smavstatic void 2493168404Spjdarc_hdr_destroy(arc_buf_hdr_t *hdr) 2494168404Spjd{ 2495286570Smav if (HDR_HAS_L1HDR(hdr)) { 2496286570Smav ASSERT(hdr->b_l1hdr.b_buf == NULL || 2497286570Smav hdr->b_l1hdr.b_datacnt > 0); 2498286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2499286570Smav ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 2500286570Smav } 2501168404Spjd ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2502286570Smav ASSERT(!HDR_IN_HASH_TABLE(hdr)); 2503168404Spjd 2504286570Smav if (HDR_HAS_L2HDR(hdr)) { 2505286598Smav l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 2506286598Smav boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); 2507286570Smav 2508286598Smav if (!buflist_held) 2509286598Smav mutex_enter(&dev->l2ad_mtx); 2510219089Spjd 2511286570Smav /* 2512286598Smav * Even though we checked this conditional above, we 2513286598Smav * need to check this again now that we have the 2514286598Smav * l2ad_mtx. This is because we could be racing with 2515286598Smav * another thread calling l2arc_evict() which might have 2516286598Smav * destroyed this header's L2 portion as we were waiting 2517286598Smav * to acquire the l2ad_mtx. If that happens, we don't 2518286598Smav * want to re-destroy the header's L2 portion. 2519286570Smav */ 2520286598Smav if (HDR_HAS_L2HDR(hdr)) { 2521290191Savg l2arc_trim(hdr); 2522286598Smav arc_hdr_l2hdr_destroy(hdr); 2523286598Smav } 2524286570Smav 2525219089Spjd if (!buflist_held) 2526286598Smav mutex_exit(&dev->l2ad_mtx); 2527185029Spjd } 2528185029Spjd 2529286570Smav if (!BUF_EMPTY(hdr)) 2530219089Spjd buf_discard_identity(hdr); 2531286776Smav 2532168404Spjd if (hdr->b_freeze_cksum != NULL) { 2533168404Spjd kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 2534168404Spjd hdr->b_freeze_cksum = NULL; 2535168404Spjd } 2536286570Smav 2537286570Smav if (HDR_HAS_L1HDR(hdr)) { 2538286570Smav while (hdr->b_l1hdr.b_buf) { 2539286570Smav arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2540286570Smav 2541286570Smav if (buf->b_efunc != NULL) { 2542286763Smav mutex_enter(&arc_user_evicts_lock); 2543286570Smav mutex_enter(&buf->b_evict_lock); 2544286570Smav ASSERT(buf->b_hdr != NULL); 2545286763Smav arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE); 2546286570Smav hdr->b_l1hdr.b_buf = buf->b_next; 2547286570Smav buf->b_hdr = &arc_eviction_hdr; 2548286570Smav buf->b_next = arc_eviction_list; 2549286570Smav arc_eviction_list = buf; 2550286570Smav mutex_exit(&buf->b_evict_lock); 2551286763Smav cv_signal(&arc_user_evicts_cv); 2552286763Smav mutex_exit(&arc_user_evicts_lock); 2553286570Smav } else { 2554286763Smav arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE); 2555286570Smav } 2556286570Smav } 2557286570Smav#ifdef ZFS_DEBUG 2558286570Smav if (hdr->b_l1hdr.b_thawed != NULL) { 2559286570Smav kmem_free(hdr->b_l1hdr.b_thawed, 1); 2560286570Smav hdr->b_l1hdr.b_thawed = NULL; 2561286570Smav } 2562286570Smav#endif 2563219089Spjd } 2564168404Spjd 2565168404Spjd ASSERT3P(hdr->b_hash_next, ==, NULL); 2566286570Smav if (HDR_HAS_L1HDR(hdr)) { 2567286763Smav ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 2568286570Smav ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 2569286570Smav kmem_cache_free(hdr_full_cache, hdr); 2570286570Smav } else { 2571286570Smav kmem_cache_free(hdr_l2only_cache, hdr); 2572286570Smav } 2573168404Spjd} 2574168404Spjd 2575168404Spjdvoid 2576168404Spjdarc_buf_free(arc_buf_t *buf, void *tag) 2577168404Spjd{ 2578168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 2579286570Smav int hashed = hdr->b_l1hdr.b_state != arc_anon; 2580168404Spjd 2581168404Spjd ASSERT(buf->b_efunc == NULL); 2582168404Spjd ASSERT(buf->b_data != NULL); 2583168404Spjd 2584168404Spjd if (hashed) { 2585168404Spjd kmutex_t *hash_lock = HDR_LOCK(hdr); 2586168404Spjd 2587168404Spjd mutex_enter(hash_lock); 2588219089Spjd hdr = buf->b_hdr; 2589219089Spjd ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2590219089Spjd 2591168404Spjd (void) remove_reference(hdr, hash_lock, tag); 2592286570Smav if (hdr->b_l1hdr.b_datacnt > 1) { 2593286763Smav arc_buf_destroy(buf, TRUE); 2594219089Spjd } else { 2595286570Smav ASSERT(buf == hdr->b_l1hdr.b_buf); 2596219089Spjd ASSERT(buf->b_efunc == NULL); 2597275811Sdelphij hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2598219089Spjd } 2599168404Spjd mutex_exit(hash_lock); 2600168404Spjd } else if (HDR_IO_IN_PROGRESS(hdr)) { 2601168404Spjd int destroy_hdr; 2602168404Spjd /* 2603168404Spjd * We are in the middle of an async write. Don't destroy 2604168404Spjd * this buffer unless the write completes before we finish 2605168404Spjd * decrementing the reference count. 2606168404Spjd */ 2607286763Smav mutex_enter(&arc_user_evicts_lock); 2608168404Spjd (void) remove_reference(hdr, NULL, tag); 2609286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2610168404Spjd destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 2611286763Smav mutex_exit(&arc_user_evicts_lock); 2612168404Spjd if (destroy_hdr) 2613168404Spjd arc_hdr_destroy(hdr); 2614168404Spjd } else { 2615219089Spjd if (remove_reference(hdr, NULL, tag) > 0) 2616286763Smav arc_buf_destroy(buf, TRUE); 2617219089Spjd else 2618168404Spjd arc_hdr_destroy(hdr); 2619168404Spjd } 2620168404Spjd} 2621168404Spjd 2622248571Smmboolean_t 2623168404Spjdarc_buf_remove_ref(arc_buf_t *buf, void* tag) 2624168404Spjd{ 2625168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 2626168404Spjd kmutex_t *hash_lock = HDR_LOCK(hdr); 2627248571Smm boolean_t no_callback = (buf->b_efunc == NULL); 2628168404Spjd 2629286570Smav if (hdr->b_l1hdr.b_state == arc_anon) { 2630286570Smav ASSERT(hdr->b_l1hdr.b_datacnt == 1); 2631168404Spjd arc_buf_free(buf, tag); 2632168404Spjd return (no_callback); 2633168404Spjd } 2634168404Spjd 2635168404Spjd mutex_enter(hash_lock); 2636219089Spjd hdr = buf->b_hdr; 2637286570Smav ASSERT(hdr->b_l1hdr.b_datacnt > 0); 2638219089Spjd ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2639286570Smav ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2640168404Spjd ASSERT(buf->b_data != NULL); 2641168404Spjd 2642168404Spjd (void) remove_reference(hdr, hash_lock, tag); 2643286570Smav if (hdr->b_l1hdr.b_datacnt > 1) { 2644168404Spjd if (no_callback) 2645286763Smav arc_buf_destroy(buf, TRUE); 2646168404Spjd } else if (no_callback) { 2647286570Smav ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); 2648219089Spjd ASSERT(buf->b_efunc == NULL); 2649275811Sdelphij hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2650168404Spjd } 2651286570Smav ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || 2652286570Smav refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2653168404Spjd mutex_exit(hash_lock); 2654168404Spjd return (no_callback); 2655168404Spjd} 2656168404Spjd 2657286570Smavint32_t 2658168404Spjdarc_buf_size(arc_buf_t *buf) 2659168404Spjd{ 2660168404Spjd return (buf->b_hdr->b_size); 2661168404Spjd} 2662168404Spjd 2663168404Spjd/* 2664242845Sdelphij * Called from the DMU to determine if the current buffer should be 2665242845Sdelphij * evicted. In order to ensure proper locking, the eviction must be initiated 2666242845Sdelphij * from the DMU. Return true if the buffer is associated with user data and 2667242845Sdelphij * duplicate buffers still exist. 2668242845Sdelphij */ 2669242845Sdelphijboolean_t 2670242845Sdelphijarc_buf_eviction_needed(arc_buf_t *buf) 2671242845Sdelphij{ 2672242845Sdelphij arc_buf_hdr_t *hdr; 2673242845Sdelphij boolean_t evict_needed = B_FALSE; 2674242845Sdelphij 2675242845Sdelphij if (zfs_disable_dup_eviction) 2676242845Sdelphij return (B_FALSE); 2677242845Sdelphij 2678242845Sdelphij mutex_enter(&buf->b_evict_lock); 2679242845Sdelphij hdr = buf->b_hdr; 2680242845Sdelphij if (hdr == NULL) { 2681242845Sdelphij /* 2682242845Sdelphij * We are in arc_do_user_evicts(); let that function 2683242845Sdelphij * perform the eviction. 2684242845Sdelphij */ 2685242845Sdelphij ASSERT(buf->b_data == NULL); 2686242845Sdelphij mutex_exit(&buf->b_evict_lock); 2687242845Sdelphij return (B_FALSE); 2688242845Sdelphij } else if (buf->b_data == NULL) { 2689242845Sdelphij /* 2690242845Sdelphij * We have already been added to the arc eviction list; 2691242845Sdelphij * recommend eviction. 2692242845Sdelphij */ 2693242845Sdelphij ASSERT3P(hdr, ==, &arc_eviction_hdr); 2694242845Sdelphij mutex_exit(&buf->b_evict_lock); 2695242845Sdelphij return (B_TRUE); 2696242845Sdelphij } 2697242845Sdelphij 2698286570Smav if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) 2699242845Sdelphij evict_needed = B_TRUE; 2700242845Sdelphij 2701242845Sdelphij mutex_exit(&buf->b_evict_lock); 2702242845Sdelphij return (evict_needed); 2703242845Sdelphij} 2704242845Sdelphij 2705242845Sdelphij/* 2706286763Smav * Evict the arc_buf_hdr that is provided as a parameter. The resultant 2707286763Smav * state of the header is dependent on it's state prior to entering this 2708286763Smav * function. The following transitions are possible: 2709185029Spjd * 2710286763Smav * - arc_mru -> arc_mru_ghost 2711286763Smav * - arc_mfu -> arc_mfu_ghost 2712286763Smav * - arc_mru_ghost -> arc_l2c_only 2713286763Smav * - arc_mru_ghost -> deleted 2714286763Smav * - arc_mfu_ghost -> arc_l2c_only 2715286763Smav * - arc_mfu_ghost -> deleted 2716168404Spjd */ 2717286763Smavstatic int64_t 2718286763Smavarc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 2719168404Spjd{ 2720286763Smav arc_state_t *evicted_state, *state; 2721286763Smav int64_t bytes_evicted = 0; 2722168404Spjd 2723286763Smav ASSERT(MUTEX_HELD(hash_lock)); 2724286763Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2725168404Spjd 2726286763Smav state = hdr->b_l1hdr.b_state; 2727286763Smav if (GHOST_STATE(state)) { 2728286763Smav ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2729286763Smav ASSERT(hdr->b_l1hdr.b_buf == NULL); 2730206796Spjd 2731286763Smav /* 2732286763Smav * l2arc_write_buffers() relies on a header's L1 portion 2733286763Smav * (i.e. it's b_tmp_cdata field) during it's write phase. 2734286763Smav * Thus, we cannot push a header onto the arc_l2c_only 2735286763Smav * state (removing it's L1 piece) until the header is 2736286763Smav * done being written to the l2arc. 2737286763Smav */ 2738286763Smav if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { 2739286763Smav ARCSTAT_BUMP(arcstat_evict_l2_skip); 2740286763Smav return (bytes_evicted); 2741286763Smav } 2742286762Smav 2743286763Smav ARCSTAT_BUMP(arcstat_deleted); 2744286763Smav bytes_evicted += hdr->b_size; 2745286762Smav 2746286763Smav DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); 2747286763Smav 2748286763Smav if (HDR_HAS_L2HDR(hdr)) { 2749275780Sdelphij /* 2750286763Smav * This buffer is cached on the 2nd Level ARC; 2751286763Smav * don't destroy the header. 2752275780Sdelphij */ 2753286763Smav arc_change_state(arc_l2c_only, hdr, hash_lock); 2754286763Smav /* 2755286763Smav * dropping from L1+L2 cached to L2-only, 2756286763Smav * realloc to remove the L1 header. 2757286763Smav */ 2758286763Smav hdr = arc_hdr_realloc(hdr, hdr_full_cache, 2759286763Smav hdr_l2only_cache); 2760286763Smav } else { 2761286763Smav arc_change_state(arc_anon, hdr, hash_lock); 2762286763Smav arc_hdr_destroy(hdr); 2763275780Sdelphij } 2764286763Smav return (bytes_evicted); 2765275780Sdelphij } 2766275780Sdelphij 2767286763Smav ASSERT(state == arc_mru || state == arc_mfu); 2768286763Smav evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2769206796Spjd 2770286763Smav /* prefetch buffers have a minimum lifespan */ 2771286763Smav if (HDR_IO_IN_PROGRESS(hdr) || 2772286763Smav ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && 2773286763Smav ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < 2774286763Smav arc_min_prefetch_lifespan)) { 2775286763Smav ARCSTAT_BUMP(arcstat_evict_skip); 2776286763Smav return (bytes_evicted); 2777286763Smav } 2778286763Smav 2779286763Smav ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 2780286763Smav ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); 2781286763Smav while (hdr->b_l1hdr.b_buf) { 2782286763Smav arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2783286763Smav if (!mutex_tryenter(&buf->b_evict_lock)) { 2784286763Smav ARCSTAT_BUMP(arcstat_mutex_miss); 2785286763Smav break; 2786168404Spjd } 2787286763Smav if (buf->b_data != NULL) 2788286763Smav bytes_evicted += hdr->b_size; 2789286763Smav if (buf->b_efunc != NULL) { 2790286763Smav mutex_enter(&arc_user_evicts_lock); 2791286763Smav arc_buf_destroy(buf, FALSE); 2792286763Smav hdr->b_l1hdr.b_buf = buf->b_next; 2793286763Smav buf->b_hdr = &arc_eviction_hdr; 2794286763Smav buf->b_next = arc_eviction_list; 2795286763Smav arc_eviction_list = buf; 2796286763Smav cv_signal(&arc_user_evicts_cv); 2797286763Smav mutex_exit(&arc_user_evicts_lock); 2798286763Smav mutex_exit(&buf->b_evict_lock); 2799286763Smav } else { 2800286763Smav mutex_exit(&buf->b_evict_lock); 2801286763Smav arc_buf_destroy(buf, TRUE); 2802286763Smav } 2803286763Smav } 2804258632Savg 2805286763Smav if (HDR_HAS_L2HDR(hdr)) { 2806286763Smav ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size); 2807286763Smav } else { 2808286763Smav if (l2arc_write_eligible(hdr->b_spa, hdr)) 2809286763Smav ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size); 2810286763Smav else 2811286763Smav ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size); 2812286763Smav } 2813258632Savg 2814286763Smav if (hdr->b_l1hdr.b_datacnt == 0) { 2815286763Smav arc_change_state(evicted_state, hdr, hash_lock); 2816286763Smav ASSERT(HDR_IN_HASH_TABLE(hdr)); 2817286763Smav hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 2818286763Smav hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 2819286763Smav DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); 2820286763Smav } 2821286763Smav 2822286763Smav return (bytes_evicted); 2823286763Smav} 2824286763Smav 2825286763Smavstatic uint64_t 2826286763Smavarc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, 2827286763Smav uint64_t spa, int64_t bytes) 2828286763Smav{ 2829286763Smav multilist_sublist_t *mls; 2830286763Smav uint64_t bytes_evicted = 0; 2831286763Smav arc_buf_hdr_t *hdr; 2832286763Smav kmutex_t *hash_lock; 2833286763Smav int evict_count = 0; 2834286763Smav 2835286763Smav ASSERT3P(marker, !=, NULL); 2836286763Smav IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 2837286763Smav 2838286763Smav mls = multilist_sublist_lock(ml, idx); 2839286763Smav 2840286763Smav for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; 2841286763Smav hdr = multilist_sublist_prev(mls, marker)) { 2842286763Smav if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || 2843286763Smav (evict_count >= zfs_arc_evict_batch_limit)) 2844286763Smav break; 2845286763Smav 2846258632Savg /* 2847286763Smav * To keep our iteration location, move the marker 2848286763Smav * forward. Since we're not holding hdr's hash lock, we 2849286763Smav * must be very careful and not remove 'hdr' from the 2850286763Smav * sublist. Otherwise, other consumers might mistake the 2851286763Smav * 'hdr' as not being on a sublist when they call the 2852286763Smav * multilist_link_active() function (they all rely on 2853286763Smav * the hash lock protecting concurrent insertions and 2854286763Smav * removals). multilist_sublist_move_forward() was 2855286763Smav * specifically implemented to ensure this is the case 2856286763Smav * (only 'marker' will be removed and re-inserted). 2857258632Savg */ 2858286763Smav multilist_sublist_move_forward(mls, marker); 2859286763Smav 2860286763Smav /* 2861286763Smav * The only case where the b_spa field should ever be 2862286763Smav * zero, is the marker headers inserted by 2863286763Smav * arc_evict_state(). It's possible for multiple threads 2864286763Smav * to be calling arc_evict_state() concurrently (e.g. 2865286763Smav * dsl_pool_close() and zio_inject_fault()), so we must 2866286763Smav * skip any markers we see from these other threads. 2867286763Smav */ 2868286763Smav if (hdr->b_spa == 0) 2869258632Savg continue; 2870286763Smav 2871286763Smav /* we're only interested in evicting buffers of a certain spa */ 2872286763Smav if (spa != 0 && hdr->b_spa != spa) { 2873286763Smav ARCSTAT_BUMP(arcstat_evict_skip); 2874286763Smav continue; 2875258632Savg } 2876258632Savg 2877275811Sdelphij hash_lock = HDR_LOCK(hdr); 2878208373Smm 2879286763Smav /* 2880286763Smav * We aren't calling this function from any code path 2881286763Smav * that would already be holding a hash lock, so we're 2882286763Smav * asserting on this assumption to be defensive in case 2883286763Smav * this ever changes. Without this check, it would be 2884286763Smav * possible to incorrectly increment arcstat_mutex_miss 2885286763Smav * below (e.g. if the code changed such that we called 2886286763Smav * this function with a hash lock held). 2887286763Smav */ 2888286763Smav ASSERT(!MUTEX_HELD(hash_lock)); 2889208373Smm 2890286763Smav if (mutex_tryenter(hash_lock)) { 2891286763Smav uint64_t evicted = arc_evict_hdr(hdr, hash_lock); 2892286763Smav mutex_exit(hash_lock); 2893286763Smav 2894286763Smav bytes_evicted += evicted; 2895286763Smav 2896286763Smav /* 2897286763Smav * If evicted is zero, arc_evict_hdr() must have 2898286763Smav * decided to skip this header, don't increment 2899286763Smav * evict_count in this case. 2900286763Smav */ 2901286763Smav if (evicted != 0) 2902286763Smav evict_count++; 2903286763Smav 2904286763Smav /* 2905286763Smav * If arc_size isn't overflowing, signal any 2906286763Smav * threads that might happen to be waiting. 2907286763Smav * 2908286763Smav * For each header evicted, we wake up a single 2909286763Smav * thread. If we used cv_broadcast, we could 2910286763Smav * wake up "too many" threads causing arc_size 2911286763Smav * to significantly overflow arc_c; since 2912286763Smav * arc_get_data_buf() doesn't check for overflow 2913286763Smav * when it's woken up (it doesn't because it's 2914286763Smav * possible for the ARC to be overflowing while 2915286763Smav * full of un-evictable buffers, and the 2916286763Smav * function should proceed in this case). 2917286763Smav * 2918286763Smav * If threads are left sleeping, due to not 2919286763Smav * using cv_broadcast, they will be woken up 2920286763Smav * just before arc_reclaim_thread() sleeps. 2921286763Smav */ 2922286763Smav mutex_enter(&arc_reclaim_lock); 2923286763Smav if (!arc_is_overflowing()) 2924286763Smav cv_signal(&arc_reclaim_waiters_cv); 2925286763Smav mutex_exit(&arc_reclaim_lock); 2926168404Spjd } else { 2927286763Smav ARCSTAT_BUMP(arcstat_mutex_miss); 2928168404Spjd } 2929168404Spjd } 2930168404Spjd 2931286763Smav multilist_sublist_unlock(mls); 2932206796Spjd 2933286763Smav return (bytes_evicted); 2934286763Smav} 2935168404Spjd 2936286763Smav/* 2937286763Smav * Evict buffers from the given arc state, until we've removed the 2938286763Smav * specified number of bytes. Move the removed buffers to the 2939286763Smav * appropriate evict state. 2940286763Smav * 2941286763Smav * This function makes a "best effort". It skips over any buffers 2942286763Smav * it can't get a hash_lock on, and so, may not catch all candidates. 2943286763Smav * It may also return without evicting as much space as requested. 2944286763Smav * 2945286763Smav * If bytes is specified using the special value ARC_EVICT_ALL, this 2946286763Smav * will evict all available (i.e. unlocked and evictable) buffers from 2947286763Smav * the given arc state; which is used by arc_flush(). 2948286763Smav */ 2949286763Smavstatic uint64_t 2950286763Smavarc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, 2951286763Smav arc_buf_contents_t type) 2952286763Smav{ 2953286763Smav uint64_t total_evicted = 0; 2954286763Smav multilist_t *ml = &state->arcs_list[type]; 2955286763Smav int num_sublists; 2956286763Smav arc_buf_hdr_t **markers; 2957168404Spjd 2958286763Smav IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 2959168404Spjd 2960286763Smav num_sublists = multilist_get_num_sublists(ml); 2961286763Smav 2962185029Spjd /* 2963286763Smav * If we've tried to evict from each sublist, made some 2964286763Smav * progress, but still have not hit the target number of bytes 2965286763Smav * to evict, we want to keep trying. The markers allow us to 2966286763Smav * pick up where we left off for each individual sublist, rather 2967286763Smav * than starting from the tail each time. 2968185029Spjd */ 2969286763Smav markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); 2970286763Smav for (int i = 0; i < num_sublists; i++) { 2971286763Smav markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); 2972185029Spjd 2973286763Smav /* 2974286763Smav * A b_spa of 0 is used to indicate that this header is 2975286763Smav * a marker. This fact is used in arc_adjust_type() and 2976286763Smav * arc_evict_state_impl(). 2977286763Smav */ 2978286763Smav markers[i]->b_spa = 0; 2979168404Spjd 2980286763Smav multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 2981286763Smav multilist_sublist_insert_tail(mls, markers[i]); 2982286763Smav multilist_sublist_unlock(mls); 2983286763Smav } 2984168404Spjd 2985286763Smav /* 2986286763Smav * While we haven't hit our target number of bytes to evict, or 2987286763Smav * we're evicting all available buffers. 2988286763Smav */ 2989286763Smav while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { 2990286763Smav /* 2991286763Smav * Start eviction using a randomly selected sublist, 2992286763Smav * this is to try and evenly balance eviction across all 2993286763Smav * sublists. Always starting at the same sublist 2994286763Smav * (e.g. index 0) would cause evictions to favor certain 2995286763Smav * sublists over others. 2996286763Smav */ 2997286763Smav int sublist_idx = multilist_get_random_index(ml); 2998286763Smav uint64_t scan_evicted = 0; 2999219089Spjd 3000286763Smav for (int i = 0; i < num_sublists; i++) { 3001286763Smav uint64_t bytes_remaining; 3002286763Smav uint64_t bytes_evicted; 3003219089Spjd 3004286763Smav if (bytes == ARC_EVICT_ALL) 3005286763Smav bytes_remaining = ARC_EVICT_ALL; 3006286763Smav else if (total_evicted < bytes) 3007286763Smav bytes_remaining = bytes - total_evicted; 3008286763Smav else 3009286763Smav break; 3010258632Savg 3011286763Smav bytes_evicted = arc_evict_state_impl(ml, sublist_idx, 3012286763Smav markers[sublist_idx], spa, bytes_remaining); 3013286763Smav 3014286763Smav scan_evicted += bytes_evicted; 3015286763Smav total_evicted += bytes_evicted; 3016286763Smav 3017286763Smav /* we've reached the end, wrap to the beginning */ 3018286763Smav if (++sublist_idx >= num_sublists) 3019286763Smav sublist_idx = 0; 3020286763Smav } 3021286763Smav 3022258632Savg /* 3023286763Smav * If we didn't evict anything during this scan, we have 3024286763Smav * no reason to believe we'll evict more during another 3025286763Smav * scan, so break the loop. 3026258632Savg */ 3027286763Smav if (scan_evicted == 0) { 3028286763Smav /* This isn't possible, let's make that obvious */ 3029286763Smav ASSERT3S(bytes, !=, 0); 3030185029Spjd 3031286763Smav /* 3032286763Smav * When bytes is ARC_EVICT_ALL, the only way to 3033286763Smav * break the loop is when scan_evicted is zero. 3034286763Smav * In that case, we actually have evicted enough, 3035286763Smav * so we don't want to increment the kstat. 3036286763Smav */ 3037286763Smav if (bytes != ARC_EVICT_ALL) { 3038286763Smav ASSERT3S(total_evicted, <, bytes); 3039286763Smav ARCSTAT_BUMP(arcstat_evict_not_enough); 3040185029Spjd } 3041185029Spjd 3042286763Smav break; 3043258632Savg } 3044286763Smav } 3045258632Savg 3046286763Smav for (int i = 0; i < num_sublists; i++) { 3047286763Smav multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 3048286763Smav multilist_sublist_remove(mls, markers[i]); 3049286763Smav multilist_sublist_unlock(mls); 3050286763Smav 3051286763Smav kmem_cache_free(hdr_full_cache, markers[i]); 3052168404Spjd } 3053286763Smav kmem_free(markers, sizeof (*markers) * num_sublists); 3054206796Spjd 3055286763Smav return (total_evicted); 3056286763Smav} 3057286763Smav 3058286763Smav/* 3059286763Smav * Flush all "evictable" data of the given type from the arc state 3060286763Smav * specified. This will not evict any "active" buffers (i.e. referenced). 3061286763Smav * 3062286763Smav * When 'retry' is set to FALSE, the function will make a single pass 3063286763Smav * over the state and evict any buffers that it can. Since it doesn't 3064286763Smav * continually retry the eviction, it might end up leaving some buffers 3065286763Smav * in the ARC due to lock misses. 3066286763Smav * 3067286763Smav * When 'retry' is set to TRUE, the function will continually retry the 3068286763Smav * eviction until *all* evictable buffers have been removed from the 3069286763Smav * state. As a result, if concurrent insertions into the state are 3070286763Smav * allowed (e.g. if the ARC isn't shutting down), this function might 3071286763Smav * wind up in an infinite loop, continually trying to evict buffers. 3072286763Smav */ 3073286763Smavstatic uint64_t 3074286763Smavarc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, 3075286763Smav boolean_t retry) 3076286763Smav{ 3077286763Smav uint64_t evicted = 0; 3078286763Smav 3079286763Smav while (state->arcs_lsize[type] != 0) { 3080286763Smav evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); 3081286763Smav 3082286763Smav if (!retry) 3083286763Smav break; 3084185029Spjd } 3085185029Spjd 3086286763Smav return (evicted); 3087286763Smav} 3088286763Smav 3089286763Smav/* 3090286763Smav * Evict the specified number of bytes from the state specified, 3091286763Smav * restricting eviction to the spa and type given. This function 3092286763Smav * prevents us from trying to evict more from a state's list than 3093286763Smav * is "evictable", and to skip evicting altogether when passed a 3094286763Smav * negative value for "bytes". In contrast, arc_evict_state() will 3095286763Smav * evict everything it can, when passed a negative value for "bytes". 3096286763Smav */ 3097286763Smavstatic uint64_t 3098286763Smavarc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, 3099286763Smav arc_buf_contents_t type) 3100286763Smav{ 3101286763Smav int64_t delta; 3102286763Smav 3103286763Smav if (bytes > 0 && state->arcs_lsize[type] > 0) { 3104286763Smav delta = MIN(state->arcs_lsize[type], bytes); 3105286763Smav return (arc_evict_state(state, spa, delta, type)); 3106168404Spjd } 3107168404Spjd 3108286763Smav return (0); 3109168404Spjd} 3110168404Spjd 3111286763Smav/* 3112286763Smav * Evict metadata buffers from the cache, such that arc_meta_used is 3113286763Smav * capped by the arc_meta_limit tunable. 3114286763Smav */ 3115286763Smavstatic uint64_t 3116286763Smavarc_adjust_meta(void) 3117286763Smav{ 3118286763Smav uint64_t total_evicted = 0; 3119286763Smav int64_t target; 3120286763Smav 3121286763Smav /* 3122286763Smav * If we're over the meta limit, we want to evict enough 3123286763Smav * metadata to get back under the meta limit. We don't want to 3124286763Smav * evict so much that we drop the MRU below arc_p, though. If 3125286763Smav * we're over the meta limit more than we're over arc_p, we 3126286763Smav * evict some from the MRU here, and some from the MFU below. 3127286763Smav */ 3128286763Smav target = MIN((int64_t)(arc_meta_used - arc_meta_limit), 3129286766Smav (int64_t)(refcount_count(&arc_anon->arcs_size) + 3130286766Smav refcount_count(&arc_mru->arcs_size) - arc_p)); 3131286763Smav 3132286763Smav total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 3133286763Smav 3134286763Smav /* 3135286763Smav * Similar to the above, we want to evict enough bytes to get us 3136286763Smav * below the meta limit, but not so much as to drop us below the 3137286763Smav * space alloted to the MFU (which is defined as arc_c - arc_p). 3138286763Smav */ 3139286763Smav target = MIN((int64_t)(arc_meta_used - arc_meta_limit), 3140286766Smav (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p))); 3141286763Smav 3142286763Smav total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 3143286763Smav 3144286763Smav return (total_evicted); 3145286763Smav} 3146286763Smav 3147286763Smav/* 3148286763Smav * Return the type of the oldest buffer in the given arc state 3149286763Smav * 3150286763Smav * This function will select a random sublist of type ARC_BUFC_DATA and 3151286763Smav * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist 3152286763Smav * is compared, and the type which contains the "older" buffer will be 3153286763Smav * returned. 3154286763Smav */ 3155286763Smavstatic arc_buf_contents_t 3156286763Smavarc_adjust_type(arc_state_t *state) 3157286763Smav{ 3158286763Smav multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA]; 3159286763Smav multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA]; 3160286763Smav int data_idx = multilist_get_random_index(data_ml); 3161286763Smav int meta_idx = multilist_get_random_index(meta_ml); 3162286763Smav multilist_sublist_t *data_mls; 3163286763Smav multilist_sublist_t *meta_mls; 3164286763Smav arc_buf_contents_t type; 3165286763Smav arc_buf_hdr_t *data_hdr; 3166286763Smav arc_buf_hdr_t *meta_hdr; 3167286763Smav 3168286763Smav /* 3169286763Smav * We keep the sublist lock until we're finished, to prevent 3170286763Smav * the headers from being destroyed via arc_evict_state(). 3171286763Smav */ 3172286763Smav data_mls = multilist_sublist_lock(data_ml, data_idx); 3173286763Smav meta_mls = multilist_sublist_lock(meta_ml, meta_idx); 3174286763Smav 3175286763Smav /* 3176286763Smav * These two loops are to ensure we skip any markers that 3177286763Smav * might be at the tail of the lists due to arc_evict_state(). 3178286763Smav */ 3179286763Smav 3180286763Smav for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; 3181286763Smav data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { 3182286763Smav if (data_hdr->b_spa != 0) 3183286763Smav break; 3184286763Smav } 3185286763Smav 3186286763Smav for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; 3187286763Smav meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { 3188286763Smav if (meta_hdr->b_spa != 0) 3189286763Smav break; 3190286763Smav } 3191286763Smav 3192286763Smav if (data_hdr == NULL && meta_hdr == NULL) { 3193286763Smav type = ARC_BUFC_DATA; 3194286763Smav } else if (data_hdr == NULL) { 3195286763Smav ASSERT3P(meta_hdr, !=, NULL); 3196286763Smav type = ARC_BUFC_METADATA; 3197286763Smav } else if (meta_hdr == NULL) { 3198286763Smav ASSERT3P(data_hdr, !=, NULL); 3199286763Smav type = ARC_BUFC_DATA; 3200286763Smav } else { 3201286763Smav ASSERT3P(data_hdr, !=, NULL); 3202286763Smav ASSERT3P(meta_hdr, !=, NULL); 3203286763Smav 3204286763Smav /* The headers can't be on the sublist without an L1 header */ 3205286763Smav ASSERT(HDR_HAS_L1HDR(data_hdr)); 3206286763Smav ASSERT(HDR_HAS_L1HDR(meta_hdr)); 3207286763Smav 3208286763Smav if (data_hdr->b_l1hdr.b_arc_access < 3209286763Smav meta_hdr->b_l1hdr.b_arc_access) { 3210286763Smav type = ARC_BUFC_DATA; 3211286763Smav } else { 3212286763Smav type = ARC_BUFC_METADATA; 3213286763Smav } 3214286763Smav } 3215286763Smav 3216286763Smav multilist_sublist_unlock(meta_mls); 3217286763Smav multilist_sublist_unlock(data_mls); 3218286763Smav 3219286763Smav return (type); 3220286763Smav} 3221286763Smav 3222286763Smav/* 3223286763Smav * Evict buffers from the cache, such that arc_size is capped by arc_c. 3224286763Smav */ 3225286763Smavstatic uint64_t 3226168404Spjdarc_adjust(void) 3227168404Spjd{ 3228286763Smav uint64_t total_evicted = 0; 3229286763Smav uint64_t bytes; 3230286763Smav int64_t target; 3231168404Spjd 3232208373Smm /* 3233286763Smav * If we're over arc_meta_limit, we want to correct that before 3234286763Smav * potentially evicting data buffers below. 3235286763Smav */ 3236286763Smav total_evicted += arc_adjust_meta(); 3237286763Smav 3238286763Smav /* 3239208373Smm * Adjust MRU size 3240286763Smav * 3241286763Smav * If we're over the target cache size, we want to evict enough 3242286763Smav * from the list to get back to our target size. We don't want 3243286763Smav * to evict too much from the MRU, such that it drops below 3244286763Smav * arc_p. So, if we're over our target cache size more than 3245286763Smav * the MRU is over arc_p, we'll evict enough to get back to 3246286763Smav * arc_p here, and then evict more from the MFU below. 3247208373Smm */ 3248286763Smav target = MIN((int64_t)(arc_size - arc_c), 3249286766Smav (int64_t)(refcount_count(&arc_anon->arcs_size) + 3250286766Smav refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p)); 3251208373Smm 3252286763Smav /* 3253286763Smav * If we're below arc_meta_min, always prefer to evict data. 3254286763Smav * Otherwise, try to satisfy the requested number of bytes to 3255286763Smav * evict from the type which contains older buffers; in an 3256286763Smav * effort to keep newer buffers in the cache regardless of their 3257286763Smav * type. If we cannot satisfy the number of bytes from this 3258286763Smav * type, spill over into the next type. 3259286763Smav */ 3260286763Smav if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && 3261286763Smav arc_meta_used > arc_meta_min) { 3262286763Smav bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 3263286763Smav total_evicted += bytes; 3264168404Spjd 3265286763Smav /* 3266286763Smav * If we couldn't evict our target number of bytes from 3267286763Smav * metadata, we try to get the rest from data. 3268286763Smav */ 3269286763Smav target -= bytes; 3270286763Smav 3271286763Smav total_evicted += 3272286763Smav arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 3273286763Smav } else { 3274286763Smav bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 3275286763Smav total_evicted += bytes; 3276286763Smav 3277286763Smav /* 3278286763Smav * If we couldn't evict our target number of bytes from 3279286763Smav * data, we try to get the rest from metadata. 3280286763Smav */ 3281286763Smav target -= bytes; 3282286763Smav 3283286763Smav total_evicted += 3284286763Smav arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 3285185029Spjd } 3286185029Spjd 3287208373Smm /* 3288208373Smm * Adjust MFU size 3289286763Smav * 3290286763Smav * Now that we've tried to evict enough from the MRU to get its 3291286763Smav * size back to arc_p, if we're still above the target cache 3292286763Smav * size, we evict the rest from the MFU. 3293208373Smm */ 3294286763Smav target = arc_size - arc_c; 3295168404Spjd 3296286764Smav if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && 3297286763Smav arc_meta_used > arc_meta_min) { 3298286763Smav bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 3299286763Smav total_evicted += bytes; 3300208373Smm 3301286763Smav /* 3302286763Smav * If we couldn't evict our target number of bytes from 3303286763Smav * metadata, we try to get the rest from data. 3304286763Smav */ 3305286763Smav target -= bytes; 3306168404Spjd 3307286763Smav total_evicted += 3308286763Smav arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 3309286763Smav } else { 3310286763Smav bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 3311286763Smav total_evicted += bytes; 3312286763Smav 3313286763Smav /* 3314286763Smav * If we couldn't evict our target number of bytes from 3315286763Smav * data, we try to get the rest from data. 3316286763Smav */ 3317286763Smav target -= bytes; 3318286763Smav 3319286763Smav total_evicted += 3320286763Smav arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 3321208373Smm } 3322168404Spjd 3323208373Smm /* 3324208373Smm * Adjust ghost lists 3325286763Smav * 3326286763Smav * In addition to the above, the ARC also defines target values 3327286763Smav * for the ghost lists. The sum of the mru list and mru ghost 3328286763Smav * list should never exceed the target size of the cache, and 3329286763Smav * the sum of the mru list, mfu list, mru ghost list, and mfu 3330286763Smav * ghost list should never exceed twice the target size of the 3331286763Smav * cache. The following logic enforces these limits on the ghost 3332286763Smav * caches, and evicts from them as needed. 3333208373Smm */ 3334286766Smav target = refcount_count(&arc_mru->arcs_size) + 3335286766Smav refcount_count(&arc_mru_ghost->arcs_size) - arc_c; 3336168404Spjd 3337286763Smav bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); 3338286763Smav total_evicted += bytes; 3339168404Spjd 3340286763Smav target -= bytes; 3341185029Spjd 3342286763Smav total_evicted += 3343286763Smav arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); 3344208373Smm 3345286763Smav /* 3346286763Smav * We assume the sum of the mru list and mfu list is less than 3347286763Smav * or equal to arc_c (we enforced this above), which means we 3348286763Smav * can use the simpler of the two equations below: 3349286763Smav * 3350286763Smav * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c 3351286763Smav * mru ghost + mfu ghost <= arc_c 3352286763Smav */ 3353286766Smav target = refcount_count(&arc_mru_ghost->arcs_size) + 3354286766Smav refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; 3355286763Smav 3356286763Smav bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); 3357286763Smav total_evicted += bytes; 3358286763Smav 3359286763Smav target -= bytes; 3360286763Smav 3361286763Smav total_evicted += 3362286763Smav arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); 3363286763Smav 3364286763Smav return (total_evicted); 3365168404Spjd} 3366168404Spjd 3367168404Spjdstatic void 3368168404Spjdarc_do_user_evicts(void) 3369168404Spjd{ 3370286763Smav mutex_enter(&arc_user_evicts_lock); 3371286762Smav while (arc_eviction_list != NULL) { 3372286762Smav arc_buf_t *buf = arc_eviction_list; 3373286762Smav arc_eviction_list = buf->b_next; 3374219089Spjd mutex_enter(&buf->b_evict_lock); 3375168404Spjd buf->b_hdr = NULL; 3376219089Spjd mutex_exit(&buf->b_evict_lock); 3377286763Smav mutex_exit(&arc_user_evicts_lock); 3378168404Spjd 3379168404Spjd if (buf->b_efunc != NULL) 3380268858Sdelphij VERIFY0(buf->b_efunc(buf->b_private)); 3381168404Spjd 3382168404Spjd buf->b_efunc = NULL; 3383168404Spjd buf->b_private = NULL; 3384168404Spjd kmem_cache_free(buf_cache, buf); 3385286763Smav mutex_enter(&arc_user_evicts_lock); 3386168404Spjd } 3387286763Smav mutex_exit(&arc_user_evicts_lock); 3388168404Spjd} 3389168404Spjd 3390168404Spjdvoid 3391286763Smavarc_flush(spa_t *spa, boolean_t retry) 3392168404Spjd{ 3393209962Smm uint64_t guid = 0; 3394209962Smm 3395286763Smav /* 3396286763Smav * If retry is TRUE, a spa must not be specified since we have 3397286763Smav * no good way to determine if all of a spa's buffers have been 3398286763Smav * evicted from an arc state. 3399286763Smav */ 3400286763Smav ASSERT(!retry || spa == 0); 3401286763Smav 3402286570Smav if (spa != NULL) 3403228103Smm guid = spa_load_guid(spa); 3404209962Smm 3405286763Smav (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); 3406286763Smav (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); 3407168404Spjd 3408286763Smav (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); 3409286763Smav (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); 3410168404Spjd 3411286763Smav (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); 3412286763Smav (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); 3413286763Smav 3414286763Smav (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); 3415286763Smav (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); 3416286763Smav 3417168404Spjd arc_do_user_evicts(); 3418185029Spjd ASSERT(spa || arc_eviction_list == NULL); 3419168404Spjd} 3420168404Spjd 3421168404Spjdvoid 3422286625Smavarc_shrink(int64_t to_free) 3423168404Spjd{ 3424168404Spjd if (arc_c > arc_c_min) { 3425272483Ssmh DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, 3426272483Ssmh arc_c_min, uint64_t, arc_p, uint64_t, to_free); 3427168404Spjd if (arc_c > arc_c_min + to_free) 3428168404Spjd atomic_add_64(&arc_c, -to_free); 3429168404Spjd else 3430168404Spjd arc_c = arc_c_min; 3431168404Spjd 3432168404Spjd atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 3433168404Spjd if (arc_c > arc_size) 3434168404Spjd arc_c = MAX(arc_size, arc_c_min); 3435168404Spjd if (arc_p > arc_c) 3436168404Spjd arc_p = (arc_c >> 1); 3437272483Ssmh 3438272483Ssmh DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, 3439272483Ssmh arc_p); 3440272483Ssmh 3441168404Spjd ASSERT(arc_c >= arc_c_min); 3442168404Spjd ASSERT((int64_t)arc_p >= 0); 3443168404Spjd } 3444168404Spjd 3445270759Ssmh if (arc_size > arc_c) { 3446270759Ssmh DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, 3447270759Ssmh uint64_t, arc_c); 3448286763Smav (void) arc_adjust(); 3449270759Ssmh } 3450168404Spjd} 3451168404Spjd 3452286625Smavstatic long needfree = 0; 3453168404Spjd 3454286625Smavtypedef enum free_memory_reason_t { 3455286625Smav FMR_UNKNOWN, 3456286625Smav FMR_NEEDFREE, 3457286625Smav FMR_LOTSFREE, 3458286625Smav FMR_SWAPFS_MINFREE, 3459286625Smav FMR_PAGES_PP_MAXIMUM, 3460286625Smav FMR_HEAP_ARENA, 3461286625Smav FMR_ZIO_ARENA, 3462286625Smav FMR_ZIO_FRAG, 3463286625Smav} free_memory_reason_t; 3464286625Smav 3465286625Smavint64_t last_free_memory; 3466286625Smavfree_memory_reason_t last_free_reason; 3467286625Smav 3468286625Smav/* 3469286625Smav * Additional reserve of pages for pp_reserve. 3470286625Smav */ 3471286625Smavint64_t arc_pages_pp_reserve = 64; 3472286625Smav 3473286625Smav/* 3474286625Smav * Additional reserve of pages for swapfs. 3475286625Smav */ 3476286625Smavint64_t arc_swapfs_reserve = 64; 3477286625Smav 3478286625Smav/* 3479286625Smav * Return the amount of memory that can be consumed before reclaim will be 3480286625Smav * needed. Positive if there is sufficient free memory, negative indicates 3481286625Smav * the amount of memory that needs to be freed up. 3482286625Smav */ 3483286625Smavstatic int64_t 3484286625Smavarc_available_memory(void) 3485168404Spjd{ 3486286625Smav int64_t lowest = INT64_MAX; 3487286625Smav int64_t n; 3488286625Smav free_memory_reason_t r = FMR_UNKNOWN; 3489168404Spjd 3490168404Spjd#ifdef _KERNEL 3491286625Smav if (needfree > 0) { 3492286625Smav n = PAGESIZE * (-needfree); 3493286625Smav if (n < lowest) { 3494286625Smav lowest = n; 3495286625Smav r = FMR_NEEDFREE; 3496286625Smav } 3497270759Ssmh } 3498168404Spjd 3499191902Skmacy /* 3500212780Savg * Cooperate with pagedaemon when it's time for it to scan 3501212780Savg * and reclaim some pages. 3502191902Skmacy */ 3503286655Smav n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); 3504286625Smav if (n < lowest) { 3505286625Smav lowest = n; 3506286625Smav r = FMR_LOTSFREE; 3507270759Ssmh } 3508191902Skmacy 3509277300Ssmh#ifdef illumos 3510168404Spjd /* 3511185029Spjd * check that we're out of range of the pageout scanner. It starts to 3512185029Spjd * schedule paging if freemem is less than lotsfree and needfree. 3513185029Spjd * lotsfree is the high-water mark for pageout, and needfree is the 3514185029Spjd * number of needed free pages. We add extra pages here to make sure 3515185029Spjd * the scanner doesn't start up while we're freeing memory. 3516185029Spjd */ 3517286625Smav n = PAGESIZE * (freemem - lotsfree - needfree - desfree); 3518286625Smav if (n < lowest) { 3519286625Smav lowest = n; 3520286625Smav r = FMR_LOTSFREE; 3521286625Smav } 3522185029Spjd 3523185029Spjd /* 3524168404Spjd * check to make sure that swapfs has enough space so that anon 3525185029Spjd * reservations can still succeed. anon_resvmem() checks that the 3526168404Spjd * availrmem is greater than swapfs_minfree, and the number of reserved 3527168404Spjd * swap pages. We also add a bit of extra here just to prevent 3528168404Spjd * circumstances from getting really dire. 3529168404Spjd */ 3530286625Smav n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - 3531286625Smav desfree - arc_swapfs_reserve); 3532286625Smav if (n < lowest) { 3533286625Smav lowest = n; 3534286625Smav r = FMR_SWAPFS_MINFREE; 3535286625Smav } 3536168404Spjd 3537286625Smav 3538168404Spjd /* 3539272483Ssmh * Check that we have enough availrmem that memory locking (e.g., via 3540272483Ssmh * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum 3541272483Ssmh * stores the number of pages that cannot be locked; when availrmem 3542272483Ssmh * drops below pages_pp_maximum, page locking mechanisms such as 3543272483Ssmh * page_pp_lock() will fail.) 3544272483Ssmh */ 3545286625Smav n = PAGESIZE * (availrmem - pages_pp_maximum - 3546286625Smav arc_pages_pp_reserve); 3547286625Smav if (n < lowest) { 3548286625Smav lowest = n; 3549286625Smav r = FMR_PAGES_PP_MAXIMUM; 3550286625Smav } 3551272483Ssmh 3552277300Ssmh#endif /* illumos */ 3553272483Ssmh#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 3554272483Ssmh /* 3555168404Spjd * If we're on an i386 platform, it's possible that we'll exhaust the 3556168404Spjd * kernel heap space before we ever run out of available physical 3557168404Spjd * memory. Most checks of the size of the heap_area compare against 3558168404Spjd * tune.t_minarmem, which is the minimum available real memory that we 3559168404Spjd * can have in the system. However, this is generally fixed at 25 pages 3560168404Spjd * which is so low that it's useless. In this comparison, we seek to 3561168404Spjd * calculate the total heap-size, and reclaim if more than 3/4ths of the 3562185029Spjd * heap is allocated. (Or, in the calculation, if less than 1/4th is 3563168404Spjd * free) 3564168404Spjd */ 3565286655Smav n = (int64_t)vmem_size(heap_arena, VMEM_FREE) - 3566286628Smav (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); 3567286625Smav if (n < lowest) { 3568286625Smav lowest = n; 3569286625Smav r = FMR_HEAP_ARENA; 3570270861Ssmh } 3571281026Smav#define zio_arena NULL 3572281026Smav#else 3573281026Smav#define zio_arena heap_arena 3574270861Ssmh#endif 3575281026Smav 3576272483Ssmh /* 3577272483Ssmh * If zio data pages are being allocated out of a separate heap segment, 3578272483Ssmh * then enforce that the size of available vmem for this arena remains 3579272483Ssmh * above about 1/16th free. 3580272483Ssmh * 3581272483Ssmh * Note: The 1/16th arena free requirement was put in place 3582272483Ssmh * to aggressively evict memory from the arc in order to avoid 3583272483Ssmh * memory fragmentation issues. 3584272483Ssmh */ 3585286625Smav if (zio_arena != NULL) { 3586286655Smav n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - 3587286625Smav (vmem_size(zio_arena, VMEM_ALLOC) >> 4); 3588286625Smav if (n < lowest) { 3589286625Smav lowest = n; 3590286625Smav r = FMR_ZIO_ARENA; 3591286625Smav } 3592286625Smav } 3593281026Smav 3594281026Smav /* 3595281026Smav * Above limits know nothing about real level of KVA fragmentation. 3596281026Smav * Start aggressive reclamation if too little sequential KVA left. 3597281026Smav */ 3598286625Smav if (lowest > 0) { 3599286625Smav n = (vmem_size(heap_arena, VMEM_MAXFREE) < zfs_max_recordsize) ? 3600286655Smav -((int64_t)vmem_size(heap_arena, VMEM_ALLOC) >> 4) : 3601286655Smav INT64_MAX; 3602286625Smav if (n < lowest) { 3603286625Smav lowest = n; 3604286625Smav r = FMR_ZIO_FRAG; 3605286625Smav } 3606281109Smav } 3607281026Smav 3608272483Ssmh#else /* _KERNEL */ 3609286625Smav /* Every 100 calls, free a small amount */ 3610168404Spjd if (spa_get_random(100) == 0) 3611286625Smav lowest = -1024; 3612272483Ssmh#endif /* _KERNEL */ 3613270759Ssmh 3614286625Smav last_free_memory = lowest; 3615286625Smav last_free_reason = r; 3616286625Smav DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); 3617286625Smav return (lowest); 3618168404Spjd} 3619168404Spjd 3620286625Smav 3621286625Smav/* 3622286625Smav * Determine if the system is under memory pressure and is asking 3623286625Smav * to reclaim memory. A return value of TRUE indicates that the system 3624286625Smav * is under memory pressure and that the arc should adjust accordingly. 3625286625Smav */ 3626286625Smavstatic boolean_t 3627286625Smavarc_reclaim_needed(void) 3628286625Smav{ 3629286625Smav return (arc_available_memory() < 0); 3630286625Smav} 3631286625Smav 3632208454Spjdextern kmem_cache_t *zio_buf_cache[]; 3633208454Spjdextern kmem_cache_t *zio_data_buf_cache[]; 3634272527Sdelphijextern kmem_cache_t *range_seg_cache; 3635208454Spjd 3636278040Ssmhstatic __noinline void 3637286625Smavarc_kmem_reap_now(void) 3638168404Spjd{ 3639168404Spjd size_t i; 3640168404Spjd kmem_cache_t *prev_cache = NULL; 3641168404Spjd kmem_cache_t *prev_data_cache = NULL; 3642168404Spjd 3643272483Ssmh DTRACE_PROBE(arc__kmem_reap_start); 3644168404Spjd#ifdef _KERNEL 3645185029Spjd if (arc_meta_used >= arc_meta_limit) { 3646185029Spjd /* 3647185029Spjd * We are exceeding our meta-data cache limit. 3648185029Spjd * Purge some DNLC entries to release holds on meta-data. 3649185029Spjd */ 3650185029Spjd dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 3651185029Spjd } 3652168404Spjd#if defined(__i386) 3653168404Spjd /* 3654168404Spjd * Reclaim unused memory from all kmem caches. 3655168404Spjd */ 3656168404Spjd kmem_reap(); 3657168404Spjd#endif 3658168404Spjd#endif 3659168404Spjd 3660168404Spjd for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 3661168404Spjd if (zio_buf_cache[i] != prev_cache) { 3662168404Spjd prev_cache = zio_buf_cache[i]; 3663168404Spjd kmem_cache_reap_now(zio_buf_cache[i]); 3664168404Spjd } 3665168404Spjd if (zio_data_buf_cache[i] != prev_data_cache) { 3666168404Spjd prev_data_cache = zio_data_buf_cache[i]; 3667168404Spjd kmem_cache_reap_now(zio_data_buf_cache[i]); 3668168404Spjd } 3669168404Spjd } 3670168404Spjd kmem_cache_reap_now(buf_cache); 3671286570Smav kmem_cache_reap_now(hdr_full_cache); 3672286570Smav kmem_cache_reap_now(hdr_l2only_cache); 3673272506Sdelphij kmem_cache_reap_now(range_seg_cache); 3674272483Ssmh 3675277300Ssmh#ifdef illumos 3676286625Smav if (zio_arena != NULL) { 3677286625Smav /* 3678286625Smav * Ask the vmem arena to reclaim unused memory from its 3679286625Smav * quantum caches. 3680286625Smav */ 3681272483Ssmh vmem_qcache_reap(zio_arena); 3682286625Smav } 3683272483Ssmh#endif 3684272483Ssmh DTRACE_PROBE(arc__kmem_reap_end); 3685168404Spjd} 3686168404Spjd 3687286763Smav/* 3688286763Smav * Threads can block in arc_get_data_buf() waiting for this thread to evict 3689286763Smav * enough data and signal them to proceed. When this happens, the threads in 3690286763Smav * arc_get_data_buf() are sleeping while holding the hash lock for their 3691286763Smav * particular arc header. Thus, we must be careful to never sleep on a 3692286763Smav * hash lock in this thread. This is to prevent the following deadlock: 3693286763Smav * 3694286763Smav * - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L", 3695286763Smav * waiting for the reclaim thread to signal it. 3696286763Smav * 3697286763Smav * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, 3698286763Smav * fails, and goes to sleep forever. 3699286763Smav * 3700286763Smav * This possible deadlock is avoided by always acquiring a hash lock 3701286763Smav * using mutex_tryenter() from arc_reclaim_thread(). 3702286763Smav */ 3703168404Spjdstatic void 3704168404Spjdarc_reclaim_thread(void *dummy __unused) 3705168404Spjd{ 3706296530Smav hrtime_t growtime = 0; 3707168404Spjd callb_cpr_t cpr; 3708168404Spjd 3709286763Smav CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); 3710168404Spjd 3711286763Smav mutex_enter(&arc_reclaim_lock); 3712286763Smav while (!arc_reclaim_thread_exit) { 3713286625Smav int64_t free_memory = arc_available_memory(); 3714286763Smav uint64_t evicted = 0; 3715286763Smav 3716286763Smav mutex_exit(&arc_reclaim_lock); 3717286763Smav 3718286625Smav if (free_memory < 0) { 3719168404Spjd 3720286625Smav arc_no_grow = B_TRUE; 3721286625Smav arc_warm = B_TRUE; 3722168404Spjd 3723286625Smav /* 3724286625Smav * Wait at least zfs_grow_retry (default 60) seconds 3725286625Smav * before considering growing. 3726286625Smav */ 3727296530Smav growtime = gethrtime() + SEC2NSEC(arc_grow_retry); 3728168404Spjd 3729286625Smav arc_kmem_reap_now(); 3730286625Smav 3731286625Smav /* 3732286625Smav * If we are still low on memory, shrink the ARC 3733286625Smav * so that we have arc_shrink_min free space. 3734286625Smav */ 3735286625Smav free_memory = arc_available_memory(); 3736286625Smav 3737286625Smav int64_t to_free = 3738286625Smav (arc_c >> arc_shrink_shift) - free_memory; 3739286625Smav if (to_free > 0) { 3740286625Smav#ifdef _KERNEL 3741286625Smav to_free = MAX(to_free, ptob(needfree)); 3742286625Smav#endif 3743286625Smav arc_shrink(to_free); 3744168404Spjd } 3745286625Smav } else if (free_memory < arc_c >> arc_no_grow_shift) { 3746286625Smav arc_no_grow = B_TRUE; 3747296530Smav } else if (gethrtime() >= growtime) { 3748286625Smav arc_no_grow = B_FALSE; 3749168404Spjd } 3750168404Spjd 3751286763Smav evicted = arc_adjust(); 3752168404Spjd 3753286763Smav mutex_enter(&arc_reclaim_lock); 3754168404Spjd 3755286763Smav /* 3756286763Smav * If evicted is zero, we couldn't evict anything via 3757286763Smav * arc_adjust(). This could be due to hash lock 3758286763Smav * collisions, but more likely due to the majority of 3759286763Smav * arc buffers being unevictable. Therefore, even if 3760286763Smav * arc_size is above arc_c, another pass is unlikely to 3761286763Smav * be helpful and could potentially cause us to enter an 3762286763Smav * infinite loop. 3763286763Smav */ 3764286763Smav if (arc_size <= arc_c || evicted == 0) { 3765211762Savg#ifdef _KERNEL 3766185029Spjd needfree = 0; 3767168404Spjd#endif 3768286763Smav /* 3769286763Smav * We're either no longer overflowing, or we 3770286763Smav * can't evict anything more, so we should wake 3771286763Smav * up any threads before we go to sleep. 3772286763Smav */ 3773286763Smav cv_broadcast(&arc_reclaim_waiters_cv); 3774168404Spjd 3775286763Smav /* 3776286763Smav * Block until signaled, or after one second (we 3777286763Smav * might need to perform arc_kmem_reap_now() 3778286763Smav * even if we aren't being signalled) 3779286763Smav */ 3780286763Smav CALLB_CPR_SAFE_BEGIN(&cpr); 3781296530Smav (void) cv_timedwait_hires(&arc_reclaim_thread_cv, 3782296530Smav &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); 3783286763Smav CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); 3784286763Smav } 3785286763Smav } 3786286763Smav 3787286763Smav arc_reclaim_thread_exit = FALSE; 3788286763Smav cv_broadcast(&arc_reclaim_thread_cv); 3789286763Smav CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ 3790286763Smav thread_exit(); 3791286763Smav} 3792286763Smav 3793286763Smavstatic void 3794286763Smavarc_user_evicts_thread(void *dummy __unused) 3795286763Smav{ 3796286763Smav callb_cpr_t cpr; 3797286763Smav 3798286763Smav CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG); 3799286763Smav 3800286763Smav mutex_enter(&arc_user_evicts_lock); 3801286763Smav while (!arc_user_evicts_thread_exit) { 3802286763Smav mutex_exit(&arc_user_evicts_lock); 3803286763Smav 3804286763Smav arc_do_user_evicts(); 3805286763Smav 3806286574Smav /* 3807286574Smav * This is necessary in order for the mdb ::arc dcmd to 3808286574Smav * show up to date information. Since the ::arc command 3809286574Smav * does not call the kstat's update function, without 3810286574Smav * this call, the command may show stale stats for the 3811286574Smav * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even 3812286574Smav * with this change, the data might be up to 1 second 3813286574Smav * out of date; but that should suffice. The arc_state_t 3814286574Smav * structures can be queried directly if more accurate 3815286574Smav * information is needed. 3816286574Smav */ 3817286574Smav if (arc_ksp != NULL) 3818286574Smav arc_ksp->ks_update(arc_ksp, KSTAT_READ); 3819286574Smav 3820286763Smav mutex_enter(&arc_user_evicts_lock); 3821286763Smav 3822286763Smav /* 3823286763Smav * Block until signaled, or after one second (we need to 3824286763Smav * call the arc's kstat update function regularly). 3825286763Smav */ 3826168404Spjd CALLB_CPR_SAFE_BEGIN(&cpr); 3827286763Smav (void) cv_timedwait(&arc_user_evicts_cv, 3828286763Smav &arc_user_evicts_lock, hz); 3829286763Smav CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock); 3830168404Spjd } 3831168404Spjd 3832286763Smav arc_user_evicts_thread_exit = FALSE; 3833286763Smav cv_broadcast(&arc_user_evicts_cv); 3834286763Smav CALLB_CPR_EXIT(&cpr); /* drops arc_user_evicts_lock */ 3835168404Spjd thread_exit(); 3836168404Spjd} 3837168404Spjd 3838301997Skibstatic u_int arc_dnlc_evicts_arg; 3839301997Skibextern struct vfsops zfs_vfsops; 3840301997Skib 3841301997Skibstatic void 3842301997Skibarc_dnlc_evicts_thread(void *dummy __unused) 3843301997Skib{ 3844301997Skib callb_cpr_t cpr; 3845301997Skib u_int percent; 3846301997Skib 3847301997Skib CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG); 3848301997Skib 3849301997Skib mutex_enter(&arc_dnlc_evicts_lock); 3850301997Skib while (!arc_dnlc_evicts_thread_exit) { 3851301997Skib CALLB_CPR_SAFE_BEGIN(&cpr); 3852301997Skib (void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); 3853301997Skib CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock); 3854301997Skib if (arc_dnlc_evicts_arg != 0) { 3855301997Skib percent = arc_dnlc_evicts_arg; 3856301997Skib mutex_exit(&arc_dnlc_evicts_lock); 3857301997Skib#ifdef _KERNEL 3858301997Skib vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops); 3859301997Skib#endif 3860301997Skib mutex_enter(&arc_dnlc_evicts_lock); 3861301997Skib /* 3862301997Skib * Clear our token only after vnlru_free() 3863301997Skib * pass is done, to avoid false queueing of 3864301997Skib * the requests. 3865301997Skib */ 3866301997Skib arc_dnlc_evicts_arg = 0; 3867301997Skib } 3868301997Skib } 3869301997Skib arc_dnlc_evicts_thread_exit = FALSE; 3870301997Skib cv_broadcast(&arc_dnlc_evicts_cv); 3871301997Skib CALLB_CPR_EXIT(&cpr); 3872301997Skib thread_exit(); 3873301997Skib} 3874301997Skib 3875301997Skibvoid 3876301997Skibdnlc_reduce_cache(void *arg) 3877301997Skib{ 3878301997Skib u_int percent; 3879301997Skib 3880302012Skib percent = (u_int)(uintptr_t)arg; 3881301997Skib mutex_enter(&arc_dnlc_evicts_lock); 3882301997Skib if (arc_dnlc_evicts_arg == 0) { 3883301997Skib arc_dnlc_evicts_arg = percent; 3884301997Skib cv_broadcast(&arc_dnlc_evicts_cv); 3885301997Skib } 3886301997Skib mutex_exit(&arc_dnlc_evicts_lock); 3887301997Skib} 3888301997Skib 3889168404Spjd/* 3890168404Spjd * Adapt arc info given the number of bytes we are trying to add and 3891168404Spjd * the state that we are comming from. This function is only called 3892168404Spjd * when we are adding new content to the cache. 3893168404Spjd */ 3894168404Spjdstatic void 3895168404Spjdarc_adapt(int bytes, arc_state_t *state) 3896168404Spjd{ 3897168404Spjd int mult; 3898208373Smm uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 3899286766Smav int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size); 3900286766Smav int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size); 3901168404Spjd 3902185029Spjd if (state == arc_l2c_only) 3903185029Spjd return; 3904185029Spjd 3905168404Spjd ASSERT(bytes > 0); 3906168404Spjd /* 3907168404Spjd * Adapt the target size of the MRU list: 3908168404Spjd * - if we just hit in the MRU ghost list, then increase 3909168404Spjd * the target size of the MRU list. 3910168404Spjd * - if we just hit in the MFU ghost list, then increase 3911168404Spjd * the target size of the MFU list by decreasing the 3912168404Spjd * target size of the MRU list. 3913168404Spjd */ 3914168404Spjd if (state == arc_mru_ghost) { 3915286766Smav mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); 3916209275Smm mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 3917168404Spjd 3918208373Smm arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 3919168404Spjd } else if (state == arc_mfu_ghost) { 3920208373Smm uint64_t delta; 3921208373Smm 3922286766Smav mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); 3923209275Smm mult = MIN(mult, 10); 3924168404Spjd 3925208373Smm delta = MIN(bytes * mult, arc_p); 3926208373Smm arc_p = MAX(arc_p_min, arc_p - delta); 3927168404Spjd } 3928168404Spjd ASSERT((int64_t)arc_p >= 0); 3929168404Spjd 3930168404Spjd if (arc_reclaim_needed()) { 3931286763Smav cv_signal(&arc_reclaim_thread_cv); 3932168404Spjd return; 3933168404Spjd } 3934168404Spjd 3935168404Spjd if (arc_no_grow) 3936168404Spjd return; 3937168404Spjd 3938168404Spjd if (arc_c >= arc_c_max) 3939168404Spjd return; 3940168404Spjd 3941168404Spjd /* 3942168404Spjd * If we're within (2 * maxblocksize) bytes of the target 3943168404Spjd * cache size, increment the target cache size 3944168404Spjd */ 3945168404Spjd if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 3946272483Ssmh DTRACE_PROBE1(arc__inc_adapt, int, bytes); 3947168404Spjd atomic_add_64(&arc_c, (int64_t)bytes); 3948168404Spjd if (arc_c > arc_c_max) 3949168404Spjd arc_c = arc_c_max; 3950168404Spjd else if (state == arc_anon) 3951168404Spjd atomic_add_64(&arc_p, (int64_t)bytes); 3952168404Spjd if (arc_p > arc_c) 3953168404Spjd arc_p = arc_c; 3954168404Spjd } 3955168404Spjd ASSERT((int64_t)arc_p >= 0); 3956168404Spjd} 3957168404Spjd 3958168404Spjd/* 3959286763Smav * Check if arc_size has grown past our upper threshold, determined by 3960286763Smav * zfs_arc_overflow_shift. 3961168404Spjd */ 3962286763Smavstatic boolean_t 3963286763Smavarc_is_overflowing(void) 3964168404Spjd{ 3965286763Smav /* Always allow at least one block of overflow */ 3966286763Smav uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, 3967286763Smav arc_c >> zfs_arc_overflow_shift); 3968185029Spjd 3969286763Smav return (arc_size >= arc_c + overflow); 3970168404Spjd} 3971168404Spjd 3972168404Spjd/* 3973286763Smav * The buffer, supplied as the first argument, needs a data block. If we 3974286763Smav * are hitting the hard limit for the cache size, we must sleep, waiting 3975286763Smav * for the eviction thread to catch up. If we're past the target size 3976286763Smav * but below the hard limit, we'll only signal the reclaim thread and 3977286763Smav * continue on. 3978168404Spjd */ 3979168404Spjdstatic void 3980168404Spjdarc_get_data_buf(arc_buf_t *buf) 3981168404Spjd{ 3982286570Smav arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 3983168404Spjd uint64_t size = buf->b_hdr->b_size; 3984286570Smav arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 3985168404Spjd 3986168404Spjd arc_adapt(size, state); 3987168404Spjd 3988168404Spjd /* 3989286763Smav * If arc_size is currently overflowing, and has grown past our 3990286763Smav * upper limit, we must be adding data faster than the evict 3991286763Smav * thread can evict. Thus, to ensure we don't compound the 3992286763Smav * problem by adding more data and forcing arc_size to grow even 3993286763Smav * further past it's target size, we halt and wait for the 3994286763Smav * eviction thread to catch up. 3995286763Smav * 3996286763Smav * It's also possible that the reclaim thread is unable to evict 3997286763Smav * enough buffers to get arc_size below the overflow limit (e.g. 3998286763Smav * due to buffers being un-evictable, or hash lock collisions). 3999286763Smav * In this case, we want to proceed regardless if we're 4000286763Smav * overflowing; thus we don't use a while loop here. 4001168404Spjd */ 4002286763Smav if (arc_is_overflowing()) { 4003286763Smav mutex_enter(&arc_reclaim_lock); 4004286763Smav 4005286763Smav /* 4006286763Smav * Now that we've acquired the lock, we may no longer be 4007286763Smav * over the overflow limit, lets check. 4008286763Smav * 4009286763Smav * We're ignoring the case of spurious wake ups. If that 4010286763Smav * were to happen, it'd let this thread consume an ARC 4011286763Smav * buffer before it should have (i.e. before we're under 4012286763Smav * the overflow limit and were signalled by the reclaim 4013286763Smav * thread). As long as that is a rare occurrence, it 4014286763Smav * shouldn't cause any harm. 4015286763Smav */ 4016286763Smav if (arc_is_overflowing()) { 4017286763Smav cv_signal(&arc_reclaim_thread_cv); 4018286763Smav cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); 4019168404Spjd } 4020286763Smav 4021286763Smav mutex_exit(&arc_reclaim_lock); 4022168404Spjd } 4023168404Spjd 4024286763Smav if (type == ARC_BUFC_METADATA) { 4025286763Smav buf->b_data = zio_buf_alloc(size); 4026286763Smav arc_space_consume(size, ARC_SPACE_META); 4027168404Spjd } else { 4028286763Smav ASSERT(type == ARC_BUFC_DATA); 4029286763Smav buf->b_data = zio_data_buf_alloc(size); 4030286763Smav arc_space_consume(size, ARC_SPACE_DATA); 4031168404Spjd } 4032286763Smav 4033168404Spjd /* 4034168404Spjd * Update the state size. Note that ghost states have a 4035168404Spjd * "ghost size" and so don't need to be updated. 4036168404Spjd */ 4037286570Smav if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { 4038168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 4039286766Smav arc_state_t *state = hdr->b_l1hdr.b_state; 4040168404Spjd 4041286766Smav (void) refcount_add_many(&state->arcs_size, size, buf); 4042286763Smav 4043286763Smav /* 4044286763Smav * If this is reached via arc_read, the link is 4045286763Smav * protected by the hash lock. If reached via 4046286763Smav * arc_buf_alloc, the header should not be accessed by 4047286763Smav * any other thread. And, if reached via arc_read_done, 4048286763Smav * the hash lock will protect it if it's found in the 4049286763Smav * hash table; otherwise no other thread should be 4050286763Smav * trying to [add|remove]_reference it. 4051286763Smav */ 4052286763Smav if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 4053286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4054286570Smav atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], 4055286570Smav size); 4056168404Spjd } 4057168404Spjd /* 4058168404Spjd * If we are growing the cache, and we are adding anonymous 4059168404Spjd * data, and we have outgrown arc_p, update arc_p 4060168404Spjd */ 4061286570Smav if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && 4062286766Smav (refcount_count(&arc_anon->arcs_size) + 4063286766Smav refcount_count(&arc_mru->arcs_size) > arc_p)) 4064168404Spjd arc_p = MIN(arc_c, arc_p + size); 4065168404Spjd } 4066205231Skmacy ARCSTAT_BUMP(arcstat_allocated); 4067168404Spjd} 4068168404Spjd 4069168404Spjd/* 4070168404Spjd * This routine is called whenever a buffer is accessed. 4071168404Spjd * NOTE: the hash lock is dropped in this function. 4072168404Spjd */ 4073168404Spjdstatic void 4074275811Sdelphijarc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 4075168404Spjd{ 4076219089Spjd clock_t now; 4077219089Spjd 4078168404Spjd ASSERT(MUTEX_HELD(hash_lock)); 4079286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 4080168404Spjd 4081286570Smav if (hdr->b_l1hdr.b_state == arc_anon) { 4082168404Spjd /* 4083168404Spjd * This buffer is not in the cache, and does not 4084168404Spjd * appear in our "ghost" list. Add the new buffer 4085168404Spjd * to the MRU state. 4086168404Spjd */ 4087168404Spjd 4088286570Smav ASSERT0(hdr->b_l1hdr.b_arc_access); 4089286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4090275811Sdelphij DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 4091275811Sdelphij arc_change_state(arc_mru, hdr, hash_lock); 4092168404Spjd 4093286570Smav } else if (hdr->b_l1hdr.b_state == arc_mru) { 4094219089Spjd now = ddi_get_lbolt(); 4095219089Spjd 4096168404Spjd /* 4097168404Spjd * If this buffer is here because of a prefetch, then either: 4098168404Spjd * - clear the flag if this is a "referencing" read 4099168404Spjd * (any subsequent access will bump this into the MFU state). 4100168404Spjd * or 4101168404Spjd * - move the buffer to the head of the list if this is 4102168404Spjd * another prefetch (to make it less likely to be evicted). 4103168404Spjd */ 4104286570Smav if (HDR_PREFETCH(hdr)) { 4105286570Smav if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 4106286763Smav /* link protected by hash lock */ 4107286763Smav ASSERT(multilist_link_active( 4108286570Smav &hdr->b_l1hdr.b_arc_node)); 4109168404Spjd } else { 4110275811Sdelphij hdr->b_flags &= ~ARC_FLAG_PREFETCH; 4111168404Spjd ARCSTAT_BUMP(arcstat_mru_hits); 4112168404Spjd } 4113286570Smav hdr->b_l1hdr.b_arc_access = now; 4114168404Spjd return; 4115168404Spjd } 4116168404Spjd 4117168404Spjd /* 4118168404Spjd * This buffer has been "accessed" only once so far, 4119168404Spjd * but it is still in the cache. Move it to the MFU 4120168404Spjd * state. 4121168404Spjd */ 4122286570Smav if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { 4123168404Spjd /* 4124168404Spjd * More than 125ms have passed since we 4125168404Spjd * instantiated this buffer. Move it to the 4126168404Spjd * most frequently used state. 4127168404Spjd */ 4128286570Smav hdr->b_l1hdr.b_arc_access = now; 4129275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4130275811Sdelphij arc_change_state(arc_mfu, hdr, hash_lock); 4131168404Spjd } 4132168404Spjd ARCSTAT_BUMP(arcstat_mru_hits); 4133286570Smav } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { 4134168404Spjd arc_state_t *new_state; 4135168404Spjd /* 4136168404Spjd * This buffer has been "accessed" recently, but 4137168404Spjd * was evicted from the cache. Move it to the 4138168404Spjd * MFU state. 4139168404Spjd */ 4140168404Spjd 4141286570Smav if (HDR_PREFETCH(hdr)) { 4142168404Spjd new_state = arc_mru; 4143286570Smav if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) 4144275811Sdelphij hdr->b_flags &= ~ARC_FLAG_PREFETCH; 4145275811Sdelphij DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 4146168404Spjd } else { 4147168404Spjd new_state = arc_mfu; 4148275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4149168404Spjd } 4150168404Spjd 4151286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4152275811Sdelphij arc_change_state(new_state, hdr, hash_lock); 4153168404Spjd 4154168404Spjd ARCSTAT_BUMP(arcstat_mru_ghost_hits); 4155286570Smav } else if (hdr->b_l1hdr.b_state == arc_mfu) { 4156168404Spjd /* 4157168404Spjd * This buffer has been accessed more than once and is 4158168404Spjd * still in the cache. Keep it in the MFU state. 4159168404Spjd * 4160168404Spjd * NOTE: an add_reference() that occurred when we did 4161168404Spjd * the arc_read() will have kicked this off the list. 4162168404Spjd * If it was a prefetch, we will explicitly move it to 4163168404Spjd * the head of the list now. 4164168404Spjd */ 4165286570Smav if ((HDR_PREFETCH(hdr)) != 0) { 4166286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4167286763Smav /* link protected by hash_lock */ 4168286763Smav ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 4169168404Spjd } 4170168404Spjd ARCSTAT_BUMP(arcstat_mfu_hits); 4171286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4172286570Smav } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { 4173168404Spjd arc_state_t *new_state = arc_mfu; 4174168404Spjd /* 4175168404Spjd * This buffer has been accessed more than once but has 4176168404Spjd * been evicted from the cache. Move it back to the 4177168404Spjd * MFU state. 4178168404Spjd */ 4179168404Spjd 4180286570Smav if (HDR_PREFETCH(hdr)) { 4181168404Spjd /* 4182168404Spjd * This is a prefetch access... 4183168404Spjd * move this block back to the MRU state. 4184168404Spjd */ 4185286570Smav ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 4186168404Spjd new_state = arc_mru; 4187168404Spjd } 4188168404Spjd 4189286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4190275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4191275811Sdelphij arc_change_state(new_state, hdr, hash_lock); 4192168404Spjd 4193168404Spjd ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 4194286570Smav } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { 4195185029Spjd /* 4196185029Spjd * This buffer is on the 2nd Level ARC. 4197185029Spjd */ 4198185029Spjd 4199286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4200275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4201275811Sdelphij arc_change_state(arc_mfu, hdr, hash_lock); 4202168404Spjd } else { 4203168404Spjd ASSERT(!"invalid arc state"); 4204168404Spjd } 4205168404Spjd} 4206168404Spjd 4207168404Spjd/* a generic arc_done_func_t which you can use */ 4208168404Spjd/* ARGSUSED */ 4209168404Spjdvoid 4210168404Spjdarc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 4211168404Spjd{ 4212219089Spjd if (zio == NULL || zio->io_error == 0) 4213219089Spjd bcopy(buf->b_data, arg, buf->b_hdr->b_size); 4214248571Smm VERIFY(arc_buf_remove_ref(buf, arg)); 4215168404Spjd} 4216168404Spjd 4217185029Spjd/* a generic arc_done_func_t */ 4218168404Spjdvoid 4219168404Spjdarc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 4220168404Spjd{ 4221168404Spjd arc_buf_t **bufp = arg; 4222168404Spjd if (zio && zio->io_error) { 4223248571Smm VERIFY(arc_buf_remove_ref(buf, arg)); 4224168404Spjd *bufp = NULL; 4225168404Spjd } else { 4226168404Spjd *bufp = buf; 4227219089Spjd ASSERT(buf->b_data); 4228168404Spjd } 4229168404Spjd} 4230168404Spjd 4231168404Spjdstatic void 4232168404Spjdarc_read_done(zio_t *zio) 4233168404Spjd{ 4234268075Sdelphij arc_buf_hdr_t *hdr; 4235168404Spjd arc_buf_t *buf; 4236168404Spjd arc_buf_t *abuf; /* buffer we're assigning to callback */ 4237268075Sdelphij kmutex_t *hash_lock = NULL; 4238168404Spjd arc_callback_t *callback_list, *acb; 4239168404Spjd int freeable = FALSE; 4240168404Spjd 4241168404Spjd buf = zio->io_private; 4242168404Spjd hdr = buf->b_hdr; 4243168404Spjd 4244168404Spjd /* 4245168404Spjd * The hdr was inserted into hash-table and removed from lists 4246168404Spjd * prior to starting I/O. We should find this header, since 4247168404Spjd * it's in the hash table, and it should be legit since it's 4248168404Spjd * not possible to evict it during the I/O. The only possible 4249168404Spjd * reason for it not to be found is if we were freed during the 4250168404Spjd * read. 4251168404Spjd */ 4252268075Sdelphij if (HDR_IN_HASH_TABLE(hdr)) { 4253268075Sdelphij ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); 4254268075Sdelphij ASSERT3U(hdr->b_dva.dva_word[0], ==, 4255268075Sdelphij BP_IDENTITY(zio->io_bp)->dva_word[0]); 4256268075Sdelphij ASSERT3U(hdr->b_dva.dva_word[1], ==, 4257268075Sdelphij BP_IDENTITY(zio->io_bp)->dva_word[1]); 4258168404Spjd 4259268075Sdelphij arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, 4260268075Sdelphij &hash_lock); 4261168404Spjd 4262268075Sdelphij ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && 4263268075Sdelphij hash_lock == NULL) || 4264268075Sdelphij (found == hdr && 4265268075Sdelphij DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 4266268075Sdelphij (found == hdr && HDR_L2_READING(hdr))); 4267268075Sdelphij } 4268268075Sdelphij 4269275811Sdelphij hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; 4270286570Smav if (l2arc_noprefetch && HDR_PREFETCH(hdr)) 4271275811Sdelphij hdr->b_flags &= ~ARC_FLAG_L2CACHE; 4272206796Spjd 4273168404Spjd /* byteswap if necessary */ 4274286570Smav callback_list = hdr->b_l1hdr.b_acb; 4275168404Spjd ASSERT(callback_list != NULL); 4276209101Smm if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 4277236884Smm dmu_object_byteswap_t bswap = 4278236884Smm DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 4279185029Spjd arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 4280185029Spjd byteswap_uint64_array : 4281236884Smm dmu_ot_byteswap[bswap].ob_func; 4282185029Spjd func(buf->b_data, hdr->b_size); 4283185029Spjd } 4284168404Spjd 4285185029Spjd arc_cksum_compute(buf, B_FALSE); 4286240133Smm#ifdef illumos 4287240133Smm arc_buf_watch(buf); 4288277300Ssmh#endif 4289168404Spjd 4290286570Smav if (hash_lock && zio->io_error == 0 && 4291286570Smav hdr->b_l1hdr.b_state == arc_anon) { 4292219089Spjd /* 4293219089Spjd * Only call arc_access on anonymous buffers. This is because 4294219089Spjd * if we've issued an I/O for an evicted buffer, we've already 4295219089Spjd * called arc_access (to prevent any simultaneous readers from 4296219089Spjd * getting confused). 4297219089Spjd */ 4298219089Spjd arc_access(hdr, hash_lock); 4299219089Spjd } 4300219089Spjd 4301168404Spjd /* create copies of the data buffer for the callers */ 4302168404Spjd abuf = buf; 4303168404Spjd for (acb = callback_list; acb; acb = acb->acb_next) { 4304168404Spjd if (acb->acb_done) { 4305242845Sdelphij if (abuf == NULL) { 4306242845Sdelphij ARCSTAT_BUMP(arcstat_duplicate_reads); 4307168404Spjd abuf = arc_buf_clone(buf); 4308242845Sdelphij } 4309168404Spjd acb->acb_buf = abuf; 4310168404Spjd abuf = NULL; 4311168404Spjd } 4312168404Spjd } 4313286570Smav hdr->b_l1hdr.b_acb = NULL; 4314275811Sdelphij hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4315168404Spjd ASSERT(!HDR_BUF_AVAILABLE(hdr)); 4316219089Spjd if (abuf == buf) { 4317219089Spjd ASSERT(buf->b_efunc == NULL); 4318286570Smav ASSERT(hdr->b_l1hdr.b_datacnt == 1); 4319275811Sdelphij hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 4320219089Spjd } 4321168404Spjd 4322286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || 4323286570Smav callback_list != NULL); 4324168404Spjd 4325168404Spjd if (zio->io_error != 0) { 4326275811Sdelphij hdr->b_flags |= ARC_FLAG_IO_ERROR; 4327286570Smav if (hdr->b_l1hdr.b_state != arc_anon) 4328168404Spjd arc_change_state(arc_anon, hdr, hash_lock); 4329168404Spjd if (HDR_IN_HASH_TABLE(hdr)) 4330168404Spjd buf_hash_remove(hdr); 4331286570Smav freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 4332168404Spjd } 4333168404Spjd 4334168404Spjd /* 4335168404Spjd * Broadcast before we drop the hash_lock to avoid the possibility 4336168404Spjd * that the hdr (and hence the cv) might be freed before we get to 4337168404Spjd * the cv_broadcast(). 4338168404Spjd */ 4339286570Smav cv_broadcast(&hdr->b_l1hdr.b_cv); 4340168404Spjd 4341286570Smav if (hash_lock != NULL) { 4342168404Spjd mutex_exit(hash_lock); 4343168404Spjd } else { 4344168404Spjd /* 4345168404Spjd * This block was freed while we waited for the read to 4346168404Spjd * complete. It has been removed from the hash table and 4347168404Spjd * moved to the anonymous state (so that it won't show up 4348168404Spjd * in the cache). 4349168404Spjd */ 4350286570Smav ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 4351286570Smav freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 4352168404Spjd } 4353168404Spjd 4354168404Spjd /* execute each callback and free its structure */ 4355168404Spjd while ((acb = callback_list) != NULL) { 4356168404Spjd if (acb->acb_done) 4357168404Spjd acb->acb_done(zio, acb->acb_buf, acb->acb_private); 4358168404Spjd 4359168404Spjd if (acb->acb_zio_dummy != NULL) { 4360168404Spjd acb->acb_zio_dummy->io_error = zio->io_error; 4361168404Spjd zio_nowait(acb->acb_zio_dummy); 4362168404Spjd } 4363168404Spjd 4364168404Spjd callback_list = acb->acb_next; 4365168404Spjd kmem_free(acb, sizeof (arc_callback_t)); 4366168404Spjd } 4367168404Spjd 4368168404Spjd if (freeable) 4369168404Spjd arc_hdr_destroy(hdr); 4370168404Spjd} 4371168404Spjd 4372168404Spjd/* 4373286762Smav * "Read" the block at the specified DVA (in bp) via the 4374168404Spjd * cache. If the block is found in the cache, invoke the provided 4375168404Spjd * callback immediately and return. Note that the `zio' parameter 4376168404Spjd * in the callback will be NULL in this case, since no IO was 4377168404Spjd * required. If the block is not in the cache pass the read request 4378168404Spjd * on to the spa with a substitute callback function, so that the 4379168404Spjd * requested block will be added to the cache. 4380168404Spjd * 4381168404Spjd * If a read request arrives for a block that has a read in-progress, 4382168404Spjd * either wait for the in-progress read to complete (and return the 4383168404Spjd * results); or, if this is a read with a "done" func, add a record 4384168404Spjd * to the read to invoke the "done" func when the read completes, 4385168404Spjd * and return; or just return. 4386168404Spjd * 4387168404Spjd * arc_read_done() will invoke all the requested "done" functions 4388168404Spjd * for readers of this block. 4389168404Spjd */ 4390168404Spjdint 4391246666Smmarc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 4392275811Sdelphij void *private, zio_priority_t priority, int zio_flags, 4393275811Sdelphij arc_flags_t *arc_flags, const zbookmark_phys_t *zb) 4394168404Spjd{ 4395268075Sdelphij arc_buf_hdr_t *hdr = NULL; 4396247187Smm arc_buf_t *buf = NULL; 4397268075Sdelphij kmutex_t *hash_lock = NULL; 4398185029Spjd zio_t *rzio; 4399228103Smm uint64_t guid = spa_load_guid(spa); 4400168404Spjd 4401268075Sdelphij ASSERT(!BP_IS_EMBEDDED(bp) || 4402268075Sdelphij BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 4403268075Sdelphij 4404168404Spjdtop: 4405268075Sdelphij if (!BP_IS_EMBEDDED(bp)) { 4406268075Sdelphij /* 4407268075Sdelphij * Embedded BP's have no DVA and require no I/O to "read". 4408268075Sdelphij * Create an anonymous arc buf to back it. 4409268075Sdelphij */ 4410268075Sdelphij hdr = buf_hash_find(guid, bp, &hash_lock); 4411268075Sdelphij } 4412168404Spjd 4413286570Smav if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { 4414268075Sdelphij 4415275811Sdelphij *arc_flags |= ARC_FLAG_CACHED; 4416168404Spjd 4417168404Spjd if (HDR_IO_IN_PROGRESS(hdr)) { 4418168404Spjd 4419287702Sdelphij if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && 4420287702Sdelphij priority == ZIO_PRIORITY_SYNC_READ) { 4421287702Sdelphij /* 4422287702Sdelphij * This sync read must wait for an 4423287702Sdelphij * in-progress async read (e.g. a predictive 4424287702Sdelphij * prefetch). Async reads are queued 4425287702Sdelphij * separately at the vdev_queue layer, so 4426287702Sdelphij * this is a form of priority inversion. 4427287702Sdelphij * Ideally, we would "inherit" the demand 4428287702Sdelphij * i/o's priority by moving the i/o from 4429287702Sdelphij * the async queue to the synchronous queue, 4430287702Sdelphij * but there is currently no mechanism to do 4431287702Sdelphij * so. Track this so that we can evaluate 4432287702Sdelphij * the magnitude of this potential performance 4433287702Sdelphij * problem. 4434287702Sdelphij * 4435287702Sdelphij * Note that if the prefetch i/o is already 4436287702Sdelphij * active (has been issued to the device), 4437287702Sdelphij * the prefetch improved performance, because 4438287702Sdelphij * we issued it sooner than we would have 4439287702Sdelphij * without the prefetch. 4440287702Sdelphij */ 4441287702Sdelphij DTRACE_PROBE1(arc__sync__wait__for__async, 4442287702Sdelphij arc_buf_hdr_t *, hdr); 4443287702Sdelphij ARCSTAT_BUMP(arcstat_sync_wait_for_async); 4444287702Sdelphij } 4445287702Sdelphij if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 4446287702Sdelphij hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; 4447287702Sdelphij } 4448287702Sdelphij 4449275811Sdelphij if (*arc_flags & ARC_FLAG_WAIT) { 4450286570Smav cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); 4451168404Spjd mutex_exit(hash_lock); 4452168404Spjd goto top; 4453168404Spjd } 4454275811Sdelphij ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 4455168404Spjd 4456168404Spjd if (done) { 4457287702Sdelphij arc_callback_t *acb = NULL; 4458168404Spjd 4459168404Spjd acb = kmem_zalloc(sizeof (arc_callback_t), 4460168404Spjd KM_SLEEP); 4461168404Spjd acb->acb_done = done; 4462168404Spjd acb->acb_private = private; 4463168404Spjd if (pio != NULL) 4464168404Spjd acb->acb_zio_dummy = zio_null(pio, 4465209962Smm spa, NULL, NULL, NULL, zio_flags); 4466168404Spjd 4467168404Spjd ASSERT(acb->acb_done != NULL); 4468286570Smav acb->acb_next = hdr->b_l1hdr.b_acb; 4469286570Smav hdr->b_l1hdr.b_acb = acb; 4470168404Spjd add_reference(hdr, hash_lock, private); 4471168404Spjd mutex_exit(hash_lock); 4472168404Spjd return (0); 4473168404Spjd } 4474168404Spjd mutex_exit(hash_lock); 4475168404Spjd return (0); 4476168404Spjd } 4477168404Spjd 4478286570Smav ASSERT(hdr->b_l1hdr.b_state == arc_mru || 4479286570Smav hdr->b_l1hdr.b_state == arc_mfu); 4480168404Spjd 4481168404Spjd if (done) { 4482287702Sdelphij if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 4483287702Sdelphij /* 4484287702Sdelphij * This is a demand read which does not have to 4485287702Sdelphij * wait for i/o because we did a predictive 4486287702Sdelphij * prefetch i/o for it, which has completed. 4487287702Sdelphij */ 4488287702Sdelphij DTRACE_PROBE1( 4489287702Sdelphij arc__demand__hit__predictive__prefetch, 4490287702Sdelphij arc_buf_hdr_t *, hdr); 4491287702Sdelphij ARCSTAT_BUMP( 4492287702Sdelphij arcstat_demand_hit_predictive_prefetch); 4493287702Sdelphij hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; 4494287702Sdelphij } 4495168404Spjd add_reference(hdr, hash_lock, private); 4496168404Spjd /* 4497168404Spjd * If this block is already in use, create a new 4498168404Spjd * copy of the data so that we will be guaranteed 4499168404Spjd * that arc_release() will always succeed. 4500168404Spjd */ 4501286570Smav buf = hdr->b_l1hdr.b_buf; 4502168404Spjd ASSERT(buf); 4503168404Spjd ASSERT(buf->b_data); 4504168404Spjd if (HDR_BUF_AVAILABLE(hdr)) { 4505168404Spjd ASSERT(buf->b_efunc == NULL); 4506275811Sdelphij hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 4507168404Spjd } else { 4508168404Spjd buf = arc_buf_clone(buf); 4509168404Spjd } 4510219089Spjd 4511275811Sdelphij } else if (*arc_flags & ARC_FLAG_PREFETCH && 4512286570Smav refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 4513275811Sdelphij hdr->b_flags |= ARC_FLAG_PREFETCH; 4514168404Spjd } 4515168404Spjd DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 4516168404Spjd arc_access(hdr, hash_lock); 4517275811Sdelphij if (*arc_flags & ARC_FLAG_L2CACHE) 4518275811Sdelphij hdr->b_flags |= ARC_FLAG_L2CACHE; 4519275811Sdelphij if (*arc_flags & ARC_FLAG_L2COMPRESS) 4520275811Sdelphij hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4521168404Spjd mutex_exit(hash_lock); 4522168404Spjd ARCSTAT_BUMP(arcstat_hits); 4523286570Smav ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 4524286570Smav demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 4525168404Spjd data, metadata, hits); 4526168404Spjd 4527168404Spjd if (done) 4528168404Spjd done(NULL, buf, private); 4529168404Spjd } else { 4530168404Spjd uint64_t size = BP_GET_LSIZE(bp); 4531268075Sdelphij arc_callback_t *acb; 4532185029Spjd vdev_t *vd = NULL; 4533247187Smm uint64_t addr = 0; 4534208373Smm boolean_t devw = B_FALSE; 4535258389Savg enum zio_compress b_compress = ZIO_COMPRESS_OFF; 4536286570Smav int32_t b_asize = 0; 4537168404Spjd 4538168404Spjd if (hdr == NULL) { 4539168404Spjd /* this block is not in the cache */ 4540268075Sdelphij arc_buf_hdr_t *exists = NULL; 4541168404Spjd arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 4542168404Spjd buf = arc_buf_alloc(spa, size, private, type); 4543168404Spjd hdr = buf->b_hdr; 4544268075Sdelphij if (!BP_IS_EMBEDDED(bp)) { 4545268075Sdelphij hdr->b_dva = *BP_IDENTITY(bp); 4546268075Sdelphij hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 4547268075Sdelphij exists = buf_hash_insert(hdr, &hash_lock); 4548268075Sdelphij } 4549268075Sdelphij if (exists != NULL) { 4550168404Spjd /* somebody beat us to the hash insert */ 4551168404Spjd mutex_exit(hash_lock); 4552219089Spjd buf_discard_identity(hdr); 4553168404Spjd (void) arc_buf_remove_ref(buf, private); 4554168404Spjd goto top; /* restart the IO request */ 4555168404Spjd } 4556275811Sdelphij 4557287702Sdelphij /* 4558287702Sdelphij * If there is a callback, we pass our reference to 4559287702Sdelphij * it; otherwise we remove our reference. 4560287702Sdelphij */ 4561287702Sdelphij if (done == NULL) { 4562168404Spjd (void) remove_reference(hdr, hash_lock, 4563168404Spjd private); 4564287702Sdelphij } 4565287702Sdelphij if (*arc_flags & ARC_FLAG_PREFETCH) 4566275811Sdelphij hdr->b_flags |= ARC_FLAG_PREFETCH; 4567275811Sdelphij if (*arc_flags & ARC_FLAG_L2CACHE) 4568275811Sdelphij hdr->b_flags |= ARC_FLAG_L2CACHE; 4569275811Sdelphij if (*arc_flags & ARC_FLAG_L2COMPRESS) 4570275811Sdelphij hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4571168404Spjd if (BP_GET_LEVEL(bp) > 0) 4572275811Sdelphij hdr->b_flags |= ARC_FLAG_INDIRECT; 4573168404Spjd } else { 4574286570Smav /* 4575286570Smav * This block is in the ghost cache. If it was L2-only 4576286570Smav * (and thus didn't have an L1 hdr), we realloc the 4577286570Smav * header to add an L1 hdr. 4578286570Smav */ 4579286570Smav if (!HDR_HAS_L1HDR(hdr)) { 4580286570Smav hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, 4581286570Smav hdr_full_cache); 4582286570Smav } 4583286570Smav 4584286570Smav ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); 4585168404Spjd ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4586286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4587286763Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 4588168404Spjd 4589287702Sdelphij /* 4590287702Sdelphij * If there is a callback, we pass a reference to it. 4591287702Sdelphij */ 4592287702Sdelphij if (done != NULL) 4593287702Sdelphij add_reference(hdr, hash_lock, private); 4594275811Sdelphij if (*arc_flags & ARC_FLAG_PREFETCH) 4595275811Sdelphij hdr->b_flags |= ARC_FLAG_PREFETCH; 4596275811Sdelphij if (*arc_flags & ARC_FLAG_L2CACHE) 4597275811Sdelphij hdr->b_flags |= ARC_FLAG_L2CACHE; 4598275811Sdelphij if (*arc_flags & ARC_FLAG_L2COMPRESS) 4599275811Sdelphij hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4600185029Spjd buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 4601168404Spjd buf->b_hdr = hdr; 4602168404Spjd buf->b_data = NULL; 4603168404Spjd buf->b_efunc = NULL; 4604168404Spjd buf->b_private = NULL; 4605168404Spjd buf->b_next = NULL; 4606286570Smav hdr->b_l1hdr.b_buf = buf; 4607286570Smav ASSERT0(hdr->b_l1hdr.b_datacnt); 4608286570Smav hdr->b_l1hdr.b_datacnt = 1; 4609219089Spjd arc_get_data_buf(buf); 4610219089Spjd arc_access(hdr, hash_lock); 4611168404Spjd } 4612168404Spjd 4613287702Sdelphij if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) 4614287702Sdelphij hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH; 4615286570Smav ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); 4616219089Spjd 4617168404Spjd acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 4618168404Spjd acb->acb_done = done; 4619168404Spjd acb->acb_private = private; 4620168404Spjd 4621286570Smav ASSERT(hdr->b_l1hdr.b_acb == NULL); 4622286570Smav hdr->b_l1hdr.b_acb = acb; 4623275811Sdelphij hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 4624168404Spjd 4625286570Smav if (HDR_HAS_L2HDR(hdr) && 4626286570Smav (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { 4627286570Smav devw = hdr->b_l2hdr.b_dev->l2ad_writing; 4628286570Smav addr = hdr->b_l2hdr.b_daddr; 4629287706Sdelphij b_compress = hdr->b_l2hdr.b_compress; 4630286570Smav b_asize = hdr->b_l2hdr.b_asize; 4631185029Spjd /* 4632185029Spjd * Lock out device removal. 4633185029Spjd */ 4634185029Spjd if (vdev_is_dead(vd) || 4635185029Spjd !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 4636185029Spjd vd = NULL; 4637185029Spjd } 4638185029Spjd 4639268075Sdelphij if (hash_lock != NULL) 4640268075Sdelphij mutex_exit(hash_lock); 4641168404Spjd 4642251629Sdelphij /* 4643251629Sdelphij * At this point, we have a level 1 cache miss. Try again in 4644251629Sdelphij * L2ARC if possible. 4645251629Sdelphij */ 4646168404Spjd ASSERT3U(hdr->b_size, ==, size); 4647219089Spjd DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 4648268123Sdelphij uint64_t, size, zbookmark_phys_t *, zb); 4649168404Spjd ARCSTAT_BUMP(arcstat_misses); 4650286570Smav ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 4651286570Smav demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 4652168404Spjd data, metadata, misses); 4653228392Spjd#ifdef _KERNEL 4654297633Strasz#ifdef RACCT 4655297633Strasz if (racct_enable) { 4656297633Strasz PROC_LOCK(curproc); 4657297633Strasz racct_add_force(curproc, RACCT_READBPS, size); 4658297633Strasz racct_add_force(curproc, RACCT_READIOPS, 1); 4659297633Strasz PROC_UNLOCK(curproc); 4660297633Strasz } 4661297633Strasz#endif /* RACCT */ 4662228392Spjd curthread->td_ru.ru_inblock++; 4663228392Spjd#endif 4664168404Spjd 4665287702Sdelphij if (priority == ZIO_PRIORITY_ASYNC_READ) 4666287702Sdelphij hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ; 4667287702Sdelphij else 4668287702Sdelphij hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ; 4669287702Sdelphij 4670208373Smm if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 4671185029Spjd /* 4672185029Spjd * Read from the L2ARC if the following are true: 4673185029Spjd * 1. The L2ARC vdev was previously cached. 4674185029Spjd * 2. This buffer still has L2ARC metadata. 4675185029Spjd * 3. This buffer isn't currently writing to the L2ARC. 4676185029Spjd * 4. The L2ARC entry wasn't evicted, which may 4677185029Spjd * also have invalidated the vdev. 4678208373Smm * 5. This isn't prefetch and l2arc_noprefetch is set. 4679185029Spjd */ 4680286570Smav if (HDR_HAS_L2HDR(hdr) && 4681208373Smm !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 4682208373Smm !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 4683185029Spjd l2arc_read_callback_t *cb; 4684297848Savg void* b_data; 4685185029Spjd 4686185029Spjd DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 4687185029Spjd ARCSTAT_BUMP(arcstat_l2_hits); 4688185029Spjd 4689185029Spjd cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 4690185029Spjd KM_SLEEP); 4691185029Spjd cb->l2rcb_buf = buf; 4692185029Spjd cb->l2rcb_spa = spa; 4693185029Spjd cb->l2rcb_bp = *bp; 4694185029Spjd cb->l2rcb_zb = *zb; 4695185029Spjd cb->l2rcb_flags = zio_flags; 4696258389Savg cb->l2rcb_compress = b_compress; 4697297848Savg if (b_asize > hdr->b_size) { 4698297848Savg ASSERT3U(b_compress, ==, 4699297848Savg ZIO_COMPRESS_OFF); 4700297848Savg b_data = zio_data_buf_alloc(b_asize); 4701297848Savg cb->l2rcb_data = b_data; 4702297848Savg } else { 4703297848Savg b_data = buf->b_data; 4704297848Savg } 4705185029Spjd 4706247187Smm ASSERT(addr >= VDEV_LABEL_START_SIZE && 4707247187Smm addr + size < vd->vdev_psize - 4708247187Smm VDEV_LABEL_END_SIZE); 4709247187Smm 4710185029Spjd /* 4711185029Spjd * l2arc read. The SCL_L2ARC lock will be 4712185029Spjd * released by l2arc_read_done(). 4713251478Sdelphij * Issue a null zio if the underlying buffer 4714251478Sdelphij * was squashed to zero size by compression. 4715185029Spjd */ 4716258389Savg if (b_compress == ZIO_COMPRESS_EMPTY) { 4717297848Savg ASSERT3U(b_asize, ==, 0); 4718251478Sdelphij rzio = zio_null(pio, spa, vd, 4719251478Sdelphij l2arc_read_done, cb, 4720251478Sdelphij zio_flags | ZIO_FLAG_DONT_CACHE | 4721251478Sdelphij ZIO_FLAG_CANFAIL | 4722251478Sdelphij ZIO_FLAG_DONT_PROPAGATE | 4723251478Sdelphij ZIO_FLAG_DONT_RETRY); 4724251478Sdelphij } else { 4725251478Sdelphij rzio = zio_read_phys(pio, vd, addr, 4726297848Savg b_asize, b_data, 4727258389Savg ZIO_CHECKSUM_OFF, 4728251478Sdelphij l2arc_read_done, cb, priority, 4729251478Sdelphij zio_flags | ZIO_FLAG_DONT_CACHE | 4730251478Sdelphij ZIO_FLAG_CANFAIL | 4731251478Sdelphij ZIO_FLAG_DONT_PROPAGATE | 4732251478Sdelphij ZIO_FLAG_DONT_RETRY, B_FALSE); 4733251478Sdelphij } 4734185029Spjd DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 4735185029Spjd zio_t *, rzio); 4736258389Savg ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); 4737185029Spjd 4738275811Sdelphij if (*arc_flags & ARC_FLAG_NOWAIT) { 4739185029Spjd zio_nowait(rzio); 4740185029Spjd return (0); 4741185029Spjd } 4742185029Spjd 4743275811Sdelphij ASSERT(*arc_flags & ARC_FLAG_WAIT); 4744185029Spjd if (zio_wait(rzio) == 0) 4745185029Spjd return (0); 4746185029Spjd 4747185029Spjd /* l2arc read error; goto zio_read() */ 4748185029Spjd } else { 4749185029Spjd DTRACE_PROBE1(l2arc__miss, 4750185029Spjd arc_buf_hdr_t *, hdr); 4751185029Spjd ARCSTAT_BUMP(arcstat_l2_misses); 4752185029Spjd if (HDR_L2_WRITING(hdr)) 4753185029Spjd ARCSTAT_BUMP(arcstat_l2_rw_clash); 4754185029Spjd spa_config_exit(spa, SCL_L2ARC, vd); 4755185029Spjd } 4756208373Smm } else { 4757208373Smm if (vd != NULL) 4758208373Smm spa_config_exit(spa, SCL_L2ARC, vd); 4759208373Smm if (l2arc_ndev != 0) { 4760208373Smm DTRACE_PROBE1(l2arc__miss, 4761208373Smm arc_buf_hdr_t *, hdr); 4762208373Smm ARCSTAT_BUMP(arcstat_l2_misses); 4763208373Smm } 4764185029Spjd } 4765185029Spjd 4766168404Spjd rzio = zio_read(pio, spa, bp, buf->b_data, size, 4767185029Spjd arc_read_done, buf, priority, zio_flags, zb); 4768168404Spjd 4769275811Sdelphij if (*arc_flags & ARC_FLAG_WAIT) 4770168404Spjd return (zio_wait(rzio)); 4771168404Spjd 4772275811Sdelphij ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 4773168404Spjd zio_nowait(rzio); 4774168404Spjd } 4775168404Spjd return (0); 4776168404Spjd} 4777168404Spjd 4778168404Spjdvoid 4779168404Spjdarc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 4780168404Spjd{ 4781168404Spjd ASSERT(buf->b_hdr != NULL); 4782286570Smav ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); 4783286570Smav ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || 4784286570Smav func == NULL); 4785219089Spjd ASSERT(buf->b_efunc == NULL); 4786219089Spjd ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 4787219089Spjd 4788168404Spjd buf->b_efunc = func; 4789168404Spjd buf->b_private = private; 4790168404Spjd} 4791168404Spjd 4792168404Spjd/* 4793251520Sdelphij * Notify the arc that a block was freed, and thus will never be used again. 4794251520Sdelphij */ 4795251520Sdelphijvoid 4796251520Sdelphijarc_freed(spa_t *spa, const blkptr_t *bp) 4797251520Sdelphij{ 4798251520Sdelphij arc_buf_hdr_t *hdr; 4799251520Sdelphij kmutex_t *hash_lock; 4800251520Sdelphij uint64_t guid = spa_load_guid(spa); 4801251520Sdelphij 4802268075Sdelphij ASSERT(!BP_IS_EMBEDDED(bp)); 4803268075Sdelphij 4804268075Sdelphij hdr = buf_hash_find(guid, bp, &hash_lock); 4805251520Sdelphij if (hdr == NULL) 4806251520Sdelphij return; 4807251520Sdelphij if (HDR_BUF_AVAILABLE(hdr)) { 4808286570Smav arc_buf_t *buf = hdr->b_l1hdr.b_buf; 4809251520Sdelphij add_reference(hdr, hash_lock, FTAG); 4810275811Sdelphij hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 4811251520Sdelphij mutex_exit(hash_lock); 4812251520Sdelphij 4813251520Sdelphij arc_release(buf, FTAG); 4814251520Sdelphij (void) arc_buf_remove_ref(buf, FTAG); 4815251520Sdelphij } else { 4816251520Sdelphij mutex_exit(hash_lock); 4817251520Sdelphij } 4818251520Sdelphij 4819251520Sdelphij} 4820251520Sdelphij 4821251520Sdelphij/* 4822268858Sdelphij * Clear the user eviction callback set by arc_set_callback(), first calling 4823268858Sdelphij * it if it exists. Because the presence of a callback keeps an arc_buf cached 4824268858Sdelphij * clearing the callback may result in the arc_buf being destroyed. However, 4825268858Sdelphij * it will not result in the *last* arc_buf being destroyed, hence the data 4826268858Sdelphij * will remain cached in the ARC. We make a copy of the arc buffer here so 4827268858Sdelphij * that we can process the callback without holding any locks. 4828268858Sdelphij * 4829268858Sdelphij * It's possible that the callback is already in the process of being cleared 4830268858Sdelphij * by another thread. In this case we can not clear the callback. 4831268858Sdelphij * 4832268858Sdelphij * Returns B_TRUE if the callback was successfully called and cleared. 4833168404Spjd */ 4834268858Sdelphijboolean_t 4835268858Sdelphijarc_clear_callback(arc_buf_t *buf) 4836168404Spjd{ 4837168404Spjd arc_buf_hdr_t *hdr; 4838168404Spjd kmutex_t *hash_lock; 4839268858Sdelphij arc_evict_func_t *efunc = buf->b_efunc; 4840268858Sdelphij void *private = buf->b_private; 4841206796Spjd 4842219089Spjd mutex_enter(&buf->b_evict_lock); 4843168404Spjd hdr = buf->b_hdr; 4844168404Spjd if (hdr == NULL) { 4845168404Spjd /* 4846168404Spjd * We are in arc_do_user_evicts(). 4847168404Spjd */ 4848168404Spjd ASSERT(buf->b_data == NULL); 4849219089Spjd mutex_exit(&buf->b_evict_lock); 4850268858Sdelphij return (B_FALSE); 4851185029Spjd } else if (buf->b_data == NULL) { 4852185029Spjd /* 4853185029Spjd * We are on the eviction list; process this buffer now 4854185029Spjd * but let arc_do_user_evicts() do the reaping. 4855185029Spjd */ 4856185029Spjd buf->b_efunc = NULL; 4857219089Spjd mutex_exit(&buf->b_evict_lock); 4858268858Sdelphij VERIFY0(efunc(private)); 4859268858Sdelphij return (B_TRUE); 4860168404Spjd } 4861168404Spjd hash_lock = HDR_LOCK(hdr); 4862168404Spjd mutex_enter(hash_lock); 4863219089Spjd hdr = buf->b_hdr; 4864219089Spjd ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4865168404Spjd 4866286570Smav ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, 4867286570Smav hdr->b_l1hdr.b_datacnt); 4868286570Smav ASSERT(hdr->b_l1hdr.b_state == arc_mru || 4869286570Smav hdr->b_l1hdr.b_state == arc_mfu); 4870168404Spjd 4871268858Sdelphij buf->b_efunc = NULL; 4872268858Sdelphij buf->b_private = NULL; 4873168404Spjd 4874286570Smav if (hdr->b_l1hdr.b_datacnt > 1) { 4875268858Sdelphij mutex_exit(&buf->b_evict_lock); 4876286763Smav arc_buf_destroy(buf, TRUE); 4877268858Sdelphij } else { 4878286570Smav ASSERT(buf == hdr->b_l1hdr.b_buf); 4879275811Sdelphij hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 4880268858Sdelphij mutex_exit(&buf->b_evict_lock); 4881268858Sdelphij } 4882168404Spjd 4883168404Spjd mutex_exit(hash_lock); 4884268858Sdelphij VERIFY0(efunc(private)); 4885268858Sdelphij return (B_TRUE); 4886168404Spjd} 4887168404Spjd 4888168404Spjd/* 4889251629Sdelphij * Release this buffer from the cache, making it an anonymous buffer. This 4890251629Sdelphij * must be done after a read and prior to modifying the buffer contents. 4891168404Spjd * If the buffer has more than one reference, we must make 4892185029Spjd * a new hdr for the buffer. 4893168404Spjd */ 4894168404Spjdvoid 4895168404Spjdarc_release(arc_buf_t *buf, void *tag) 4896168404Spjd{ 4897286570Smav arc_buf_hdr_t *hdr = buf->b_hdr; 4898168404Spjd 4899219089Spjd /* 4900219089Spjd * It would be nice to assert that if it's DMU metadata (level > 4901219089Spjd * 0 || it's the dnode file), then it must be syncing context. 4902219089Spjd * But we don't know that information at this level. 4903219089Spjd */ 4904219089Spjd 4905219089Spjd mutex_enter(&buf->b_evict_lock); 4906286776Smav 4907286776Smav ASSERT(HDR_HAS_L1HDR(hdr)); 4908286776Smav 4909286570Smav /* 4910286570Smav * We don't grab the hash lock prior to this check, because if 4911286570Smav * the buffer's header is in the arc_anon state, it won't be 4912286570Smav * linked into the hash table. 4913286570Smav */ 4914286570Smav if (hdr->b_l1hdr.b_state == arc_anon) { 4915286570Smav mutex_exit(&buf->b_evict_lock); 4916286570Smav ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4917286570Smav ASSERT(!HDR_IN_HASH_TABLE(hdr)); 4918286570Smav ASSERT(!HDR_HAS_L2HDR(hdr)); 4919286570Smav ASSERT(BUF_EMPTY(hdr)); 4920286570Smav ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); 4921286570Smav ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); 4922286570Smav ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 4923185029Spjd 4924286570Smav ASSERT3P(buf->b_efunc, ==, NULL); 4925286570Smav ASSERT3P(buf->b_private, ==, NULL); 4926168404Spjd 4927286570Smav hdr->b_l1hdr.b_arc_access = 0; 4928286570Smav arc_buf_thaw(buf); 4929286570Smav 4930286570Smav return; 4931168404Spjd } 4932168404Spjd 4933286570Smav kmutex_t *hash_lock = HDR_LOCK(hdr); 4934286570Smav mutex_enter(hash_lock); 4935286570Smav 4936286570Smav /* 4937286570Smav * This assignment is only valid as long as the hash_lock is 4938286570Smav * held, we must be careful not to reference state or the 4939286570Smav * b_state field after dropping the lock. 4940286570Smav */ 4941286570Smav arc_state_t *state = hdr->b_l1hdr.b_state; 4942286570Smav ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4943286570Smav ASSERT3P(state, !=, arc_anon); 4944286570Smav 4945286570Smav /* this buffer is not on any list */ 4946286570Smav ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); 4947286570Smav 4948286570Smav if (HDR_HAS_L2HDR(hdr)) { 4949286570Smav mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4950286570Smav 4951286570Smav /* 4952286598Smav * We have to recheck this conditional again now that 4953286598Smav * we're holding the l2ad_mtx to prevent a race with 4954286598Smav * another thread which might be concurrently calling 4955286598Smav * l2arc_evict(). In that case, l2arc_evict() might have 4956286598Smav * destroyed the header's L2 portion as we were waiting 4957286598Smav * to acquire the l2ad_mtx. 4958286570Smav */ 4959286598Smav if (HDR_HAS_L2HDR(hdr)) { 4960290191Savg l2arc_trim(hdr); 4961286598Smav arc_hdr_l2hdr_destroy(hdr); 4962286598Smav } 4963286570Smav 4964286570Smav mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4965185029Spjd } 4966185029Spjd 4967168404Spjd /* 4968168404Spjd * Do we have more than one buf? 4969168404Spjd */ 4970286570Smav if (hdr->b_l1hdr.b_datacnt > 1) { 4971168404Spjd arc_buf_hdr_t *nhdr; 4972168404Spjd arc_buf_t **bufp; 4973168404Spjd uint64_t blksz = hdr->b_size; 4974209962Smm uint64_t spa = hdr->b_spa; 4975286570Smav arc_buf_contents_t type = arc_buf_type(hdr); 4976185029Spjd uint32_t flags = hdr->b_flags; 4977168404Spjd 4978286570Smav ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); 4979168404Spjd /* 4980219089Spjd * Pull the data off of this hdr and attach it to 4981219089Spjd * a new anonymous hdr. 4982168404Spjd */ 4983168404Spjd (void) remove_reference(hdr, hash_lock, tag); 4984286570Smav bufp = &hdr->b_l1hdr.b_buf; 4985168404Spjd while (*bufp != buf) 4986168404Spjd bufp = &(*bufp)->b_next; 4987219089Spjd *bufp = buf->b_next; 4988168404Spjd buf->b_next = NULL; 4989168404Spjd 4990286570Smav ASSERT3P(state, !=, arc_l2c_only); 4991286766Smav 4992286766Smav (void) refcount_remove_many( 4993286766Smav &state->arcs_size, hdr->b_size, buf); 4994286766Smav 4995286570Smav if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { 4996286570Smav ASSERT3P(state, !=, arc_l2c_only); 4997286570Smav uint64_t *size = &state->arcs_lsize[type]; 4998185029Spjd ASSERT3U(*size, >=, hdr->b_size); 4999185029Spjd atomic_add_64(size, -hdr->b_size); 5000168404Spjd } 5001242845Sdelphij 5002242845Sdelphij /* 5003242845Sdelphij * We're releasing a duplicate user data buffer, update 5004242845Sdelphij * our statistics accordingly. 5005242845Sdelphij */ 5006286570Smav if (HDR_ISTYPE_DATA(hdr)) { 5007242845Sdelphij ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 5008242845Sdelphij ARCSTAT_INCR(arcstat_duplicate_buffers_size, 5009242845Sdelphij -hdr->b_size); 5010242845Sdelphij } 5011286570Smav hdr->b_l1hdr.b_datacnt -= 1; 5012168404Spjd arc_cksum_verify(buf); 5013240133Smm#ifdef illumos 5014240133Smm arc_buf_unwatch(buf); 5015277300Ssmh#endif 5016168404Spjd 5017168404Spjd mutex_exit(hash_lock); 5018168404Spjd 5019286570Smav nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 5020168404Spjd nhdr->b_size = blksz; 5021168404Spjd nhdr->b_spa = spa; 5022286570Smav 5023275811Sdelphij nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; 5024286570Smav nhdr->b_flags |= arc_bufc_to_flags(type); 5025286570Smav nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 5026286570Smav 5027286570Smav nhdr->b_l1hdr.b_buf = buf; 5028286570Smav nhdr->b_l1hdr.b_datacnt = 1; 5029286570Smav nhdr->b_l1hdr.b_state = arc_anon; 5030286570Smav nhdr->b_l1hdr.b_arc_access = 0; 5031286763Smav nhdr->b_l1hdr.b_tmp_cdata = NULL; 5032168404Spjd nhdr->b_freeze_cksum = NULL; 5033286570Smav 5034286570Smav (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); 5035168404Spjd buf->b_hdr = nhdr; 5036219089Spjd mutex_exit(&buf->b_evict_lock); 5037286766Smav (void) refcount_add_many(&arc_anon->arcs_size, blksz, buf); 5038168404Spjd } else { 5039219089Spjd mutex_exit(&buf->b_evict_lock); 5040286570Smav ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); 5041286763Smav /* protected by hash lock, or hdr is on arc_anon */ 5042286763Smav ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 5043168404Spjd ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 5044286570Smav arc_change_state(arc_anon, hdr, hash_lock); 5045286570Smav hdr->b_l1hdr.b_arc_access = 0; 5046286570Smav mutex_exit(hash_lock); 5047185029Spjd 5048219089Spjd buf_discard_identity(hdr); 5049168404Spjd arc_buf_thaw(buf); 5050168404Spjd } 5051168404Spjd buf->b_efunc = NULL; 5052168404Spjd buf->b_private = NULL; 5053168404Spjd} 5054168404Spjd 5055168404Spjdint 5056168404Spjdarc_released(arc_buf_t *buf) 5057168404Spjd{ 5058185029Spjd int released; 5059185029Spjd 5060219089Spjd mutex_enter(&buf->b_evict_lock); 5061286570Smav released = (buf->b_data != NULL && 5062286570Smav buf->b_hdr->b_l1hdr.b_state == arc_anon); 5063219089Spjd mutex_exit(&buf->b_evict_lock); 5064185029Spjd return (released); 5065168404Spjd} 5066168404Spjd 5067168404Spjd#ifdef ZFS_DEBUG 5068168404Spjdint 5069168404Spjdarc_referenced(arc_buf_t *buf) 5070168404Spjd{ 5071185029Spjd int referenced; 5072185029Spjd 5073219089Spjd mutex_enter(&buf->b_evict_lock); 5074286570Smav referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); 5075219089Spjd mutex_exit(&buf->b_evict_lock); 5076185029Spjd return (referenced); 5077168404Spjd} 5078168404Spjd#endif 5079168404Spjd 5080168404Spjdstatic void 5081168404Spjdarc_write_ready(zio_t *zio) 5082168404Spjd{ 5083168404Spjd arc_write_callback_t *callback = zio->io_private; 5084168404Spjd arc_buf_t *buf = callback->awcb_buf; 5085185029Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 5086168404Spjd 5087286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 5088286570Smav ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); 5089286570Smav ASSERT(hdr->b_l1hdr.b_datacnt > 0); 5090185029Spjd callback->awcb_ready(zio, buf, callback->awcb_private); 5091185029Spjd 5092185029Spjd /* 5093185029Spjd * If the IO is already in progress, then this is a re-write 5094185029Spjd * attempt, so we need to thaw and re-compute the cksum. 5095185029Spjd * It is the responsibility of the callback to handle the 5096185029Spjd * accounting for any re-write attempt. 5097185029Spjd */ 5098185029Spjd if (HDR_IO_IN_PROGRESS(hdr)) { 5099286570Smav mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 5100185029Spjd if (hdr->b_freeze_cksum != NULL) { 5101185029Spjd kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 5102185029Spjd hdr->b_freeze_cksum = NULL; 5103185029Spjd } 5104286570Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 5105168404Spjd } 5106185029Spjd arc_cksum_compute(buf, B_FALSE); 5107275811Sdelphij hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 5108168404Spjd} 5109168404Spjd 5110304138Savgstatic void 5111304138Savgarc_write_children_ready(zio_t *zio) 5112304138Savg{ 5113304138Savg arc_write_callback_t *callback = zio->io_private; 5114304138Savg arc_buf_t *buf = callback->awcb_buf; 5115304138Savg 5116304138Savg callback->awcb_children_ready(zio, buf, callback->awcb_private); 5117304138Savg} 5118304138Savg 5119258632Savg/* 5120258632Savg * The SPA calls this callback for each physical write that happens on behalf 5121258632Savg * of a logical write. See the comment in dbuf_write_physdone() for details. 5122258632Savg */ 5123168404Spjdstatic void 5124258632Savgarc_write_physdone(zio_t *zio) 5125258632Savg{ 5126258632Savg arc_write_callback_t *cb = zio->io_private; 5127258632Savg if (cb->awcb_physdone != NULL) 5128258632Savg cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 5129258632Savg} 5130258632Savg 5131258632Savgstatic void 5132168404Spjdarc_write_done(zio_t *zio) 5133168404Spjd{ 5134168404Spjd arc_write_callback_t *callback = zio->io_private; 5135168404Spjd arc_buf_t *buf = callback->awcb_buf; 5136168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 5137168404Spjd 5138286570Smav ASSERT(hdr->b_l1hdr.b_acb == NULL); 5139168404Spjd 5140219089Spjd if (zio->io_error == 0) { 5141268075Sdelphij if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 5142260150Sdelphij buf_discard_identity(hdr); 5143260150Sdelphij } else { 5144260150Sdelphij hdr->b_dva = *BP_IDENTITY(zio->io_bp); 5145260150Sdelphij hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 5146260150Sdelphij } 5147219089Spjd } else { 5148219089Spjd ASSERT(BUF_EMPTY(hdr)); 5149219089Spjd } 5150219089Spjd 5151168404Spjd /* 5152268075Sdelphij * If the block to be written was all-zero or compressed enough to be 5153268075Sdelphij * embedded in the BP, no write was performed so there will be no 5154268075Sdelphij * dva/birth/checksum. The buffer must therefore remain anonymous 5155268075Sdelphij * (and uncached). 5156168404Spjd */ 5157168404Spjd if (!BUF_EMPTY(hdr)) { 5158168404Spjd arc_buf_hdr_t *exists; 5159168404Spjd kmutex_t *hash_lock; 5160168404Spjd 5161219089Spjd ASSERT(zio->io_error == 0); 5162219089Spjd 5163168404Spjd arc_cksum_verify(buf); 5164168404Spjd 5165168404Spjd exists = buf_hash_insert(hdr, &hash_lock); 5166286570Smav if (exists != NULL) { 5167168404Spjd /* 5168168404Spjd * This can only happen if we overwrite for 5169168404Spjd * sync-to-convergence, because we remove 5170168404Spjd * buffers from the hash table when we arc_free(). 5171168404Spjd */ 5172219089Spjd if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 5173219089Spjd if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 5174219089Spjd panic("bad overwrite, hdr=%p exists=%p", 5175219089Spjd (void *)hdr, (void *)exists); 5176286570Smav ASSERT(refcount_is_zero( 5177286570Smav &exists->b_l1hdr.b_refcnt)); 5178219089Spjd arc_change_state(arc_anon, exists, hash_lock); 5179219089Spjd mutex_exit(hash_lock); 5180219089Spjd arc_hdr_destroy(exists); 5181219089Spjd exists = buf_hash_insert(hdr, &hash_lock); 5182219089Spjd ASSERT3P(exists, ==, NULL); 5183243524Smm } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 5184243524Smm /* nopwrite */ 5185243524Smm ASSERT(zio->io_prop.zp_nopwrite); 5186243524Smm if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 5187243524Smm panic("bad nopwrite, hdr=%p exists=%p", 5188243524Smm (void *)hdr, (void *)exists); 5189219089Spjd } else { 5190219089Spjd /* Dedup */ 5191286570Smav ASSERT(hdr->b_l1hdr.b_datacnt == 1); 5192286570Smav ASSERT(hdr->b_l1hdr.b_state == arc_anon); 5193219089Spjd ASSERT(BP_GET_DEDUP(zio->io_bp)); 5194219089Spjd ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 5195219089Spjd } 5196168404Spjd } 5197275811Sdelphij hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 5198185029Spjd /* if it's not anon, we are doing a scrub */ 5199286570Smav if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) 5200185029Spjd arc_access(hdr, hash_lock); 5201168404Spjd mutex_exit(hash_lock); 5202168404Spjd } else { 5203275811Sdelphij hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 5204168404Spjd } 5205168404Spjd 5206286570Smav ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 5207219089Spjd callback->awcb_done(zio, buf, callback->awcb_private); 5208168404Spjd 5209168404Spjd kmem_free(callback, sizeof (arc_write_callback_t)); 5210168404Spjd} 5211168404Spjd 5212168404Spjdzio_t * 5213219089Spjdarc_write(zio_t *pio, spa_t *spa, uint64_t txg, 5214251478Sdelphij blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, 5215304138Savg const zio_prop_t *zp, arc_done_func_t *ready, 5216304138Savg arc_done_func_t *children_ready, arc_done_func_t *physdone, 5217258632Savg arc_done_func_t *done, void *private, zio_priority_t priority, 5218268123Sdelphij int zio_flags, const zbookmark_phys_t *zb) 5219168404Spjd{ 5220168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 5221168404Spjd arc_write_callback_t *callback; 5222185029Spjd zio_t *zio; 5223168404Spjd 5224185029Spjd ASSERT(ready != NULL); 5225219089Spjd ASSERT(done != NULL); 5226168404Spjd ASSERT(!HDR_IO_ERROR(hdr)); 5227286570Smav ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 5228286570Smav ASSERT(hdr->b_l1hdr.b_acb == NULL); 5229286570Smav ASSERT(hdr->b_l1hdr.b_datacnt > 0); 5230185029Spjd if (l2arc) 5231275811Sdelphij hdr->b_flags |= ARC_FLAG_L2CACHE; 5232251478Sdelphij if (l2arc_compress) 5233275811Sdelphij hdr->b_flags |= ARC_FLAG_L2COMPRESS; 5234168404Spjd callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 5235168404Spjd callback->awcb_ready = ready; 5236304138Savg callback->awcb_children_ready = children_ready; 5237258632Savg callback->awcb_physdone = physdone; 5238168404Spjd callback->awcb_done = done; 5239168404Spjd callback->awcb_private = private; 5240168404Spjd callback->awcb_buf = buf; 5241168404Spjd 5242219089Spjd zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 5243304138Savg arc_write_ready, 5244304138Savg (children_ready != NULL) ? arc_write_children_ready : NULL, 5245304138Savg arc_write_physdone, arc_write_done, callback, 5246258632Savg priority, zio_flags, zb); 5247185029Spjd 5248168404Spjd return (zio); 5249168404Spjd} 5250168404Spjd 5251185029Spjdstatic int 5252258632Savgarc_memory_throttle(uint64_t reserve, uint64_t txg) 5253185029Spjd{ 5254185029Spjd#ifdef _KERNEL 5255272483Ssmh uint64_t available_memory = ptob(freemem); 5256185029Spjd static uint64_t page_load = 0; 5257185029Spjd static uint64_t last_txg = 0; 5258185029Spjd 5259272483Ssmh#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 5260185029Spjd available_memory = 5261272483Ssmh MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); 5262185029Spjd#endif 5263258632Savg 5264272483Ssmh if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) 5265185029Spjd return (0); 5266185029Spjd 5267185029Spjd if (txg > last_txg) { 5268185029Spjd last_txg = txg; 5269185029Spjd page_load = 0; 5270185029Spjd } 5271185029Spjd /* 5272185029Spjd * If we are in pageout, we know that memory is already tight, 5273185029Spjd * the arc is already going to be evicting, so we just want to 5274185029Spjd * continue to let page writes occur as quickly as possible. 5275185029Spjd */ 5276185029Spjd if (curproc == pageproc) { 5277272483Ssmh if (page_load > MAX(ptob(minfree), available_memory) / 4) 5278249195Smm return (SET_ERROR(ERESTART)); 5279185029Spjd /* Note: reserve is inflated, so we deflate */ 5280185029Spjd page_load += reserve / 8; 5281185029Spjd return (0); 5282185029Spjd } else if (page_load > 0 && arc_reclaim_needed()) { 5283185029Spjd /* memory is low, delay before restarting */ 5284185029Spjd ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 5285249195Smm return (SET_ERROR(EAGAIN)); 5286185029Spjd } 5287185029Spjd page_load = 0; 5288185029Spjd#endif 5289185029Spjd return (0); 5290185029Spjd} 5291185029Spjd 5292168404Spjdvoid 5293185029Spjdarc_tempreserve_clear(uint64_t reserve) 5294168404Spjd{ 5295185029Spjd atomic_add_64(&arc_tempreserve, -reserve); 5296168404Spjd ASSERT((int64_t)arc_tempreserve >= 0); 5297168404Spjd} 5298168404Spjd 5299168404Spjdint 5300185029Spjdarc_tempreserve_space(uint64_t reserve, uint64_t txg) 5301168404Spjd{ 5302185029Spjd int error; 5303209962Smm uint64_t anon_size; 5304185029Spjd 5305272483Ssmh if (reserve > arc_c/4 && !arc_no_grow) { 5306185029Spjd arc_c = MIN(arc_c_max, reserve * 4); 5307272483Ssmh DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); 5308272483Ssmh } 5309185029Spjd if (reserve > arc_c) 5310249195Smm return (SET_ERROR(ENOMEM)); 5311168404Spjd 5312168404Spjd /* 5313209962Smm * Don't count loaned bufs as in flight dirty data to prevent long 5314209962Smm * network delays from blocking transactions that are ready to be 5315209962Smm * assigned to a txg. 5316209962Smm */ 5317286766Smav anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) - 5318286766Smav arc_loaned_bytes), 0); 5319209962Smm 5320209962Smm /* 5321185029Spjd * Writes will, almost always, require additional memory allocations 5322251631Sdelphij * in order to compress/encrypt/etc the data. We therefore need to 5323185029Spjd * make sure that there is sufficient available memory for this. 5324185029Spjd */ 5325258632Savg error = arc_memory_throttle(reserve, txg); 5326258632Savg if (error != 0) 5327185029Spjd return (error); 5328185029Spjd 5329185029Spjd /* 5330168404Spjd * Throttle writes when the amount of dirty data in the cache 5331168404Spjd * gets too large. We try to keep the cache less than half full 5332168404Spjd * of dirty blocks so that our sync times don't grow too large. 5333168404Spjd * Note: if two requests come in concurrently, we might let them 5334168404Spjd * both succeed, when one of them should fail. Not a huge deal. 5335168404Spjd */ 5336209962Smm 5337209962Smm if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 5338209962Smm anon_size > arc_c / 4) { 5339185029Spjd dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 5340185029Spjd "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 5341185029Spjd arc_tempreserve>>10, 5342185029Spjd arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 5343185029Spjd arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 5344185029Spjd reserve>>10, arc_c>>10); 5345249195Smm return (SET_ERROR(ERESTART)); 5346168404Spjd } 5347185029Spjd atomic_add_64(&arc_tempreserve, reserve); 5348168404Spjd return (0); 5349168404Spjd} 5350168404Spjd 5351286626Smavstatic void 5352286626Smavarc_kstat_update_state(arc_state_t *state, kstat_named_t *size, 5353286626Smav kstat_named_t *evict_data, kstat_named_t *evict_metadata) 5354286626Smav{ 5355286766Smav size->value.ui64 = refcount_count(&state->arcs_size); 5356286626Smav evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; 5357286626Smav evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; 5358286626Smav} 5359286626Smav 5360286626Smavstatic int 5361286626Smavarc_kstat_update(kstat_t *ksp, int rw) 5362286626Smav{ 5363286626Smav arc_stats_t *as = ksp->ks_data; 5364286626Smav 5365286626Smav if (rw == KSTAT_WRITE) { 5366286626Smav return (EACCES); 5367286626Smav } else { 5368286626Smav arc_kstat_update_state(arc_anon, 5369286626Smav &as->arcstat_anon_size, 5370286626Smav &as->arcstat_anon_evictable_data, 5371286626Smav &as->arcstat_anon_evictable_metadata); 5372286626Smav arc_kstat_update_state(arc_mru, 5373286626Smav &as->arcstat_mru_size, 5374286626Smav &as->arcstat_mru_evictable_data, 5375286626Smav &as->arcstat_mru_evictable_metadata); 5376286626Smav arc_kstat_update_state(arc_mru_ghost, 5377286626Smav &as->arcstat_mru_ghost_size, 5378286626Smav &as->arcstat_mru_ghost_evictable_data, 5379286626Smav &as->arcstat_mru_ghost_evictable_metadata); 5380286626Smav arc_kstat_update_state(arc_mfu, 5381286626Smav &as->arcstat_mfu_size, 5382286626Smav &as->arcstat_mfu_evictable_data, 5383286626Smav &as->arcstat_mfu_evictable_metadata); 5384286626Smav arc_kstat_update_state(arc_mfu_ghost, 5385286626Smav &as->arcstat_mfu_ghost_size, 5386286626Smav &as->arcstat_mfu_ghost_evictable_data, 5387286626Smav &as->arcstat_mfu_ghost_evictable_metadata); 5388286626Smav } 5389286626Smav 5390286626Smav return (0); 5391286626Smav} 5392286626Smav 5393286763Smav/* 5394286763Smav * This function *must* return indices evenly distributed between all 5395286763Smav * sublists of the multilist. This is needed due to how the ARC eviction 5396286763Smav * code is laid out; arc_evict_state() assumes ARC buffers are evenly 5397286763Smav * distributed between all sublists and uses this assumption when 5398286763Smav * deciding which sublist to evict from and how much to evict from it. 5399286763Smav */ 5400286763Smavunsigned int 5401286763Smavarc_state_multilist_index_func(multilist_t *ml, void *obj) 5402286763Smav{ 5403286763Smav arc_buf_hdr_t *hdr = obj; 5404286763Smav 5405286763Smav /* 5406286763Smav * We rely on b_dva to generate evenly distributed index 5407286763Smav * numbers using buf_hash below. So, as an added precaution, 5408286763Smav * let's make sure we never add empty buffers to the arc lists. 5409286763Smav */ 5410286763Smav ASSERT(!BUF_EMPTY(hdr)); 5411286763Smav 5412286763Smav /* 5413286763Smav * The assumption here, is the hash value for a given 5414286763Smav * arc_buf_hdr_t will remain constant throughout it's lifetime 5415286763Smav * (i.e. it's b_spa, b_dva, and b_birth fields don't change). 5416286763Smav * Thus, we don't need to store the header's sublist index 5417286763Smav * on insertion, as this index can be recalculated on removal. 5418286763Smav * 5419286763Smav * Also, the low order bits of the hash value are thought to be 5420286763Smav * distributed evenly. Otherwise, in the case that the multilist 5421286763Smav * has a power of two number of sublists, each sublists' usage 5422286763Smav * would not be evenly distributed. 5423286763Smav */ 5424286763Smav return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % 5425286763Smav multilist_get_num_sublists(ml)); 5426286763Smav} 5427286763Smav 5428168404Spjd#ifdef _KERNEL 5429168566Spjdstatic eventhandler_tag arc_event_lowmem = NULL; 5430168404Spjd 5431168404Spjdstatic void 5432168566Spjdarc_lowmem(void *arg __unused, int howto __unused) 5433168404Spjd{ 5434168404Spjd 5435286763Smav mutex_enter(&arc_reclaim_lock); 5436286625Smav /* XXX: Memory deficit should be passed as argument. */ 5437286625Smav needfree = btoc(arc_c >> arc_shrink_shift); 5438272483Ssmh DTRACE_PROBE(arc__needfree); 5439286763Smav cv_signal(&arc_reclaim_thread_cv); 5440241773Savg 5441241773Savg /* 5442241773Savg * It is unsafe to block here in arbitrary threads, because we can come 5443241773Savg * here from ARC itself and may hold ARC locks and thus risk a deadlock 5444241773Savg * with ARC reclaim thread. 5445241773Savg */ 5446286623Smav if (curproc == pageproc) 5447286763Smav (void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); 5448286763Smav mutex_exit(&arc_reclaim_lock); 5449168404Spjd} 5450168404Spjd#endif 5451168404Spjd 5452168404Spjdvoid 5453168404Spjdarc_init(void) 5454168404Spjd{ 5455219089Spjd int i, prefetch_tunable_set = 0; 5456205231Skmacy 5457286763Smav mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); 5458286763Smav cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); 5459286763Smav cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); 5460168404Spjd 5461286763Smav mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL); 5462286763Smav cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL); 5463286763Smav 5464301997Skib mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL); 5465301997Skib cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL); 5466301997Skib 5467168404Spjd /* Convert seconds to clock ticks */ 5468168404Spjd arc_min_prefetch_lifespan = 1 * hz; 5469168404Spjd 5470168404Spjd /* Start out with 1/8 of all memory */ 5471168566Spjd arc_c = kmem_size() / 8; 5472219089Spjd 5473277300Ssmh#ifdef illumos 5474192360Skmacy#ifdef _KERNEL 5475192360Skmacy /* 5476192360Skmacy * On architectures where the physical memory can be larger 5477192360Skmacy * than the addressable space (intel in 32-bit mode), we may 5478192360Skmacy * need to limit the cache to 1/8 of VM size. 5479192360Skmacy */ 5480192360Skmacy arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 5481192360Skmacy#endif 5482277300Ssmh#endif /* illumos */ 5483302265Ssmh /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */ 5484302265Ssmh arc_c_min = MAX(arc_c / 4, arc_abs_min); 5485168566Spjd /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ 5486280822Smav if (arc_c * 8 >= 1 << 30) 5487280822Smav arc_c_max = (arc_c * 8) - (1 << 30); 5488168404Spjd else 5489168404Spjd arc_c_max = arc_c_min; 5490175633Spjd arc_c_max = MAX(arc_c * 5, arc_c_max); 5491219089Spjd 5492289305Smav /* 5493289305Smav * In userland, there's only the memory pressure that we artificially 5494289305Smav * create (see arc_available_memory()). Don't let arc_c get too 5495289305Smav * small, because it can cause transactions to be larger than 5496289305Smav * arc_c, causing arc_tempreserve_space() to fail. 5497289305Smav */ 5498289305Smav#ifndef _KERNEL 5499289305Smav arc_c_min = arc_c_max / 2; 5500289305Smav#endif 5501289305Smav 5502168481Spjd#ifdef _KERNEL 5503168404Spjd /* 5504168404Spjd * Allow the tunables to override our calculations if they are 5505302265Ssmh * reasonable. 5506168404Spjd */ 5507302265Ssmh if (zfs_arc_max > arc_abs_min && zfs_arc_max < kmem_size()) 5508168404Spjd arc_c_max = zfs_arc_max; 5509302265Ssmh if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max) 5510168404Spjd arc_c_min = zfs_arc_min; 5511168481Spjd#endif 5512219089Spjd 5513168404Spjd arc_c = arc_c_max; 5514168404Spjd arc_p = (arc_c >> 1); 5515168404Spjd 5516185029Spjd /* limit meta-data to 1/4 of the arc capacity */ 5517185029Spjd arc_meta_limit = arc_c_max / 4; 5518185029Spjd 5519185029Spjd /* Allow the tunable to override if it is reasonable */ 5520185029Spjd if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 5521185029Spjd arc_meta_limit = zfs_arc_meta_limit; 5522185029Spjd 5523185029Spjd if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 5524185029Spjd arc_c_min = arc_meta_limit / 2; 5525185029Spjd 5526275780Sdelphij if (zfs_arc_meta_min > 0) { 5527275780Sdelphij arc_meta_min = zfs_arc_meta_min; 5528275780Sdelphij } else { 5529275780Sdelphij arc_meta_min = arc_c_min / 2; 5530275780Sdelphij } 5531275780Sdelphij 5532208373Smm if (zfs_arc_grow_retry > 0) 5533208373Smm arc_grow_retry = zfs_arc_grow_retry; 5534208373Smm 5535208373Smm if (zfs_arc_shrink_shift > 0) 5536208373Smm arc_shrink_shift = zfs_arc_shrink_shift; 5537208373Smm 5538286625Smav /* 5539286625Smav * Ensure that arc_no_grow_shift is less than arc_shrink_shift. 5540286625Smav */ 5541286625Smav if (arc_no_grow_shift >= arc_shrink_shift) 5542286625Smav arc_no_grow_shift = arc_shrink_shift - 1; 5543286625Smav 5544208373Smm if (zfs_arc_p_min_shift > 0) 5545208373Smm arc_p_min_shift = zfs_arc_p_min_shift; 5546208373Smm 5547286763Smav if (zfs_arc_num_sublists_per_state < 1) 5548286763Smav zfs_arc_num_sublists_per_state = MAX(max_ncpus, 1); 5549286763Smav 5550168404Spjd /* if kmem_flags are set, lets try to use less memory */ 5551168404Spjd if (kmem_debugging()) 5552168404Spjd arc_c = arc_c / 2; 5553168404Spjd if (arc_c < arc_c_min) 5554168404Spjd arc_c = arc_c_min; 5555168404Spjd 5556168473Spjd zfs_arc_min = arc_c_min; 5557168473Spjd zfs_arc_max = arc_c_max; 5558168473Spjd 5559168404Spjd arc_anon = &ARC_anon; 5560168404Spjd arc_mru = &ARC_mru; 5561168404Spjd arc_mru_ghost = &ARC_mru_ghost; 5562168404Spjd arc_mfu = &ARC_mfu; 5563168404Spjd arc_mfu_ghost = &ARC_mfu_ghost; 5564185029Spjd arc_l2c_only = &ARC_l2c_only; 5565168404Spjd arc_size = 0; 5566168404Spjd 5567286763Smav multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], 5568286762Smav sizeof (arc_buf_hdr_t), 5569286763Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5570286763Smav zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5571286763Smav multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], 5572286762Smav sizeof (arc_buf_hdr_t), 5573286763Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5574286763Smav zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5575286763Smav multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], 5576286762Smav sizeof (arc_buf_hdr_t), 5577286763Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5578286763Smav zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5579286763Smav multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], 5580286762Smav sizeof (arc_buf_hdr_t), 5581286763Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5582286763Smav zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5583286763Smav multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], 5584286762Smav sizeof (arc_buf_hdr_t), 5585286763Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5586286763Smav zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5587286763Smav multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], 5588286762Smav sizeof (arc_buf_hdr_t), 5589286763Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5590286763Smav zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5591286763Smav multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], 5592286762Smav sizeof (arc_buf_hdr_t), 5593286763Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5594286763Smav zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5595286763Smav multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], 5596286762Smav sizeof (arc_buf_hdr_t), 5597286763Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5598286763Smav zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5599286763Smav multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], 5600286762Smav sizeof (arc_buf_hdr_t), 5601286763Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5602286763Smav zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5603286763Smav multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], 5604286762Smav sizeof (arc_buf_hdr_t), 5605286763Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5606286763Smav zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5607168404Spjd 5608286766Smav refcount_create(&arc_anon->arcs_size); 5609286766Smav refcount_create(&arc_mru->arcs_size); 5610286766Smav refcount_create(&arc_mru_ghost->arcs_size); 5611286766Smav refcount_create(&arc_mfu->arcs_size); 5612286766Smav refcount_create(&arc_mfu_ghost->arcs_size); 5613286766Smav refcount_create(&arc_l2c_only->arcs_size); 5614286766Smav 5615168404Spjd buf_init(); 5616168404Spjd 5617286763Smav arc_reclaim_thread_exit = FALSE; 5618286763Smav arc_user_evicts_thread_exit = FALSE; 5619301997Skib arc_dnlc_evicts_thread_exit = FALSE; 5620168404Spjd arc_eviction_list = NULL; 5621168404Spjd bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 5622168404Spjd 5623168404Spjd arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 5624168404Spjd sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 5625168404Spjd 5626168404Spjd if (arc_ksp != NULL) { 5627168404Spjd arc_ksp->ks_data = &arc_stats; 5628286574Smav arc_ksp->ks_update = arc_kstat_update; 5629168404Spjd kstat_install(arc_ksp); 5630168404Spjd } 5631168404Spjd 5632168404Spjd (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 5633168404Spjd TS_RUN, minclsyspri); 5634168404Spjd 5635168404Spjd#ifdef _KERNEL 5636168566Spjd arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 5637168404Spjd EVENTHANDLER_PRI_FIRST); 5638168404Spjd#endif 5639168404Spjd 5640286763Smav (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0, 5641286763Smav TS_RUN, minclsyspri); 5642286763Smav 5643301997Skib (void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0, 5644301997Skib TS_RUN, minclsyspri); 5645301997Skib 5646168404Spjd arc_dead = FALSE; 5647185029Spjd arc_warm = B_FALSE; 5648168566Spjd 5649258632Savg /* 5650258632Savg * Calculate maximum amount of dirty data per pool. 5651258632Savg * 5652258632Savg * If it has been set by /etc/system, take that. 5653258632Savg * Otherwise, use a percentage of physical memory defined by 5654258632Savg * zfs_dirty_data_max_percent (default 10%) with a cap at 5655258632Savg * zfs_dirty_data_max_max (default 4GB). 5656258632Savg */ 5657258632Savg if (zfs_dirty_data_max == 0) { 5658258632Savg zfs_dirty_data_max = ptob(physmem) * 5659258632Savg zfs_dirty_data_max_percent / 100; 5660258632Savg zfs_dirty_data_max = MIN(zfs_dirty_data_max, 5661258632Savg zfs_dirty_data_max_max); 5662258632Savg } 5663185029Spjd 5664168566Spjd#ifdef _KERNEL 5665194043Skmacy if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 5666193953Skmacy prefetch_tunable_set = 1; 5667206796Spjd 5668193878Skmacy#ifdef __i386__ 5669193953Skmacy if (prefetch_tunable_set == 0) { 5670196863Strasz printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 5671196863Strasz "-- to enable,\n"); 5672196863Strasz printf(" add \"vfs.zfs.prefetch_disable=0\" " 5673196863Strasz "to /boot/loader.conf.\n"); 5674219089Spjd zfs_prefetch_disable = 1; 5675193878Skmacy } 5676206796Spjd#else 5677193878Skmacy if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 5678193953Skmacy prefetch_tunable_set == 0) { 5679196863Strasz printf("ZFS NOTICE: Prefetch is disabled by default if less " 5680196941Strasz "than 4GB of RAM is present;\n" 5681196863Strasz " to enable, add \"vfs.zfs.prefetch_disable=0\" " 5682196863Strasz "to /boot/loader.conf.\n"); 5683219089Spjd zfs_prefetch_disable = 1; 5684193878Skmacy } 5685206796Spjd#endif 5686175633Spjd /* Warn about ZFS memory and address space requirements. */ 5687168696Spjd if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 5688168987Sbmah printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 5689168987Sbmah "expect unstable behavior.\n"); 5690175633Spjd } 5691175633Spjd if (kmem_size() < 512 * (1 << 20)) { 5692173419Spjd printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 5693168987Sbmah "expect unstable behavior.\n"); 5694185029Spjd printf(" Consider tuning vm.kmem_size and " 5695173419Spjd "vm.kmem_size_max\n"); 5696185029Spjd printf(" in /boot/loader.conf.\n"); 5697168566Spjd } 5698168566Spjd#endif 5699168404Spjd} 5700168404Spjd 5701168404Spjdvoid 5702168404Spjdarc_fini(void) 5703168404Spjd{ 5704286763Smav mutex_enter(&arc_reclaim_lock); 5705286763Smav arc_reclaim_thread_exit = TRUE; 5706286763Smav /* 5707286763Smav * The reclaim thread will set arc_reclaim_thread_exit back to 5708286763Smav * FALSE when it is finished exiting; we're waiting for that. 5709286763Smav */ 5710286763Smav while (arc_reclaim_thread_exit) { 5711286763Smav cv_signal(&arc_reclaim_thread_cv); 5712286763Smav cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); 5713286763Smav } 5714286763Smav mutex_exit(&arc_reclaim_lock); 5715168404Spjd 5716286763Smav mutex_enter(&arc_user_evicts_lock); 5717286763Smav arc_user_evicts_thread_exit = TRUE; 5718286763Smav /* 5719286763Smav * The user evicts thread will set arc_user_evicts_thread_exit 5720286763Smav * to FALSE when it is finished exiting; we're waiting for that. 5721286763Smav */ 5722286763Smav while (arc_user_evicts_thread_exit) { 5723286763Smav cv_signal(&arc_user_evicts_cv); 5724286763Smav cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock); 5725286763Smav } 5726286763Smav mutex_exit(&arc_user_evicts_lock); 5727168404Spjd 5728301997Skib mutex_enter(&arc_dnlc_evicts_lock); 5729301997Skib arc_dnlc_evicts_thread_exit = TRUE; 5730301997Skib /* 5731301997Skib * The user evicts thread will set arc_user_evicts_thread_exit 5732301997Skib * to FALSE when it is finished exiting; we're waiting for that. 5733301997Skib */ 5734301997Skib while (arc_dnlc_evicts_thread_exit) { 5735301997Skib cv_signal(&arc_dnlc_evicts_cv); 5736301997Skib cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); 5737301997Skib } 5738301997Skib mutex_exit(&arc_dnlc_evicts_lock); 5739301997Skib 5740286763Smav /* Use TRUE to ensure *all* buffers are evicted */ 5741286763Smav arc_flush(NULL, TRUE); 5742286763Smav 5743168404Spjd arc_dead = TRUE; 5744168404Spjd 5745168404Spjd if (arc_ksp != NULL) { 5746168404Spjd kstat_delete(arc_ksp); 5747168404Spjd arc_ksp = NULL; 5748168404Spjd } 5749168404Spjd 5750286763Smav mutex_destroy(&arc_reclaim_lock); 5751286763Smav cv_destroy(&arc_reclaim_thread_cv); 5752286763Smav cv_destroy(&arc_reclaim_waiters_cv); 5753168404Spjd 5754286763Smav mutex_destroy(&arc_user_evicts_lock); 5755286763Smav cv_destroy(&arc_user_evicts_cv); 5756168404Spjd 5757301997Skib mutex_destroy(&arc_dnlc_evicts_lock); 5758301997Skib cv_destroy(&arc_dnlc_evicts_cv); 5759301997Skib 5760286766Smav refcount_destroy(&arc_anon->arcs_size); 5761286766Smav refcount_destroy(&arc_mru->arcs_size); 5762286766Smav refcount_destroy(&arc_mru_ghost->arcs_size); 5763286766Smav refcount_destroy(&arc_mfu->arcs_size); 5764286766Smav refcount_destroy(&arc_mfu_ghost->arcs_size); 5765286766Smav refcount_destroy(&arc_l2c_only->arcs_size); 5766286766Smav 5767286763Smav multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); 5768286763Smav multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 5769286763Smav multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); 5770286763Smav multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 5771294809Smav multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); 5772286763Smav multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); 5773286763Smav multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 5774286763Smav multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); 5775286763Smav multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 5776294809Smav multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); 5777206796Spjd 5778168404Spjd buf_fini(); 5779168404Spjd 5780286570Smav ASSERT0(arc_loaned_bytes); 5781209962Smm 5782168404Spjd#ifdef _KERNEL 5783168566Spjd if (arc_event_lowmem != NULL) 5784168566Spjd EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 5785168404Spjd#endif 5786168404Spjd} 5787185029Spjd 5788185029Spjd/* 5789185029Spjd * Level 2 ARC 5790185029Spjd * 5791185029Spjd * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 5792185029Spjd * It uses dedicated storage devices to hold cached data, which are populated 5793185029Spjd * using large infrequent writes. The main role of this cache is to boost 5794185029Spjd * the performance of random read workloads. The intended L2ARC devices 5795185029Spjd * include short-stroked disks, solid state disks, and other media with 5796185029Spjd * substantially faster read latency than disk. 5797185029Spjd * 5798185029Spjd * +-----------------------+ 5799185029Spjd * | ARC | 5800185029Spjd * +-----------------------+ 5801185029Spjd * | ^ ^ 5802185029Spjd * | | | 5803185029Spjd * l2arc_feed_thread() arc_read() 5804185029Spjd * | | | 5805185029Spjd * | l2arc read | 5806185029Spjd * V | | 5807185029Spjd * +---------------+ | 5808185029Spjd * | L2ARC | | 5809185029Spjd * +---------------+ | 5810185029Spjd * | ^ | 5811185029Spjd * l2arc_write() | | 5812185029Spjd * | | | 5813185029Spjd * V | | 5814185029Spjd * +-------+ +-------+ 5815185029Spjd * | vdev | | vdev | 5816185029Spjd * | cache | | cache | 5817185029Spjd * +-------+ +-------+ 5818185029Spjd * +=========+ .-----. 5819185029Spjd * : L2ARC : |-_____-| 5820185029Spjd * : devices : | Disks | 5821185029Spjd * +=========+ `-_____-' 5822185029Spjd * 5823185029Spjd * Read requests are satisfied from the following sources, in order: 5824185029Spjd * 5825185029Spjd * 1) ARC 5826185029Spjd * 2) vdev cache of L2ARC devices 5827185029Spjd * 3) L2ARC devices 5828185029Spjd * 4) vdev cache of disks 5829185029Spjd * 5) disks 5830185029Spjd * 5831185029Spjd * Some L2ARC device types exhibit extremely slow write performance. 5832185029Spjd * To accommodate for this there are some significant differences between 5833185029Spjd * the L2ARC and traditional cache design: 5834185029Spjd * 5835185029Spjd * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 5836185029Spjd * the ARC behave as usual, freeing buffers and placing headers on ghost 5837185029Spjd * lists. The ARC does not send buffers to the L2ARC during eviction as 5838185029Spjd * this would add inflated write latencies for all ARC memory pressure. 5839185029Spjd * 5840185029Spjd * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 5841185029Spjd * It does this by periodically scanning buffers from the eviction-end of 5842185029Spjd * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 5843251478Sdelphij * not already there. It scans until a headroom of buffers is satisfied, 5844251478Sdelphij * which itself is a buffer for ARC eviction. If a compressible buffer is 5845251478Sdelphij * found during scanning and selected for writing to an L2ARC device, we 5846251478Sdelphij * temporarily boost scanning headroom during the next scan cycle to make 5847251478Sdelphij * sure we adapt to compression effects (which might significantly reduce 5848251478Sdelphij * the data volume we write to L2ARC). The thread that does this is 5849185029Spjd * l2arc_feed_thread(), illustrated below; example sizes are included to 5850185029Spjd * provide a better sense of ratio than this diagram: 5851185029Spjd * 5852185029Spjd * head --> tail 5853185029Spjd * +---------------------+----------+ 5854185029Spjd * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 5855185029Spjd * +---------------------+----------+ | o L2ARC eligible 5856185029Spjd * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 5857185029Spjd * +---------------------+----------+ | 5858185029Spjd * 15.9 Gbytes ^ 32 Mbytes | 5859185029Spjd * headroom | 5860185029Spjd * l2arc_feed_thread() 5861185029Spjd * | 5862185029Spjd * l2arc write hand <--[oooo]--' 5863185029Spjd * | 8 Mbyte 5864185029Spjd * | write max 5865185029Spjd * V 5866185029Spjd * +==============================+ 5867185029Spjd * L2ARC dev |####|#|###|###| |####| ... | 5868185029Spjd * +==============================+ 5869185029Spjd * 32 Gbytes 5870185029Spjd * 5871185029Spjd * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 5872185029Spjd * evicted, then the L2ARC has cached a buffer much sooner than it probably 5873185029Spjd * needed to, potentially wasting L2ARC device bandwidth and storage. It is 5874185029Spjd * safe to say that this is an uncommon case, since buffers at the end of 5875185029Spjd * the ARC lists have moved there due to inactivity. 5876185029Spjd * 5877185029Spjd * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 5878185029Spjd * then the L2ARC simply misses copying some buffers. This serves as a 5879185029Spjd * pressure valve to prevent heavy read workloads from both stalling the ARC 5880185029Spjd * with waits and clogging the L2ARC with writes. This also helps prevent 5881185029Spjd * the potential for the L2ARC to churn if it attempts to cache content too 5882185029Spjd * quickly, such as during backups of the entire pool. 5883185029Spjd * 5884185029Spjd * 5. After system boot and before the ARC has filled main memory, there are 5885185029Spjd * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 5886185029Spjd * lists can remain mostly static. Instead of searching from tail of these 5887185029Spjd * lists as pictured, the l2arc_feed_thread() will search from the list heads 5888185029Spjd * for eligible buffers, greatly increasing its chance of finding them. 5889185029Spjd * 5890185029Spjd * The L2ARC device write speed is also boosted during this time so that 5891185029Spjd * the L2ARC warms up faster. Since there have been no ARC evictions yet, 5892185029Spjd * there are no L2ARC reads, and no fear of degrading read performance 5893185029Spjd * through increased writes. 5894185029Spjd * 5895185029Spjd * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 5896185029Spjd * the vdev queue can aggregate them into larger and fewer writes. Each 5897185029Spjd * device is written to in a rotor fashion, sweeping writes through 5898185029Spjd * available space then repeating. 5899185029Spjd * 5900185029Spjd * 7. The L2ARC does not store dirty content. It never needs to flush 5901185029Spjd * write buffers back to disk based storage. 5902185029Spjd * 5903185029Spjd * 8. If an ARC buffer is written (and dirtied) which also exists in the 5904185029Spjd * L2ARC, the now stale L2ARC buffer is immediately dropped. 5905185029Spjd * 5906185029Spjd * The performance of the L2ARC can be tweaked by a number of tunables, which 5907185029Spjd * may be necessary for different workloads: 5908185029Spjd * 5909185029Spjd * l2arc_write_max max write bytes per interval 5910185029Spjd * l2arc_write_boost extra write bytes during device warmup 5911185029Spjd * l2arc_noprefetch skip caching prefetched buffers 5912185029Spjd * l2arc_headroom number of max device writes to precache 5913251478Sdelphij * l2arc_headroom_boost when we find compressed buffers during ARC 5914251478Sdelphij * scanning, we multiply headroom by this 5915251478Sdelphij * percentage factor for the next scan cycle, 5916251478Sdelphij * since more compressed buffers are likely to 5917251478Sdelphij * be present 5918185029Spjd * l2arc_feed_secs seconds between L2ARC writing 5919185029Spjd * 5920185029Spjd * Tunables may be removed or added as future performance improvements are 5921185029Spjd * integrated, and also may become zpool properties. 5922208373Smm * 5923208373Smm * There are three key functions that control how the L2ARC warms up: 5924208373Smm * 5925208373Smm * l2arc_write_eligible() check if a buffer is eligible to cache 5926208373Smm * l2arc_write_size() calculate how much to write 5927208373Smm * l2arc_write_interval() calculate sleep delay between writes 5928208373Smm * 5929208373Smm * These three functions determine what to write, how much, and how quickly 5930208373Smm * to send writes. 5931185029Spjd */ 5932185029Spjd 5933208373Smmstatic boolean_t 5934275811Sdelphijl2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) 5935208373Smm{ 5936208373Smm /* 5937208373Smm * A buffer is *not* eligible for the L2ARC if it: 5938208373Smm * 1. belongs to a different spa. 5939208373Smm * 2. is already cached on the L2ARC. 5940208373Smm * 3. has an I/O in progress (it may be an incomplete read). 5941208373Smm * 4. is flagged not eligible (zfs property). 5942208373Smm */ 5943275811Sdelphij if (hdr->b_spa != spa_guid) { 5944208373Smm ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 5945208373Smm return (B_FALSE); 5946208373Smm } 5947286570Smav if (HDR_HAS_L2HDR(hdr)) { 5948208373Smm ARCSTAT_BUMP(arcstat_l2_write_in_l2); 5949208373Smm return (B_FALSE); 5950208373Smm } 5951275811Sdelphij if (HDR_IO_IN_PROGRESS(hdr)) { 5952208373Smm ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 5953208373Smm return (B_FALSE); 5954208373Smm } 5955275811Sdelphij if (!HDR_L2CACHE(hdr)) { 5956208373Smm ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 5957208373Smm return (B_FALSE); 5958208373Smm } 5959208373Smm 5960208373Smm return (B_TRUE); 5961208373Smm} 5962208373Smm 5963208373Smmstatic uint64_t 5964251478Sdelphijl2arc_write_size(void) 5965208373Smm{ 5966208373Smm uint64_t size; 5967208373Smm 5968251478Sdelphij /* 5969251478Sdelphij * Make sure our globals have meaningful values in case the user 5970251478Sdelphij * altered them. 5971251478Sdelphij */ 5972251478Sdelphij size = l2arc_write_max; 5973251478Sdelphij if (size == 0) { 5974251478Sdelphij cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 5975251478Sdelphij "be greater than zero, resetting it to the default (%d)", 5976251478Sdelphij L2ARC_WRITE_SIZE); 5977251478Sdelphij size = l2arc_write_max = L2ARC_WRITE_SIZE; 5978251478Sdelphij } 5979208373Smm 5980208373Smm if (arc_warm == B_FALSE) 5981251478Sdelphij size += l2arc_write_boost; 5982208373Smm 5983208373Smm return (size); 5984208373Smm 5985208373Smm} 5986208373Smm 5987208373Smmstatic clock_t 5988208373Smml2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 5989208373Smm{ 5990219089Spjd clock_t interval, next, now; 5991208373Smm 5992208373Smm /* 5993208373Smm * If the ARC lists are busy, increase our write rate; if the 5994208373Smm * lists are stale, idle back. This is achieved by checking 5995208373Smm * how much we previously wrote - if it was more than half of 5996208373Smm * what we wanted, schedule the next write much sooner. 5997208373Smm */ 5998208373Smm if (l2arc_feed_again && wrote > (wanted / 2)) 5999208373Smm interval = (hz * l2arc_feed_min_ms) / 1000; 6000208373Smm else 6001208373Smm interval = hz * l2arc_feed_secs; 6002208373Smm 6003219089Spjd now = ddi_get_lbolt(); 6004219089Spjd next = MAX(now, MIN(now + interval, began + interval)); 6005208373Smm 6006208373Smm return (next); 6007208373Smm} 6008208373Smm 6009185029Spjd/* 6010185029Spjd * Cycle through L2ARC devices. This is how L2ARC load balances. 6011185029Spjd * If a device is returned, this also returns holding the spa config lock. 6012185029Spjd */ 6013185029Spjdstatic l2arc_dev_t * 6014185029Spjdl2arc_dev_get_next(void) 6015185029Spjd{ 6016185029Spjd l2arc_dev_t *first, *next = NULL; 6017185029Spjd 6018185029Spjd /* 6019185029Spjd * Lock out the removal of spas (spa_namespace_lock), then removal 6020185029Spjd * of cache devices (l2arc_dev_mtx). Once a device has been selected, 6021185029Spjd * both locks will be dropped and a spa config lock held instead. 6022185029Spjd */ 6023185029Spjd mutex_enter(&spa_namespace_lock); 6024185029Spjd mutex_enter(&l2arc_dev_mtx); 6025185029Spjd 6026185029Spjd /* if there are no vdevs, there is nothing to do */ 6027185029Spjd if (l2arc_ndev == 0) 6028185029Spjd goto out; 6029185029Spjd 6030185029Spjd first = NULL; 6031185029Spjd next = l2arc_dev_last; 6032185029Spjd do { 6033185029Spjd /* loop around the list looking for a non-faulted vdev */ 6034185029Spjd if (next == NULL) { 6035185029Spjd next = list_head(l2arc_dev_list); 6036185029Spjd } else { 6037185029Spjd next = list_next(l2arc_dev_list, next); 6038185029Spjd if (next == NULL) 6039185029Spjd next = list_head(l2arc_dev_list); 6040185029Spjd } 6041185029Spjd 6042185029Spjd /* if we have come back to the start, bail out */ 6043185029Spjd if (first == NULL) 6044185029Spjd first = next; 6045185029Spjd else if (next == first) 6046185029Spjd break; 6047185029Spjd 6048185029Spjd } while (vdev_is_dead(next->l2ad_vdev)); 6049185029Spjd 6050185029Spjd /* if we were unable to find any usable vdevs, return NULL */ 6051185029Spjd if (vdev_is_dead(next->l2ad_vdev)) 6052185029Spjd next = NULL; 6053185029Spjd 6054185029Spjd l2arc_dev_last = next; 6055185029Spjd 6056185029Spjdout: 6057185029Spjd mutex_exit(&l2arc_dev_mtx); 6058185029Spjd 6059185029Spjd /* 6060185029Spjd * Grab the config lock to prevent the 'next' device from being 6061185029Spjd * removed while we are writing to it. 6062185029Spjd */ 6063185029Spjd if (next != NULL) 6064185029Spjd spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 6065185029Spjd mutex_exit(&spa_namespace_lock); 6066185029Spjd 6067185029Spjd return (next); 6068185029Spjd} 6069185029Spjd 6070185029Spjd/* 6071185029Spjd * Free buffers that were tagged for destruction. 6072185029Spjd */ 6073185029Spjdstatic void 6074185029Spjdl2arc_do_free_on_write() 6075185029Spjd{ 6076185029Spjd list_t *buflist; 6077185029Spjd l2arc_data_free_t *df, *df_prev; 6078185029Spjd 6079185029Spjd mutex_enter(&l2arc_free_on_write_mtx); 6080185029Spjd buflist = l2arc_free_on_write; 6081185029Spjd 6082185029Spjd for (df = list_tail(buflist); df; df = df_prev) { 6083185029Spjd df_prev = list_prev(buflist, df); 6084185029Spjd ASSERT(df->l2df_data != NULL); 6085185029Spjd ASSERT(df->l2df_func != NULL); 6086185029Spjd df->l2df_func(df->l2df_data, df->l2df_size); 6087185029Spjd list_remove(buflist, df); 6088185029Spjd kmem_free(df, sizeof (l2arc_data_free_t)); 6089185029Spjd } 6090185029Spjd 6091185029Spjd mutex_exit(&l2arc_free_on_write_mtx); 6092185029Spjd} 6093185029Spjd 6094185029Spjd/* 6095185029Spjd * A write to a cache device has completed. Update all headers to allow 6096185029Spjd * reads from these buffers to begin. 6097185029Spjd */ 6098185029Spjdstatic void 6099185029Spjdl2arc_write_done(zio_t *zio) 6100185029Spjd{ 6101185029Spjd l2arc_write_callback_t *cb; 6102185029Spjd l2arc_dev_t *dev; 6103185029Spjd list_t *buflist; 6104275811Sdelphij arc_buf_hdr_t *head, *hdr, *hdr_prev; 6105185029Spjd kmutex_t *hash_lock; 6106268085Sdelphij int64_t bytes_dropped = 0; 6107185029Spjd 6108185029Spjd cb = zio->io_private; 6109185029Spjd ASSERT(cb != NULL); 6110185029Spjd dev = cb->l2wcb_dev; 6111185029Spjd ASSERT(dev != NULL); 6112185029Spjd head = cb->l2wcb_head; 6113185029Spjd ASSERT(head != NULL); 6114286570Smav buflist = &dev->l2ad_buflist; 6115185029Spjd ASSERT(buflist != NULL); 6116185029Spjd DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 6117185029Spjd l2arc_write_callback_t *, cb); 6118185029Spjd 6119185029Spjd if (zio->io_error != 0) 6120185029Spjd ARCSTAT_BUMP(arcstat_l2_writes_error); 6121185029Spjd 6122185029Spjd /* 6123185029Spjd * All writes completed, or an error was hit. 6124185029Spjd */ 6125286763Smavtop: 6126286763Smav mutex_enter(&dev->l2ad_mtx); 6127275811Sdelphij for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { 6128275811Sdelphij hdr_prev = list_prev(buflist, hdr); 6129185029Spjd 6130275811Sdelphij hash_lock = HDR_LOCK(hdr); 6131286763Smav 6132286763Smav /* 6133286763Smav * We cannot use mutex_enter or else we can deadlock 6134286763Smav * with l2arc_write_buffers (due to swapping the order 6135286763Smav * the hash lock and l2ad_mtx are taken). 6136286763Smav */ 6137185029Spjd if (!mutex_tryenter(hash_lock)) { 6138185029Spjd /* 6139286763Smav * Missed the hash lock. We must retry so we 6140286763Smav * don't leave the ARC_FLAG_L2_WRITING bit set. 6141185029Spjd */ 6142286763Smav ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); 6143286763Smav 6144286763Smav /* 6145286763Smav * We don't want to rescan the headers we've 6146286763Smav * already marked as having been written out, so 6147286763Smav * we reinsert the head node so we can pick up 6148286763Smav * where we left off. 6149286763Smav */ 6150286763Smav list_remove(buflist, head); 6151286763Smav list_insert_after(buflist, hdr, head); 6152286763Smav 6153286763Smav mutex_exit(&dev->l2ad_mtx); 6154286763Smav 6155286763Smav /* 6156286763Smav * We wait for the hash lock to become available 6157286763Smav * to try and prevent busy waiting, and increase 6158286763Smav * the chance we'll be able to acquire the lock 6159286763Smav * the next time around. 6160286763Smav */ 6161286763Smav mutex_enter(hash_lock); 6162286763Smav mutex_exit(hash_lock); 6163286763Smav goto top; 6164185029Spjd } 6165185029Spjd 6166286570Smav /* 6167286763Smav * We could not have been moved into the arc_l2c_only 6168286763Smav * state while in-flight due to our ARC_FLAG_L2_WRITING 6169286763Smav * bit being set. Let's just ensure that's being enforced. 6170286570Smav */ 6171286763Smav ASSERT(HDR_HAS_L1HDR(hdr)); 6172286570Smav 6173286763Smav /* 6174286763Smav * We may have allocated a buffer for L2ARC compression, 6175286763Smav * we must release it to avoid leaking this data. 6176286763Smav */ 6177286763Smav l2arc_release_cdata_buf(hdr); 6178286763Smav 6179185029Spjd if (zio->io_error != 0) { 6180185029Spjd /* 6181185029Spjd * Error - drop L2ARC entry. 6182185029Spjd */ 6183286776Smav list_remove(buflist, hdr); 6184290191Savg l2arc_trim(hdr); 6185286570Smav hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 6186286570Smav 6187286570Smav ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); 6188275811Sdelphij ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 6189286598Smav 6190286598Smav bytes_dropped += hdr->b_l2hdr.b_asize; 6191286598Smav (void) refcount_remove_many(&dev->l2ad_alloc, 6192286598Smav hdr->b_l2hdr.b_asize, hdr); 6193185029Spjd } 6194185029Spjd 6195185029Spjd /* 6196286763Smav * Allow ARC to begin reads and ghost list evictions to 6197286763Smav * this L2ARC entry. 6198185029Spjd */ 6199275811Sdelphij hdr->b_flags &= ~ARC_FLAG_L2_WRITING; 6200185029Spjd 6201185029Spjd mutex_exit(hash_lock); 6202185029Spjd } 6203185029Spjd 6204185029Spjd atomic_inc_64(&l2arc_writes_done); 6205185029Spjd list_remove(buflist, head); 6206286570Smav ASSERT(!HDR_HAS_L1HDR(head)); 6207286570Smav kmem_cache_free(hdr_l2only_cache, head); 6208286570Smav mutex_exit(&dev->l2ad_mtx); 6209185029Spjd 6210268085Sdelphij vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); 6211268085Sdelphij 6212185029Spjd l2arc_do_free_on_write(); 6213185029Spjd 6214185029Spjd kmem_free(cb, sizeof (l2arc_write_callback_t)); 6215185029Spjd} 6216185029Spjd 6217185029Spjd/* 6218185029Spjd * A read to a cache device completed. Validate buffer contents before 6219185029Spjd * handing over to the regular ARC routines. 6220185029Spjd */ 6221185029Spjdstatic void 6222185029Spjdl2arc_read_done(zio_t *zio) 6223185029Spjd{ 6224185029Spjd l2arc_read_callback_t *cb; 6225185029Spjd arc_buf_hdr_t *hdr; 6226185029Spjd arc_buf_t *buf; 6227185029Spjd kmutex_t *hash_lock; 6228185029Spjd int equal; 6229185029Spjd 6230185029Spjd ASSERT(zio->io_vd != NULL); 6231185029Spjd ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 6232185029Spjd 6233185029Spjd spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 6234185029Spjd 6235185029Spjd cb = zio->io_private; 6236185029Spjd ASSERT(cb != NULL); 6237185029Spjd buf = cb->l2rcb_buf; 6238185029Spjd ASSERT(buf != NULL); 6239185029Spjd 6240219089Spjd hash_lock = HDR_LOCK(buf->b_hdr); 6241185029Spjd mutex_enter(hash_lock); 6242219089Spjd hdr = buf->b_hdr; 6243219089Spjd ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 6244185029Spjd 6245185029Spjd /* 6246297848Savg * If the data was read into a temporary buffer, 6247297848Savg * move it and free the buffer. 6248297848Savg */ 6249297848Savg if (cb->l2rcb_data != NULL) { 6250297848Savg ASSERT3U(hdr->b_size, <, zio->io_size); 6251297848Savg ASSERT3U(cb->l2rcb_compress, ==, ZIO_COMPRESS_OFF); 6252297848Savg if (zio->io_error == 0) 6253297848Savg bcopy(cb->l2rcb_data, buf->b_data, hdr->b_size); 6254297848Savg 6255297848Savg /* 6256297848Savg * The following must be done regardless of whether 6257297848Savg * there was an error: 6258297848Savg * - free the temporary buffer 6259297848Savg * - point zio to the real ARC buffer 6260297848Savg * - set zio size accordingly 6261297848Savg * These are required because zio is either re-used for 6262297848Savg * an I/O of the block in the case of the error 6263297848Savg * or the zio is passed to arc_read_done() and it 6264297848Savg * needs real data. 6265297848Savg */ 6266297848Savg zio_data_buf_free(cb->l2rcb_data, zio->io_size); 6267297848Savg zio->io_size = zio->io_orig_size = hdr->b_size; 6268297848Savg zio->io_data = zio->io_orig_data = buf->b_data; 6269297848Savg } 6270297848Savg 6271297848Savg /* 6272251478Sdelphij * If the buffer was compressed, decompress it first. 6273251478Sdelphij */ 6274251478Sdelphij if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) 6275251478Sdelphij l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); 6276251478Sdelphij ASSERT(zio->io_data != NULL); 6277287706Sdelphij ASSERT3U(zio->io_size, ==, hdr->b_size); 6278287706Sdelphij ASSERT3U(BP_GET_LSIZE(&cb->l2rcb_bp), ==, hdr->b_size); 6279251478Sdelphij 6280251478Sdelphij /* 6281185029Spjd * Check this survived the L2ARC journey. 6282185029Spjd */ 6283185029Spjd equal = arc_cksum_equal(buf); 6284185029Spjd if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 6285185029Spjd mutex_exit(hash_lock); 6286185029Spjd zio->io_private = buf; 6287185029Spjd zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 6288185029Spjd zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 6289185029Spjd arc_read_done(zio); 6290185029Spjd } else { 6291185029Spjd mutex_exit(hash_lock); 6292185029Spjd /* 6293185029Spjd * Buffer didn't survive caching. Increment stats and 6294185029Spjd * reissue to the original storage device. 6295185029Spjd */ 6296185029Spjd if (zio->io_error != 0) { 6297185029Spjd ARCSTAT_BUMP(arcstat_l2_io_error); 6298185029Spjd } else { 6299249195Smm zio->io_error = SET_ERROR(EIO); 6300185029Spjd } 6301185029Spjd if (!equal) 6302185029Spjd ARCSTAT_BUMP(arcstat_l2_cksum_bad); 6303185029Spjd 6304185029Spjd /* 6305185029Spjd * If there's no waiter, issue an async i/o to the primary 6306185029Spjd * storage now. If there *is* a waiter, the caller must 6307185029Spjd * issue the i/o in a context where it's OK to block. 6308185029Spjd */ 6309209962Smm if (zio->io_waiter == NULL) { 6310209962Smm zio_t *pio = zio_unique_parent(zio); 6311209962Smm 6312209962Smm ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 6313209962Smm 6314209962Smm zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 6315287706Sdelphij buf->b_data, hdr->b_size, arc_read_done, buf, 6316185029Spjd zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 6317209962Smm } 6318185029Spjd } 6319185029Spjd 6320185029Spjd kmem_free(cb, sizeof (l2arc_read_callback_t)); 6321185029Spjd} 6322185029Spjd 6323185029Spjd/* 6324185029Spjd * This is the list priority from which the L2ARC will search for pages to 6325185029Spjd * cache. This is used within loops (0..3) to cycle through lists in the 6326185029Spjd * desired order. This order can have a significant effect on cache 6327185029Spjd * performance. 6328185029Spjd * 6329185029Spjd * Currently the metadata lists are hit first, MFU then MRU, followed by 6330185029Spjd * the data lists. This function returns a locked list, and also returns 6331185029Spjd * the lock pointer. 6332185029Spjd */ 6333286763Smavstatic multilist_sublist_t * 6334286763Smavl2arc_sublist_lock(int list_num) 6335185029Spjd{ 6336286763Smav multilist_t *ml = NULL; 6337286763Smav unsigned int idx; 6338185029Spjd 6339286762Smav ASSERT(list_num >= 0 && list_num <= 3); 6340206796Spjd 6341286762Smav switch (list_num) { 6342286762Smav case 0: 6343286763Smav ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; 6344286762Smav break; 6345286762Smav case 1: 6346286763Smav ml = &arc_mru->arcs_list[ARC_BUFC_METADATA]; 6347286762Smav break; 6348286762Smav case 2: 6349286763Smav ml = &arc_mfu->arcs_list[ARC_BUFC_DATA]; 6350286762Smav break; 6351286762Smav case 3: 6352286763Smav ml = &arc_mru->arcs_list[ARC_BUFC_DATA]; 6353286762Smav break; 6354185029Spjd } 6355185029Spjd 6356286763Smav /* 6357286763Smav * Return a randomly-selected sublist. This is acceptable 6358286763Smav * because the caller feeds only a little bit of data for each 6359286763Smav * call (8MB). Subsequent calls will result in different 6360286763Smav * sublists being selected. 6361286763Smav */ 6362286763Smav idx = multilist_get_random_index(ml); 6363286763Smav return (multilist_sublist_lock(ml, idx)); 6364185029Spjd} 6365185029Spjd 6366185029Spjd/* 6367185029Spjd * Evict buffers from the device write hand to the distance specified in 6368185029Spjd * bytes. This distance may span populated buffers, it may span nothing. 6369185029Spjd * This is clearing a region on the L2ARC device ready for writing. 6370185029Spjd * If the 'all' boolean is set, every buffer is evicted. 6371185029Spjd */ 6372185029Spjdstatic void 6373185029Spjdl2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 6374185029Spjd{ 6375185029Spjd list_t *buflist; 6376275811Sdelphij arc_buf_hdr_t *hdr, *hdr_prev; 6377185029Spjd kmutex_t *hash_lock; 6378185029Spjd uint64_t taddr; 6379185029Spjd 6380286570Smav buflist = &dev->l2ad_buflist; 6381185029Spjd 6382185029Spjd if (!all && dev->l2ad_first) { 6383185029Spjd /* 6384185029Spjd * This is the first sweep through the device. There is 6385185029Spjd * nothing to evict. 6386185029Spjd */ 6387185029Spjd return; 6388185029Spjd } 6389185029Spjd 6390185029Spjd if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 6391185029Spjd /* 6392185029Spjd * When nearing the end of the device, evict to the end 6393185029Spjd * before the device write hand jumps to the start. 6394185029Spjd */ 6395185029Spjd taddr = dev->l2ad_end; 6396185029Spjd } else { 6397185029Spjd taddr = dev->l2ad_hand + distance; 6398185029Spjd } 6399185029Spjd DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 6400185029Spjd uint64_t, taddr, boolean_t, all); 6401185029Spjd 6402185029Spjdtop: 6403286570Smav mutex_enter(&dev->l2ad_mtx); 6404275811Sdelphij for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { 6405275811Sdelphij hdr_prev = list_prev(buflist, hdr); 6406185029Spjd 6407275811Sdelphij hash_lock = HDR_LOCK(hdr); 6408286763Smav 6409286763Smav /* 6410286763Smav * We cannot use mutex_enter or else we can deadlock 6411286763Smav * with l2arc_write_buffers (due to swapping the order 6412286763Smav * the hash lock and l2ad_mtx are taken). 6413286763Smav */ 6414185029Spjd if (!mutex_tryenter(hash_lock)) { 6415185029Spjd /* 6416185029Spjd * Missed the hash lock. Retry. 6417185029Spjd */ 6418185029Spjd ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 6419286570Smav mutex_exit(&dev->l2ad_mtx); 6420185029Spjd mutex_enter(hash_lock); 6421185029Spjd mutex_exit(hash_lock); 6422185029Spjd goto top; 6423185029Spjd } 6424185029Spjd 6425275811Sdelphij if (HDR_L2_WRITE_HEAD(hdr)) { 6426185029Spjd /* 6427185029Spjd * We hit a write head node. Leave it for 6428185029Spjd * l2arc_write_done(). 6429185029Spjd */ 6430275811Sdelphij list_remove(buflist, hdr); 6431185029Spjd mutex_exit(hash_lock); 6432185029Spjd continue; 6433185029Spjd } 6434185029Spjd 6435286570Smav if (!all && HDR_HAS_L2HDR(hdr) && 6436286570Smav (hdr->b_l2hdr.b_daddr > taddr || 6437286570Smav hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { 6438185029Spjd /* 6439185029Spjd * We've evicted to the target address, 6440185029Spjd * or the end of the device. 6441185029Spjd */ 6442185029Spjd mutex_exit(hash_lock); 6443185029Spjd break; 6444185029Spjd } 6445185029Spjd 6446286570Smav ASSERT(HDR_HAS_L2HDR(hdr)); 6447286570Smav if (!HDR_HAS_L1HDR(hdr)) { 6448275811Sdelphij ASSERT(!HDR_L2_READING(hdr)); 6449185029Spjd /* 6450185029Spjd * This doesn't exist in the ARC. Destroy. 6451185029Spjd * arc_hdr_destroy() will call list_remove() 6452185029Spjd * and decrement arcstat_l2_size. 6453185029Spjd */ 6454275811Sdelphij arc_change_state(arc_anon, hdr, hash_lock); 6455275811Sdelphij arc_hdr_destroy(hdr); 6456185029Spjd } else { 6457286570Smav ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); 6458286570Smav ARCSTAT_BUMP(arcstat_l2_evict_l1cached); 6459185029Spjd /* 6460185029Spjd * Invalidate issued or about to be issued 6461185029Spjd * reads, since we may be about to write 6462185029Spjd * over this location. 6463185029Spjd */ 6464275811Sdelphij if (HDR_L2_READING(hdr)) { 6465185029Spjd ARCSTAT_BUMP(arcstat_l2_evict_reading); 6466275811Sdelphij hdr->b_flags |= ARC_FLAG_L2_EVICTED; 6467185029Spjd } 6468185029Spjd 6469286763Smav /* Ensure this header has finished being written */ 6470286763Smav ASSERT(!HDR_L2_WRITING(hdr)); 6471286763Smav ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); 6472286763Smav 6473286598Smav arc_hdr_l2hdr_destroy(hdr); 6474185029Spjd } 6475185029Spjd mutex_exit(hash_lock); 6476185029Spjd } 6477286570Smav mutex_exit(&dev->l2ad_mtx); 6478185029Spjd} 6479185029Spjd 6480185029Spjd/* 6481185029Spjd * Find and write ARC buffers to the L2ARC device. 6482185029Spjd * 6483275811Sdelphij * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid 6484185029Spjd * for reading until they have completed writing. 6485251478Sdelphij * The headroom_boost is an in-out parameter used to maintain headroom boost 6486251478Sdelphij * state between calls to this function. 6487251478Sdelphij * 6488251478Sdelphij * Returns the number of bytes actually written (which may be smaller than 6489251478Sdelphij * the delta by which the device hand has changed due to alignment). 6490185029Spjd */ 6491208373Smmstatic uint64_t 6492251478Sdelphijl2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, 6493251478Sdelphij boolean_t *headroom_boost) 6494185029Spjd{ 6495275811Sdelphij arc_buf_hdr_t *hdr, *hdr_prev, *head; 6496289295Smav uint64_t write_asize, write_sz, headroom, 6497289295Smav buf_compress_minsz; 6498185029Spjd void *buf_data; 6499251478Sdelphij boolean_t full; 6500185029Spjd l2arc_write_callback_t *cb; 6501185029Spjd zio_t *pio, *wzio; 6502228103Smm uint64_t guid = spa_load_guid(spa); 6503251478Sdelphij const boolean_t do_headroom_boost = *headroom_boost; 6504185029Spjd int try; 6505185029Spjd 6506185029Spjd ASSERT(dev->l2ad_vdev != NULL); 6507185029Spjd 6508251478Sdelphij /* Lower the flag now, we might want to raise it again later. */ 6509251478Sdelphij *headroom_boost = B_FALSE; 6510251478Sdelphij 6511185029Spjd pio = NULL; 6512287099Savg write_sz = write_asize = 0; 6513185029Spjd full = B_FALSE; 6514286570Smav head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); 6515275811Sdelphij head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; 6516286570Smav head->b_flags |= ARC_FLAG_HAS_L2HDR; 6517185029Spjd 6518205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); 6519185029Spjd /* 6520251478Sdelphij * We will want to try to compress buffers that are at least 2x the 6521251478Sdelphij * device sector size. 6522251478Sdelphij */ 6523251478Sdelphij buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; 6524251478Sdelphij 6525251478Sdelphij /* 6526185029Spjd * Copy buffers for L2ARC writing. 6527185029Spjd */ 6528286762Smav for (try = 0; try <= 3; try++) { 6529286763Smav multilist_sublist_t *mls = l2arc_sublist_lock(try); 6530251478Sdelphij uint64_t passed_sz = 0; 6531251478Sdelphij 6532205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); 6533185029Spjd 6534185029Spjd /* 6535185029Spjd * L2ARC fast warmup. 6536185029Spjd * 6537185029Spjd * Until the ARC is warm and starts to evict, read from the 6538185029Spjd * head of the ARC lists rather than the tail. 6539185029Spjd */ 6540185029Spjd if (arc_warm == B_FALSE) 6541286763Smav hdr = multilist_sublist_head(mls); 6542185029Spjd else 6543286763Smav hdr = multilist_sublist_tail(mls); 6544275811Sdelphij if (hdr == NULL) 6545205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); 6546185029Spjd 6547286762Smav headroom = target_sz * l2arc_headroom; 6548251478Sdelphij if (do_headroom_boost) 6549251478Sdelphij headroom = (headroom * l2arc_headroom_boost) / 100; 6550251478Sdelphij 6551275811Sdelphij for (; hdr; hdr = hdr_prev) { 6552251478Sdelphij kmutex_t *hash_lock; 6553251478Sdelphij uint64_t buf_sz; 6554287099Savg uint64_t buf_a_sz; 6555297848Savg size_t align; 6556251478Sdelphij 6557185029Spjd if (arc_warm == B_FALSE) 6558286763Smav hdr_prev = multilist_sublist_next(mls, hdr); 6559185029Spjd else 6560286763Smav hdr_prev = multilist_sublist_prev(mls, hdr); 6561275811Sdelphij ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size); 6562206796Spjd 6563275811Sdelphij hash_lock = HDR_LOCK(hdr); 6564251478Sdelphij if (!mutex_tryenter(hash_lock)) { 6565205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); 6566185029Spjd /* 6567185029Spjd * Skip this buffer rather than waiting. 6568185029Spjd */ 6569185029Spjd continue; 6570185029Spjd } 6571185029Spjd 6572275811Sdelphij passed_sz += hdr->b_size; 6573185029Spjd if (passed_sz > headroom) { 6574185029Spjd /* 6575185029Spjd * Searched too far. 6576185029Spjd */ 6577185029Spjd mutex_exit(hash_lock); 6578205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); 6579185029Spjd break; 6580185029Spjd } 6581185029Spjd 6582275811Sdelphij if (!l2arc_write_eligible(guid, hdr)) { 6583185029Spjd mutex_exit(hash_lock); 6584185029Spjd continue; 6585185029Spjd } 6586185029Spjd 6587287099Savg /* 6588287099Savg * Assume that the buffer is not going to be compressed 6589287099Savg * and could take more space on disk because of a larger 6590287099Savg * disk block size. 6591287099Savg */ 6592287099Savg buf_sz = hdr->b_size; 6593297848Savg align = (size_t)1 << dev->l2ad_vdev->vdev_ashift; 6594297848Savg buf_a_sz = P2ROUNDUP(buf_sz, align); 6595287099Savg 6596287099Savg if ((write_asize + buf_a_sz) > target_sz) { 6597185029Spjd full = B_TRUE; 6598185029Spjd mutex_exit(hash_lock); 6599205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_full); 6600185029Spjd break; 6601185029Spjd } 6602185029Spjd 6603185029Spjd if (pio == NULL) { 6604185029Spjd /* 6605185029Spjd * Insert a dummy header on the buflist so 6606185029Spjd * l2arc_write_done() can find where the 6607185029Spjd * write buffers begin without searching. 6608185029Spjd */ 6609286763Smav mutex_enter(&dev->l2ad_mtx); 6610286570Smav list_insert_head(&dev->l2ad_buflist, head); 6611286763Smav mutex_exit(&dev->l2ad_mtx); 6612185029Spjd 6613185029Spjd cb = kmem_alloc( 6614185029Spjd sizeof (l2arc_write_callback_t), KM_SLEEP); 6615185029Spjd cb->l2wcb_dev = dev; 6616185029Spjd cb->l2wcb_head = head; 6617185029Spjd pio = zio_root(spa, l2arc_write_done, cb, 6618185029Spjd ZIO_FLAG_CANFAIL); 6619205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_pios); 6620185029Spjd } 6621185029Spjd 6622185029Spjd /* 6623185029Spjd * Create and add a new L2ARC header. 6624185029Spjd */ 6625286570Smav hdr->b_l2hdr.b_dev = dev; 6626275811Sdelphij hdr->b_flags |= ARC_FLAG_L2_WRITING; 6627251478Sdelphij /* 6628251478Sdelphij * Temporarily stash the data buffer in b_tmp_cdata. 6629251478Sdelphij * The subsequent write step will pick it up from 6630286570Smav * there. This is because can't access b_l1hdr.b_buf 6631251478Sdelphij * without holding the hash_lock, which we in turn 6632251478Sdelphij * can't access without holding the ARC list locks 6633251478Sdelphij * (which we want to avoid during compression/writing). 6634251478Sdelphij */ 6635287706Sdelphij hdr->b_l2hdr.b_compress = ZIO_COMPRESS_OFF; 6636286570Smav hdr->b_l2hdr.b_asize = hdr->b_size; 6637286570Smav hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; 6638251478Sdelphij 6639286598Smav /* 6640286598Smav * Explicitly set the b_daddr field to a known 6641286598Smav * value which means "invalid address". This 6642286598Smav * enables us to differentiate which stage of 6643286598Smav * l2arc_write_buffers() the particular header 6644286598Smav * is in (e.g. this loop, or the one below). 6645286598Smav * ARC_FLAG_L2_WRITING is not enough to make 6646286598Smav * this distinction, and we need to know in 6647286598Smav * order to do proper l2arc vdev accounting in 6648286598Smav * arc_release() and arc_hdr_destroy(). 6649286598Smav * 6650286598Smav * Note, we can't use a new flag to distinguish 6651286598Smav * the two stages because we don't hold the 6652286598Smav * header's hash_lock below, in the second stage 6653286598Smav * of this function. Thus, we can't simply 6654286598Smav * change the b_flags field to denote that the 6655286598Smav * IO has been sent. We can change the b_daddr 6656286598Smav * field of the L2 portion, though, since we'll 6657286598Smav * be holding the l2ad_mtx; which is why we're 6658286598Smav * using it to denote the header's state change. 6659286598Smav */ 6660286598Smav hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; 6661289295Smav 6662286570Smav hdr->b_flags |= ARC_FLAG_HAS_L2HDR; 6663185029Spjd 6664286763Smav mutex_enter(&dev->l2ad_mtx); 6665286570Smav list_insert_head(&dev->l2ad_buflist, hdr); 6666286763Smav mutex_exit(&dev->l2ad_mtx); 6667251478Sdelphij 6668185029Spjd /* 6669185029Spjd * Compute and store the buffer cksum before 6670185029Spjd * writing. On debug the cksum is verified first. 6671185029Spjd */ 6672286570Smav arc_cksum_verify(hdr->b_l1hdr.b_buf); 6673286570Smav arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); 6674185029Spjd 6675185029Spjd mutex_exit(hash_lock); 6676185029Spjd 6677251478Sdelphij write_sz += buf_sz; 6678287099Savg write_asize += buf_a_sz; 6679251478Sdelphij } 6680251478Sdelphij 6681286763Smav multilist_sublist_unlock(mls); 6682251478Sdelphij 6683251478Sdelphij if (full == B_TRUE) 6684251478Sdelphij break; 6685251478Sdelphij } 6686251478Sdelphij 6687251478Sdelphij /* No buffers selected for writing? */ 6688251478Sdelphij if (pio == NULL) { 6689251478Sdelphij ASSERT0(write_sz); 6690286570Smav ASSERT(!HDR_HAS_L1HDR(head)); 6691286570Smav kmem_cache_free(hdr_l2only_cache, head); 6692251478Sdelphij return (0); 6693251478Sdelphij } 6694251478Sdelphij 6695286763Smav mutex_enter(&dev->l2ad_mtx); 6696286763Smav 6697251478Sdelphij /* 6698251478Sdelphij * Now start writing the buffers. We're starting at the write head 6699251478Sdelphij * and work backwards, retracing the course of the buffer selector 6700251478Sdelphij * loop above. 6701251478Sdelphij */ 6702297848Savg write_asize = 0; 6703286570Smav for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; 6704286570Smav hdr = list_prev(&dev->l2ad_buflist, hdr)) { 6705251478Sdelphij uint64_t buf_sz; 6706297848Savg boolean_t compress; 6707251478Sdelphij 6708251478Sdelphij /* 6709286763Smav * We rely on the L1 portion of the header below, so 6710286763Smav * it's invalid for this header to have been evicted out 6711286763Smav * of the ghost cache, prior to being written out. The 6712286763Smav * ARC_FLAG_L2_WRITING bit ensures this won't happen. 6713286763Smav */ 6714286763Smav ASSERT(HDR_HAS_L1HDR(hdr)); 6715286763Smav 6716286763Smav /* 6717251478Sdelphij * We shouldn't need to lock the buffer here, since we flagged 6718275811Sdelphij * it as ARC_FLAG_L2_WRITING in the previous step, but we must 6719275811Sdelphij * take care to only access its L2 cache parameters. In 6720286570Smav * particular, hdr->l1hdr.b_buf may be invalid by now due to 6721275811Sdelphij * ARC eviction. 6722251478Sdelphij */ 6723286570Smav hdr->b_l2hdr.b_daddr = dev->l2ad_hand; 6724251478Sdelphij 6725297848Savg /* 6726297848Savg * Save a pointer to the original buffer data we had previously 6727297848Savg * stashed away. 6728297848Savg */ 6729297848Savg buf_data = hdr->b_l1hdr.b_tmp_cdata; 6730297848Savg 6731297848Savg compress = HDR_L2COMPRESS(hdr) && 6732297848Savg hdr->b_l2hdr.b_asize >= buf_compress_minsz; 6733297848Savg if (l2arc_transform_buf(hdr, compress)) { 6734297848Savg /* 6735297848Savg * If compression succeeded, enable headroom 6736297848Savg * boost on the next scan cycle. 6737297848Savg */ 6738297848Savg *headroom_boost = B_TRUE; 6739251478Sdelphij } 6740251478Sdelphij 6741251478Sdelphij /* 6742297848Savg * Get the new buffer size that accounts for compression 6743297848Savg * and padding. 6744251478Sdelphij */ 6745286570Smav buf_sz = hdr->b_l2hdr.b_asize; 6746251478Sdelphij 6747274172Savg /* 6748286598Smav * We need to do this regardless if buf_sz is zero or 6749286598Smav * not, otherwise, when this l2hdr is evicted we'll 6750286598Smav * remove a reference that was never added. 6751286598Smav */ 6752286598Smav (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr); 6753286598Smav 6754251478Sdelphij /* Compression may have squashed the buffer to zero length. */ 6755251478Sdelphij if (buf_sz != 0) { 6756297848Savg /* 6757297848Savg * If the data was padded or compressed, then it 6758297848Savg * it is in a new buffer. 6759297848Savg */ 6760297848Savg if (hdr->b_l1hdr.b_tmp_cdata != NULL) 6761297848Savg buf_data = hdr->b_l1hdr.b_tmp_cdata; 6762185029Spjd wzio = zio_write_phys(pio, dev->l2ad_vdev, 6763185029Spjd dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 6764185029Spjd NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 6765185029Spjd ZIO_FLAG_CANFAIL, B_FALSE); 6766185029Spjd 6767185029Spjd DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 6768185029Spjd zio_t *, wzio); 6769185029Spjd (void) zio_nowait(wzio); 6770185029Spjd 6771297848Savg write_asize += buf_sz; 6772297848Savg dev->l2ad_hand += buf_sz; 6773185029Spjd } 6774251478Sdelphij } 6775185029Spjd 6776286570Smav mutex_exit(&dev->l2ad_mtx); 6777185029Spjd 6778251478Sdelphij ASSERT3U(write_asize, <=, target_sz); 6779185029Spjd ARCSTAT_BUMP(arcstat_l2_writes_sent); 6780251478Sdelphij ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); 6781185029Spjd ARCSTAT_INCR(arcstat_l2_size, write_sz); 6782297848Savg ARCSTAT_INCR(arcstat_l2_asize, write_asize); 6783297848Savg vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0); 6784185029Spjd 6785185029Spjd /* 6786185029Spjd * Bump device hand to the device start if it is approaching the end. 6787185029Spjd * l2arc_evict() will already have evicted ahead for this case. 6788185029Spjd */ 6789185029Spjd if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 6790185029Spjd dev->l2ad_hand = dev->l2ad_start; 6791185029Spjd dev->l2ad_first = B_FALSE; 6792185029Spjd } 6793185029Spjd 6794208373Smm dev->l2ad_writing = B_TRUE; 6795185029Spjd (void) zio_wait(pio); 6796208373Smm dev->l2ad_writing = B_FALSE; 6797208373Smm 6798251478Sdelphij return (write_asize); 6799185029Spjd} 6800185029Spjd 6801185029Spjd/* 6802297848Savg * Transforms, possibly compresses and pads, an L2ARC buffer. 6803286570Smav * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its 6804251478Sdelphij * size in l2hdr->b_asize. This routine tries to compress the data and 6805251478Sdelphij * depending on the compression result there are three possible outcomes: 6806297848Savg * *) The buffer was incompressible. The buffer size was already ashift aligned. 6807297848Savg * The original hdr contents were left untouched except for b_tmp_cdata, 6808297848Savg * which is reset to NULL. The caller must keep a pointer to the original 6809297848Savg * data. 6810297848Savg * *) The buffer was incompressible. The buffer size was not ashift aligned. 6811297848Savg * b_tmp_cdata was replaced with a temporary data buffer which holds a padded 6812297848Savg * (aligned) copy of the data. Once writing is done, invoke 6813297848Savg * l2arc_release_cdata_buf on this hdr to free the temporary buffer. 6814251478Sdelphij * *) The buffer was all-zeros, so there is no need to write it to an L2 6815251478Sdelphij * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is 6816251478Sdelphij * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. 6817251478Sdelphij * *) Compression succeeded and b_tmp_cdata was replaced with a temporary 6818251478Sdelphij * data buffer which holds the compressed data to be written, and b_asize 6819251478Sdelphij * tells us how much data there is. b_compress is set to the appropriate 6820251478Sdelphij * compression algorithm. Once writing is done, invoke 6821251478Sdelphij * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. 6822251478Sdelphij * 6823251478Sdelphij * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the 6824251478Sdelphij * buffer was incompressible). 6825251478Sdelphij */ 6826251478Sdelphijstatic boolean_t 6827297848Savgl2arc_transform_buf(arc_buf_hdr_t *hdr, boolean_t compress) 6828251478Sdelphij{ 6829251478Sdelphij void *cdata; 6830297848Savg size_t align, asize, csize, len, rounded; 6831297848Savg 6832286570Smav ASSERT(HDR_HAS_L2HDR(hdr)); 6833286570Smav l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 6834251478Sdelphij 6835286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 6836287706Sdelphij ASSERT3S(l2hdr->b_compress, ==, ZIO_COMPRESS_OFF); 6837286570Smav ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 6838251478Sdelphij 6839251478Sdelphij len = l2hdr->b_asize; 6840297848Savg align = (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift; 6841297848Savg asize = P2ROUNDUP(len, align); 6842297848Savg cdata = zio_data_buf_alloc(asize); 6843286570Smav ASSERT3P(cdata, !=, NULL); 6844297848Savg if (compress) 6845297848Savg csize = zio_compress_data(ZIO_COMPRESS_LZ4, 6846297848Savg hdr->b_l1hdr.b_tmp_cdata, cdata, len); 6847297848Savg else 6848297848Savg csize = len; 6849251478Sdelphij 6850251478Sdelphij if (csize == 0) { 6851251478Sdelphij /* zero block, indicate that there's nothing to write */ 6852297848Savg zio_data_buf_free(cdata, asize); 6853287706Sdelphij l2hdr->b_compress = ZIO_COMPRESS_EMPTY; 6854251478Sdelphij l2hdr->b_asize = 0; 6855286570Smav hdr->b_l1hdr.b_tmp_cdata = NULL; 6856251478Sdelphij ARCSTAT_BUMP(arcstat_l2_compress_zeros); 6857251478Sdelphij return (B_TRUE); 6858287283Sdelphij } 6859287283Sdelphij 6860297848Savg rounded = P2ROUNDUP(csize, align); 6861297848Savg ASSERT3U(rounded, <=, asize); 6862287283Sdelphij if (rounded < len) { 6863251478Sdelphij /* 6864251478Sdelphij * Compression succeeded, we'll keep the cdata around for 6865251478Sdelphij * writing and release it afterwards. 6866251478Sdelphij */ 6867287283Sdelphij if (rounded > csize) { 6868287283Sdelphij bzero((char *)cdata + csize, rounded - csize); 6869287283Sdelphij csize = rounded; 6870287283Sdelphij } 6871287706Sdelphij l2hdr->b_compress = ZIO_COMPRESS_LZ4; 6872251478Sdelphij l2hdr->b_asize = csize; 6873286570Smav hdr->b_l1hdr.b_tmp_cdata = cdata; 6874251478Sdelphij ARCSTAT_BUMP(arcstat_l2_compress_successes); 6875251478Sdelphij return (B_TRUE); 6876251478Sdelphij } else { 6877251478Sdelphij /* 6878297848Savg * Compression did not save space. 6879251478Sdelphij */ 6880297848Savg if (P2PHASE(len, align) != 0) { 6881297848Savg /* 6882297848Savg * Use compression buffer for a copy of data padded to 6883297848Savg * the proper size. Compression algorithm remains set 6884297848Savg * to ZIO_COMPRESS_OFF. 6885297848Savg */ 6886297848Savg ASSERT3U(len, <, asize); 6887297848Savg bcopy(hdr->b_l1hdr.b_tmp_cdata, cdata, len); 6888297848Savg bzero((char *)cdata + len, asize - len); 6889297848Savg l2hdr->b_asize = asize; 6890297848Savg hdr->b_l1hdr.b_tmp_cdata = cdata; 6891297848Savg ARCSTAT_BUMP(arcstat_l2_padding_needed); 6892297848Savg } else { 6893297848Savg ASSERT3U(len, ==, asize); 6894297848Savg /* 6895297848Savg * The original buffer is good as is, 6896297848Savg * release the compressed buffer. 6897297848Savg * l2hdr will be left unmodified except for b_tmp_cdata. 6898297848Savg */ 6899297848Savg zio_data_buf_free(cdata, asize); 6900297848Savg hdr->b_l1hdr.b_tmp_cdata = NULL; 6901297848Savg } 6902297848Savg if (compress) 6903297848Savg ARCSTAT_BUMP(arcstat_l2_compress_failures); 6904251478Sdelphij return (B_FALSE); 6905251478Sdelphij } 6906251478Sdelphij} 6907251478Sdelphij 6908251478Sdelphij/* 6909251478Sdelphij * Decompresses a zio read back from an l2arc device. On success, the 6910251478Sdelphij * underlying zio's io_data buffer is overwritten by the uncompressed 6911251478Sdelphij * version. On decompression error (corrupt compressed stream), the 6912251478Sdelphij * zio->io_error value is set to signal an I/O error. 6913251478Sdelphij * 6914251478Sdelphij * Please note that the compressed data stream is not checksummed, so 6915251478Sdelphij * if the underlying device is experiencing data corruption, we may feed 6916251478Sdelphij * corrupt data to the decompressor, so the decompressor needs to be 6917251478Sdelphij * able to handle this situation (LZ4 does). 6918251478Sdelphij */ 6919251478Sdelphijstatic void 6920251478Sdelphijl2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) 6921251478Sdelphij{ 6922251478Sdelphij ASSERT(L2ARC_IS_VALID_COMPRESS(c)); 6923251478Sdelphij 6924251478Sdelphij if (zio->io_error != 0) { 6925251478Sdelphij /* 6926251478Sdelphij * An io error has occured, just restore the original io 6927251478Sdelphij * size in preparation for a main pool read. 6928251478Sdelphij */ 6929251478Sdelphij zio->io_orig_size = zio->io_size = hdr->b_size; 6930251478Sdelphij return; 6931251478Sdelphij } 6932251478Sdelphij 6933251478Sdelphij if (c == ZIO_COMPRESS_EMPTY) { 6934251478Sdelphij /* 6935251478Sdelphij * An empty buffer results in a null zio, which means we 6936251478Sdelphij * need to fill its io_data after we're done restoring the 6937251478Sdelphij * buffer's contents. 6938251478Sdelphij */ 6939286570Smav ASSERT(hdr->b_l1hdr.b_buf != NULL); 6940286570Smav bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); 6941286570Smav zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; 6942251478Sdelphij } else { 6943251478Sdelphij ASSERT(zio->io_data != NULL); 6944251478Sdelphij /* 6945251478Sdelphij * We copy the compressed data from the start of the arc buffer 6946251478Sdelphij * (the zio_read will have pulled in only what we need, the 6947251478Sdelphij * rest is garbage which we will overwrite at decompression) 6948251478Sdelphij * and then decompress back to the ARC data buffer. This way we 6949251478Sdelphij * can minimize copying by simply decompressing back over the 6950251478Sdelphij * original compressed data (rather than decompressing to an 6951251478Sdelphij * aux buffer and then copying back the uncompressed buffer, 6952251478Sdelphij * which is likely to be much larger). 6953251478Sdelphij */ 6954251478Sdelphij uint64_t csize; 6955251478Sdelphij void *cdata; 6956251478Sdelphij 6957251478Sdelphij csize = zio->io_size; 6958251478Sdelphij cdata = zio_data_buf_alloc(csize); 6959251478Sdelphij bcopy(zio->io_data, cdata, csize); 6960251478Sdelphij if (zio_decompress_data(c, cdata, zio->io_data, csize, 6961251478Sdelphij hdr->b_size) != 0) 6962251478Sdelphij zio->io_error = EIO; 6963251478Sdelphij zio_data_buf_free(cdata, csize); 6964251478Sdelphij } 6965251478Sdelphij 6966251478Sdelphij /* Restore the expected uncompressed IO size. */ 6967251478Sdelphij zio->io_orig_size = zio->io_size = hdr->b_size; 6968251478Sdelphij} 6969251478Sdelphij 6970251478Sdelphij/* 6971251478Sdelphij * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. 6972297848Savg * This buffer serves as a temporary holder of compressed or padded data while 6973251478Sdelphij * the buffer entry is being written to an l2arc device. Once that is 6974251478Sdelphij * done, we can dispose of it. 6975251478Sdelphij */ 6976251478Sdelphijstatic void 6977275811Sdelphijl2arc_release_cdata_buf(arc_buf_hdr_t *hdr) 6978251478Sdelphij{ 6979297848Savg size_t align, asize, len; 6980287706Sdelphij enum zio_compress comp = hdr->b_l2hdr.b_compress; 6981286763Smav 6982297848Savg ASSERT(HDR_HAS_L2HDR(hdr)); 6983286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 6984286763Smav ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp)); 6985286763Smav 6986297848Savg if (hdr->b_l1hdr.b_tmp_cdata != NULL) { 6987297848Savg ASSERT(comp != ZIO_COMPRESS_EMPTY); 6988297848Savg len = hdr->b_size; 6989297848Savg align = (size_t)1 << hdr->b_l2hdr.b_dev->l2ad_vdev->vdev_ashift; 6990297848Savg asize = P2ROUNDUP(len, align); 6991297848Savg zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, asize); 6992286763Smav hdr->b_l1hdr.b_tmp_cdata = NULL; 6993286763Smav } else { 6994297848Savg ASSERT(comp == ZIO_COMPRESS_OFF || comp == ZIO_COMPRESS_EMPTY); 6995251478Sdelphij } 6996251478Sdelphij} 6997251478Sdelphij 6998251478Sdelphij/* 6999185029Spjd * This thread feeds the L2ARC at regular intervals. This is the beating 7000185029Spjd * heart of the L2ARC. 7001185029Spjd */ 7002185029Spjdstatic void 7003185029Spjdl2arc_feed_thread(void *dummy __unused) 7004185029Spjd{ 7005185029Spjd callb_cpr_t cpr; 7006185029Spjd l2arc_dev_t *dev; 7007185029Spjd spa_t *spa; 7008208373Smm uint64_t size, wrote; 7009219089Spjd clock_t begin, next = ddi_get_lbolt(); 7010251478Sdelphij boolean_t headroom_boost = B_FALSE; 7011185029Spjd 7012185029Spjd CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 7013185029Spjd 7014185029Spjd mutex_enter(&l2arc_feed_thr_lock); 7015185029Spjd 7016185029Spjd while (l2arc_thread_exit == 0) { 7017185029Spjd CALLB_CPR_SAFE_BEGIN(&cpr); 7018185029Spjd (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 7019219089Spjd next - ddi_get_lbolt()); 7020185029Spjd CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 7021219089Spjd next = ddi_get_lbolt() + hz; 7022185029Spjd 7023185029Spjd /* 7024185029Spjd * Quick check for L2ARC devices. 7025185029Spjd */ 7026185029Spjd mutex_enter(&l2arc_dev_mtx); 7027185029Spjd if (l2arc_ndev == 0) { 7028185029Spjd mutex_exit(&l2arc_dev_mtx); 7029185029Spjd continue; 7030185029Spjd } 7031185029Spjd mutex_exit(&l2arc_dev_mtx); 7032219089Spjd begin = ddi_get_lbolt(); 7033185029Spjd 7034185029Spjd /* 7035185029Spjd * This selects the next l2arc device to write to, and in 7036185029Spjd * doing so the next spa to feed from: dev->l2ad_spa. This 7037185029Spjd * will return NULL if there are now no l2arc devices or if 7038185029Spjd * they are all faulted. 7039185029Spjd * 7040185029Spjd * If a device is returned, its spa's config lock is also 7041185029Spjd * held to prevent device removal. l2arc_dev_get_next() 7042185029Spjd * will grab and release l2arc_dev_mtx. 7043185029Spjd */ 7044185029Spjd if ((dev = l2arc_dev_get_next()) == NULL) 7045185029Spjd continue; 7046185029Spjd 7047185029Spjd spa = dev->l2ad_spa; 7048185029Spjd ASSERT(spa != NULL); 7049185029Spjd 7050185029Spjd /* 7051219089Spjd * If the pool is read-only then force the feed thread to 7052219089Spjd * sleep a little longer. 7053219089Spjd */ 7054219089Spjd if (!spa_writeable(spa)) { 7055219089Spjd next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 7056219089Spjd spa_config_exit(spa, SCL_L2ARC, dev); 7057219089Spjd continue; 7058219089Spjd } 7059219089Spjd 7060219089Spjd /* 7061185029Spjd * Avoid contributing to memory pressure. 7062185029Spjd */ 7063185029Spjd if (arc_reclaim_needed()) { 7064185029Spjd ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 7065185029Spjd spa_config_exit(spa, SCL_L2ARC, dev); 7066185029Spjd continue; 7067185029Spjd } 7068185029Spjd 7069185029Spjd ARCSTAT_BUMP(arcstat_l2_feeds); 7070185029Spjd 7071251478Sdelphij size = l2arc_write_size(); 7072185029Spjd 7073185029Spjd /* 7074185029Spjd * Evict L2ARC buffers that will be overwritten. 7075185029Spjd */ 7076185029Spjd l2arc_evict(dev, size, B_FALSE); 7077185029Spjd 7078185029Spjd /* 7079185029Spjd * Write ARC buffers. 7080185029Spjd */ 7081251478Sdelphij wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); 7082208373Smm 7083208373Smm /* 7084208373Smm * Calculate interval between writes. 7085208373Smm */ 7086208373Smm next = l2arc_write_interval(begin, size, wrote); 7087185029Spjd spa_config_exit(spa, SCL_L2ARC, dev); 7088185029Spjd } 7089185029Spjd 7090185029Spjd l2arc_thread_exit = 0; 7091185029Spjd cv_broadcast(&l2arc_feed_thr_cv); 7092185029Spjd CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 7093185029Spjd thread_exit(); 7094185029Spjd} 7095185029Spjd 7096185029Spjdboolean_t 7097185029Spjdl2arc_vdev_present(vdev_t *vd) 7098185029Spjd{ 7099185029Spjd l2arc_dev_t *dev; 7100185029Spjd 7101185029Spjd mutex_enter(&l2arc_dev_mtx); 7102185029Spjd for (dev = list_head(l2arc_dev_list); dev != NULL; 7103185029Spjd dev = list_next(l2arc_dev_list, dev)) { 7104185029Spjd if (dev->l2ad_vdev == vd) 7105185029Spjd break; 7106185029Spjd } 7107185029Spjd mutex_exit(&l2arc_dev_mtx); 7108185029Spjd 7109185029Spjd return (dev != NULL); 7110185029Spjd} 7111185029Spjd 7112185029Spjd/* 7113185029Spjd * Add a vdev for use by the L2ARC. By this point the spa has already 7114185029Spjd * validated the vdev and opened it. 7115185029Spjd */ 7116185029Spjdvoid 7117219089Spjdl2arc_add_vdev(spa_t *spa, vdev_t *vd) 7118185029Spjd{ 7119185029Spjd l2arc_dev_t *adddev; 7120185029Spjd 7121185029Spjd ASSERT(!l2arc_vdev_present(vd)); 7122185029Spjd 7123255753Sgibbs vdev_ashift_optimize(vd); 7124255753Sgibbs 7125185029Spjd /* 7126185029Spjd * Create a new l2arc device entry. 7127185029Spjd */ 7128185029Spjd adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 7129185029Spjd adddev->l2ad_spa = spa; 7130185029Spjd adddev->l2ad_vdev = vd; 7131219089Spjd adddev->l2ad_start = VDEV_LABEL_START_SIZE; 7132219089Spjd adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 7133185029Spjd adddev->l2ad_hand = adddev->l2ad_start; 7134185029Spjd adddev->l2ad_first = B_TRUE; 7135208373Smm adddev->l2ad_writing = B_FALSE; 7136185029Spjd 7137286570Smav mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); 7138185029Spjd /* 7139185029Spjd * This is a list of all ARC buffers that are still valid on the 7140185029Spjd * device. 7141185029Spjd */ 7142286570Smav list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 7143286570Smav offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); 7144185029Spjd 7145219089Spjd vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 7146286598Smav refcount_create(&adddev->l2ad_alloc); 7147185029Spjd 7148185029Spjd /* 7149185029Spjd * Add device to global list 7150185029Spjd */ 7151185029Spjd mutex_enter(&l2arc_dev_mtx); 7152185029Spjd list_insert_head(l2arc_dev_list, adddev); 7153185029Spjd atomic_inc_64(&l2arc_ndev); 7154185029Spjd mutex_exit(&l2arc_dev_mtx); 7155185029Spjd} 7156185029Spjd 7157185029Spjd/* 7158185029Spjd * Remove a vdev from the L2ARC. 7159185029Spjd */ 7160185029Spjdvoid 7161185029Spjdl2arc_remove_vdev(vdev_t *vd) 7162185029Spjd{ 7163185029Spjd l2arc_dev_t *dev, *nextdev, *remdev = NULL; 7164185029Spjd 7165185029Spjd /* 7166185029Spjd * Find the device by vdev 7167185029Spjd */ 7168185029Spjd mutex_enter(&l2arc_dev_mtx); 7169185029Spjd for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 7170185029Spjd nextdev = list_next(l2arc_dev_list, dev); 7171185029Spjd if (vd == dev->l2ad_vdev) { 7172185029Spjd remdev = dev; 7173185029Spjd break; 7174185029Spjd } 7175185029Spjd } 7176185029Spjd ASSERT(remdev != NULL); 7177185029Spjd 7178185029Spjd /* 7179185029Spjd * Remove device from global list 7180185029Spjd */ 7181185029Spjd list_remove(l2arc_dev_list, remdev); 7182185029Spjd l2arc_dev_last = NULL; /* may have been invalidated */ 7183185029Spjd atomic_dec_64(&l2arc_ndev); 7184185029Spjd mutex_exit(&l2arc_dev_mtx); 7185185029Spjd 7186185029Spjd /* 7187185029Spjd * Clear all buflists and ARC references. L2ARC device flush. 7188185029Spjd */ 7189185029Spjd l2arc_evict(remdev, 0, B_TRUE); 7190286570Smav list_destroy(&remdev->l2ad_buflist); 7191286570Smav mutex_destroy(&remdev->l2ad_mtx); 7192286598Smav refcount_destroy(&remdev->l2ad_alloc); 7193185029Spjd kmem_free(remdev, sizeof (l2arc_dev_t)); 7194185029Spjd} 7195185029Spjd 7196185029Spjdvoid 7197185029Spjdl2arc_init(void) 7198185029Spjd{ 7199185029Spjd l2arc_thread_exit = 0; 7200185029Spjd l2arc_ndev = 0; 7201185029Spjd l2arc_writes_sent = 0; 7202185029Spjd l2arc_writes_done = 0; 7203185029Spjd 7204185029Spjd mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 7205185029Spjd cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 7206185029Spjd mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 7207185029Spjd mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 7208185029Spjd 7209185029Spjd l2arc_dev_list = &L2ARC_dev_list; 7210185029Spjd l2arc_free_on_write = &L2ARC_free_on_write; 7211185029Spjd list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 7212185029Spjd offsetof(l2arc_dev_t, l2ad_node)); 7213185029Spjd list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 7214185029Spjd offsetof(l2arc_data_free_t, l2df_list_node)); 7215185029Spjd} 7216185029Spjd 7217185029Spjdvoid 7218185029Spjdl2arc_fini(void) 7219185029Spjd{ 7220185029Spjd /* 7221185029Spjd * This is called from dmu_fini(), which is called from spa_fini(); 7222185029Spjd * Because of this, we can assume that all l2arc devices have 7223185029Spjd * already been removed when the pools themselves were removed. 7224185029Spjd */ 7225185029Spjd 7226185029Spjd l2arc_do_free_on_write(); 7227185029Spjd 7228185029Spjd mutex_destroy(&l2arc_feed_thr_lock); 7229185029Spjd cv_destroy(&l2arc_feed_thr_cv); 7230185029Spjd mutex_destroy(&l2arc_dev_mtx); 7231185029Spjd mutex_destroy(&l2arc_free_on_write_mtx); 7232185029Spjd 7233185029Spjd list_destroy(l2arc_dev_list); 7234185029Spjd list_destroy(l2arc_free_on_write); 7235185029Spjd} 7236185029Spjd 7237185029Spjdvoid 7238185029Spjdl2arc_start(void) 7239185029Spjd{ 7240209962Smm if (!(spa_mode_global & FWRITE)) 7241185029Spjd return; 7242185029Spjd 7243185029Spjd (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 7244185029Spjd TS_RUN, minclsyspri); 7245185029Spjd} 7246185029Spjd 7247185029Spjdvoid 7248185029Spjdl2arc_stop(void) 7249185029Spjd{ 7250209962Smm if (!(spa_mode_global & FWRITE)) 7251185029Spjd return; 7252185029Spjd 7253185029Spjd mutex_enter(&l2arc_feed_thr_lock); 7254185029Spjd cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 7255185029Spjd l2arc_thread_exit = 1; 7256185029Spjd while (l2arc_thread_exit != 0) 7257185029Spjd cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 7258185029Spjd mutex_exit(&l2arc_feed_thr_lock); 7259185029Spjd} 7260