arc.c revision 286762
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23277826Sdelphij * Copyright (c) 2012, Joyent, Inc. All rights reserved. 24268123Sdelphij * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25260835Sdelphij * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. 26268085Sdelphij * Copyright 2014 Nexenta Systems, Inc. All rights reserved. 27168404Spjd */ 28168404Spjd 29168404Spjd/* 30168404Spjd * DVA-based Adjustable Replacement Cache 31168404Spjd * 32168404Spjd * While much of the theory of operation used here is 33168404Spjd * based on the self-tuning, low overhead replacement cache 34168404Spjd * presented by Megiddo and Modha at FAST 2003, there are some 35168404Spjd * significant differences: 36168404Spjd * 37168404Spjd * 1. The Megiddo and Modha model assumes any page is evictable. 38168404Spjd * Pages in its cache cannot be "locked" into memory. This makes 39168404Spjd * the eviction algorithm simple: evict the last page in the list. 40168404Spjd * This also make the performance characteristics easy to reason 41168404Spjd * about. Our cache is not so simple. At any given moment, some 42168404Spjd * subset of the blocks in the cache are un-evictable because we 43168404Spjd * have handed out a reference to them. Blocks are only evictable 44168404Spjd * when there are no external references active. This makes 45168404Spjd * eviction far more problematic: we choose to evict the evictable 46168404Spjd * blocks that are the "lowest" in the list. 47168404Spjd * 48168404Spjd * There are times when it is not possible to evict the requested 49168404Spjd * space. In these circumstances we are unable to adjust the cache 50168404Spjd * size. To prevent the cache growing unbounded at these times we 51185029Spjd * implement a "cache throttle" that slows the flow of new data 52185029Spjd * into the cache until we can make space available. 53168404Spjd * 54168404Spjd * 2. The Megiddo and Modha model assumes a fixed cache size. 55168404Spjd * Pages are evicted when the cache is full and there is a cache 56168404Spjd * miss. Our model has a variable sized cache. It grows with 57185029Spjd * high use, but also tries to react to memory pressure from the 58168404Spjd * operating system: decreasing its size when system memory is 59168404Spjd * tight. 60168404Spjd * 61168404Spjd * 3. The Megiddo and Modha model assumes a fixed page size. All 62251631Sdelphij * elements of the cache are therefore exactly the same size. So 63168404Spjd * when adjusting the cache size following a cache miss, its simply 64168404Spjd * a matter of choosing a single page to evict. In our model, we 65168404Spjd * have variable sized cache blocks (rangeing from 512 bytes to 66251631Sdelphij * 128K bytes). We therefore choose a set of blocks to evict to make 67168404Spjd * space for a cache miss that approximates as closely as possible 68168404Spjd * the space used by the new block. 69168404Spjd * 70168404Spjd * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 71168404Spjd * by N. Megiddo & D. Modha, FAST 2003 72168404Spjd */ 73168404Spjd 74168404Spjd/* 75168404Spjd * The locking model: 76168404Spjd * 77168404Spjd * A new reference to a cache buffer can be obtained in two 78168404Spjd * ways: 1) via a hash table lookup using the DVA as a key, 79185029Spjd * or 2) via one of the ARC lists. The arc_read() interface 80168404Spjd * uses method 1, while the internal arc algorithms for 81251631Sdelphij * adjusting the cache use method 2. We therefore provide two 82168404Spjd * types of locks: 1) the hash table lock array, and 2) the 83168404Spjd * arc list locks. 84168404Spjd * 85168404Spjd * Buffers do not have their own mutexs, rather they rely on the 86168404Spjd * hash table mutexs for the bulk of their protection (i.e. most 87168404Spjd * fields in the arc_buf_hdr_t are protected by these mutexs). 88168404Spjd * 89168404Spjd * buf_hash_find() returns the appropriate mutex (held) when it 90168404Spjd * locates the requested buffer in the hash table. It returns 91168404Spjd * NULL for the mutex if the buffer was not in the table. 92168404Spjd * 93168404Spjd * buf_hash_remove() expects the appropriate hash mutex to be 94168404Spjd * already held before it is invoked. 95168404Spjd * 96168404Spjd * Each arc state also has a mutex which is used to protect the 97168404Spjd * buffer list associated with the state. When attempting to 98168404Spjd * obtain a hash table lock while holding an arc list lock you 99168404Spjd * must use: mutex_tryenter() to avoid deadlock. Also note that 100168404Spjd * the active state mutex must be held before the ghost state mutex. 101168404Spjd * 102168404Spjd * Arc buffers may have an associated eviction callback function. 103168404Spjd * This function will be invoked prior to removing the buffer (e.g. 104168404Spjd * in arc_do_user_evicts()). Note however that the data associated 105168404Spjd * with the buffer may be evicted prior to the callback. The callback 106168404Spjd * must be made with *no locks held* (to prevent deadlock). Additionally, 107168404Spjd * the users of callbacks must ensure that their private data is 108268858Sdelphij * protected from simultaneous callbacks from arc_clear_callback() 109168404Spjd * and arc_do_user_evicts(). 110168404Spjd * 111168404Spjd * Note that the majority of the performance stats are manipulated 112168404Spjd * with atomic operations. 113185029Spjd * 114286570Smav * The L2ARC uses the l2ad_mtx on each vdev for the following: 115185029Spjd * 116185029Spjd * - L2ARC buflist creation 117185029Spjd * - L2ARC buflist eviction 118185029Spjd * - L2ARC write completion, which walks L2ARC buflists 119185029Spjd * - ARC header destruction, as it removes from L2ARC buflists 120185029Spjd * - ARC header release, as it removes from L2ARC buflists 121168404Spjd */ 122168404Spjd 123168404Spjd#include <sys/spa.h> 124168404Spjd#include <sys/zio.h> 125251478Sdelphij#include <sys/zio_compress.h> 126168404Spjd#include <sys/zfs_context.h> 127168404Spjd#include <sys/arc.h> 128168404Spjd#include <sys/refcount.h> 129185029Spjd#include <sys/vdev.h> 130219089Spjd#include <sys/vdev_impl.h> 131258632Savg#include <sys/dsl_pool.h> 132168404Spjd#ifdef _KERNEL 133168404Spjd#include <sys/dnlc.h> 134168404Spjd#endif 135168404Spjd#include <sys/callb.h> 136168404Spjd#include <sys/kstat.h> 137248572Ssmh#include <sys/trim_map.h> 138219089Spjd#include <zfs_fletcher.h> 139168404Spjd#include <sys/sdt.h> 140168404Spjd 141191902Skmacy#include <vm/vm_pageout.h> 142272483Ssmh#include <machine/vmparam.h> 143191902Skmacy 144240133Smm#ifdef illumos 145240133Smm#ifndef _KERNEL 146240133Smm/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 147240133Smmboolean_t arc_watch = B_FALSE; 148240133Smmint arc_procfd; 149240133Smm#endif 150240133Smm#endif /* illumos */ 151240133Smm 152168404Spjdstatic kmutex_t arc_reclaim_thr_lock; 153168404Spjdstatic kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 154168404Spjdstatic uint8_t arc_thread_exit; 155168404Spjd 156286625Smavuint_t arc_reduce_dnlc_percent = 3; 157168404Spjd 158258632Savg/* 159258632Savg * The number of iterations through arc_evict_*() before we 160258632Savg * drop & reacquire the lock. 161258632Savg */ 162258632Savgint arc_evict_iterations = 100; 163258632Savg 164168404Spjd/* number of seconds before growing cache again */ 165168404Spjdstatic int arc_grow_retry = 60; 166168404Spjd 167208373Smm/* shift of arc_c for calculating both min and max arc_p */ 168208373Smmstatic int arc_p_min_shift = 4; 169208373Smm 170208373Smm/* log2(fraction of arc to reclaim) */ 171286625Smavstatic int arc_shrink_shift = 7; 172208373Smm 173168404Spjd/* 174286625Smav * log2(fraction of ARC which must be free to allow growing). 175286625Smav * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, 176286625Smav * when reading a new block into the ARC, we will evict an equal-sized block 177286625Smav * from the ARC. 178286625Smav * 179286625Smav * This must be less than arc_shrink_shift, so that when we shrink the ARC, 180286625Smav * we will still not allow it to grow. 181286625Smav */ 182286625Smavint arc_no_grow_shift = 5; 183286625Smav 184286625Smav 185286625Smav/* 186168404Spjd * minimum lifespan of a prefetch block in clock ticks 187168404Spjd * (initialized in arc_init()) 188168404Spjd */ 189168404Spjdstatic int arc_min_prefetch_lifespan; 190168404Spjd 191258632Savg/* 192258632Savg * If this percent of memory is free, don't throttle. 193258632Savg */ 194258632Savgint arc_lotsfree_percent = 10; 195258632Savg 196208373Smmstatic int arc_dead; 197194043Skmacyextern int zfs_prefetch_disable; 198168404Spjd 199168404Spjd/* 200185029Spjd * The arc has filled available memory and has now warmed up. 201185029Spjd */ 202185029Spjdstatic boolean_t arc_warm; 203185029Spjd 204286762Smav/* 205286762Smav * These tunables are for performance analysis. 206286762Smav */ 207185029Spjduint64_t zfs_arc_max; 208185029Spjduint64_t zfs_arc_min; 209185029Spjduint64_t zfs_arc_meta_limit = 0; 210275780Sdelphijuint64_t zfs_arc_meta_min = 0; 211208373Smmint zfs_arc_grow_retry = 0; 212208373Smmint zfs_arc_shrink_shift = 0; 213208373Smmint zfs_arc_p_min_shift = 0; 214242845Sdelphijint zfs_disable_dup_eviction = 0; 215269230Sdelphijuint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ 216272483Ssmhu_int zfs_arc_free_target = 0; 217185029Spjd 218270759Ssmhstatic int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); 219275748Sdelphijstatic int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); 220270759Ssmh 221270759Ssmh#ifdef _KERNEL 222270759Ssmhstatic void 223270759Ssmharc_free_target_init(void *unused __unused) 224270759Ssmh{ 225270759Ssmh 226272483Ssmh zfs_arc_free_target = vm_pageout_wakeup_thresh; 227270759Ssmh} 228270759SsmhSYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, 229270759Ssmh arc_free_target_init, NULL); 230270759Ssmh 231185029SpjdTUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 232275780SdelphijTUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); 233273026SdelphijTUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); 234168473SpjdSYSCTL_DECL(_vfs_zfs); 235217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, 236168473Spjd "Maximum ARC size"); 237217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0, 238168473Spjd "Minimum ARC size"); 239269230SdelphijSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, 240269230Sdelphij &zfs_arc_average_blocksize, 0, 241269230Sdelphij "ARC average blocksize"); 242273026SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, 243273026Sdelphij &arc_shrink_shift, 0, 244273026Sdelphij "log2(fraction of arc to reclaim)"); 245273026Sdelphij 246270759Ssmh/* 247270759Ssmh * We don't have a tunable for arc_free_target due to the dependency on 248270759Ssmh * pagedaemon initialisation. 249270759Ssmh */ 250270759SsmhSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, 251270759Ssmh CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), 252270759Ssmh sysctl_vfs_zfs_arc_free_target, "IU", 253270759Ssmh "Desired number of free pages below which ARC triggers reclaim"); 254168404Spjd 255270759Ssmhstatic int 256270759Ssmhsysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) 257270759Ssmh{ 258270759Ssmh u_int val; 259270759Ssmh int err; 260270759Ssmh 261270759Ssmh val = zfs_arc_free_target; 262270759Ssmh err = sysctl_handle_int(oidp, &val, 0, req); 263270759Ssmh if (err != 0 || req->newptr == NULL) 264270759Ssmh return (err); 265270759Ssmh 266272483Ssmh if (val < minfree) 267270759Ssmh return (EINVAL); 268272483Ssmh if (val > vm_cnt.v_page_count) 269270759Ssmh return (EINVAL); 270270759Ssmh 271270759Ssmh zfs_arc_free_target = val; 272270759Ssmh 273270759Ssmh return (0); 274270759Ssmh} 275275748Sdelphij 276275748Sdelphij/* 277275748Sdelphij * Must be declared here, before the definition of corresponding kstat 278275748Sdelphij * macro which uses the same names will confuse the compiler. 279275748Sdelphij */ 280275748SdelphijSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, 281275748Sdelphij CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 282275748Sdelphij sysctl_vfs_zfs_arc_meta_limit, "QU", 283275748Sdelphij "ARC metadata limit"); 284272483Ssmh#endif 285270759Ssmh 286168404Spjd/* 287185029Spjd * Note that buffers can be in one of 6 states: 288168404Spjd * ARC_anon - anonymous (discussed below) 289168404Spjd * ARC_mru - recently used, currently cached 290168404Spjd * ARC_mru_ghost - recentely used, no longer in cache 291168404Spjd * ARC_mfu - frequently used, currently cached 292168404Spjd * ARC_mfu_ghost - frequently used, no longer in cache 293185029Spjd * ARC_l2c_only - exists in L2ARC but not other states 294185029Spjd * When there are no active references to the buffer, they are 295185029Spjd * are linked onto a list in one of these arc states. These are 296185029Spjd * the only buffers that can be evicted or deleted. Within each 297185029Spjd * state there are multiple lists, one for meta-data and one for 298185029Spjd * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 299185029Spjd * etc.) is tracked separately so that it can be managed more 300185029Spjd * explicitly: favored over data, limited explicitly. 301168404Spjd * 302168404Spjd * Anonymous buffers are buffers that are not associated with 303168404Spjd * a DVA. These are buffers that hold dirty block copies 304168404Spjd * before they are written to stable storage. By definition, 305168404Spjd * they are "ref'd" and are considered part of arc_mru 306168404Spjd * that cannot be freed. Generally, they will aquire a DVA 307168404Spjd * as they are written and migrate onto the arc_mru list. 308185029Spjd * 309185029Spjd * The ARC_l2c_only state is for buffers that are in the second 310185029Spjd * level ARC but no longer in any of the ARC_m* lists. The second 311185029Spjd * level ARC itself may also contain buffers that are in any of 312185029Spjd * the ARC_m* states - meaning that a buffer can exist in two 313185029Spjd * places. The reason for the ARC_l2c_only state is to keep the 314185029Spjd * buffer header in the hash table, so that reads that hit the 315185029Spjd * second level ARC benefit from these fast lookups. 316168404Spjd */ 317168404Spjd 318168404Spjdtypedef struct arc_state { 319286762Smav list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ 320185029Spjd uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 321185029Spjd uint64_t arcs_size; /* total amount of data in this state */ 322286762Smav kmutex_t arcs_mtx; 323168404Spjd} arc_state_t; 324168404Spjd 325185029Spjd/* The 6 states: */ 326168404Spjdstatic arc_state_t ARC_anon; 327168404Spjdstatic arc_state_t ARC_mru; 328168404Spjdstatic arc_state_t ARC_mru_ghost; 329168404Spjdstatic arc_state_t ARC_mfu; 330168404Spjdstatic arc_state_t ARC_mfu_ghost; 331185029Spjdstatic arc_state_t ARC_l2c_only; 332168404Spjd 333168404Spjdtypedef struct arc_stats { 334168404Spjd kstat_named_t arcstat_hits; 335168404Spjd kstat_named_t arcstat_misses; 336168404Spjd kstat_named_t arcstat_demand_data_hits; 337168404Spjd kstat_named_t arcstat_demand_data_misses; 338168404Spjd kstat_named_t arcstat_demand_metadata_hits; 339168404Spjd kstat_named_t arcstat_demand_metadata_misses; 340168404Spjd kstat_named_t arcstat_prefetch_data_hits; 341168404Spjd kstat_named_t arcstat_prefetch_data_misses; 342168404Spjd kstat_named_t arcstat_prefetch_metadata_hits; 343168404Spjd kstat_named_t arcstat_prefetch_metadata_misses; 344168404Spjd kstat_named_t arcstat_mru_hits; 345168404Spjd kstat_named_t arcstat_mru_ghost_hits; 346168404Spjd kstat_named_t arcstat_mfu_hits; 347168404Spjd kstat_named_t arcstat_mfu_ghost_hits; 348205231Skmacy kstat_named_t arcstat_allocated; 349168404Spjd kstat_named_t arcstat_deleted; 350168404Spjd kstat_named_t arcstat_recycle_miss; 351251629Sdelphij /* 352251629Sdelphij * Number of buffers that could not be evicted because the hash lock 353251629Sdelphij * was held by another thread. The lock may not necessarily be held 354251629Sdelphij * by something using the same buffer, since hash locks are shared 355251629Sdelphij * by multiple buffers. 356251629Sdelphij */ 357168404Spjd kstat_named_t arcstat_mutex_miss; 358251629Sdelphij /* 359251629Sdelphij * Number of buffers skipped because they have I/O in progress, are 360251629Sdelphij * indrect prefetch buffers that have not lived long enough, or are 361251629Sdelphij * not from the spa we're trying to evict from. 362251629Sdelphij */ 363168404Spjd kstat_named_t arcstat_evict_skip; 364208373Smm kstat_named_t arcstat_evict_l2_cached; 365208373Smm kstat_named_t arcstat_evict_l2_eligible; 366208373Smm kstat_named_t arcstat_evict_l2_ineligible; 367168404Spjd kstat_named_t arcstat_hash_elements; 368168404Spjd kstat_named_t arcstat_hash_elements_max; 369168404Spjd kstat_named_t arcstat_hash_collisions; 370168404Spjd kstat_named_t arcstat_hash_chains; 371168404Spjd kstat_named_t arcstat_hash_chain_max; 372168404Spjd kstat_named_t arcstat_p; 373168404Spjd kstat_named_t arcstat_c; 374168404Spjd kstat_named_t arcstat_c_min; 375168404Spjd kstat_named_t arcstat_c_max; 376168404Spjd kstat_named_t arcstat_size; 377286574Smav /* 378286574Smav * Number of bytes consumed by internal ARC structures necessary 379286574Smav * for tracking purposes; these structures are not actually 380286574Smav * backed by ARC buffers. This includes arc_buf_hdr_t structures 381286574Smav * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 382286574Smav * caches), and arc_buf_t structures (allocated via arc_buf_t 383286574Smav * cache). 384286574Smav */ 385185029Spjd kstat_named_t arcstat_hdr_size; 386286574Smav /* 387286574Smav * Number of bytes consumed by ARC buffers of type equal to 388286574Smav * ARC_BUFC_DATA. This is generally consumed by buffers backing 389286574Smav * on disk user data (e.g. plain file contents). 390286574Smav */ 391208373Smm kstat_named_t arcstat_data_size; 392286574Smav /* 393286574Smav * Number of bytes consumed by ARC buffers of type equal to 394286574Smav * ARC_BUFC_METADATA. This is generally consumed by buffers 395286574Smav * backing on disk data that is used for internal ZFS 396286574Smav * structures (e.g. ZAP, dnode, indirect blocks, etc). 397286574Smav */ 398286574Smav kstat_named_t arcstat_metadata_size; 399286574Smav /* 400286574Smav * Number of bytes consumed by various buffers and structures 401286574Smav * not actually backed with ARC buffers. This includes bonus 402286574Smav * buffers (allocated directly via zio_buf_* functions), 403286574Smav * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t 404286574Smav * cache), and dnode_t structures (allocated via dnode_t cache). 405286574Smav */ 406208373Smm kstat_named_t arcstat_other_size; 407286574Smav /* 408286574Smav * Total number of bytes consumed by ARC buffers residing in the 409286574Smav * arc_anon state. This includes *all* buffers in the arc_anon 410286574Smav * state; e.g. data, metadata, evictable, and unevictable buffers 411286574Smav * are all included in this value. 412286574Smav */ 413286574Smav kstat_named_t arcstat_anon_size; 414286574Smav /* 415286574Smav * Number of bytes consumed by ARC buffers that meet the 416286574Smav * following criteria: backing buffers of type ARC_BUFC_DATA, 417286574Smav * residing in the arc_anon state, and are eligible for eviction 418286574Smav * (e.g. have no outstanding holds on the buffer). 419286574Smav */ 420286574Smav kstat_named_t arcstat_anon_evictable_data; 421286574Smav /* 422286574Smav * Number of bytes consumed by ARC buffers that meet the 423286574Smav * following criteria: backing buffers of type ARC_BUFC_METADATA, 424286574Smav * residing in the arc_anon state, and are eligible for eviction 425286574Smav * (e.g. have no outstanding holds on the buffer). 426286574Smav */ 427286574Smav kstat_named_t arcstat_anon_evictable_metadata; 428286574Smav /* 429286574Smav * Total number of bytes consumed by ARC buffers residing in the 430286574Smav * arc_mru state. This includes *all* buffers in the arc_mru 431286574Smav * state; e.g. data, metadata, evictable, and unevictable buffers 432286574Smav * are all included in this value. 433286574Smav */ 434286574Smav kstat_named_t arcstat_mru_size; 435286574Smav /* 436286574Smav * Number of bytes consumed by ARC buffers that meet the 437286574Smav * following criteria: backing buffers of type ARC_BUFC_DATA, 438286574Smav * residing in the arc_mru state, and are eligible for eviction 439286574Smav * (e.g. have no outstanding holds on the buffer). 440286574Smav */ 441286574Smav kstat_named_t arcstat_mru_evictable_data; 442286574Smav /* 443286574Smav * Number of bytes consumed by ARC buffers that meet the 444286574Smav * following criteria: backing buffers of type ARC_BUFC_METADATA, 445286574Smav * residing in the arc_mru state, and are eligible for eviction 446286574Smav * (e.g. have no outstanding holds on the buffer). 447286574Smav */ 448286574Smav kstat_named_t arcstat_mru_evictable_metadata; 449286574Smav /* 450286574Smav * Total number of bytes that *would have been* consumed by ARC 451286574Smav * buffers in the arc_mru_ghost state. The key thing to note 452286574Smav * here, is the fact that this size doesn't actually indicate 453286574Smav * RAM consumption. The ghost lists only consist of headers and 454286574Smav * don't actually have ARC buffers linked off of these headers. 455286574Smav * Thus, *if* the headers had associated ARC buffers, these 456286574Smav * buffers *would have* consumed this number of bytes. 457286574Smav */ 458286574Smav kstat_named_t arcstat_mru_ghost_size; 459286574Smav /* 460286574Smav * Number of bytes that *would have been* consumed by ARC 461286574Smav * buffers that are eligible for eviction, of type 462286574Smav * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 463286574Smav */ 464286574Smav kstat_named_t arcstat_mru_ghost_evictable_data; 465286574Smav /* 466286574Smav * Number of bytes that *would have been* consumed by ARC 467286574Smav * buffers that are eligible for eviction, of type 468286574Smav * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 469286574Smav */ 470286574Smav kstat_named_t arcstat_mru_ghost_evictable_metadata; 471286574Smav /* 472286574Smav * Total number of bytes consumed by ARC buffers residing in the 473286574Smav * arc_mfu state. This includes *all* buffers in the arc_mfu 474286574Smav * state; e.g. data, metadata, evictable, and unevictable buffers 475286574Smav * are all included in this value. 476286574Smav */ 477286574Smav kstat_named_t arcstat_mfu_size; 478286574Smav /* 479286574Smav * Number of bytes consumed by ARC buffers that are eligible for 480286574Smav * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 481286574Smav * state. 482286574Smav */ 483286574Smav kstat_named_t arcstat_mfu_evictable_data; 484286574Smav /* 485286574Smav * Number of bytes consumed by ARC buffers that are eligible for 486286574Smav * eviction, of type ARC_BUFC_METADATA, and reside in the 487286574Smav * arc_mfu state. 488286574Smav */ 489286574Smav kstat_named_t arcstat_mfu_evictable_metadata; 490286574Smav /* 491286574Smav * Total number of bytes that *would have been* consumed by ARC 492286574Smav * buffers in the arc_mfu_ghost state. See the comment above 493286574Smav * arcstat_mru_ghost_size for more details. 494286574Smav */ 495286574Smav kstat_named_t arcstat_mfu_ghost_size; 496286574Smav /* 497286574Smav * Number of bytes that *would have been* consumed by ARC 498286574Smav * buffers that are eligible for eviction, of type 499286574Smav * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 500286574Smav */ 501286574Smav kstat_named_t arcstat_mfu_ghost_evictable_data; 502286574Smav /* 503286574Smav * Number of bytes that *would have been* consumed by ARC 504286574Smav * buffers that are eligible for eviction, of type 505286574Smav * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 506286574Smav */ 507286574Smav kstat_named_t arcstat_mfu_ghost_evictable_metadata; 508185029Spjd kstat_named_t arcstat_l2_hits; 509185029Spjd kstat_named_t arcstat_l2_misses; 510185029Spjd kstat_named_t arcstat_l2_feeds; 511185029Spjd kstat_named_t arcstat_l2_rw_clash; 512208373Smm kstat_named_t arcstat_l2_read_bytes; 513208373Smm kstat_named_t arcstat_l2_write_bytes; 514185029Spjd kstat_named_t arcstat_l2_writes_sent; 515185029Spjd kstat_named_t arcstat_l2_writes_done; 516185029Spjd kstat_named_t arcstat_l2_writes_error; 517185029Spjd kstat_named_t arcstat_l2_writes_hdr_miss; 518185029Spjd kstat_named_t arcstat_l2_evict_lock_retry; 519185029Spjd kstat_named_t arcstat_l2_evict_reading; 520286570Smav kstat_named_t arcstat_l2_evict_l1cached; 521185029Spjd kstat_named_t arcstat_l2_free_on_write; 522274172Savg kstat_named_t arcstat_l2_cdata_free_on_write; 523185029Spjd kstat_named_t arcstat_l2_abort_lowmem; 524185029Spjd kstat_named_t arcstat_l2_cksum_bad; 525185029Spjd kstat_named_t arcstat_l2_io_error; 526185029Spjd kstat_named_t arcstat_l2_size; 527251478Sdelphij kstat_named_t arcstat_l2_asize; 528185029Spjd kstat_named_t arcstat_l2_hdr_size; 529251478Sdelphij kstat_named_t arcstat_l2_compress_successes; 530251478Sdelphij kstat_named_t arcstat_l2_compress_zeros; 531251478Sdelphij kstat_named_t arcstat_l2_compress_failures; 532205231Skmacy kstat_named_t arcstat_l2_write_trylock_fail; 533205231Skmacy kstat_named_t arcstat_l2_write_passed_headroom; 534205231Skmacy kstat_named_t arcstat_l2_write_spa_mismatch; 535206796Spjd kstat_named_t arcstat_l2_write_in_l2; 536205231Skmacy kstat_named_t arcstat_l2_write_hdr_io_in_progress; 537205231Skmacy kstat_named_t arcstat_l2_write_not_cacheable; 538205231Skmacy kstat_named_t arcstat_l2_write_full; 539205231Skmacy kstat_named_t arcstat_l2_write_buffer_iter; 540205231Skmacy kstat_named_t arcstat_l2_write_pios; 541205231Skmacy kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 542205231Skmacy kstat_named_t arcstat_l2_write_buffer_list_iter; 543205231Skmacy kstat_named_t arcstat_l2_write_buffer_list_null_iter; 544242845Sdelphij kstat_named_t arcstat_memory_throttle_count; 545242845Sdelphij kstat_named_t arcstat_duplicate_buffers; 546242845Sdelphij kstat_named_t arcstat_duplicate_buffers_size; 547242845Sdelphij kstat_named_t arcstat_duplicate_reads; 548275748Sdelphij kstat_named_t arcstat_meta_used; 549275748Sdelphij kstat_named_t arcstat_meta_limit; 550275748Sdelphij kstat_named_t arcstat_meta_max; 551275780Sdelphij kstat_named_t arcstat_meta_min; 552168404Spjd} arc_stats_t; 553168404Spjd 554168404Spjdstatic arc_stats_t arc_stats = { 555168404Spjd { "hits", KSTAT_DATA_UINT64 }, 556168404Spjd { "misses", KSTAT_DATA_UINT64 }, 557168404Spjd { "demand_data_hits", KSTAT_DATA_UINT64 }, 558168404Spjd { "demand_data_misses", KSTAT_DATA_UINT64 }, 559168404Spjd { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 560168404Spjd { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 561168404Spjd { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 562168404Spjd { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 563168404Spjd { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 564168404Spjd { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 565168404Spjd { "mru_hits", KSTAT_DATA_UINT64 }, 566168404Spjd { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 567168404Spjd { "mfu_hits", KSTAT_DATA_UINT64 }, 568168404Spjd { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 569205231Skmacy { "allocated", KSTAT_DATA_UINT64 }, 570168404Spjd { "deleted", KSTAT_DATA_UINT64 }, 571168404Spjd { "recycle_miss", KSTAT_DATA_UINT64 }, 572168404Spjd { "mutex_miss", KSTAT_DATA_UINT64 }, 573168404Spjd { "evict_skip", KSTAT_DATA_UINT64 }, 574208373Smm { "evict_l2_cached", KSTAT_DATA_UINT64 }, 575208373Smm { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 576208373Smm { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 577168404Spjd { "hash_elements", KSTAT_DATA_UINT64 }, 578168404Spjd { "hash_elements_max", KSTAT_DATA_UINT64 }, 579168404Spjd { "hash_collisions", KSTAT_DATA_UINT64 }, 580168404Spjd { "hash_chains", KSTAT_DATA_UINT64 }, 581168404Spjd { "hash_chain_max", KSTAT_DATA_UINT64 }, 582168404Spjd { "p", KSTAT_DATA_UINT64 }, 583168404Spjd { "c", KSTAT_DATA_UINT64 }, 584168404Spjd { "c_min", KSTAT_DATA_UINT64 }, 585168404Spjd { "c_max", KSTAT_DATA_UINT64 }, 586185029Spjd { "size", KSTAT_DATA_UINT64 }, 587185029Spjd { "hdr_size", KSTAT_DATA_UINT64 }, 588208373Smm { "data_size", KSTAT_DATA_UINT64 }, 589286574Smav { "metadata_size", KSTAT_DATA_UINT64 }, 590208373Smm { "other_size", KSTAT_DATA_UINT64 }, 591286574Smav { "anon_size", KSTAT_DATA_UINT64 }, 592286574Smav { "anon_evictable_data", KSTAT_DATA_UINT64 }, 593286574Smav { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, 594286574Smav { "mru_size", KSTAT_DATA_UINT64 }, 595286574Smav { "mru_evictable_data", KSTAT_DATA_UINT64 }, 596286574Smav { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, 597286574Smav { "mru_ghost_size", KSTAT_DATA_UINT64 }, 598286574Smav { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, 599286574Smav { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 600286574Smav { "mfu_size", KSTAT_DATA_UINT64 }, 601286574Smav { "mfu_evictable_data", KSTAT_DATA_UINT64 }, 602286574Smav { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, 603286574Smav { "mfu_ghost_size", KSTAT_DATA_UINT64 }, 604286574Smav { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, 605286574Smav { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 606185029Spjd { "l2_hits", KSTAT_DATA_UINT64 }, 607185029Spjd { "l2_misses", KSTAT_DATA_UINT64 }, 608185029Spjd { "l2_feeds", KSTAT_DATA_UINT64 }, 609185029Spjd { "l2_rw_clash", KSTAT_DATA_UINT64 }, 610208373Smm { "l2_read_bytes", KSTAT_DATA_UINT64 }, 611208373Smm { "l2_write_bytes", KSTAT_DATA_UINT64 }, 612185029Spjd { "l2_writes_sent", KSTAT_DATA_UINT64 }, 613185029Spjd { "l2_writes_done", KSTAT_DATA_UINT64 }, 614185029Spjd { "l2_writes_error", KSTAT_DATA_UINT64 }, 615185029Spjd { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 616185029Spjd { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 617185029Spjd { "l2_evict_reading", KSTAT_DATA_UINT64 }, 618286570Smav { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, 619185029Spjd { "l2_free_on_write", KSTAT_DATA_UINT64 }, 620274172Savg { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, 621185029Spjd { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 622185029Spjd { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 623185029Spjd { "l2_io_error", KSTAT_DATA_UINT64 }, 624185029Spjd { "l2_size", KSTAT_DATA_UINT64 }, 625251478Sdelphij { "l2_asize", KSTAT_DATA_UINT64 }, 626185029Spjd { "l2_hdr_size", KSTAT_DATA_UINT64 }, 627251478Sdelphij { "l2_compress_successes", KSTAT_DATA_UINT64 }, 628251478Sdelphij { "l2_compress_zeros", KSTAT_DATA_UINT64 }, 629251478Sdelphij { "l2_compress_failures", KSTAT_DATA_UINT64 }, 630206796Spjd { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 631206796Spjd { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 632206796Spjd { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 633206796Spjd { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 634206796Spjd { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 635206796Spjd { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 636206796Spjd { "l2_write_full", KSTAT_DATA_UINT64 }, 637206796Spjd { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 638206796Spjd { "l2_write_pios", KSTAT_DATA_UINT64 }, 639206796Spjd { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 640206796Spjd { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 641242845Sdelphij { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, 642242845Sdelphij { "memory_throttle_count", KSTAT_DATA_UINT64 }, 643242845Sdelphij { "duplicate_buffers", KSTAT_DATA_UINT64 }, 644242845Sdelphij { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 645275748Sdelphij { "duplicate_reads", KSTAT_DATA_UINT64 }, 646275748Sdelphij { "arc_meta_used", KSTAT_DATA_UINT64 }, 647275748Sdelphij { "arc_meta_limit", KSTAT_DATA_UINT64 }, 648275780Sdelphij { "arc_meta_max", KSTAT_DATA_UINT64 }, 649275780Sdelphij { "arc_meta_min", KSTAT_DATA_UINT64 } 650168404Spjd}; 651168404Spjd 652168404Spjd#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 653168404Spjd 654168404Spjd#define ARCSTAT_INCR(stat, val) \ 655251631Sdelphij atomic_add_64(&arc_stats.stat.value.ui64, (val)) 656168404Spjd 657206796Spjd#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 658168404Spjd#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 659168404Spjd 660168404Spjd#define ARCSTAT_MAX(stat, val) { \ 661168404Spjd uint64_t m; \ 662168404Spjd while ((val) > (m = arc_stats.stat.value.ui64) && \ 663168404Spjd (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 664168404Spjd continue; \ 665168404Spjd} 666168404Spjd 667168404Spjd#define ARCSTAT_MAXSTAT(stat) \ 668168404Spjd ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 669168404Spjd 670168404Spjd/* 671168404Spjd * We define a macro to allow ARC hits/misses to be easily broken down by 672168404Spjd * two separate conditions, giving a total of four different subtypes for 673168404Spjd * each of hits and misses (so eight statistics total). 674168404Spjd */ 675168404Spjd#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 676168404Spjd if (cond1) { \ 677168404Spjd if (cond2) { \ 678168404Spjd ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 679168404Spjd } else { \ 680168404Spjd ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 681168404Spjd } \ 682168404Spjd } else { \ 683168404Spjd if (cond2) { \ 684168404Spjd ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 685168404Spjd } else { \ 686168404Spjd ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 687168404Spjd } \ 688168404Spjd } 689168404Spjd 690168404Spjdkstat_t *arc_ksp; 691206796Spjdstatic arc_state_t *arc_anon; 692168404Spjdstatic arc_state_t *arc_mru; 693168404Spjdstatic arc_state_t *arc_mru_ghost; 694168404Spjdstatic arc_state_t *arc_mfu; 695168404Spjdstatic arc_state_t *arc_mfu_ghost; 696185029Spjdstatic arc_state_t *arc_l2c_only; 697168404Spjd 698168404Spjd/* 699168404Spjd * There are several ARC variables that are critical to export as kstats -- 700168404Spjd * but we don't want to have to grovel around in the kstat whenever we wish to 701168404Spjd * manipulate them. For these variables, we therefore define them to be in 702168404Spjd * terms of the statistic variable. This assures that we are not introducing 703168404Spjd * the possibility of inconsistency by having shadow copies of the variables, 704168404Spjd * while still allowing the code to be readable. 705168404Spjd */ 706168404Spjd#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 707168404Spjd#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 708168404Spjd#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 709168404Spjd#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 710168404Spjd#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 711275748Sdelphij#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 712275780Sdelphij#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ 713275748Sdelphij#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ 714275748Sdelphij#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 715168404Spjd 716251478Sdelphij#define L2ARC_IS_VALID_COMPRESS(_c_) \ 717251478Sdelphij ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) 718251478Sdelphij 719168404Spjdstatic int arc_no_grow; /* Don't try to grow cache size */ 720168404Spjdstatic uint64_t arc_tempreserve; 721209962Smmstatic uint64_t arc_loaned_bytes; 722168404Spjd 723168404Spjdtypedef struct arc_callback arc_callback_t; 724168404Spjd 725168404Spjdstruct arc_callback { 726168404Spjd void *acb_private; 727168404Spjd arc_done_func_t *acb_done; 728168404Spjd arc_buf_t *acb_buf; 729168404Spjd zio_t *acb_zio_dummy; 730168404Spjd arc_callback_t *acb_next; 731168404Spjd}; 732168404Spjd 733168404Spjdtypedef struct arc_write_callback arc_write_callback_t; 734168404Spjd 735168404Spjdstruct arc_write_callback { 736168404Spjd void *awcb_private; 737168404Spjd arc_done_func_t *awcb_ready; 738258632Savg arc_done_func_t *awcb_physdone; 739168404Spjd arc_done_func_t *awcb_done; 740168404Spjd arc_buf_t *awcb_buf; 741168404Spjd}; 742168404Spjd 743286570Smav/* 744286570Smav * ARC buffers are separated into multiple structs as a memory saving measure: 745286570Smav * - Common fields struct, always defined, and embedded within it: 746286570Smav * - L2-only fields, always allocated but undefined when not in L2ARC 747286570Smav * - L1-only fields, only allocated when in L1ARC 748286570Smav * 749286570Smav * Buffer in L1 Buffer only in L2 750286570Smav * +------------------------+ +------------------------+ 751286570Smav * | arc_buf_hdr_t | | arc_buf_hdr_t | 752286570Smav * | | | | 753286570Smav * | | | | 754286570Smav * | | | | 755286570Smav * +------------------------+ +------------------------+ 756286570Smav * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 757286570Smav * | (undefined if L1-only) | | | 758286570Smav * +------------------------+ +------------------------+ 759286570Smav * | l1arc_buf_hdr_t | 760286570Smav * | | 761286570Smav * | | 762286570Smav * | | 763286570Smav * | | 764286570Smav * +------------------------+ 765286570Smav * 766286570Smav * Because it's possible for the L2ARC to become extremely large, we can wind 767286570Smav * up eating a lot of memory in L2ARC buffer headers, so the size of a header 768286570Smav * is minimized by only allocating the fields necessary for an L1-cached buffer 769286570Smav * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 770286570Smav * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 771286570Smav * words in pointers. arc_hdr_realloc() is used to switch a header between 772286570Smav * these two allocation states. 773286570Smav */ 774286570Smavtypedef struct l1arc_buf_hdr { 775168404Spjd kmutex_t b_freeze_lock; 776286570Smav#ifdef ZFS_DEBUG 777286570Smav /* 778286570Smav * used for debugging wtih kmem_flags - by allocating and freeing 779286570Smav * b_thawed when the buffer is thawed, we get a record of the stack 780286570Smav * trace that thawed it. 781286570Smav */ 782219089Spjd void *b_thawed; 783286570Smav#endif 784168404Spjd 785168404Spjd arc_buf_t *b_buf; 786168404Spjd uint32_t b_datacnt; 787286570Smav /* for waiting on writes to complete */ 788168404Spjd kcondvar_t b_cv; 789168404Spjd 790168404Spjd /* protected by arc state mutex */ 791168404Spjd arc_state_t *b_state; 792168404Spjd list_node_t b_arc_node; 793168404Spjd 794168404Spjd /* updated atomically */ 795168404Spjd clock_t b_arc_access; 796168404Spjd 797168404Spjd /* self protecting */ 798168404Spjd refcount_t b_refcnt; 799185029Spjd 800286570Smav arc_callback_t *b_acb; 801286570Smav /* temporary buffer holder for in-flight compressed data */ 802286570Smav void *b_tmp_cdata; 803286570Smav} l1arc_buf_hdr_t; 804286570Smav 805286570Smavtypedef struct l2arc_dev l2arc_dev_t; 806286570Smav 807286570Smavtypedef struct l2arc_buf_hdr { 808286570Smav /* protected by arc_buf_hdr mutex */ 809286570Smav l2arc_dev_t *b_dev; /* L2ARC device */ 810286570Smav uint64_t b_daddr; /* disk address, offset byte */ 811286570Smav /* real alloc'd buffer size depending on b_compress applied */ 812286570Smav int32_t b_asize; 813286570Smav 814185029Spjd list_node_t b_l2node; 815286570Smav} l2arc_buf_hdr_t; 816286570Smav 817286570Smavstruct arc_buf_hdr { 818286570Smav /* protected by hash lock */ 819286570Smav dva_t b_dva; 820286570Smav uint64_t b_birth; 821286570Smav /* 822286570Smav * Even though this checksum is only set/verified when a buffer is in 823286570Smav * the L1 cache, it needs to be in the set of common fields because it 824286570Smav * must be preserved from the time before a buffer is written out to 825286570Smav * L2ARC until after it is read back in. 826286570Smav */ 827286570Smav zio_cksum_t *b_freeze_cksum; 828286570Smav 829286570Smav arc_buf_hdr_t *b_hash_next; 830286570Smav arc_flags_t b_flags; 831286570Smav 832286570Smav /* immutable */ 833286570Smav int32_t b_size; 834286570Smav uint64_t b_spa; 835286570Smav 836286570Smav /* L2ARC fields. Undefined when not in L2ARC. */ 837286570Smav l2arc_buf_hdr_t b_l2hdr; 838286570Smav /* L1ARC fields. Undefined when in l2arc_only state */ 839286570Smav l1arc_buf_hdr_t b_l1hdr; 840168404Spjd}; 841168404Spjd 842275748Sdelphij#ifdef _KERNEL 843275748Sdelphijstatic int 844275748Sdelphijsysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) 845275748Sdelphij{ 846275748Sdelphij uint64_t val; 847275748Sdelphij int err; 848275748Sdelphij 849275748Sdelphij val = arc_meta_limit; 850275748Sdelphij err = sysctl_handle_64(oidp, &val, 0, req); 851275748Sdelphij if (err != 0 || req->newptr == NULL) 852275748Sdelphij return (err); 853275748Sdelphij 854275748Sdelphij if (val <= 0 || val > arc_c_max) 855275748Sdelphij return (EINVAL); 856275748Sdelphij 857275748Sdelphij arc_meta_limit = val; 858275748Sdelphij return (0); 859275748Sdelphij} 860275748Sdelphij#endif 861275748Sdelphij 862168404Spjdstatic arc_buf_t *arc_eviction_list; 863168404Spjdstatic kmutex_t arc_eviction_mtx; 864168404Spjdstatic arc_buf_hdr_t arc_eviction_hdr; 865168404Spjd 866168404Spjd#define GHOST_STATE(state) \ 867185029Spjd ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 868185029Spjd (state) == arc_l2c_only) 869168404Spjd 870275811Sdelphij#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) 871275811Sdelphij#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) 872275811Sdelphij#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) 873275811Sdelphij#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) 874275811Sdelphij#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) 875275811Sdelphij#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) 876286570Smav 877275811Sdelphij#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) 878286570Smav#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) 879275811Sdelphij#define HDR_L2_READING(hdr) \ 880286570Smav (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ 881286570Smav ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) 882275811Sdelphij#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) 883275811Sdelphij#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) 884275811Sdelphij#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) 885168404Spjd 886286570Smav#define HDR_ISTYPE_METADATA(hdr) \ 887286570Smav ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) 888286570Smav#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) 889286570Smav 890286570Smav#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) 891286570Smav#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) 892286570Smav 893286570Smav/* For storing compression mode in b_flags */ 894286570Smav#define HDR_COMPRESS_OFFSET 24 895286570Smav#define HDR_COMPRESS_NBITS 7 896286570Smav 897286570Smav#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET(hdr->b_flags, \ 898286570Smav HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS)) 899286570Smav#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \ 900286570Smav HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp)) 901286570Smav 902168404Spjd/* 903185029Spjd * Other sizes 904185029Spjd */ 905185029Spjd 906286570Smav#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 907286570Smav#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) 908185029Spjd 909185029Spjd/* 910168404Spjd * Hash table routines 911168404Spjd */ 912168404Spjd 913205253Skmacy#define HT_LOCK_PAD CACHE_LINE_SIZE 914168404Spjd 915168404Spjdstruct ht_lock { 916168404Spjd kmutex_t ht_lock; 917168404Spjd#ifdef _KERNEL 918168404Spjd unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 919168404Spjd#endif 920168404Spjd}; 921168404Spjd 922168404Spjd#define BUF_LOCKS 256 923168404Spjdtypedef struct buf_hash_table { 924168404Spjd uint64_t ht_mask; 925168404Spjd arc_buf_hdr_t **ht_table; 926205264Skmacy struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 927168404Spjd} buf_hash_table_t; 928168404Spjd 929168404Spjdstatic buf_hash_table_t buf_hash_table; 930168404Spjd 931168404Spjd#define BUF_HASH_INDEX(spa, dva, birth) \ 932168404Spjd (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 933168404Spjd#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 934168404Spjd#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 935219089Spjd#define HDR_LOCK(hdr) \ 936219089Spjd (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 937168404Spjd 938168404Spjduint64_t zfs_crc64_table[256]; 939168404Spjd 940185029Spjd/* 941185029Spjd * Level 2 ARC 942185029Spjd */ 943185029Spjd 944272707Savg#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 945251478Sdelphij#define L2ARC_HEADROOM 2 /* num of writes */ 946251478Sdelphij/* 947251478Sdelphij * If we discover during ARC scan any buffers to be compressed, we boost 948251478Sdelphij * our headroom for the next scanning cycle by this percentage multiple. 949251478Sdelphij */ 950251478Sdelphij#define L2ARC_HEADROOM_BOOST 200 951208373Smm#define L2ARC_FEED_SECS 1 /* caching interval secs */ 952208373Smm#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 953185029Spjd 954286598Smav/* 955286598Smav * Used to distinguish headers that are being process by 956286598Smav * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk 957286598Smav * address. This can happen when the header is added to the l2arc's list 958286598Smav * of buffers to write in the first stage of l2arc_write_buffers(), but 959286598Smav * has not yet been written out which happens in the second stage of 960286598Smav * l2arc_write_buffers(). 961286598Smav */ 962286598Smav#define L2ARC_ADDR_UNSET ((uint64_t)(-1)) 963286598Smav 964185029Spjd#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 965185029Spjd#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 966185029Spjd 967251631Sdelphij/* L2ARC Performance Tunables */ 968185029Spjduint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 969185029Spjduint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 970185029Spjduint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 971251478Sdelphijuint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 972185029Spjduint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 973208373Smmuint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 974219089Spjdboolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 975208373Smmboolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 976208373Smmboolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 977185029Spjd 978217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, 979205231Skmacy &l2arc_write_max, 0, "max write size"); 980217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, 981205231Skmacy &l2arc_write_boost, 0, "extra write during warmup"); 982217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, 983205231Skmacy &l2arc_headroom, 0, "number of dev writes"); 984217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, 985205231Skmacy &l2arc_feed_secs, 0, "interval seconds"); 986217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, 987208373Smm &l2arc_feed_min_ms, 0, "min interval milliseconds"); 988205231Skmacy 989205231SkmacySYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, 990205231Skmacy &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 991208373SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, 992208373Smm &l2arc_feed_again, 0, "turbo warmup"); 993208373SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, 994208373Smm &l2arc_norw, 0, "no reads during writes"); 995205231Skmacy 996217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 997205231Skmacy &ARC_anon.arcs_size, 0, "size of anonymous state"); 998217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, 999205231Skmacy &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); 1000217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, 1001205231Skmacy &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); 1002205231Skmacy 1003217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 1004205231Skmacy &ARC_mru.arcs_size, 0, "size of mru state"); 1005217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, 1006205231Skmacy &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); 1007217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, 1008205231Skmacy &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); 1009205231Skmacy 1010217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 1011205231Skmacy &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state"); 1012217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, 1013205231Skmacy &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1014205231Skmacy "size of metadata in mru ghost state"); 1015217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, 1016205231Skmacy &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1017205231Skmacy "size of data in mru ghost state"); 1018205231Skmacy 1019217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 1020205231Skmacy &ARC_mfu.arcs_size, 0, "size of mfu state"); 1021217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, 1022205231Skmacy &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); 1023217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, 1024205231Skmacy &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); 1025205231Skmacy 1026217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 1027205231Skmacy &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state"); 1028217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, 1029205231Skmacy &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1030205231Skmacy "size of metadata in mfu ghost state"); 1031217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, 1032205231Skmacy &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1033205231Skmacy "size of data in mfu ghost state"); 1034205231Skmacy 1035217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 1036205231Skmacy &ARC_l2c_only.arcs_size, 0, "size of mru state"); 1037205231Skmacy 1038185029Spjd/* 1039185029Spjd * L2ARC Internals 1040185029Spjd */ 1041286570Smavstruct l2arc_dev { 1042185029Spjd vdev_t *l2ad_vdev; /* vdev */ 1043185029Spjd spa_t *l2ad_spa; /* spa */ 1044185029Spjd uint64_t l2ad_hand; /* next write location */ 1045185029Spjd uint64_t l2ad_start; /* first addr on device */ 1046185029Spjd uint64_t l2ad_end; /* last addr on device */ 1047185029Spjd boolean_t l2ad_first; /* first sweep through */ 1048208373Smm boolean_t l2ad_writing; /* currently writing */ 1049286570Smav kmutex_t l2ad_mtx; /* lock for buffer list */ 1050286570Smav list_t l2ad_buflist; /* buffer list */ 1051185029Spjd list_node_t l2ad_node; /* device list node */ 1052286598Smav refcount_t l2ad_alloc; /* allocated bytes */ 1053286570Smav}; 1054185029Spjd 1055185029Spjdstatic list_t L2ARC_dev_list; /* device list */ 1056185029Spjdstatic list_t *l2arc_dev_list; /* device list pointer */ 1057185029Spjdstatic kmutex_t l2arc_dev_mtx; /* device list mutex */ 1058185029Spjdstatic l2arc_dev_t *l2arc_dev_last; /* last device used */ 1059185029Spjdstatic list_t L2ARC_free_on_write; /* free after write buf list */ 1060185029Spjdstatic list_t *l2arc_free_on_write; /* free after write list ptr */ 1061185029Spjdstatic kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 1062185029Spjdstatic uint64_t l2arc_ndev; /* number of devices */ 1063185029Spjd 1064185029Spjdtypedef struct l2arc_read_callback { 1065251478Sdelphij arc_buf_t *l2rcb_buf; /* read buffer */ 1066251478Sdelphij spa_t *l2rcb_spa; /* spa */ 1067251478Sdelphij blkptr_t l2rcb_bp; /* original blkptr */ 1068268123Sdelphij zbookmark_phys_t l2rcb_zb; /* original bookmark */ 1069251478Sdelphij int l2rcb_flags; /* original flags */ 1070251478Sdelphij enum zio_compress l2rcb_compress; /* applied compress */ 1071185029Spjd} l2arc_read_callback_t; 1072185029Spjd 1073185029Spjdtypedef struct l2arc_write_callback { 1074185029Spjd l2arc_dev_t *l2wcb_dev; /* device info */ 1075185029Spjd arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 1076185029Spjd} l2arc_write_callback_t; 1077185029Spjd 1078185029Spjdtypedef struct l2arc_data_free { 1079185029Spjd /* protected by l2arc_free_on_write_mtx */ 1080185029Spjd void *l2df_data; 1081185029Spjd size_t l2df_size; 1082185029Spjd void (*l2df_func)(void *, size_t); 1083185029Spjd list_node_t l2df_list_node; 1084185029Spjd} l2arc_data_free_t; 1085185029Spjd 1086185029Spjdstatic kmutex_t l2arc_feed_thr_lock; 1087185029Spjdstatic kcondvar_t l2arc_feed_thr_cv; 1088185029Spjdstatic uint8_t l2arc_thread_exit; 1089185029Spjd 1090275811Sdelphijstatic void arc_get_data_buf(arc_buf_t *); 1091275811Sdelphijstatic void arc_access(arc_buf_hdr_t *, kmutex_t *); 1092275811Sdelphijstatic int arc_evict_needed(arc_buf_contents_t); 1093275811Sdelphijstatic void arc_evict_ghost(arc_state_t *, uint64_t, int64_t); 1094275811Sdelphijstatic void arc_buf_watch(arc_buf_t *); 1095275811Sdelphij 1096286570Smavstatic arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); 1097286570Smavstatic uint32_t arc_bufc_to_flags(arc_buf_contents_t); 1098286570Smav 1099275811Sdelphijstatic boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); 1100275811Sdelphijstatic void l2arc_read_done(zio_t *); 1101185029Spjd 1102286570Smavstatic boolean_t l2arc_compress_buf(arc_buf_hdr_t *); 1103275811Sdelphijstatic void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); 1104275811Sdelphijstatic void l2arc_release_cdata_buf(arc_buf_hdr_t *); 1105251478Sdelphij 1106168404Spjdstatic uint64_t 1107209962Smmbuf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 1108168404Spjd{ 1109168404Spjd uint8_t *vdva = (uint8_t *)dva; 1110168404Spjd uint64_t crc = -1ULL; 1111168404Spjd int i; 1112168404Spjd 1113168404Spjd ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 1114168404Spjd 1115168404Spjd for (i = 0; i < sizeof (dva_t); i++) 1116168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 1117168404Spjd 1118209962Smm crc ^= (spa>>8) ^ birth; 1119168404Spjd 1120168404Spjd return (crc); 1121168404Spjd} 1122168404Spjd 1123168404Spjd#define BUF_EMPTY(buf) \ 1124168404Spjd ((buf)->b_dva.dva_word[0] == 0 && \ 1125286570Smav (buf)->b_dva.dva_word[1] == 0) 1126168404Spjd 1127168404Spjd#define BUF_EQUAL(spa, dva, birth, buf) \ 1128168404Spjd ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 1129168404Spjd ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 1130168404Spjd ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 1131168404Spjd 1132219089Spjdstatic void 1133219089Spjdbuf_discard_identity(arc_buf_hdr_t *hdr) 1134219089Spjd{ 1135219089Spjd hdr->b_dva.dva_word[0] = 0; 1136219089Spjd hdr->b_dva.dva_word[1] = 0; 1137219089Spjd hdr->b_birth = 0; 1138219089Spjd} 1139219089Spjd 1140168404Spjdstatic arc_buf_hdr_t * 1141268075Sdelphijbuf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) 1142168404Spjd{ 1143268075Sdelphij const dva_t *dva = BP_IDENTITY(bp); 1144268075Sdelphij uint64_t birth = BP_PHYSICAL_BIRTH(bp); 1145168404Spjd uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 1146168404Spjd kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1147275811Sdelphij arc_buf_hdr_t *hdr; 1148168404Spjd 1149168404Spjd mutex_enter(hash_lock); 1150275811Sdelphij for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; 1151275811Sdelphij hdr = hdr->b_hash_next) { 1152275811Sdelphij if (BUF_EQUAL(spa, dva, birth, hdr)) { 1153168404Spjd *lockp = hash_lock; 1154275811Sdelphij return (hdr); 1155168404Spjd } 1156168404Spjd } 1157168404Spjd mutex_exit(hash_lock); 1158168404Spjd *lockp = NULL; 1159168404Spjd return (NULL); 1160168404Spjd} 1161168404Spjd 1162168404Spjd/* 1163168404Spjd * Insert an entry into the hash table. If there is already an element 1164168404Spjd * equal to elem in the hash table, then the already existing element 1165168404Spjd * will be returned and the new element will not be inserted. 1166168404Spjd * Otherwise returns NULL. 1167286570Smav * If lockp == NULL, the caller is assumed to already hold the hash lock. 1168168404Spjd */ 1169168404Spjdstatic arc_buf_hdr_t * 1170275811Sdelphijbuf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) 1171168404Spjd{ 1172275811Sdelphij uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1173168404Spjd kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1174275811Sdelphij arc_buf_hdr_t *fhdr; 1175168404Spjd uint32_t i; 1176168404Spjd 1177275811Sdelphij ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); 1178275811Sdelphij ASSERT(hdr->b_birth != 0); 1179275811Sdelphij ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1180286570Smav 1181286570Smav if (lockp != NULL) { 1182286570Smav *lockp = hash_lock; 1183286570Smav mutex_enter(hash_lock); 1184286570Smav } else { 1185286570Smav ASSERT(MUTEX_HELD(hash_lock)); 1186286570Smav } 1187286570Smav 1188275811Sdelphij for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; 1189275811Sdelphij fhdr = fhdr->b_hash_next, i++) { 1190275811Sdelphij if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) 1191275811Sdelphij return (fhdr); 1192168404Spjd } 1193168404Spjd 1194275811Sdelphij hdr->b_hash_next = buf_hash_table.ht_table[idx]; 1195275811Sdelphij buf_hash_table.ht_table[idx] = hdr; 1196275811Sdelphij hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 1197168404Spjd 1198168404Spjd /* collect some hash table performance data */ 1199168404Spjd if (i > 0) { 1200168404Spjd ARCSTAT_BUMP(arcstat_hash_collisions); 1201168404Spjd if (i == 1) 1202168404Spjd ARCSTAT_BUMP(arcstat_hash_chains); 1203168404Spjd 1204168404Spjd ARCSTAT_MAX(arcstat_hash_chain_max, i); 1205168404Spjd } 1206168404Spjd 1207168404Spjd ARCSTAT_BUMP(arcstat_hash_elements); 1208168404Spjd ARCSTAT_MAXSTAT(arcstat_hash_elements); 1209168404Spjd 1210168404Spjd return (NULL); 1211168404Spjd} 1212168404Spjd 1213168404Spjdstatic void 1214275811Sdelphijbuf_hash_remove(arc_buf_hdr_t *hdr) 1215168404Spjd{ 1216275811Sdelphij arc_buf_hdr_t *fhdr, **hdrp; 1217275811Sdelphij uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1218168404Spjd 1219168404Spjd ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 1220275811Sdelphij ASSERT(HDR_IN_HASH_TABLE(hdr)); 1221168404Spjd 1222275811Sdelphij hdrp = &buf_hash_table.ht_table[idx]; 1223275811Sdelphij while ((fhdr = *hdrp) != hdr) { 1224275811Sdelphij ASSERT(fhdr != NULL); 1225275811Sdelphij hdrp = &fhdr->b_hash_next; 1226168404Spjd } 1227275811Sdelphij *hdrp = hdr->b_hash_next; 1228275811Sdelphij hdr->b_hash_next = NULL; 1229275811Sdelphij hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; 1230168404Spjd 1231168404Spjd /* collect some hash table performance data */ 1232168404Spjd ARCSTAT_BUMPDOWN(arcstat_hash_elements); 1233168404Spjd 1234168404Spjd if (buf_hash_table.ht_table[idx] && 1235168404Spjd buf_hash_table.ht_table[idx]->b_hash_next == NULL) 1236168404Spjd ARCSTAT_BUMPDOWN(arcstat_hash_chains); 1237168404Spjd} 1238168404Spjd 1239168404Spjd/* 1240168404Spjd * Global data structures and functions for the buf kmem cache. 1241168404Spjd */ 1242286570Smavstatic kmem_cache_t *hdr_full_cache; 1243286570Smavstatic kmem_cache_t *hdr_l2only_cache; 1244168404Spjdstatic kmem_cache_t *buf_cache; 1245168404Spjd 1246168404Spjdstatic void 1247168404Spjdbuf_fini(void) 1248168404Spjd{ 1249168404Spjd int i; 1250168404Spjd 1251168404Spjd kmem_free(buf_hash_table.ht_table, 1252168404Spjd (buf_hash_table.ht_mask + 1) * sizeof (void *)); 1253168404Spjd for (i = 0; i < BUF_LOCKS; i++) 1254168404Spjd mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 1255286570Smav kmem_cache_destroy(hdr_full_cache); 1256286570Smav kmem_cache_destroy(hdr_l2only_cache); 1257168404Spjd kmem_cache_destroy(buf_cache); 1258168404Spjd} 1259168404Spjd 1260168404Spjd/* 1261168404Spjd * Constructor callback - called when the cache is empty 1262168404Spjd * and a new buf is requested. 1263168404Spjd */ 1264168404Spjd/* ARGSUSED */ 1265168404Spjdstatic int 1266286570Smavhdr_full_cons(void *vbuf, void *unused, int kmflag) 1267168404Spjd{ 1268275811Sdelphij arc_buf_hdr_t *hdr = vbuf; 1269168404Spjd 1270286570Smav bzero(hdr, HDR_FULL_SIZE); 1271286570Smav cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); 1272286570Smav refcount_create(&hdr->b_l1hdr.b_refcnt); 1273286570Smav mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 1274286570Smav arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1275185029Spjd 1276168404Spjd return (0); 1277168404Spjd} 1278168404Spjd 1279185029Spjd/* ARGSUSED */ 1280185029Spjdstatic int 1281286570Smavhdr_l2only_cons(void *vbuf, void *unused, int kmflag) 1282286570Smav{ 1283286570Smav arc_buf_hdr_t *hdr = vbuf; 1284286570Smav 1285286570Smav bzero(hdr, HDR_L2ONLY_SIZE); 1286286570Smav arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1287286570Smav 1288286570Smav return (0); 1289286570Smav} 1290286570Smav 1291286570Smav/* ARGSUSED */ 1292286570Smavstatic int 1293185029Spjdbuf_cons(void *vbuf, void *unused, int kmflag) 1294185029Spjd{ 1295185029Spjd arc_buf_t *buf = vbuf; 1296185029Spjd 1297185029Spjd bzero(buf, sizeof (arc_buf_t)); 1298219089Spjd mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 1299208373Smm arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1300208373Smm 1301185029Spjd return (0); 1302185029Spjd} 1303185029Spjd 1304168404Spjd/* 1305168404Spjd * Destructor callback - called when a cached buf is 1306168404Spjd * no longer required. 1307168404Spjd */ 1308168404Spjd/* ARGSUSED */ 1309168404Spjdstatic void 1310286570Smavhdr_full_dest(void *vbuf, void *unused) 1311168404Spjd{ 1312275811Sdelphij arc_buf_hdr_t *hdr = vbuf; 1313168404Spjd 1314275811Sdelphij ASSERT(BUF_EMPTY(hdr)); 1315286570Smav cv_destroy(&hdr->b_l1hdr.b_cv); 1316286570Smav refcount_destroy(&hdr->b_l1hdr.b_refcnt); 1317286570Smav mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); 1318286570Smav arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1319168404Spjd} 1320168404Spjd 1321185029Spjd/* ARGSUSED */ 1322185029Spjdstatic void 1323286570Smavhdr_l2only_dest(void *vbuf, void *unused) 1324286570Smav{ 1325286570Smav arc_buf_hdr_t *hdr = vbuf; 1326286570Smav 1327286570Smav ASSERT(BUF_EMPTY(hdr)); 1328286570Smav arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1329286570Smav} 1330286570Smav 1331286570Smav/* ARGSUSED */ 1332286570Smavstatic void 1333185029Spjdbuf_dest(void *vbuf, void *unused) 1334185029Spjd{ 1335185029Spjd arc_buf_t *buf = vbuf; 1336185029Spjd 1337219089Spjd mutex_destroy(&buf->b_evict_lock); 1338208373Smm arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1339185029Spjd} 1340185029Spjd 1341168404Spjd/* 1342168404Spjd * Reclaim callback -- invoked when memory is low. 1343168404Spjd */ 1344168404Spjd/* ARGSUSED */ 1345168404Spjdstatic void 1346168404Spjdhdr_recl(void *unused) 1347168404Spjd{ 1348168404Spjd dprintf("hdr_recl called\n"); 1349168404Spjd /* 1350168404Spjd * umem calls the reclaim func when we destroy the buf cache, 1351168404Spjd * which is after we do arc_fini(). 1352168404Spjd */ 1353168404Spjd if (!arc_dead) 1354168404Spjd cv_signal(&arc_reclaim_thr_cv); 1355168404Spjd} 1356168404Spjd 1357168404Spjdstatic void 1358168404Spjdbuf_init(void) 1359168404Spjd{ 1360168404Spjd uint64_t *ct; 1361168404Spjd uint64_t hsize = 1ULL << 12; 1362168404Spjd int i, j; 1363168404Spjd 1364168404Spjd /* 1365168404Spjd * The hash table is big enough to fill all of physical memory 1366269230Sdelphij * with an average block size of zfs_arc_average_blocksize (default 8K). 1367269230Sdelphij * By default, the table will take up 1368269230Sdelphij * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). 1369168404Spjd */ 1370269230Sdelphij while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) 1371168404Spjd hsize <<= 1; 1372168404Spjdretry: 1373168404Spjd buf_hash_table.ht_mask = hsize - 1; 1374168404Spjd buf_hash_table.ht_table = 1375168404Spjd kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1376168404Spjd if (buf_hash_table.ht_table == NULL) { 1377168404Spjd ASSERT(hsize > (1ULL << 8)); 1378168404Spjd hsize >>= 1; 1379168404Spjd goto retry; 1380168404Spjd } 1381168404Spjd 1382286570Smav hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 1383286570Smav 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); 1384286570Smav hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", 1385286570Smav HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, 1386286570Smav NULL, NULL, 0); 1387168404Spjd buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1388185029Spjd 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1389168404Spjd 1390168404Spjd for (i = 0; i < 256; i++) 1391168404Spjd for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1392168404Spjd *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1393168404Spjd 1394168404Spjd for (i = 0; i < BUF_LOCKS; i++) { 1395168404Spjd mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1396168404Spjd NULL, MUTEX_DEFAULT, NULL); 1397168404Spjd } 1398168404Spjd} 1399168404Spjd 1400286570Smav/* 1401286570Smav * Transition between the two allocation states for the arc_buf_hdr struct. 1402286570Smav * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without 1403286570Smav * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller 1404286570Smav * version is used when a cache buffer is only in the L2ARC in order to reduce 1405286570Smav * memory usage. 1406286570Smav */ 1407286570Smavstatic arc_buf_hdr_t * 1408286570Smavarc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) 1409286570Smav{ 1410286570Smav ASSERT(HDR_HAS_L2HDR(hdr)); 1411286570Smav 1412286570Smav arc_buf_hdr_t *nhdr; 1413286570Smav l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 1414286570Smav 1415286570Smav ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || 1416286570Smav (old == hdr_l2only_cache && new == hdr_full_cache)); 1417286570Smav 1418286570Smav nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); 1419286570Smav 1420286570Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); 1421286570Smav buf_hash_remove(hdr); 1422286570Smav 1423286570Smav bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); 1424286598Smav 1425286570Smav if (new == hdr_full_cache) { 1426286570Smav nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1427286570Smav /* 1428286570Smav * arc_access and arc_change_state need to be aware that a 1429286570Smav * header has just come out of L2ARC, so we set its state to 1430286570Smav * l2c_only even though it's about to change. 1431286570Smav */ 1432286570Smav nhdr->b_l1hdr.b_state = arc_l2c_only; 1433286570Smav } else { 1434286570Smav ASSERT(hdr->b_l1hdr.b_buf == NULL); 1435286570Smav ASSERT0(hdr->b_l1hdr.b_datacnt); 1436286570Smav ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 1437286570Smav /* 1438286570Smav * We might be removing the L1hdr of a buffer which was just 1439286570Smav * written out to L2ARC. If such a buffer is compressed then we 1440286570Smav * need to free its b_tmp_cdata before destroying the header. 1441286570Smav */ 1442286570Smav if (hdr->b_l1hdr.b_tmp_cdata != NULL && 1443286570Smav HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 1444286570Smav l2arc_release_cdata_buf(hdr); 1445286570Smav nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; 1446286570Smav } 1447286570Smav /* 1448286570Smav * The header has been reallocated so we need to re-insert it into any 1449286570Smav * lists it was on. 1450286570Smav */ 1451286570Smav (void) buf_hash_insert(nhdr, NULL); 1452286570Smav 1453286570Smav ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); 1454286570Smav 1455286570Smav mutex_enter(&dev->l2ad_mtx); 1456286570Smav 1457286570Smav /* 1458286570Smav * We must place the realloc'ed header back into the list at 1459286570Smav * the same spot. Otherwise, if it's placed earlier in the list, 1460286570Smav * l2arc_write_buffers() could find it during the function's 1461286570Smav * write phase, and try to write it out to the l2arc. 1462286570Smav */ 1463286570Smav list_insert_after(&dev->l2ad_buflist, hdr, nhdr); 1464286570Smav list_remove(&dev->l2ad_buflist, hdr); 1465286570Smav 1466286570Smav mutex_exit(&dev->l2ad_mtx); 1467286570Smav 1468286598Smav /* 1469286598Smav * Since we're using the pointer address as the tag when 1470286598Smav * incrementing and decrementing the l2ad_alloc refcount, we 1471286598Smav * must remove the old pointer (that we're about to destroy) and 1472286598Smav * add the new pointer to the refcount. Otherwise we'd remove 1473286598Smav * the wrong pointer address when calling arc_hdr_destroy() later. 1474286598Smav */ 1475286598Smav 1476286598Smav (void) refcount_remove_many(&dev->l2ad_alloc, 1477286598Smav hdr->b_l2hdr.b_asize, hdr); 1478286598Smav 1479286598Smav (void) refcount_add_many(&dev->l2ad_alloc, 1480286598Smav nhdr->b_l2hdr.b_asize, nhdr); 1481286598Smav 1482286570Smav buf_discard_identity(hdr); 1483286570Smav hdr->b_freeze_cksum = NULL; 1484286570Smav kmem_cache_free(old, hdr); 1485286570Smav 1486286570Smav return (nhdr); 1487286570Smav} 1488286570Smav 1489286570Smav 1490168404Spjd#define ARC_MINTIME (hz>>4) /* 62 ms */ 1491168404Spjd 1492168404Spjdstatic void 1493168404Spjdarc_cksum_verify(arc_buf_t *buf) 1494168404Spjd{ 1495168404Spjd zio_cksum_t zc; 1496168404Spjd 1497168404Spjd if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1498168404Spjd return; 1499168404Spjd 1500286570Smav mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1501286570Smav if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { 1502286570Smav mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1503168404Spjd return; 1504168404Spjd } 1505168404Spjd fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1506168404Spjd if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 1507168404Spjd panic("buffer modified while frozen!"); 1508286570Smav mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1509168404Spjd} 1510168404Spjd 1511185029Spjdstatic int 1512185029Spjdarc_cksum_equal(arc_buf_t *buf) 1513185029Spjd{ 1514185029Spjd zio_cksum_t zc; 1515185029Spjd int equal; 1516185029Spjd 1517286570Smav mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1518185029Spjd fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 1519185029Spjd equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 1520286570Smav mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1521185029Spjd 1522185029Spjd return (equal); 1523185029Spjd} 1524185029Spjd 1525168404Spjdstatic void 1526185029Spjdarc_cksum_compute(arc_buf_t *buf, boolean_t force) 1527168404Spjd{ 1528185029Spjd if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 1529168404Spjd return; 1530168404Spjd 1531286570Smav mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1532168404Spjd if (buf->b_hdr->b_freeze_cksum != NULL) { 1533286570Smav mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1534168404Spjd return; 1535168404Spjd } 1536168404Spjd buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 1537168404Spjd fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 1538168404Spjd buf->b_hdr->b_freeze_cksum); 1539286570Smav mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1540240133Smm#ifdef illumos 1541240133Smm arc_buf_watch(buf); 1542277300Ssmh#endif 1543168404Spjd} 1544168404Spjd 1545240133Smm#ifdef illumos 1546240133Smm#ifndef _KERNEL 1547240133Smmtypedef struct procctl { 1548240133Smm long cmd; 1549240133Smm prwatch_t prwatch; 1550240133Smm} procctl_t; 1551240133Smm#endif 1552240133Smm 1553240133Smm/* ARGSUSED */ 1554240133Smmstatic void 1555240133Smmarc_buf_unwatch(arc_buf_t *buf) 1556240133Smm{ 1557240133Smm#ifndef _KERNEL 1558240133Smm if (arc_watch) { 1559240133Smm int result; 1560240133Smm procctl_t ctl; 1561240133Smm ctl.cmd = PCWATCH; 1562240133Smm ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1563240133Smm ctl.prwatch.pr_size = 0; 1564240133Smm ctl.prwatch.pr_wflags = 0; 1565240133Smm result = write(arc_procfd, &ctl, sizeof (ctl)); 1566240133Smm ASSERT3U(result, ==, sizeof (ctl)); 1567240133Smm } 1568240133Smm#endif 1569240133Smm} 1570240133Smm 1571240133Smm/* ARGSUSED */ 1572240133Smmstatic void 1573240133Smmarc_buf_watch(arc_buf_t *buf) 1574240133Smm{ 1575240133Smm#ifndef _KERNEL 1576240133Smm if (arc_watch) { 1577240133Smm int result; 1578240133Smm procctl_t ctl; 1579240133Smm ctl.cmd = PCWATCH; 1580240133Smm ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1581240133Smm ctl.prwatch.pr_size = buf->b_hdr->b_size; 1582240133Smm ctl.prwatch.pr_wflags = WA_WRITE; 1583240133Smm result = write(arc_procfd, &ctl, sizeof (ctl)); 1584240133Smm ASSERT3U(result, ==, sizeof (ctl)); 1585240133Smm } 1586240133Smm#endif 1587240133Smm} 1588240133Smm#endif /* illumos */ 1589240133Smm 1590286570Smavstatic arc_buf_contents_t 1591286570Smavarc_buf_type(arc_buf_hdr_t *hdr) 1592286570Smav{ 1593286570Smav if (HDR_ISTYPE_METADATA(hdr)) { 1594286570Smav return (ARC_BUFC_METADATA); 1595286570Smav } else { 1596286570Smav return (ARC_BUFC_DATA); 1597286570Smav } 1598286570Smav} 1599286570Smav 1600286570Smavstatic uint32_t 1601286570Smavarc_bufc_to_flags(arc_buf_contents_t type) 1602286570Smav{ 1603286570Smav switch (type) { 1604286570Smav case ARC_BUFC_DATA: 1605286570Smav /* metadata field is 0 if buffer contains normal data */ 1606286570Smav return (0); 1607286570Smav case ARC_BUFC_METADATA: 1608286570Smav return (ARC_FLAG_BUFC_METADATA); 1609286570Smav default: 1610286570Smav break; 1611286570Smav } 1612286570Smav panic("undefined ARC buffer type!"); 1613286570Smav return ((uint32_t)-1); 1614286570Smav} 1615286570Smav 1616168404Spjdvoid 1617168404Spjdarc_buf_thaw(arc_buf_t *buf) 1618168404Spjd{ 1619185029Spjd if (zfs_flags & ZFS_DEBUG_MODIFY) { 1620286570Smav if (buf->b_hdr->b_l1hdr.b_state != arc_anon) 1621185029Spjd panic("modifying non-anon buffer!"); 1622286570Smav if (HDR_IO_IN_PROGRESS(buf->b_hdr)) 1623185029Spjd panic("modifying buffer while i/o in progress!"); 1624185029Spjd arc_cksum_verify(buf); 1625185029Spjd } 1626168404Spjd 1627286570Smav mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1628168404Spjd if (buf->b_hdr->b_freeze_cksum != NULL) { 1629168404Spjd kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1630168404Spjd buf->b_hdr->b_freeze_cksum = NULL; 1631168404Spjd } 1632219089Spjd 1633286570Smav#ifdef ZFS_DEBUG 1634219089Spjd if (zfs_flags & ZFS_DEBUG_MODIFY) { 1635286570Smav if (buf->b_hdr->b_l1hdr.b_thawed != NULL) 1636286570Smav kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1); 1637286570Smav buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); 1638219089Spjd } 1639286570Smav#endif 1640219089Spjd 1641286570Smav mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1642240133Smm 1643240133Smm#ifdef illumos 1644240133Smm arc_buf_unwatch(buf); 1645277300Ssmh#endif 1646168404Spjd} 1647168404Spjd 1648168404Spjdvoid 1649168404Spjdarc_buf_freeze(arc_buf_t *buf) 1650168404Spjd{ 1651219089Spjd kmutex_t *hash_lock; 1652219089Spjd 1653168404Spjd if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1654168404Spjd return; 1655168404Spjd 1656219089Spjd hash_lock = HDR_LOCK(buf->b_hdr); 1657219089Spjd mutex_enter(hash_lock); 1658219089Spjd 1659168404Spjd ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 1660286570Smav buf->b_hdr->b_l1hdr.b_state == arc_anon); 1661185029Spjd arc_cksum_compute(buf, B_FALSE); 1662219089Spjd mutex_exit(hash_lock); 1663240133Smm 1664168404Spjd} 1665168404Spjd 1666168404Spjdstatic void 1667275811Sdelphijadd_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1668168404Spjd{ 1669286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1670168404Spjd ASSERT(MUTEX_HELD(hash_lock)); 1671286570Smav arc_state_t *state = hdr->b_l1hdr.b_state; 1672168404Spjd 1673286570Smav if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && 1674286570Smav (state != arc_anon)) { 1675286570Smav /* We don't use the L2-only state list. */ 1676286570Smav if (state != arc_l2c_only) { 1677286570Smav uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; 1678286762Smav list_t *list = &state->arcs_list[arc_buf_type(hdr)]; 1679286570Smav uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; 1680168404Spjd 1681286762Smav ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 1682286762Smav mutex_enter(&state->arcs_mtx); 1683286570Smav ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 1684286570Smav list_remove(list, hdr); 1685286570Smav if (GHOST_STATE(state)) { 1686286570Smav ASSERT0(hdr->b_l1hdr.b_datacnt); 1687286570Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 1688286570Smav delta = hdr->b_size; 1689286570Smav } 1690286570Smav ASSERT(delta > 0); 1691286570Smav ASSERT3U(*size, >=, delta); 1692286570Smav atomic_add_64(size, -delta); 1693286762Smav mutex_exit(&state->arcs_mtx); 1694168404Spjd } 1695185029Spjd /* remove the prefetch flag if we get a reference */ 1696286570Smav hdr->b_flags &= ~ARC_FLAG_PREFETCH; 1697168404Spjd } 1698168404Spjd} 1699168404Spjd 1700168404Spjdstatic int 1701275811Sdelphijremove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1702168404Spjd{ 1703168404Spjd int cnt; 1704286570Smav arc_state_t *state = hdr->b_l1hdr.b_state; 1705168404Spjd 1706286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1707168404Spjd ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 1708168404Spjd ASSERT(!GHOST_STATE(state)); 1709168404Spjd 1710286570Smav /* 1711286570Smav * arc_l2c_only counts as a ghost state so we don't need to explicitly 1712286570Smav * check to prevent usage of the arc_l2c_only list. 1713286570Smav */ 1714286570Smav if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && 1715168404Spjd (state != arc_anon)) { 1716286570Smav uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; 1717185029Spjd 1718286762Smav ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 1719286762Smav mutex_enter(&state->arcs_mtx); 1720286570Smav ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 1721286762Smav list_insert_head(&state->arcs_list[arc_buf_type(hdr)], hdr); 1722286570Smav ASSERT(hdr->b_l1hdr.b_datacnt > 0); 1723286570Smav atomic_add_64(size, hdr->b_size * 1724286570Smav hdr->b_l1hdr.b_datacnt); 1725286762Smav mutex_exit(&state->arcs_mtx); 1726168404Spjd } 1727168404Spjd return (cnt); 1728168404Spjd} 1729168404Spjd 1730168404Spjd/* 1731168404Spjd * Move the supplied buffer to the indicated state. The mutex 1732168404Spjd * for the buffer must be held by the caller. 1733168404Spjd */ 1734168404Spjdstatic void 1735275811Sdelphijarc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, 1736275811Sdelphij kmutex_t *hash_lock) 1737168404Spjd{ 1738286570Smav arc_state_t *old_state; 1739286570Smav int64_t refcnt; 1740286570Smav uint32_t datacnt; 1741168404Spjd uint64_t from_delta, to_delta; 1742286570Smav arc_buf_contents_t buftype = arc_buf_type(hdr); 1743168404Spjd 1744286570Smav /* 1745286570Smav * We almost always have an L1 hdr here, since we call arc_hdr_realloc() 1746286570Smav * in arc_read() when bringing a buffer out of the L2ARC. However, the 1747286570Smav * L1 hdr doesn't always exist when we change state to arc_anon before 1748286570Smav * destroying a header, in which case reallocating to add the L1 hdr is 1749286570Smav * pointless. 1750286570Smav */ 1751286570Smav if (HDR_HAS_L1HDR(hdr)) { 1752286570Smav old_state = hdr->b_l1hdr.b_state; 1753286570Smav refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); 1754286570Smav datacnt = hdr->b_l1hdr.b_datacnt; 1755286570Smav } else { 1756286570Smav old_state = arc_l2c_only; 1757286570Smav refcnt = 0; 1758286570Smav datacnt = 0; 1759286570Smav } 1760286570Smav 1761168404Spjd ASSERT(MUTEX_HELD(hash_lock)); 1762258632Savg ASSERT3P(new_state, !=, old_state); 1763286570Smav ASSERT(refcnt == 0 || datacnt > 0); 1764286570Smav ASSERT(!GHOST_STATE(new_state) || datacnt == 0); 1765286570Smav ASSERT(old_state != arc_anon || datacnt <= 1); 1766168404Spjd 1767286570Smav from_delta = to_delta = datacnt * hdr->b_size; 1768168404Spjd 1769168404Spjd /* 1770168404Spjd * If this buffer is evictable, transfer it from the 1771168404Spjd * old state list to the new state list. 1772168404Spjd */ 1773168404Spjd if (refcnt == 0) { 1774286570Smav if (old_state != arc_anon && old_state != arc_l2c_only) { 1775286762Smav int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); 1776286570Smav uint64_t *size = &old_state->arcs_lsize[buftype]; 1777168404Spjd 1778168404Spjd if (use_mutex) 1779286762Smav mutex_enter(&old_state->arcs_mtx); 1780168404Spjd 1781286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1782286570Smav ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 1783286762Smav list_remove(&old_state->arcs_list[buftype], hdr); 1784168404Spjd 1785168404Spjd /* 1786168404Spjd * If prefetching out of the ghost cache, 1787219089Spjd * we will have a non-zero datacnt. 1788168404Spjd */ 1789286570Smav if (GHOST_STATE(old_state) && datacnt == 0) { 1790168404Spjd /* ghost elements have a ghost size */ 1791286570Smav ASSERT(hdr->b_l1hdr.b_buf == NULL); 1792275811Sdelphij from_delta = hdr->b_size; 1793168404Spjd } 1794185029Spjd ASSERT3U(*size, >=, from_delta); 1795185029Spjd atomic_add_64(size, -from_delta); 1796168404Spjd 1797168404Spjd if (use_mutex) 1798286762Smav mutex_exit(&old_state->arcs_mtx); 1799168404Spjd } 1800286570Smav if (new_state != arc_anon && new_state != arc_l2c_only) { 1801286762Smav int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); 1802286570Smav uint64_t *size = &new_state->arcs_lsize[buftype]; 1803168404Spjd 1804286570Smav /* 1805286570Smav * An L1 header always exists here, since if we're 1806286570Smav * moving to some L1-cached state (i.e. not l2c_only or 1807286570Smav * anonymous), we realloc the header to add an L1hdr 1808286570Smav * beforehand. 1809286570Smav */ 1810286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1811168404Spjd if (use_mutex) 1812286762Smav mutex_enter(&new_state->arcs_mtx); 1813168404Spjd 1814286762Smav list_insert_head(&new_state->arcs_list[buftype], hdr); 1815168404Spjd 1816168404Spjd /* ghost elements have a ghost size */ 1817168404Spjd if (GHOST_STATE(new_state)) { 1818286762Smav ASSERT0(datacnt); 1819286570Smav ASSERT(hdr->b_l1hdr.b_buf == NULL); 1820275811Sdelphij to_delta = hdr->b_size; 1821168404Spjd } 1822185029Spjd atomic_add_64(size, to_delta); 1823168404Spjd 1824168404Spjd if (use_mutex) 1825286762Smav mutex_exit(&new_state->arcs_mtx); 1826168404Spjd } 1827168404Spjd } 1828168404Spjd 1829275811Sdelphij ASSERT(!BUF_EMPTY(hdr)); 1830275811Sdelphij if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) 1831275811Sdelphij buf_hash_remove(hdr); 1832168404Spjd 1833286570Smav /* adjust state sizes (ignore arc_l2c_only) */ 1834286570Smav if (to_delta && new_state != arc_l2c_only) 1835168404Spjd atomic_add_64(&new_state->arcs_size, to_delta); 1836286570Smav if (from_delta && old_state != arc_l2c_only) { 1837168404Spjd ASSERT3U(old_state->arcs_size, >=, from_delta); 1838168404Spjd atomic_add_64(&old_state->arcs_size, -from_delta); 1839168404Spjd } 1840286570Smav if (HDR_HAS_L1HDR(hdr)) 1841286570Smav hdr->b_l1hdr.b_state = new_state; 1842185029Spjd 1843286570Smav /* 1844286570Smav * L2 headers should never be on the L2 state list since they don't 1845286570Smav * have L1 headers allocated. 1846286570Smav */ 1847286570Smav ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && 1848286570Smav list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); 1849168404Spjd} 1850168404Spjd 1851185029Spjdvoid 1852208373Smmarc_space_consume(uint64_t space, arc_space_type_t type) 1853185029Spjd{ 1854208373Smm ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1855208373Smm 1856208373Smm switch (type) { 1857208373Smm case ARC_SPACE_DATA: 1858208373Smm ARCSTAT_INCR(arcstat_data_size, space); 1859208373Smm break; 1860286574Smav case ARC_SPACE_META: 1861286574Smav ARCSTAT_INCR(arcstat_metadata_size, space); 1862286574Smav break; 1863208373Smm case ARC_SPACE_OTHER: 1864208373Smm ARCSTAT_INCR(arcstat_other_size, space); 1865208373Smm break; 1866208373Smm case ARC_SPACE_HDRS: 1867208373Smm ARCSTAT_INCR(arcstat_hdr_size, space); 1868208373Smm break; 1869208373Smm case ARC_SPACE_L2HDRS: 1870208373Smm ARCSTAT_INCR(arcstat_l2_hdr_size, space); 1871208373Smm break; 1872208373Smm } 1873208373Smm 1874286574Smav if (type != ARC_SPACE_DATA) 1875286574Smav ARCSTAT_INCR(arcstat_meta_used, space); 1876286574Smav 1877185029Spjd atomic_add_64(&arc_size, space); 1878185029Spjd} 1879185029Spjd 1880185029Spjdvoid 1881208373Smmarc_space_return(uint64_t space, arc_space_type_t type) 1882185029Spjd{ 1883208373Smm ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1884208373Smm 1885208373Smm switch (type) { 1886208373Smm case ARC_SPACE_DATA: 1887208373Smm ARCSTAT_INCR(arcstat_data_size, -space); 1888208373Smm break; 1889286574Smav case ARC_SPACE_META: 1890286574Smav ARCSTAT_INCR(arcstat_metadata_size, -space); 1891286574Smav break; 1892208373Smm case ARC_SPACE_OTHER: 1893208373Smm ARCSTAT_INCR(arcstat_other_size, -space); 1894208373Smm break; 1895208373Smm case ARC_SPACE_HDRS: 1896208373Smm ARCSTAT_INCR(arcstat_hdr_size, -space); 1897208373Smm break; 1898208373Smm case ARC_SPACE_L2HDRS: 1899208373Smm ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 1900208373Smm break; 1901208373Smm } 1902208373Smm 1903286574Smav if (type != ARC_SPACE_DATA) { 1904286574Smav ASSERT(arc_meta_used >= space); 1905286574Smav if (arc_meta_max < arc_meta_used) 1906286574Smav arc_meta_max = arc_meta_used; 1907286574Smav ARCSTAT_INCR(arcstat_meta_used, -space); 1908286574Smav } 1909286574Smav 1910185029Spjd ASSERT(arc_size >= space); 1911185029Spjd atomic_add_64(&arc_size, -space); 1912185029Spjd} 1913185029Spjd 1914168404Spjdarc_buf_t * 1915286570Smavarc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) 1916168404Spjd{ 1917168404Spjd arc_buf_hdr_t *hdr; 1918168404Spjd arc_buf_t *buf; 1919168404Spjd 1920168404Spjd ASSERT3U(size, >, 0); 1921286570Smav hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 1922168404Spjd ASSERT(BUF_EMPTY(hdr)); 1923286570Smav ASSERT3P(hdr->b_freeze_cksum, ==, NULL); 1924168404Spjd hdr->b_size = size; 1925228103Smm hdr->b_spa = spa_load_guid(spa); 1926286570Smav 1927185029Spjd buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1928168404Spjd buf->b_hdr = hdr; 1929168404Spjd buf->b_data = NULL; 1930168404Spjd buf->b_efunc = NULL; 1931168404Spjd buf->b_private = NULL; 1932168404Spjd buf->b_next = NULL; 1933286570Smav 1934286570Smav hdr->b_flags = arc_bufc_to_flags(type); 1935286570Smav hdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1936286570Smav 1937286570Smav hdr->b_l1hdr.b_buf = buf; 1938286570Smav hdr->b_l1hdr.b_state = arc_anon; 1939286570Smav hdr->b_l1hdr.b_arc_access = 0; 1940286570Smav hdr->b_l1hdr.b_datacnt = 1; 1941286570Smav 1942168404Spjd arc_get_data_buf(buf); 1943286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 1944286570Smav (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 1945168404Spjd 1946168404Spjd return (buf); 1947168404Spjd} 1948168404Spjd 1949209962Smmstatic char *arc_onloan_tag = "onloan"; 1950209962Smm 1951209962Smm/* 1952209962Smm * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 1953209962Smm * flight data by arc_tempreserve_space() until they are "returned". Loaned 1954209962Smm * buffers must be returned to the arc before they can be used by the DMU or 1955209962Smm * freed. 1956209962Smm */ 1957209962Smmarc_buf_t * 1958209962Smmarc_loan_buf(spa_t *spa, int size) 1959209962Smm{ 1960209962Smm arc_buf_t *buf; 1961209962Smm 1962209962Smm buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 1963209962Smm 1964209962Smm atomic_add_64(&arc_loaned_bytes, size); 1965209962Smm return (buf); 1966209962Smm} 1967209962Smm 1968209962Smm/* 1969209962Smm * Return a loaned arc buffer to the arc. 1970209962Smm */ 1971209962Smmvoid 1972209962Smmarc_return_buf(arc_buf_t *buf, void *tag) 1973209962Smm{ 1974209962Smm arc_buf_hdr_t *hdr = buf->b_hdr; 1975209962Smm 1976209962Smm ASSERT(buf->b_data != NULL); 1977286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1978286570Smav (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 1979286570Smav (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 1980209962Smm 1981209962Smm atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 1982209962Smm} 1983209962Smm 1984219089Spjd/* Detach an arc_buf from a dbuf (tag) */ 1985219089Spjdvoid 1986219089Spjdarc_loan_inuse_buf(arc_buf_t *buf, void *tag) 1987219089Spjd{ 1988286570Smav arc_buf_hdr_t *hdr = buf->b_hdr; 1989219089Spjd 1990219089Spjd ASSERT(buf->b_data != NULL); 1991286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1992286570Smav (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 1993286570Smav (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); 1994219089Spjd buf->b_efunc = NULL; 1995219089Spjd buf->b_private = NULL; 1996219089Spjd 1997219089Spjd atomic_add_64(&arc_loaned_bytes, hdr->b_size); 1998219089Spjd} 1999219089Spjd 2000168404Spjdstatic arc_buf_t * 2001168404Spjdarc_buf_clone(arc_buf_t *from) 2002168404Spjd{ 2003168404Spjd arc_buf_t *buf; 2004168404Spjd arc_buf_hdr_t *hdr = from->b_hdr; 2005168404Spjd uint64_t size = hdr->b_size; 2006168404Spjd 2007286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2008286570Smav ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2009219089Spjd 2010185029Spjd buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2011168404Spjd buf->b_hdr = hdr; 2012168404Spjd buf->b_data = NULL; 2013168404Spjd buf->b_efunc = NULL; 2014168404Spjd buf->b_private = NULL; 2015286570Smav buf->b_next = hdr->b_l1hdr.b_buf; 2016286570Smav hdr->b_l1hdr.b_buf = buf; 2017168404Spjd arc_get_data_buf(buf); 2018168404Spjd bcopy(from->b_data, buf->b_data, size); 2019242845Sdelphij 2020242845Sdelphij /* 2021242845Sdelphij * This buffer already exists in the arc so create a duplicate 2022242845Sdelphij * copy for the caller. If the buffer is associated with user data 2023242845Sdelphij * then track the size and number of duplicates. These stats will be 2024242845Sdelphij * updated as duplicate buffers are created and destroyed. 2025242845Sdelphij */ 2026286570Smav if (HDR_ISTYPE_DATA(hdr)) { 2027242845Sdelphij ARCSTAT_BUMP(arcstat_duplicate_buffers); 2028242845Sdelphij ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 2029242845Sdelphij } 2030286570Smav hdr->b_l1hdr.b_datacnt += 1; 2031168404Spjd return (buf); 2032168404Spjd} 2033168404Spjd 2034168404Spjdvoid 2035168404Spjdarc_buf_add_ref(arc_buf_t *buf, void* tag) 2036168404Spjd{ 2037168404Spjd arc_buf_hdr_t *hdr; 2038168404Spjd kmutex_t *hash_lock; 2039168404Spjd 2040168404Spjd /* 2041185029Spjd * Check to see if this buffer is evicted. Callers 2042185029Spjd * must verify b_data != NULL to know if the add_ref 2043185029Spjd * was successful. 2044168404Spjd */ 2045219089Spjd mutex_enter(&buf->b_evict_lock); 2046185029Spjd if (buf->b_data == NULL) { 2047219089Spjd mutex_exit(&buf->b_evict_lock); 2048168404Spjd return; 2049168404Spjd } 2050219089Spjd hash_lock = HDR_LOCK(buf->b_hdr); 2051219089Spjd mutex_enter(hash_lock); 2052185029Spjd hdr = buf->b_hdr; 2053286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2054219089Spjd ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2055219089Spjd mutex_exit(&buf->b_evict_lock); 2056168404Spjd 2057286570Smav ASSERT(hdr->b_l1hdr.b_state == arc_mru || 2058286570Smav hdr->b_l1hdr.b_state == arc_mfu); 2059286570Smav 2060168404Spjd add_reference(hdr, hash_lock, tag); 2061208373Smm DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2062168404Spjd arc_access(hdr, hash_lock); 2063168404Spjd mutex_exit(hash_lock); 2064168404Spjd ARCSTAT_BUMP(arcstat_hits); 2065286570Smav ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 2066286570Smav demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 2067168404Spjd data, metadata, hits); 2068168404Spjd} 2069168404Spjd 2070274172Savgstatic void 2071274172Savgarc_buf_free_on_write(void *data, size_t size, 2072274172Savg void (*free_func)(void *, size_t)) 2073274172Savg{ 2074274172Savg l2arc_data_free_t *df; 2075274172Savg 2076274172Savg df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 2077274172Savg df->l2df_data = data; 2078274172Savg df->l2df_size = size; 2079274172Savg df->l2df_func = free_func; 2080274172Savg mutex_enter(&l2arc_free_on_write_mtx); 2081274172Savg list_insert_head(l2arc_free_on_write, df); 2082274172Savg mutex_exit(&l2arc_free_on_write_mtx); 2083274172Savg} 2084274172Savg 2085185029Spjd/* 2086185029Spjd * Free the arc data buffer. If it is an l2arc write in progress, 2087185029Spjd * the buffer is placed on l2arc_free_on_write to be freed later. 2088185029Spjd */ 2089168404Spjdstatic void 2090240133Smmarc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) 2091185029Spjd{ 2092240133Smm arc_buf_hdr_t *hdr = buf->b_hdr; 2093240133Smm 2094185029Spjd if (HDR_L2_WRITING(hdr)) { 2095274172Savg arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); 2096185029Spjd ARCSTAT_BUMP(arcstat_l2_free_on_write); 2097185029Spjd } else { 2098240133Smm free_func(buf->b_data, hdr->b_size); 2099185029Spjd } 2100185029Spjd} 2101185029Spjd 2102268858Sdelphij/* 2103268858Sdelphij * Free up buf->b_data and if 'remove' is set, then pull the 2104268858Sdelphij * arc_buf_t off of the the arc_buf_hdr_t's list and free it. 2105268858Sdelphij */ 2106185029Spjdstatic void 2107274172Savgarc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) 2108274172Savg{ 2109286570Smav ASSERT(HDR_HAS_L2HDR(hdr)); 2110286570Smav ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); 2111274172Savg 2112286570Smav /* 2113286570Smav * The b_tmp_cdata field is linked off of the b_l1hdr, so if 2114286570Smav * that doesn't exist, the header is in the arc_l2c_only state, 2115286570Smav * and there isn't anything to free (it's already been freed). 2116286570Smav */ 2117286570Smav if (!HDR_HAS_L1HDR(hdr)) 2118286570Smav return; 2119274172Savg 2120286570Smav if (hdr->b_l1hdr.b_tmp_cdata == NULL) 2121274172Savg return; 2122274172Savg 2123274172Savg ASSERT(HDR_L2_WRITING(hdr)); 2124286570Smav arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size, 2125274172Savg zio_data_buf_free); 2126286570Smav 2127274172Savg ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); 2128286570Smav hdr->b_l1hdr.b_tmp_cdata = NULL; 2129274172Savg} 2130274172Savg 2131274172Savgstatic void 2132268858Sdelphijarc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) 2133168404Spjd{ 2134168404Spjd arc_buf_t **bufp; 2135168404Spjd 2136168404Spjd /* free up data associated with the buf */ 2137286570Smav if (buf->b_data != NULL) { 2138286570Smav arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 2139168404Spjd uint64_t size = buf->b_hdr->b_size; 2140286570Smav arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 2141168404Spjd 2142168404Spjd arc_cksum_verify(buf); 2143240133Smm#ifdef illumos 2144240133Smm arc_buf_unwatch(buf); 2145277300Ssmh#endif 2146219089Spjd 2147168404Spjd if (!recycle) { 2148168404Spjd if (type == ARC_BUFC_METADATA) { 2149240133Smm arc_buf_data_free(buf, zio_buf_free); 2150286574Smav arc_space_return(size, ARC_SPACE_META); 2151168404Spjd } else { 2152168404Spjd ASSERT(type == ARC_BUFC_DATA); 2153240133Smm arc_buf_data_free(buf, zio_data_buf_free); 2154286574Smav arc_space_return(size, ARC_SPACE_DATA); 2155168404Spjd } 2156168404Spjd } 2157286570Smav if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { 2158185029Spjd uint64_t *cnt = &state->arcs_lsize[type]; 2159185029Spjd 2160286570Smav ASSERT(refcount_is_zero( 2161286570Smav &buf->b_hdr->b_l1hdr.b_refcnt)); 2162286570Smav ASSERT(state != arc_anon && state != arc_l2c_only); 2163185029Spjd 2164185029Spjd ASSERT3U(*cnt, >=, size); 2165185029Spjd atomic_add_64(cnt, -size); 2166168404Spjd } 2167168404Spjd ASSERT3U(state->arcs_size, >=, size); 2168168404Spjd atomic_add_64(&state->arcs_size, -size); 2169168404Spjd buf->b_data = NULL; 2170242845Sdelphij 2171242845Sdelphij /* 2172242845Sdelphij * If we're destroying a duplicate buffer make sure 2173242845Sdelphij * that the appropriate statistics are updated. 2174242845Sdelphij */ 2175286570Smav if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && 2176286570Smav HDR_ISTYPE_DATA(buf->b_hdr)) { 2177242845Sdelphij ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 2178242845Sdelphij ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 2179242845Sdelphij } 2180286570Smav ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); 2181286570Smav buf->b_hdr->b_l1hdr.b_datacnt -= 1; 2182168404Spjd } 2183168404Spjd 2184168404Spjd /* only remove the buf if requested */ 2185268858Sdelphij if (!remove) 2186168404Spjd return; 2187168404Spjd 2188168404Spjd /* remove the buf from the hdr list */ 2189286570Smav for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; 2190286570Smav bufp = &(*bufp)->b_next) 2191168404Spjd continue; 2192168404Spjd *bufp = buf->b_next; 2193219089Spjd buf->b_next = NULL; 2194168404Spjd 2195168404Spjd ASSERT(buf->b_efunc == NULL); 2196168404Spjd 2197168404Spjd /* clean up the buf */ 2198168404Spjd buf->b_hdr = NULL; 2199168404Spjd kmem_cache_free(buf_cache, buf); 2200168404Spjd} 2201168404Spjd 2202168404Spjdstatic void 2203286598Smavarc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) 2204286598Smav{ 2205286598Smav l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 2206286598Smav l2arc_dev_t *dev = l2hdr->b_dev; 2207286598Smav 2208286598Smav ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 2209286598Smav ASSERT(HDR_HAS_L2HDR(hdr)); 2210286598Smav 2211286598Smav list_remove(&dev->l2ad_buflist, hdr); 2212286598Smav 2213286598Smav /* 2214286598Smav * We don't want to leak the b_tmp_cdata buffer that was 2215286598Smav * allocated in l2arc_write_buffers() 2216286598Smav */ 2217286598Smav arc_buf_l2_cdata_free(hdr); 2218286598Smav 2219286598Smav /* 2220286598Smav * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then 2221286598Smav * this header is being processed by l2arc_write_buffers() (i.e. 2222286598Smav * it's in the first stage of l2arc_write_buffers()). 2223286598Smav * Re-affirming that truth here, just to serve as a reminder. If 2224286598Smav * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or 2225286598Smav * may not have its HDR_L2_WRITING flag set. (the write may have 2226286598Smav * completed, in which case HDR_L2_WRITING will be false and the 2227286598Smav * b_daddr field will point to the address of the buffer on disk). 2228286598Smav */ 2229286598Smav IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr)); 2230286598Smav 2231286598Smav /* 2232286598Smav * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with 2233286598Smav * l2arc_write_buffers(). Since we've just removed this header 2234286598Smav * from the l2arc buffer list, this header will never reach the 2235286598Smav * second stage of l2arc_write_buffers(), which increments the 2236286598Smav * accounting stats for this header. Thus, we must be careful 2237286598Smav * not to decrement them for this header either. 2238286598Smav */ 2239286598Smav if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) { 2240286598Smav ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 2241286598Smav ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 2242286598Smav 2243286598Smav vdev_space_update(dev->l2ad_vdev, 2244286598Smav -l2hdr->b_asize, 0, 0); 2245286598Smav 2246286598Smav (void) refcount_remove_many(&dev->l2ad_alloc, 2247286598Smav l2hdr->b_asize, hdr); 2248286598Smav } 2249286598Smav 2250286598Smav hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 2251286598Smav} 2252286598Smav 2253286598Smavstatic void 2254168404Spjdarc_hdr_destroy(arc_buf_hdr_t *hdr) 2255168404Spjd{ 2256286570Smav if (HDR_HAS_L1HDR(hdr)) { 2257286570Smav ASSERT(hdr->b_l1hdr.b_buf == NULL || 2258286570Smav hdr->b_l1hdr.b_datacnt > 0); 2259286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2260286570Smav ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 2261286570Smav } 2262168404Spjd ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2263286570Smav ASSERT(!HDR_IN_HASH_TABLE(hdr)); 2264168404Spjd 2265286570Smav if (HDR_HAS_L2HDR(hdr)) { 2266286598Smav l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 2267286598Smav boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); 2268286570Smav 2269286598Smav if (!buflist_held) 2270286598Smav mutex_enter(&dev->l2ad_mtx); 2271219089Spjd 2272286570Smav /* 2273286598Smav * Even though we checked this conditional above, we 2274286598Smav * need to check this again now that we have the 2275286598Smav * l2ad_mtx. This is because we could be racing with 2276286598Smav * another thread calling l2arc_evict() which might have 2277286598Smav * destroyed this header's L2 portion as we were waiting 2278286598Smav * to acquire the l2ad_mtx. If that happens, we don't 2279286598Smav * want to re-destroy the header's L2 portion. 2280286570Smav */ 2281286598Smav if (HDR_HAS_L2HDR(hdr)) { 2282286647Smav if (hdr->b_l2hdr.b_daddr != L2ARC_ADDR_UNSET) 2283286647Smav trim_map_free(dev->l2ad_vdev, 2284286647Smav hdr->b_l2hdr.b_daddr, 2285286647Smav hdr->b_l2hdr.b_asize, 0); 2286286598Smav arc_hdr_l2hdr_destroy(hdr); 2287286598Smav } 2288286570Smav 2289219089Spjd if (!buflist_held) 2290286598Smav mutex_exit(&dev->l2ad_mtx); 2291185029Spjd } 2292185029Spjd 2293286570Smav if (!BUF_EMPTY(hdr)) 2294219089Spjd buf_discard_identity(hdr); 2295168404Spjd if (hdr->b_freeze_cksum != NULL) { 2296168404Spjd kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 2297168404Spjd hdr->b_freeze_cksum = NULL; 2298168404Spjd } 2299286570Smav 2300286570Smav if (HDR_HAS_L1HDR(hdr)) { 2301286570Smav while (hdr->b_l1hdr.b_buf) { 2302286570Smav arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2303286570Smav 2304286570Smav if (buf->b_efunc != NULL) { 2305286570Smav mutex_enter(&arc_eviction_mtx); 2306286570Smav mutex_enter(&buf->b_evict_lock); 2307286570Smav ASSERT(buf->b_hdr != NULL); 2308286570Smav arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, 2309286570Smav FALSE); 2310286570Smav hdr->b_l1hdr.b_buf = buf->b_next; 2311286570Smav buf->b_hdr = &arc_eviction_hdr; 2312286570Smav buf->b_next = arc_eviction_list; 2313286570Smav arc_eviction_list = buf; 2314286570Smav mutex_exit(&buf->b_evict_lock); 2315286570Smav mutex_exit(&arc_eviction_mtx); 2316286570Smav } else { 2317286570Smav arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE, 2318286570Smav TRUE); 2319286570Smav } 2320286570Smav } 2321286570Smav#ifdef ZFS_DEBUG 2322286570Smav if (hdr->b_l1hdr.b_thawed != NULL) { 2323286570Smav kmem_free(hdr->b_l1hdr.b_thawed, 1); 2324286570Smav hdr->b_l1hdr.b_thawed = NULL; 2325286570Smav } 2326286570Smav#endif 2327219089Spjd } 2328168404Spjd 2329168404Spjd ASSERT3P(hdr->b_hash_next, ==, NULL); 2330286570Smav if (HDR_HAS_L1HDR(hdr)) { 2331286570Smav ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 2332286570Smav ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 2333286570Smav kmem_cache_free(hdr_full_cache, hdr); 2334286570Smav } else { 2335286570Smav kmem_cache_free(hdr_l2only_cache, hdr); 2336286570Smav } 2337168404Spjd} 2338168404Spjd 2339168404Spjdvoid 2340168404Spjdarc_buf_free(arc_buf_t *buf, void *tag) 2341168404Spjd{ 2342168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 2343286570Smav int hashed = hdr->b_l1hdr.b_state != arc_anon; 2344168404Spjd 2345168404Spjd ASSERT(buf->b_efunc == NULL); 2346168404Spjd ASSERT(buf->b_data != NULL); 2347168404Spjd 2348168404Spjd if (hashed) { 2349168404Spjd kmutex_t *hash_lock = HDR_LOCK(hdr); 2350168404Spjd 2351168404Spjd mutex_enter(hash_lock); 2352219089Spjd hdr = buf->b_hdr; 2353219089Spjd ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2354219089Spjd 2355168404Spjd (void) remove_reference(hdr, hash_lock, tag); 2356286570Smav if (hdr->b_l1hdr.b_datacnt > 1) { 2357168404Spjd arc_buf_destroy(buf, FALSE, TRUE); 2358219089Spjd } else { 2359286570Smav ASSERT(buf == hdr->b_l1hdr.b_buf); 2360219089Spjd ASSERT(buf->b_efunc == NULL); 2361275811Sdelphij hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2362219089Spjd } 2363168404Spjd mutex_exit(hash_lock); 2364168404Spjd } else if (HDR_IO_IN_PROGRESS(hdr)) { 2365168404Spjd int destroy_hdr; 2366168404Spjd /* 2367168404Spjd * We are in the middle of an async write. Don't destroy 2368168404Spjd * this buffer unless the write completes before we finish 2369168404Spjd * decrementing the reference count. 2370168404Spjd */ 2371168404Spjd mutex_enter(&arc_eviction_mtx); 2372168404Spjd (void) remove_reference(hdr, NULL, tag); 2373286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2374168404Spjd destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 2375168404Spjd mutex_exit(&arc_eviction_mtx); 2376168404Spjd if (destroy_hdr) 2377168404Spjd arc_hdr_destroy(hdr); 2378168404Spjd } else { 2379219089Spjd if (remove_reference(hdr, NULL, tag) > 0) 2380168404Spjd arc_buf_destroy(buf, FALSE, TRUE); 2381219089Spjd else 2382168404Spjd arc_hdr_destroy(hdr); 2383168404Spjd } 2384168404Spjd} 2385168404Spjd 2386248571Smmboolean_t 2387168404Spjdarc_buf_remove_ref(arc_buf_t *buf, void* tag) 2388168404Spjd{ 2389168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 2390168404Spjd kmutex_t *hash_lock = HDR_LOCK(hdr); 2391248571Smm boolean_t no_callback = (buf->b_efunc == NULL); 2392168404Spjd 2393286570Smav if (hdr->b_l1hdr.b_state == arc_anon) { 2394286570Smav ASSERT(hdr->b_l1hdr.b_datacnt == 1); 2395168404Spjd arc_buf_free(buf, tag); 2396168404Spjd return (no_callback); 2397168404Spjd } 2398168404Spjd 2399168404Spjd mutex_enter(hash_lock); 2400219089Spjd hdr = buf->b_hdr; 2401286570Smav ASSERT(hdr->b_l1hdr.b_datacnt > 0); 2402219089Spjd ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2403286570Smav ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2404168404Spjd ASSERT(buf->b_data != NULL); 2405168404Spjd 2406168404Spjd (void) remove_reference(hdr, hash_lock, tag); 2407286570Smav if (hdr->b_l1hdr.b_datacnt > 1) { 2408168404Spjd if (no_callback) 2409168404Spjd arc_buf_destroy(buf, FALSE, TRUE); 2410168404Spjd } else if (no_callback) { 2411286570Smav ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); 2412219089Spjd ASSERT(buf->b_efunc == NULL); 2413275811Sdelphij hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2414168404Spjd } 2415286570Smav ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || 2416286570Smav refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2417168404Spjd mutex_exit(hash_lock); 2418168404Spjd return (no_callback); 2419168404Spjd} 2420168404Spjd 2421286570Smavint32_t 2422168404Spjdarc_buf_size(arc_buf_t *buf) 2423168404Spjd{ 2424168404Spjd return (buf->b_hdr->b_size); 2425168404Spjd} 2426168404Spjd 2427168404Spjd/* 2428242845Sdelphij * Called from the DMU to determine if the current buffer should be 2429242845Sdelphij * evicted. In order to ensure proper locking, the eviction must be initiated 2430242845Sdelphij * from the DMU. Return true if the buffer is associated with user data and 2431242845Sdelphij * duplicate buffers still exist. 2432242845Sdelphij */ 2433242845Sdelphijboolean_t 2434242845Sdelphijarc_buf_eviction_needed(arc_buf_t *buf) 2435242845Sdelphij{ 2436242845Sdelphij arc_buf_hdr_t *hdr; 2437242845Sdelphij boolean_t evict_needed = B_FALSE; 2438242845Sdelphij 2439242845Sdelphij if (zfs_disable_dup_eviction) 2440242845Sdelphij return (B_FALSE); 2441242845Sdelphij 2442242845Sdelphij mutex_enter(&buf->b_evict_lock); 2443242845Sdelphij hdr = buf->b_hdr; 2444242845Sdelphij if (hdr == NULL) { 2445242845Sdelphij /* 2446242845Sdelphij * We are in arc_do_user_evicts(); let that function 2447242845Sdelphij * perform the eviction. 2448242845Sdelphij */ 2449242845Sdelphij ASSERT(buf->b_data == NULL); 2450242845Sdelphij mutex_exit(&buf->b_evict_lock); 2451242845Sdelphij return (B_FALSE); 2452242845Sdelphij } else if (buf->b_data == NULL) { 2453242845Sdelphij /* 2454242845Sdelphij * We have already been added to the arc eviction list; 2455242845Sdelphij * recommend eviction. 2456242845Sdelphij */ 2457242845Sdelphij ASSERT3P(hdr, ==, &arc_eviction_hdr); 2458242845Sdelphij mutex_exit(&buf->b_evict_lock); 2459242845Sdelphij return (B_TRUE); 2460242845Sdelphij } 2461242845Sdelphij 2462286570Smav if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) 2463242845Sdelphij evict_needed = B_TRUE; 2464242845Sdelphij 2465242845Sdelphij mutex_exit(&buf->b_evict_lock); 2466242845Sdelphij return (evict_needed); 2467242845Sdelphij} 2468242845Sdelphij 2469242845Sdelphij/* 2470168404Spjd * Evict buffers from list until we've removed the specified number of 2471168404Spjd * bytes. Move the removed buffers to the appropriate evict state. 2472168404Spjd * If the recycle flag is set, then attempt to "recycle" a buffer: 2473168404Spjd * - look for a buffer to evict that is `bytes' long. 2474168404Spjd * - return the data block from this buffer rather than freeing it. 2475168404Spjd * This flag is used by callers that are trying to make space for a 2476168404Spjd * new buffer in a full arc cache. 2477185029Spjd * 2478185029Spjd * This function makes a "best effort". It skips over any buffers 2479185029Spjd * it can't get a hash_lock on, and so may not catch all candidates. 2480185029Spjd * It may also return without evicting as much space as requested. 2481168404Spjd */ 2482168404Spjdstatic void * 2483209962Smmarc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, 2484168404Spjd arc_buf_contents_t type) 2485168404Spjd{ 2486168404Spjd arc_state_t *evicted_state; 2487168404Spjd uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 2488275811Sdelphij arc_buf_hdr_t *hdr, *hdr_prev = NULL; 2489168404Spjd kmutex_t *hash_lock; 2490168404Spjd boolean_t have_lock; 2491168404Spjd void *stolen = NULL; 2492258632Savg arc_buf_hdr_t marker = { 0 }; 2493258632Savg int count = 0; 2494168404Spjd 2495168404Spjd ASSERT(state == arc_mru || state == arc_mfu); 2496168404Spjd 2497168404Spjd evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2498206796Spjd 2499275780Sdelphij /* 2500286762Smav * The ghost list lock must be acquired first in order to prevent 2501286762Smav * a 3 party deadlock: 2502286762Smav * 2503286762Smav * - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by 2504286762Smav * l2ad_mtx in arc_hdr_realloc 2505286762Smav * - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx 2506286762Smav * - arc_evict acquires arc_*_ghost->arcs_mtx, followed by 2507286762Smav * arc_*_ghost->arcs_mtx and forms a deadlock cycle. 2508286762Smav * 2509286762Smav * This situation is avoided by acquiring the ghost list lock first. 2510286762Smav */ 2511286762Smav mutex_enter(&evicted_state->arcs_mtx); 2512286762Smav mutex_enter(&state->arcs_mtx); 2513286762Smav 2514286762Smav /* 2515275780Sdelphij * Decide which "type" (data vs metadata) to recycle from. 2516275780Sdelphij * 2517275780Sdelphij * If we are over the metadata limit, recycle from metadata. 2518275780Sdelphij * If we are under the metadata minimum, recycle from data. 2519275780Sdelphij * Otherwise, recycle from whichever type has the oldest (least 2520286762Smav * recently accessed) header. 2521275780Sdelphij */ 2522275780Sdelphij if (recycle) { 2523286762Smav arc_buf_hdr_t *data_hdr = 2524286762Smav list_tail(&state->arcs_list[ARC_BUFC_DATA]); 2525286762Smav arc_buf_hdr_t *metadata_hdr = 2526286762Smav list_tail(&state->arcs_list[ARC_BUFC_METADATA]); 2527275780Sdelphij arc_buf_contents_t realtype; 2528286762Smav 2529286762Smav if (data_hdr == NULL) { 2530275780Sdelphij realtype = ARC_BUFC_METADATA; 2531286762Smav } else if (metadata_hdr == NULL) { 2532275780Sdelphij realtype = ARC_BUFC_DATA; 2533275780Sdelphij } else if (arc_meta_used >= arc_meta_limit) { 2534275780Sdelphij realtype = ARC_BUFC_METADATA; 2535275780Sdelphij } else if (arc_meta_used <= arc_meta_min) { 2536275780Sdelphij realtype = ARC_BUFC_DATA; 2537286570Smav } else if (HDR_HAS_L1HDR(data_hdr) && 2538286570Smav HDR_HAS_L1HDR(metadata_hdr) && 2539286570Smav data_hdr->b_l1hdr.b_arc_access < 2540286570Smav metadata_hdr->b_l1hdr.b_arc_access) { 2541286570Smav realtype = ARC_BUFC_DATA; 2542275780Sdelphij } else { 2543286570Smav realtype = ARC_BUFC_METADATA; 2544275780Sdelphij } 2545275780Sdelphij if (realtype != type) { 2546275780Sdelphij /* 2547275780Sdelphij * If we want to evict from a different list, 2548275780Sdelphij * we can not recycle, because DATA vs METADATA 2549275780Sdelphij * buffers are segregated into different kmem 2550275780Sdelphij * caches (and vmem arenas). 2551275780Sdelphij */ 2552275780Sdelphij type = realtype; 2553275780Sdelphij recycle = B_FALSE; 2554275780Sdelphij } 2555275780Sdelphij } 2556275780Sdelphij 2557286762Smav list_t *list = &state->arcs_list[type]; 2558206796Spjd 2559275811Sdelphij for (hdr = list_tail(list); hdr; hdr = hdr_prev) { 2560275811Sdelphij hdr_prev = list_prev(list, hdr); 2561168404Spjd /* prefetch buffers have a minimum lifespan */ 2562275811Sdelphij if (HDR_IO_IN_PROGRESS(hdr) || 2563275811Sdelphij (spa && hdr->b_spa != spa) || 2564286570Smav ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && 2565286570Smav ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < 2566219089Spjd arc_min_prefetch_lifespan)) { 2567168404Spjd skipped++; 2568168404Spjd continue; 2569168404Spjd } 2570168404Spjd /* "lookahead" for better eviction candidate */ 2571275811Sdelphij if (recycle && hdr->b_size != bytes && 2572275811Sdelphij hdr_prev && hdr_prev->b_size == bytes) 2573168404Spjd continue; 2574258632Savg 2575258632Savg /* ignore markers */ 2576275811Sdelphij if (hdr->b_spa == 0) 2577258632Savg continue; 2578258632Savg 2579258632Savg /* 2580258632Savg * It may take a long time to evict all the bufs requested. 2581258632Savg * To avoid blocking all arc activity, periodically drop 2582258632Savg * the arcs_mtx and give other threads a chance to run 2583258632Savg * before reacquiring the lock. 2584258632Savg * 2585258632Savg * If we are looking for a buffer to recycle, we are in 2586258632Savg * the hot code path, so don't sleep. 2587258632Savg */ 2588258632Savg if (!recycle && count++ > arc_evict_iterations) { 2589275811Sdelphij list_insert_after(list, hdr, &marker); 2590286762Smav mutex_exit(&state->arcs_mtx); 2591286762Smav mutex_exit(&evicted_state->arcs_mtx); 2592258632Savg kpreempt(KPREEMPT_SYNC); 2593286762Smav mutex_enter(&evicted_state->arcs_mtx); 2594286762Smav mutex_enter(&state->arcs_mtx); 2595275811Sdelphij hdr_prev = list_prev(list, &marker); 2596258632Savg list_remove(list, &marker); 2597258632Savg count = 0; 2598258632Savg continue; 2599258632Savg } 2600258632Savg 2601275811Sdelphij hash_lock = HDR_LOCK(hdr); 2602168404Spjd have_lock = MUTEX_HELD(hash_lock); 2603168404Spjd if (have_lock || mutex_tryenter(hash_lock)) { 2604286570Smav ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 2605286570Smav ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); 2606286570Smav while (hdr->b_l1hdr.b_buf) { 2607286570Smav arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2608219089Spjd if (!mutex_tryenter(&buf->b_evict_lock)) { 2609185029Spjd missed += 1; 2610185029Spjd break; 2611185029Spjd } 2612286570Smav if (buf->b_data != NULL) { 2613275811Sdelphij bytes_evicted += hdr->b_size; 2614286570Smav if (recycle && 2615286570Smav arc_buf_type(hdr) == type && 2616275811Sdelphij hdr->b_size == bytes && 2617275811Sdelphij !HDR_L2_WRITING(hdr)) { 2618168404Spjd stolen = buf->b_data; 2619168404Spjd recycle = FALSE; 2620168404Spjd } 2621168404Spjd } 2622286570Smav if (buf->b_efunc != NULL) { 2623168404Spjd mutex_enter(&arc_eviction_mtx); 2624168404Spjd arc_buf_destroy(buf, 2625168404Spjd buf->b_data == stolen, FALSE); 2626286570Smav hdr->b_l1hdr.b_buf = buf->b_next; 2627168404Spjd buf->b_hdr = &arc_eviction_hdr; 2628168404Spjd buf->b_next = arc_eviction_list; 2629168404Spjd arc_eviction_list = buf; 2630168404Spjd mutex_exit(&arc_eviction_mtx); 2631219089Spjd mutex_exit(&buf->b_evict_lock); 2632168404Spjd } else { 2633219089Spjd mutex_exit(&buf->b_evict_lock); 2634168404Spjd arc_buf_destroy(buf, 2635168404Spjd buf->b_data == stolen, TRUE); 2636168404Spjd } 2637168404Spjd } 2638208373Smm 2639286570Smav if (HDR_HAS_L2HDR(hdr)) { 2640208373Smm ARCSTAT_INCR(arcstat_evict_l2_cached, 2641275811Sdelphij hdr->b_size); 2642208373Smm } else { 2643275811Sdelphij if (l2arc_write_eligible(hdr->b_spa, hdr)) { 2644208373Smm ARCSTAT_INCR(arcstat_evict_l2_eligible, 2645275811Sdelphij hdr->b_size); 2646208373Smm } else { 2647208373Smm ARCSTAT_INCR( 2648208373Smm arcstat_evict_l2_ineligible, 2649275811Sdelphij hdr->b_size); 2650208373Smm } 2651208373Smm } 2652208373Smm 2653286570Smav if (hdr->b_l1hdr.b_datacnt == 0) { 2654275811Sdelphij arc_change_state(evicted_state, hdr, hash_lock); 2655275811Sdelphij ASSERT(HDR_IN_HASH_TABLE(hdr)); 2656275811Sdelphij hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 2657275811Sdelphij hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 2658275811Sdelphij DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); 2659185029Spjd } 2660168404Spjd if (!have_lock) 2661168404Spjd mutex_exit(hash_lock); 2662168404Spjd if (bytes >= 0 && bytes_evicted >= bytes) 2663168404Spjd break; 2664168404Spjd } else { 2665168404Spjd missed += 1; 2666168404Spjd } 2667168404Spjd } 2668168404Spjd 2669286762Smav mutex_exit(&state->arcs_mtx); 2670286762Smav mutex_exit(&evicted_state->arcs_mtx); 2671206796Spjd 2672286762Smav if (bytes_evicted < bytes) 2673286762Smav dprintf("only evicted %lld bytes from %x", 2674286762Smav (longlong_t)bytes_evicted, state); 2675168404Spjd 2676168404Spjd if (skipped) 2677168404Spjd ARCSTAT_INCR(arcstat_evict_skip, skipped); 2678168404Spjd 2679168404Spjd if (missed) 2680168404Spjd ARCSTAT_INCR(arcstat_mutex_miss, missed); 2681168404Spjd 2682185029Spjd /* 2683258632Savg * Note: we have just evicted some data into the ghost state, 2684258632Savg * potentially putting the ghost size over the desired size. Rather 2685258632Savg * that evicting from the ghost list in this hot code path, leave 2686258632Savg * this chore to the arc_reclaim_thread(). 2687185029Spjd */ 2688185029Spjd 2689168404Spjd return (stolen); 2690168404Spjd} 2691168404Spjd 2692168404Spjd/* 2693168404Spjd * Remove buffers from list until we've removed the specified number of 2694168404Spjd * bytes. Destroy the buffers that are removed. 2695168404Spjd */ 2696168404Spjdstatic void 2697209962Smmarc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) 2698168404Spjd{ 2699275811Sdelphij arc_buf_hdr_t *hdr, *hdr_prev; 2700219089Spjd arc_buf_hdr_t marker = { 0 }; 2701286762Smav list_t *list = &state->arcs_list[ARC_BUFC_DATA]; 2702286762Smav kmutex_t *hash_lock; 2703168404Spjd uint64_t bytes_deleted = 0; 2704168404Spjd uint64_t bufs_skipped = 0; 2705258632Savg int count = 0; 2706168404Spjd 2707168404Spjd ASSERT(GHOST_STATE(state)); 2708286762Smavtop: 2709286762Smav mutex_enter(&state->arcs_mtx); 2710275811Sdelphij for (hdr = list_tail(list); hdr; hdr = hdr_prev) { 2711275811Sdelphij hdr_prev = list_prev(list, hdr); 2712286570Smav if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES) 2713275811Sdelphij panic("invalid hdr=%p", (void *)hdr); 2714275811Sdelphij if (spa && hdr->b_spa != spa) 2715185029Spjd continue; 2716219089Spjd 2717219089Spjd /* ignore markers */ 2718275811Sdelphij if (hdr->b_spa == 0) 2719219089Spjd continue; 2720219089Spjd 2721275811Sdelphij hash_lock = HDR_LOCK(hdr); 2722219089Spjd /* caller may be trying to modify this buffer, skip it */ 2723219089Spjd if (MUTEX_HELD(hash_lock)) 2724219089Spjd continue; 2725258632Savg 2726258632Savg /* 2727258632Savg * It may take a long time to evict all the bufs requested. 2728258632Savg * To avoid blocking all arc activity, periodically drop 2729258632Savg * the arcs_mtx and give other threads a chance to run 2730258632Savg * before reacquiring the lock. 2731258632Savg */ 2732258632Savg if (count++ > arc_evict_iterations) { 2733275811Sdelphij list_insert_after(list, hdr, &marker); 2734286762Smav mutex_exit(&state->arcs_mtx); 2735258632Savg kpreempt(KPREEMPT_SYNC); 2736286762Smav mutex_enter(&state->arcs_mtx); 2737275811Sdelphij hdr_prev = list_prev(list, &marker); 2738258632Savg list_remove(list, &marker); 2739258632Savg count = 0; 2740258632Savg continue; 2741258632Savg } 2742168404Spjd if (mutex_tryenter(hash_lock)) { 2743275811Sdelphij ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2744286570Smav ASSERT(!HDR_HAS_L1HDR(hdr) || 2745286570Smav hdr->b_l1hdr.b_buf == NULL); 2746168404Spjd ARCSTAT_BUMP(arcstat_deleted); 2747275811Sdelphij bytes_deleted += hdr->b_size; 2748185029Spjd 2749286570Smav if (HDR_HAS_L2HDR(hdr)) { 2750185029Spjd /* 2751185029Spjd * This buffer is cached on the 2nd Level ARC; 2752185029Spjd * don't destroy the header. 2753185029Spjd */ 2754275811Sdelphij arc_change_state(arc_l2c_only, hdr, hash_lock); 2755286570Smav /* 2756286570Smav * dropping from L1+L2 cached to L2-only, 2757286570Smav * realloc to remove the L1 header. 2758286570Smav */ 2759286570Smav hdr = arc_hdr_realloc(hdr, hdr_full_cache, 2760286570Smav hdr_l2only_cache); 2761185029Spjd mutex_exit(hash_lock); 2762185029Spjd } else { 2763275811Sdelphij arc_change_state(arc_anon, hdr, hash_lock); 2764185029Spjd mutex_exit(hash_lock); 2765275811Sdelphij arc_hdr_destroy(hdr); 2766185029Spjd } 2767185029Spjd 2768275811Sdelphij DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); 2769168404Spjd if (bytes >= 0 && bytes_deleted >= bytes) 2770168404Spjd break; 2771219089Spjd } else if (bytes < 0) { 2772219089Spjd /* 2773219089Spjd * Insert a list marker and then wait for the 2774219089Spjd * hash lock to become available. Once its 2775219089Spjd * available, restart from where we left off. 2776219089Spjd */ 2777275811Sdelphij list_insert_after(list, hdr, &marker); 2778286762Smav mutex_exit(&state->arcs_mtx); 2779219089Spjd mutex_enter(hash_lock); 2780219089Spjd mutex_exit(hash_lock); 2781286762Smav mutex_enter(&state->arcs_mtx); 2782275811Sdelphij hdr_prev = list_prev(list, &marker); 2783219089Spjd list_remove(list, &marker); 2784258632Savg } else { 2785168404Spjd bufs_skipped += 1; 2786258632Savg } 2787258632Savg 2788168404Spjd } 2789286762Smav mutex_exit(&state->arcs_mtx); 2790206796Spjd 2791286762Smav if (list == &state->arcs_list[ARC_BUFC_DATA] && 2792185029Spjd (bytes < 0 || bytes_deleted < bytes)) { 2793286762Smav list = &state->arcs_list[ARC_BUFC_METADATA]; 2794286762Smav goto top; 2795185029Spjd } 2796185029Spjd 2797168404Spjd if (bufs_skipped) { 2798168404Spjd ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 2799168404Spjd ASSERT(bytes >= 0); 2800168404Spjd } 2801168404Spjd 2802168404Spjd if (bytes_deleted < bytes) 2803168404Spjd dprintf("only deleted %lld bytes from %p", 2804168404Spjd (longlong_t)bytes_deleted, state); 2805168404Spjd} 2806168404Spjd 2807168404Spjdstatic void 2808168404Spjdarc_adjust(void) 2809168404Spjd{ 2810208373Smm int64_t adjustment, delta; 2811168404Spjd 2812208373Smm /* 2813208373Smm * Adjust MRU size 2814208373Smm */ 2815168404Spjd 2816209275Smm adjustment = MIN((int64_t)(arc_size - arc_c), 2817209275Smm (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - 2818209275Smm arc_p)); 2819208373Smm 2820208373Smm if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 2821208373Smm delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); 2822209962Smm (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); 2823208373Smm adjustment -= delta; 2824168404Spjd } 2825168404Spjd 2826208373Smm if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2827208373Smm delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); 2828209962Smm (void) arc_evict(arc_mru, 0, delta, FALSE, 2829185029Spjd ARC_BUFC_METADATA); 2830185029Spjd } 2831185029Spjd 2832208373Smm /* 2833208373Smm * Adjust MFU size 2834208373Smm */ 2835168404Spjd 2836208373Smm adjustment = arc_size - arc_c; 2837208373Smm 2838208373Smm if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 2839208373Smm delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); 2840209962Smm (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); 2841208373Smm adjustment -= delta; 2842168404Spjd } 2843168404Spjd 2844208373Smm if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 2845208373Smm int64_t delta = MIN(adjustment, 2846208373Smm arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); 2847209962Smm (void) arc_evict(arc_mfu, 0, delta, FALSE, 2848208373Smm ARC_BUFC_METADATA); 2849208373Smm } 2850168404Spjd 2851208373Smm /* 2852208373Smm * Adjust ghost lists 2853208373Smm */ 2854168404Spjd 2855208373Smm adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; 2856168404Spjd 2857208373Smm if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { 2858208373Smm delta = MIN(arc_mru_ghost->arcs_size, adjustment); 2859209962Smm arc_evict_ghost(arc_mru_ghost, 0, delta); 2860208373Smm } 2861185029Spjd 2862208373Smm adjustment = 2863208373Smm arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; 2864208373Smm 2865208373Smm if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { 2866208373Smm delta = MIN(arc_mfu_ghost->arcs_size, adjustment); 2867209962Smm arc_evict_ghost(arc_mfu_ghost, 0, delta); 2868168404Spjd } 2869168404Spjd} 2870168404Spjd 2871168404Spjdstatic void 2872168404Spjdarc_do_user_evicts(void) 2873168404Spjd{ 2874168404Spjd mutex_enter(&arc_eviction_mtx); 2875286762Smav while (arc_eviction_list != NULL) { 2876286762Smav arc_buf_t *buf = arc_eviction_list; 2877286762Smav arc_eviction_list = buf->b_next; 2878219089Spjd mutex_enter(&buf->b_evict_lock); 2879168404Spjd buf->b_hdr = NULL; 2880219089Spjd mutex_exit(&buf->b_evict_lock); 2881286762Smav mutex_exit(&arc_eviction_mtx); 2882168404Spjd 2883168404Spjd if (buf->b_efunc != NULL) 2884268858Sdelphij VERIFY0(buf->b_efunc(buf->b_private)); 2885168404Spjd 2886168404Spjd buf->b_efunc = NULL; 2887168404Spjd buf->b_private = NULL; 2888168404Spjd kmem_cache_free(buf_cache, buf); 2889286762Smav mutex_enter(&arc_eviction_mtx); 2890168404Spjd } 2891286762Smav mutex_exit(&arc_eviction_mtx); 2892168404Spjd} 2893168404Spjd 2894168404Spjd/* 2895185029Spjd * Flush all *evictable* data from the cache for the given spa. 2896168404Spjd * NOTE: this will not touch "active" (i.e. referenced) data. 2897168404Spjd */ 2898168404Spjdvoid 2899185029Spjdarc_flush(spa_t *spa) 2900168404Spjd{ 2901209962Smm uint64_t guid = 0; 2902209962Smm 2903286570Smav if (spa != NULL) 2904228103Smm guid = spa_load_guid(spa); 2905209962Smm 2906205231Skmacy while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) { 2907209962Smm (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); 2908286570Smav if (spa != NULL) 2909185029Spjd break; 2910185029Spjd } 2911205231Skmacy while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) { 2912209962Smm (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); 2913286570Smav if (spa != NULL) 2914185029Spjd break; 2915185029Spjd } 2916205231Skmacy while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) { 2917209962Smm (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); 2918286570Smav if (spa != NULL) 2919185029Spjd break; 2920185029Spjd } 2921205231Skmacy while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) { 2922209962Smm (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); 2923286570Smav if (spa != NULL) 2924185029Spjd break; 2925185029Spjd } 2926168404Spjd 2927209962Smm arc_evict_ghost(arc_mru_ghost, guid, -1); 2928209962Smm arc_evict_ghost(arc_mfu_ghost, guid, -1); 2929168404Spjd 2930168404Spjd mutex_enter(&arc_reclaim_thr_lock); 2931168404Spjd arc_do_user_evicts(); 2932168404Spjd mutex_exit(&arc_reclaim_thr_lock); 2933185029Spjd ASSERT(spa || arc_eviction_list == NULL); 2934168404Spjd} 2935168404Spjd 2936168404Spjdvoid 2937286625Smavarc_shrink(int64_t to_free) 2938168404Spjd{ 2939168404Spjd if (arc_c > arc_c_min) { 2940272483Ssmh DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, 2941272483Ssmh arc_c_min, uint64_t, arc_p, uint64_t, to_free); 2942168404Spjd if (arc_c > arc_c_min + to_free) 2943168404Spjd atomic_add_64(&arc_c, -to_free); 2944168404Spjd else 2945168404Spjd arc_c = arc_c_min; 2946168404Spjd 2947168404Spjd atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 2948168404Spjd if (arc_c > arc_size) 2949168404Spjd arc_c = MAX(arc_size, arc_c_min); 2950168404Spjd if (arc_p > arc_c) 2951168404Spjd arc_p = (arc_c >> 1); 2952272483Ssmh 2953272483Ssmh DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, 2954272483Ssmh arc_p); 2955272483Ssmh 2956168404Spjd ASSERT(arc_c >= arc_c_min); 2957168404Spjd ASSERT((int64_t)arc_p >= 0); 2958168404Spjd } 2959168404Spjd 2960270759Ssmh if (arc_size > arc_c) { 2961270759Ssmh DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, 2962270759Ssmh uint64_t, arc_c); 2963168404Spjd arc_adjust(); 2964270759Ssmh } 2965168404Spjd} 2966168404Spjd 2967286625Smavstatic long needfree = 0; 2968168404Spjd 2969286625Smavtypedef enum free_memory_reason_t { 2970286625Smav FMR_UNKNOWN, 2971286625Smav FMR_NEEDFREE, 2972286625Smav FMR_LOTSFREE, 2973286625Smav FMR_SWAPFS_MINFREE, 2974286625Smav FMR_PAGES_PP_MAXIMUM, 2975286625Smav FMR_HEAP_ARENA, 2976286625Smav FMR_ZIO_ARENA, 2977286625Smav FMR_ZIO_FRAG, 2978286625Smav} free_memory_reason_t; 2979286625Smav 2980286625Smavint64_t last_free_memory; 2981286625Smavfree_memory_reason_t last_free_reason; 2982286625Smav 2983286625Smav/* 2984286625Smav * Additional reserve of pages for pp_reserve. 2985286625Smav */ 2986286625Smavint64_t arc_pages_pp_reserve = 64; 2987286625Smav 2988286625Smav/* 2989286625Smav * Additional reserve of pages for swapfs. 2990286625Smav */ 2991286625Smavint64_t arc_swapfs_reserve = 64; 2992286625Smav 2993286625Smav/* 2994286625Smav * Return the amount of memory that can be consumed before reclaim will be 2995286625Smav * needed. Positive if there is sufficient free memory, negative indicates 2996286625Smav * the amount of memory that needs to be freed up. 2997286625Smav */ 2998286625Smavstatic int64_t 2999286625Smavarc_available_memory(void) 3000168404Spjd{ 3001286625Smav int64_t lowest = INT64_MAX; 3002286625Smav int64_t n; 3003286625Smav free_memory_reason_t r = FMR_UNKNOWN; 3004168404Spjd 3005168404Spjd#ifdef _KERNEL 3006286625Smav if (needfree > 0) { 3007286625Smav n = PAGESIZE * (-needfree); 3008286625Smav if (n < lowest) { 3009286625Smav lowest = n; 3010286625Smav r = FMR_NEEDFREE; 3011286625Smav } 3012270759Ssmh } 3013168404Spjd 3014191902Skmacy /* 3015212780Savg * Cooperate with pagedaemon when it's time for it to scan 3016212780Savg * and reclaim some pages. 3017191902Skmacy */ 3018286655Smav n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); 3019286625Smav if (n < lowest) { 3020286625Smav lowest = n; 3021286625Smav r = FMR_LOTSFREE; 3022270759Ssmh } 3023191902Skmacy 3024277300Ssmh#ifdef illumos 3025168404Spjd /* 3026185029Spjd * check that we're out of range of the pageout scanner. It starts to 3027185029Spjd * schedule paging if freemem is less than lotsfree and needfree. 3028185029Spjd * lotsfree is the high-water mark for pageout, and needfree is the 3029185029Spjd * number of needed free pages. We add extra pages here to make sure 3030185029Spjd * the scanner doesn't start up while we're freeing memory. 3031185029Spjd */ 3032286625Smav n = PAGESIZE * (freemem - lotsfree - needfree - desfree); 3033286625Smav if (n < lowest) { 3034286625Smav lowest = n; 3035286625Smav r = FMR_LOTSFREE; 3036286625Smav } 3037185029Spjd 3038185029Spjd /* 3039168404Spjd * check to make sure that swapfs has enough space so that anon 3040185029Spjd * reservations can still succeed. anon_resvmem() checks that the 3041168404Spjd * availrmem is greater than swapfs_minfree, and the number of reserved 3042168404Spjd * swap pages. We also add a bit of extra here just to prevent 3043168404Spjd * circumstances from getting really dire. 3044168404Spjd */ 3045286625Smav n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - 3046286625Smav desfree - arc_swapfs_reserve); 3047286625Smav if (n < lowest) { 3048286625Smav lowest = n; 3049286625Smav r = FMR_SWAPFS_MINFREE; 3050286625Smav } 3051168404Spjd 3052286625Smav 3053168404Spjd /* 3054272483Ssmh * Check that we have enough availrmem that memory locking (e.g., via 3055272483Ssmh * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum 3056272483Ssmh * stores the number of pages that cannot be locked; when availrmem 3057272483Ssmh * drops below pages_pp_maximum, page locking mechanisms such as 3058272483Ssmh * page_pp_lock() will fail.) 3059272483Ssmh */ 3060286625Smav n = PAGESIZE * (availrmem - pages_pp_maximum - 3061286625Smav arc_pages_pp_reserve); 3062286625Smav if (n < lowest) { 3063286625Smav lowest = n; 3064286625Smav r = FMR_PAGES_PP_MAXIMUM; 3065286625Smav } 3066272483Ssmh 3067277300Ssmh#endif /* illumos */ 3068272483Ssmh#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 3069272483Ssmh /* 3070168404Spjd * If we're on an i386 platform, it's possible that we'll exhaust the 3071168404Spjd * kernel heap space before we ever run out of available physical 3072168404Spjd * memory. Most checks of the size of the heap_area compare against 3073168404Spjd * tune.t_minarmem, which is the minimum available real memory that we 3074168404Spjd * can have in the system. However, this is generally fixed at 25 pages 3075168404Spjd * which is so low that it's useless. In this comparison, we seek to 3076168404Spjd * calculate the total heap-size, and reclaim if more than 3/4ths of the 3077185029Spjd * heap is allocated. (Or, in the calculation, if less than 1/4th is 3078168404Spjd * free) 3079168404Spjd */ 3080286655Smav n = (int64_t)vmem_size(heap_arena, VMEM_FREE) - 3081286628Smav (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); 3082286625Smav if (n < lowest) { 3083286625Smav lowest = n; 3084286625Smav r = FMR_HEAP_ARENA; 3085270861Ssmh } 3086281026Smav#define zio_arena NULL 3087281026Smav#else 3088281026Smav#define zio_arena heap_arena 3089270861Ssmh#endif 3090281026Smav 3091272483Ssmh /* 3092272483Ssmh * If zio data pages are being allocated out of a separate heap segment, 3093272483Ssmh * then enforce that the size of available vmem for this arena remains 3094272483Ssmh * above about 1/16th free. 3095272483Ssmh * 3096272483Ssmh * Note: The 1/16th arena free requirement was put in place 3097272483Ssmh * to aggressively evict memory from the arc in order to avoid 3098272483Ssmh * memory fragmentation issues. 3099272483Ssmh */ 3100286625Smav if (zio_arena != NULL) { 3101286655Smav n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - 3102286625Smav (vmem_size(zio_arena, VMEM_ALLOC) >> 4); 3103286625Smav if (n < lowest) { 3104286625Smav lowest = n; 3105286625Smav r = FMR_ZIO_ARENA; 3106286625Smav } 3107286625Smav } 3108281026Smav 3109281026Smav /* 3110281026Smav * Above limits know nothing about real level of KVA fragmentation. 3111281026Smav * Start aggressive reclamation if too little sequential KVA left. 3112281026Smav */ 3113286625Smav if (lowest > 0) { 3114286625Smav n = (vmem_size(heap_arena, VMEM_MAXFREE) < zfs_max_recordsize) ? 3115286655Smav -((int64_t)vmem_size(heap_arena, VMEM_ALLOC) >> 4) : 3116286655Smav INT64_MAX; 3117286625Smav if (n < lowest) { 3118286625Smav lowest = n; 3119286625Smav r = FMR_ZIO_FRAG; 3120286625Smav } 3121281109Smav } 3122281026Smav 3123272483Ssmh#else /* _KERNEL */ 3124286625Smav /* Every 100 calls, free a small amount */ 3125168404Spjd if (spa_get_random(100) == 0) 3126286625Smav lowest = -1024; 3127272483Ssmh#endif /* _KERNEL */ 3128270759Ssmh 3129286625Smav last_free_memory = lowest; 3130286625Smav last_free_reason = r; 3131286625Smav DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); 3132286625Smav return (lowest); 3133168404Spjd} 3134168404Spjd 3135286625Smav 3136286625Smav/* 3137286625Smav * Determine if the system is under memory pressure and is asking 3138286625Smav * to reclaim memory. A return value of TRUE indicates that the system 3139286625Smav * is under memory pressure and that the arc should adjust accordingly. 3140286625Smav */ 3141286625Smavstatic boolean_t 3142286625Smavarc_reclaim_needed(void) 3143286625Smav{ 3144286625Smav return (arc_available_memory() < 0); 3145286625Smav} 3146286625Smav 3147208454Spjdextern kmem_cache_t *zio_buf_cache[]; 3148208454Spjdextern kmem_cache_t *zio_data_buf_cache[]; 3149272527Sdelphijextern kmem_cache_t *range_seg_cache; 3150208454Spjd 3151278040Ssmhstatic __noinline void 3152286625Smavarc_kmem_reap_now(void) 3153168404Spjd{ 3154168404Spjd size_t i; 3155168404Spjd kmem_cache_t *prev_cache = NULL; 3156168404Spjd kmem_cache_t *prev_data_cache = NULL; 3157168404Spjd 3158272483Ssmh DTRACE_PROBE(arc__kmem_reap_start); 3159168404Spjd#ifdef _KERNEL 3160185029Spjd if (arc_meta_used >= arc_meta_limit) { 3161185029Spjd /* 3162185029Spjd * We are exceeding our meta-data cache limit. 3163185029Spjd * Purge some DNLC entries to release holds on meta-data. 3164185029Spjd */ 3165185029Spjd dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 3166185029Spjd } 3167168404Spjd#if defined(__i386) 3168168404Spjd /* 3169168404Spjd * Reclaim unused memory from all kmem caches. 3170168404Spjd */ 3171168404Spjd kmem_reap(); 3172168404Spjd#endif 3173168404Spjd#endif 3174168404Spjd 3175168404Spjd for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 3176168404Spjd if (zio_buf_cache[i] != prev_cache) { 3177168404Spjd prev_cache = zio_buf_cache[i]; 3178168404Spjd kmem_cache_reap_now(zio_buf_cache[i]); 3179168404Spjd } 3180168404Spjd if (zio_data_buf_cache[i] != prev_data_cache) { 3181168404Spjd prev_data_cache = zio_data_buf_cache[i]; 3182168404Spjd kmem_cache_reap_now(zio_data_buf_cache[i]); 3183168404Spjd } 3184168404Spjd } 3185168404Spjd kmem_cache_reap_now(buf_cache); 3186286570Smav kmem_cache_reap_now(hdr_full_cache); 3187286570Smav kmem_cache_reap_now(hdr_l2only_cache); 3188272506Sdelphij kmem_cache_reap_now(range_seg_cache); 3189272483Ssmh 3190277300Ssmh#ifdef illumos 3191286625Smav if (zio_arena != NULL) { 3192286625Smav /* 3193286625Smav * Ask the vmem arena to reclaim unused memory from its 3194286625Smav * quantum caches. 3195286625Smav */ 3196272483Ssmh vmem_qcache_reap(zio_arena); 3197286625Smav } 3198272483Ssmh#endif 3199272483Ssmh DTRACE_PROBE(arc__kmem_reap_end); 3200168404Spjd} 3201168404Spjd 3202168404Spjdstatic void 3203168404Spjdarc_reclaim_thread(void *dummy __unused) 3204168404Spjd{ 3205168404Spjd clock_t growtime = 0; 3206168404Spjd callb_cpr_t cpr; 3207168404Spjd 3208168404Spjd CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 3209168404Spjd 3210168404Spjd mutex_enter(&arc_reclaim_thr_lock); 3211168404Spjd while (arc_thread_exit == 0) { 3212286625Smav int64_t free_memory = arc_available_memory(); 3213286625Smav if (free_memory < 0) { 3214168404Spjd 3215286625Smav arc_no_grow = B_TRUE; 3216286625Smav arc_warm = B_TRUE; 3217168404Spjd 3218286625Smav /* 3219286625Smav * Wait at least zfs_grow_retry (default 60) seconds 3220286625Smav * before considering growing. 3221286625Smav */ 3222219089Spjd growtime = ddi_get_lbolt() + (arc_grow_retry * hz); 3223168404Spjd 3224286625Smav arc_kmem_reap_now(); 3225286625Smav 3226286625Smav /* 3227286625Smav * If we are still low on memory, shrink the ARC 3228286625Smav * so that we have arc_shrink_min free space. 3229286625Smav */ 3230286625Smav free_memory = arc_available_memory(); 3231286625Smav 3232286625Smav int64_t to_free = 3233286625Smav (arc_c >> arc_shrink_shift) - free_memory; 3234286625Smav if (to_free > 0) { 3235286625Smav#ifdef _KERNEL 3236286625Smav to_free = MAX(to_free, ptob(needfree)); 3237286625Smav#endif 3238286625Smav arc_shrink(to_free); 3239168404Spjd } 3240286625Smav } else if (free_memory < arc_c >> arc_no_grow_shift) { 3241286625Smav arc_no_grow = B_TRUE; 3242286625Smav } else if (ddi_get_lbolt() >= growtime) { 3243286625Smav arc_no_grow = B_FALSE; 3244168404Spjd } 3245168404Spjd 3246209275Smm arc_adjust(); 3247168404Spjd 3248168404Spjd if (arc_eviction_list != NULL) 3249168404Spjd arc_do_user_evicts(); 3250168404Spjd 3251211762Savg#ifdef _KERNEL 3252211762Savg if (needfree) { 3253185029Spjd needfree = 0; 3254185029Spjd wakeup(&needfree); 3255211762Savg } 3256168404Spjd#endif 3257168404Spjd 3258286574Smav /* 3259286574Smav * This is necessary in order for the mdb ::arc dcmd to 3260286574Smav * show up to date information. Since the ::arc command 3261286574Smav * does not call the kstat's update function, without 3262286574Smav * this call, the command may show stale stats for the 3263286574Smav * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even 3264286574Smav * with this change, the data might be up to 1 second 3265286574Smav * out of date; but that should suffice. The arc_state_t 3266286574Smav * structures can be queried directly if more accurate 3267286574Smav * information is needed. 3268286574Smav */ 3269286574Smav if (arc_ksp != NULL) 3270286574Smav arc_ksp->ks_update(arc_ksp, KSTAT_READ); 3271286574Smav 3272168404Spjd /* block until needed, or one second, whichever is shorter */ 3273168404Spjd CALLB_CPR_SAFE_BEGIN(&cpr); 3274168404Spjd (void) cv_timedwait(&arc_reclaim_thr_cv, 3275168404Spjd &arc_reclaim_thr_lock, hz); 3276168404Spjd CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 3277168404Spjd } 3278168404Spjd 3279168404Spjd arc_thread_exit = 0; 3280168404Spjd cv_broadcast(&arc_reclaim_thr_cv); 3281168404Spjd CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 3282168404Spjd thread_exit(); 3283168404Spjd} 3284168404Spjd 3285168404Spjd/* 3286168404Spjd * Adapt arc info given the number of bytes we are trying to add and 3287168404Spjd * the state that we are comming from. This function is only called 3288168404Spjd * when we are adding new content to the cache. 3289168404Spjd */ 3290168404Spjdstatic void 3291168404Spjdarc_adapt(int bytes, arc_state_t *state) 3292168404Spjd{ 3293168404Spjd int mult; 3294208373Smm uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 3295168404Spjd 3296185029Spjd if (state == arc_l2c_only) 3297185029Spjd return; 3298185029Spjd 3299168404Spjd ASSERT(bytes > 0); 3300168404Spjd /* 3301168404Spjd * Adapt the target size of the MRU list: 3302168404Spjd * - if we just hit in the MRU ghost list, then increase 3303168404Spjd * the target size of the MRU list. 3304168404Spjd * - if we just hit in the MFU ghost list, then increase 3305168404Spjd * the target size of the MFU list by decreasing the 3306168404Spjd * target size of the MRU list. 3307168404Spjd */ 3308168404Spjd if (state == arc_mru_ghost) { 3309168404Spjd mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 3310168404Spjd 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 3311209275Smm mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 3312168404Spjd 3313208373Smm arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 3314168404Spjd } else if (state == arc_mfu_ghost) { 3315208373Smm uint64_t delta; 3316208373Smm 3317168404Spjd mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 3318168404Spjd 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 3319209275Smm mult = MIN(mult, 10); 3320168404Spjd 3321208373Smm delta = MIN(bytes * mult, arc_p); 3322208373Smm arc_p = MAX(arc_p_min, arc_p - delta); 3323168404Spjd } 3324168404Spjd ASSERT((int64_t)arc_p >= 0); 3325168404Spjd 3326168404Spjd if (arc_reclaim_needed()) { 3327168404Spjd cv_signal(&arc_reclaim_thr_cv); 3328168404Spjd return; 3329168404Spjd } 3330168404Spjd 3331168404Spjd if (arc_no_grow) 3332168404Spjd return; 3333168404Spjd 3334168404Spjd if (arc_c >= arc_c_max) 3335168404Spjd return; 3336168404Spjd 3337168404Spjd /* 3338168404Spjd * If we're within (2 * maxblocksize) bytes of the target 3339168404Spjd * cache size, increment the target cache size 3340168404Spjd */ 3341168404Spjd if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 3342272483Ssmh DTRACE_PROBE1(arc__inc_adapt, int, bytes); 3343168404Spjd atomic_add_64(&arc_c, (int64_t)bytes); 3344168404Spjd if (arc_c > arc_c_max) 3345168404Spjd arc_c = arc_c_max; 3346168404Spjd else if (state == arc_anon) 3347168404Spjd atomic_add_64(&arc_p, (int64_t)bytes); 3348168404Spjd if (arc_p > arc_c) 3349168404Spjd arc_p = arc_c; 3350168404Spjd } 3351168404Spjd ASSERT((int64_t)arc_p >= 0); 3352168404Spjd} 3353168404Spjd 3354168404Spjd/* 3355168404Spjd * Check if the cache has reached its limits and eviction is required 3356168404Spjd * prior to insert. 3357168404Spjd */ 3358168404Spjdstatic int 3359185029Spjdarc_evict_needed(arc_buf_contents_t type) 3360168404Spjd{ 3361185029Spjd if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 3362185029Spjd return (1); 3363185029Spjd 3364168404Spjd if (arc_reclaim_needed()) 3365168404Spjd return (1); 3366168404Spjd 3367168404Spjd return (arc_size > arc_c); 3368168404Spjd} 3369168404Spjd 3370168404Spjd/* 3371168404Spjd * The buffer, supplied as the first argument, needs a data block. 3372168404Spjd * So, if we are at cache max, determine which cache should be victimized. 3373168404Spjd * We have the following cases: 3374168404Spjd * 3375168404Spjd * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 3376168404Spjd * In this situation if we're out of space, but the resident size of the MFU is 3377168404Spjd * under the limit, victimize the MFU cache to satisfy this insertion request. 3378168404Spjd * 3379168404Spjd * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 3380168404Spjd * Here, we've used up all of the available space for the MRU, so we need to 3381168404Spjd * evict from our own cache instead. Evict from the set of resident MRU 3382168404Spjd * entries. 3383168404Spjd * 3384168404Spjd * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 3385168404Spjd * c minus p represents the MFU space in the cache, since p is the size of the 3386168404Spjd * cache that is dedicated to the MRU. In this situation there's still space on 3387168404Spjd * the MFU side, so the MRU side needs to be victimized. 3388168404Spjd * 3389168404Spjd * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 3390168404Spjd * MFU's resident set is consuming more space than it has been allotted. In 3391168404Spjd * this situation, we must victimize our own cache, the MFU, for this insertion. 3392168404Spjd */ 3393168404Spjdstatic void 3394168404Spjdarc_get_data_buf(arc_buf_t *buf) 3395168404Spjd{ 3396286570Smav arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 3397168404Spjd uint64_t size = buf->b_hdr->b_size; 3398286570Smav arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 3399168404Spjd 3400168404Spjd arc_adapt(size, state); 3401168404Spjd 3402168404Spjd /* 3403168404Spjd * We have not yet reached cache maximum size, 3404168404Spjd * just allocate a new buffer. 3405168404Spjd */ 3406185029Spjd if (!arc_evict_needed(type)) { 3407168404Spjd if (type == ARC_BUFC_METADATA) { 3408168404Spjd buf->b_data = zio_buf_alloc(size); 3409286574Smav arc_space_consume(size, ARC_SPACE_META); 3410168404Spjd } else { 3411168404Spjd ASSERT(type == ARC_BUFC_DATA); 3412168404Spjd buf->b_data = zio_data_buf_alloc(size); 3413286574Smav arc_space_consume(size, ARC_SPACE_DATA); 3414168404Spjd } 3415168404Spjd goto out; 3416168404Spjd } 3417168404Spjd 3418168404Spjd /* 3419168404Spjd * If we are prefetching from the mfu ghost list, this buffer 3420168404Spjd * will end up on the mru list; so steal space from there. 3421168404Spjd */ 3422168404Spjd if (state == arc_mfu_ghost) 3423286570Smav state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu; 3424168404Spjd else if (state == arc_mru_ghost) 3425168404Spjd state = arc_mru; 3426168404Spjd 3427168404Spjd if (state == arc_mru || state == arc_anon) { 3428168404Spjd uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 3429208373Smm state = (arc_mfu->arcs_lsize[type] >= size && 3430185029Spjd arc_p > mru_used) ? arc_mfu : arc_mru; 3431168404Spjd } else { 3432168404Spjd /* MFU cases */ 3433168404Spjd uint64_t mfu_space = arc_c - arc_p; 3434208373Smm state = (arc_mru->arcs_lsize[type] >= size && 3435185029Spjd mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 3436168404Spjd } 3437209962Smm if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) { 3438168404Spjd if (type == ARC_BUFC_METADATA) { 3439168404Spjd buf->b_data = zio_buf_alloc(size); 3440286574Smav arc_space_consume(size, ARC_SPACE_META); 3441168404Spjd } else { 3442168404Spjd ASSERT(type == ARC_BUFC_DATA); 3443168404Spjd buf->b_data = zio_data_buf_alloc(size); 3444286574Smav arc_space_consume(size, ARC_SPACE_DATA); 3445168404Spjd } 3446168404Spjd ARCSTAT_BUMP(arcstat_recycle_miss); 3447168404Spjd } 3448168404Spjd ASSERT(buf->b_data != NULL); 3449168404Spjdout: 3450168404Spjd /* 3451168404Spjd * Update the state size. Note that ghost states have a 3452168404Spjd * "ghost size" and so don't need to be updated. 3453168404Spjd */ 3454286570Smav if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { 3455168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 3456168404Spjd 3457286570Smav atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size); 3458286570Smav if (list_link_active(&hdr->b_l1hdr.b_arc_node)) { 3459286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3460286570Smav atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], 3461286570Smav size); 3462168404Spjd } 3463168404Spjd /* 3464168404Spjd * If we are growing the cache, and we are adding anonymous 3465168404Spjd * data, and we have outgrown arc_p, update arc_p 3466168404Spjd */ 3467286570Smav if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && 3468168404Spjd arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 3469168404Spjd arc_p = MIN(arc_c, arc_p + size); 3470168404Spjd } 3471205231Skmacy ARCSTAT_BUMP(arcstat_allocated); 3472168404Spjd} 3473168404Spjd 3474168404Spjd/* 3475168404Spjd * This routine is called whenever a buffer is accessed. 3476168404Spjd * NOTE: the hash lock is dropped in this function. 3477168404Spjd */ 3478168404Spjdstatic void 3479275811Sdelphijarc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 3480168404Spjd{ 3481219089Spjd clock_t now; 3482219089Spjd 3483168404Spjd ASSERT(MUTEX_HELD(hash_lock)); 3484286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 3485168404Spjd 3486286570Smav if (hdr->b_l1hdr.b_state == arc_anon) { 3487168404Spjd /* 3488168404Spjd * This buffer is not in the cache, and does not 3489168404Spjd * appear in our "ghost" list. Add the new buffer 3490168404Spjd * to the MRU state. 3491168404Spjd */ 3492168404Spjd 3493286570Smav ASSERT0(hdr->b_l1hdr.b_arc_access); 3494286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3495275811Sdelphij DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 3496275811Sdelphij arc_change_state(arc_mru, hdr, hash_lock); 3497168404Spjd 3498286570Smav } else if (hdr->b_l1hdr.b_state == arc_mru) { 3499219089Spjd now = ddi_get_lbolt(); 3500219089Spjd 3501168404Spjd /* 3502168404Spjd * If this buffer is here because of a prefetch, then either: 3503168404Spjd * - clear the flag if this is a "referencing" read 3504168404Spjd * (any subsequent access will bump this into the MFU state). 3505168404Spjd * or 3506168404Spjd * - move the buffer to the head of the list if this is 3507168404Spjd * another prefetch (to make it less likely to be evicted). 3508168404Spjd */ 3509286570Smav if (HDR_PREFETCH(hdr)) { 3510286570Smav if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 3511286570Smav ASSERT(list_link_active( 3512286570Smav &hdr->b_l1hdr.b_arc_node)); 3513168404Spjd } else { 3514275811Sdelphij hdr->b_flags &= ~ARC_FLAG_PREFETCH; 3515168404Spjd ARCSTAT_BUMP(arcstat_mru_hits); 3516168404Spjd } 3517286570Smav hdr->b_l1hdr.b_arc_access = now; 3518168404Spjd return; 3519168404Spjd } 3520168404Spjd 3521168404Spjd /* 3522168404Spjd * This buffer has been "accessed" only once so far, 3523168404Spjd * but it is still in the cache. Move it to the MFU 3524168404Spjd * state. 3525168404Spjd */ 3526286570Smav if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { 3527168404Spjd /* 3528168404Spjd * More than 125ms have passed since we 3529168404Spjd * instantiated this buffer. Move it to the 3530168404Spjd * most frequently used state. 3531168404Spjd */ 3532286570Smav hdr->b_l1hdr.b_arc_access = now; 3533275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3534275811Sdelphij arc_change_state(arc_mfu, hdr, hash_lock); 3535168404Spjd } 3536168404Spjd ARCSTAT_BUMP(arcstat_mru_hits); 3537286570Smav } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { 3538168404Spjd arc_state_t *new_state; 3539168404Spjd /* 3540168404Spjd * This buffer has been "accessed" recently, but 3541168404Spjd * was evicted from the cache. Move it to the 3542168404Spjd * MFU state. 3543168404Spjd */ 3544168404Spjd 3545286570Smav if (HDR_PREFETCH(hdr)) { 3546168404Spjd new_state = arc_mru; 3547286570Smav if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) 3548275811Sdelphij hdr->b_flags &= ~ARC_FLAG_PREFETCH; 3549275811Sdelphij DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 3550168404Spjd } else { 3551168404Spjd new_state = arc_mfu; 3552275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3553168404Spjd } 3554168404Spjd 3555286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3556275811Sdelphij arc_change_state(new_state, hdr, hash_lock); 3557168404Spjd 3558168404Spjd ARCSTAT_BUMP(arcstat_mru_ghost_hits); 3559286570Smav } else if (hdr->b_l1hdr.b_state == arc_mfu) { 3560168404Spjd /* 3561168404Spjd * This buffer has been accessed more than once and is 3562168404Spjd * still in the cache. Keep it in the MFU state. 3563168404Spjd * 3564168404Spjd * NOTE: an add_reference() that occurred when we did 3565168404Spjd * the arc_read() will have kicked this off the list. 3566168404Spjd * If it was a prefetch, we will explicitly move it to 3567168404Spjd * the head of the list now. 3568168404Spjd */ 3569286570Smav if ((HDR_PREFETCH(hdr)) != 0) { 3570286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3571286570Smav ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); 3572168404Spjd } 3573168404Spjd ARCSTAT_BUMP(arcstat_mfu_hits); 3574286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3575286570Smav } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { 3576168404Spjd arc_state_t *new_state = arc_mfu; 3577168404Spjd /* 3578168404Spjd * This buffer has been accessed more than once but has 3579168404Spjd * been evicted from the cache. Move it back to the 3580168404Spjd * MFU state. 3581168404Spjd */ 3582168404Spjd 3583286570Smav if (HDR_PREFETCH(hdr)) { 3584168404Spjd /* 3585168404Spjd * This is a prefetch access... 3586168404Spjd * move this block back to the MRU state. 3587168404Spjd */ 3588286570Smav ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 3589168404Spjd new_state = arc_mru; 3590168404Spjd } 3591168404Spjd 3592286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3593275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3594275811Sdelphij arc_change_state(new_state, hdr, hash_lock); 3595168404Spjd 3596168404Spjd ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 3597286570Smav } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { 3598185029Spjd /* 3599185029Spjd * This buffer is on the 2nd Level ARC. 3600185029Spjd */ 3601185029Spjd 3602286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3603275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3604275811Sdelphij arc_change_state(arc_mfu, hdr, hash_lock); 3605168404Spjd } else { 3606168404Spjd ASSERT(!"invalid arc state"); 3607168404Spjd } 3608168404Spjd} 3609168404Spjd 3610168404Spjd/* a generic arc_done_func_t which you can use */ 3611168404Spjd/* ARGSUSED */ 3612168404Spjdvoid 3613168404Spjdarc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 3614168404Spjd{ 3615219089Spjd if (zio == NULL || zio->io_error == 0) 3616219089Spjd bcopy(buf->b_data, arg, buf->b_hdr->b_size); 3617248571Smm VERIFY(arc_buf_remove_ref(buf, arg)); 3618168404Spjd} 3619168404Spjd 3620185029Spjd/* a generic arc_done_func_t */ 3621168404Spjdvoid 3622168404Spjdarc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 3623168404Spjd{ 3624168404Spjd arc_buf_t **bufp = arg; 3625168404Spjd if (zio && zio->io_error) { 3626248571Smm VERIFY(arc_buf_remove_ref(buf, arg)); 3627168404Spjd *bufp = NULL; 3628168404Spjd } else { 3629168404Spjd *bufp = buf; 3630219089Spjd ASSERT(buf->b_data); 3631168404Spjd } 3632168404Spjd} 3633168404Spjd 3634168404Spjdstatic void 3635168404Spjdarc_read_done(zio_t *zio) 3636168404Spjd{ 3637268075Sdelphij arc_buf_hdr_t *hdr; 3638168404Spjd arc_buf_t *buf; 3639168404Spjd arc_buf_t *abuf; /* buffer we're assigning to callback */ 3640268075Sdelphij kmutex_t *hash_lock = NULL; 3641168404Spjd arc_callback_t *callback_list, *acb; 3642168404Spjd int freeable = FALSE; 3643168404Spjd 3644168404Spjd buf = zio->io_private; 3645168404Spjd hdr = buf->b_hdr; 3646168404Spjd 3647168404Spjd /* 3648168404Spjd * The hdr was inserted into hash-table and removed from lists 3649168404Spjd * prior to starting I/O. We should find this header, since 3650168404Spjd * it's in the hash table, and it should be legit since it's 3651168404Spjd * not possible to evict it during the I/O. The only possible 3652168404Spjd * reason for it not to be found is if we were freed during the 3653168404Spjd * read. 3654168404Spjd */ 3655268075Sdelphij if (HDR_IN_HASH_TABLE(hdr)) { 3656268075Sdelphij ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); 3657268075Sdelphij ASSERT3U(hdr->b_dva.dva_word[0], ==, 3658268075Sdelphij BP_IDENTITY(zio->io_bp)->dva_word[0]); 3659268075Sdelphij ASSERT3U(hdr->b_dva.dva_word[1], ==, 3660268075Sdelphij BP_IDENTITY(zio->io_bp)->dva_word[1]); 3661168404Spjd 3662268075Sdelphij arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, 3663268075Sdelphij &hash_lock); 3664168404Spjd 3665268075Sdelphij ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && 3666268075Sdelphij hash_lock == NULL) || 3667268075Sdelphij (found == hdr && 3668268075Sdelphij DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 3669268075Sdelphij (found == hdr && HDR_L2_READING(hdr))); 3670268075Sdelphij } 3671268075Sdelphij 3672275811Sdelphij hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; 3673286570Smav if (l2arc_noprefetch && HDR_PREFETCH(hdr)) 3674275811Sdelphij hdr->b_flags &= ~ARC_FLAG_L2CACHE; 3675206796Spjd 3676168404Spjd /* byteswap if necessary */ 3677286570Smav callback_list = hdr->b_l1hdr.b_acb; 3678168404Spjd ASSERT(callback_list != NULL); 3679209101Smm if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 3680236884Smm dmu_object_byteswap_t bswap = 3681236884Smm DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 3682185029Spjd arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 3683185029Spjd byteswap_uint64_array : 3684236884Smm dmu_ot_byteswap[bswap].ob_func; 3685185029Spjd func(buf->b_data, hdr->b_size); 3686185029Spjd } 3687168404Spjd 3688185029Spjd arc_cksum_compute(buf, B_FALSE); 3689240133Smm#ifdef illumos 3690240133Smm arc_buf_watch(buf); 3691277300Ssmh#endif 3692168404Spjd 3693286570Smav if (hash_lock && zio->io_error == 0 && 3694286570Smav hdr->b_l1hdr.b_state == arc_anon) { 3695219089Spjd /* 3696219089Spjd * Only call arc_access on anonymous buffers. This is because 3697219089Spjd * if we've issued an I/O for an evicted buffer, we've already 3698219089Spjd * called arc_access (to prevent any simultaneous readers from 3699219089Spjd * getting confused). 3700219089Spjd */ 3701219089Spjd arc_access(hdr, hash_lock); 3702219089Spjd } 3703219089Spjd 3704168404Spjd /* create copies of the data buffer for the callers */ 3705168404Spjd abuf = buf; 3706168404Spjd for (acb = callback_list; acb; acb = acb->acb_next) { 3707168404Spjd if (acb->acb_done) { 3708242845Sdelphij if (abuf == NULL) { 3709242845Sdelphij ARCSTAT_BUMP(arcstat_duplicate_reads); 3710168404Spjd abuf = arc_buf_clone(buf); 3711242845Sdelphij } 3712168404Spjd acb->acb_buf = abuf; 3713168404Spjd abuf = NULL; 3714168404Spjd } 3715168404Spjd } 3716286570Smav hdr->b_l1hdr.b_acb = NULL; 3717275811Sdelphij hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 3718168404Spjd ASSERT(!HDR_BUF_AVAILABLE(hdr)); 3719219089Spjd if (abuf == buf) { 3720219089Spjd ASSERT(buf->b_efunc == NULL); 3721286570Smav ASSERT(hdr->b_l1hdr.b_datacnt == 1); 3722275811Sdelphij hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 3723219089Spjd } 3724168404Spjd 3725286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || 3726286570Smav callback_list != NULL); 3727168404Spjd 3728168404Spjd if (zio->io_error != 0) { 3729275811Sdelphij hdr->b_flags |= ARC_FLAG_IO_ERROR; 3730286570Smav if (hdr->b_l1hdr.b_state != arc_anon) 3731168404Spjd arc_change_state(arc_anon, hdr, hash_lock); 3732168404Spjd if (HDR_IN_HASH_TABLE(hdr)) 3733168404Spjd buf_hash_remove(hdr); 3734286570Smav freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 3735168404Spjd } 3736168404Spjd 3737168404Spjd /* 3738168404Spjd * Broadcast before we drop the hash_lock to avoid the possibility 3739168404Spjd * that the hdr (and hence the cv) might be freed before we get to 3740168404Spjd * the cv_broadcast(). 3741168404Spjd */ 3742286570Smav cv_broadcast(&hdr->b_l1hdr.b_cv); 3743168404Spjd 3744286570Smav if (hash_lock != NULL) { 3745168404Spjd mutex_exit(hash_lock); 3746168404Spjd } else { 3747168404Spjd /* 3748168404Spjd * This block was freed while we waited for the read to 3749168404Spjd * complete. It has been removed from the hash table and 3750168404Spjd * moved to the anonymous state (so that it won't show up 3751168404Spjd * in the cache). 3752168404Spjd */ 3753286570Smav ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 3754286570Smav freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 3755168404Spjd } 3756168404Spjd 3757168404Spjd /* execute each callback and free its structure */ 3758168404Spjd while ((acb = callback_list) != NULL) { 3759168404Spjd if (acb->acb_done) 3760168404Spjd acb->acb_done(zio, acb->acb_buf, acb->acb_private); 3761168404Spjd 3762168404Spjd if (acb->acb_zio_dummy != NULL) { 3763168404Spjd acb->acb_zio_dummy->io_error = zio->io_error; 3764168404Spjd zio_nowait(acb->acb_zio_dummy); 3765168404Spjd } 3766168404Spjd 3767168404Spjd callback_list = acb->acb_next; 3768168404Spjd kmem_free(acb, sizeof (arc_callback_t)); 3769168404Spjd } 3770168404Spjd 3771168404Spjd if (freeable) 3772168404Spjd arc_hdr_destroy(hdr); 3773168404Spjd} 3774168404Spjd 3775168404Spjd/* 3776286762Smav * "Read" the block at the specified DVA (in bp) via the 3777168404Spjd * cache. If the block is found in the cache, invoke the provided 3778168404Spjd * callback immediately and return. Note that the `zio' parameter 3779168404Spjd * in the callback will be NULL in this case, since no IO was 3780168404Spjd * required. If the block is not in the cache pass the read request 3781168404Spjd * on to the spa with a substitute callback function, so that the 3782168404Spjd * requested block will be added to the cache. 3783168404Spjd * 3784168404Spjd * If a read request arrives for a block that has a read in-progress, 3785168404Spjd * either wait for the in-progress read to complete (and return the 3786168404Spjd * results); or, if this is a read with a "done" func, add a record 3787168404Spjd * to the read to invoke the "done" func when the read completes, 3788168404Spjd * and return; or just return. 3789168404Spjd * 3790168404Spjd * arc_read_done() will invoke all the requested "done" functions 3791168404Spjd * for readers of this block. 3792168404Spjd */ 3793168404Spjdint 3794246666Smmarc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 3795275811Sdelphij void *private, zio_priority_t priority, int zio_flags, 3796275811Sdelphij arc_flags_t *arc_flags, const zbookmark_phys_t *zb) 3797168404Spjd{ 3798268075Sdelphij arc_buf_hdr_t *hdr = NULL; 3799247187Smm arc_buf_t *buf = NULL; 3800268075Sdelphij kmutex_t *hash_lock = NULL; 3801185029Spjd zio_t *rzio; 3802228103Smm uint64_t guid = spa_load_guid(spa); 3803168404Spjd 3804268075Sdelphij ASSERT(!BP_IS_EMBEDDED(bp) || 3805268075Sdelphij BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 3806268075Sdelphij 3807168404Spjdtop: 3808268075Sdelphij if (!BP_IS_EMBEDDED(bp)) { 3809268075Sdelphij /* 3810268075Sdelphij * Embedded BP's have no DVA and require no I/O to "read". 3811268075Sdelphij * Create an anonymous arc buf to back it. 3812268075Sdelphij */ 3813268075Sdelphij hdr = buf_hash_find(guid, bp, &hash_lock); 3814268075Sdelphij } 3815168404Spjd 3816286570Smav if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { 3817268075Sdelphij 3818275811Sdelphij *arc_flags |= ARC_FLAG_CACHED; 3819168404Spjd 3820168404Spjd if (HDR_IO_IN_PROGRESS(hdr)) { 3821168404Spjd 3822275811Sdelphij if (*arc_flags & ARC_FLAG_WAIT) { 3823286570Smav cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); 3824168404Spjd mutex_exit(hash_lock); 3825168404Spjd goto top; 3826168404Spjd } 3827275811Sdelphij ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 3828168404Spjd 3829168404Spjd if (done) { 3830168404Spjd arc_callback_t *acb = NULL; 3831168404Spjd 3832168404Spjd acb = kmem_zalloc(sizeof (arc_callback_t), 3833168404Spjd KM_SLEEP); 3834168404Spjd acb->acb_done = done; 3835168404Spjd acb->acb_private = private; 3836168404Spjd if (pio != NULL) 3837168404Spjd acb->acb_zio_dummy = zio_null(pio, 3838209962Smm spa, NULL, NULL, NULL, zio_flags); 3839168404Spjd 3840168404Spjd ASSERT(acb->acb_done != NULL); 3841286570Smav acb->acb_next = hdr->b_l1hdr.b_acb; 3842286570Smav hdr->b_l1hdr.b_acb = acb; 3843168404Spjd add_reference(hdr, hash_lock, private); 3844168404Spjd mutex_exit(hash_lock); 3845168404Spjd return (0); 3846168404Spjd } 3847168404Spjd mutex_exit(hash_lock); 3848168404Spjd return (0); 3849168404Spjd } 3850168404Spjd 3851286570Smav ASSERT(hdr->b_l1hdr.b_state == arc_mru || 3852286570Smav hdr->b_l1hdr.b_state == arc_mfu); 3853168404Spjd 3854168404Spjd if (done) { 3855168404Spjd add_reference(hdr, hash_lock, private); 3856168404Spjd /* 3857168404Spjd * If this block is already in use, create a new 3858168404Spjd * copy of the data so that we will be guaranteed 3859168404Spjd * that arc_release() will always succeed. 3860168404Spjd */ 3861286570Smav buf = hdr->b_l1hdr.b_buf; 3862168404Spjd ASSERT(buf); 3863168404Spjd ASSERT(buf->b_data); 3864168404Spjd if (HDR_BUF_AVAILABLE(hdr)) { 3865168404Spjd ASSERT(buf->b_efunc == NULL); 3866275811Sdelphij hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 3867168404Spjd } else { 3868168404Spjd buf = arc_buf_clone(buf); 3869168404Spjd } 3870219089Spjd 3871275811Sdelphij } else if (*arc_flags & ARC_FLAG_PREFETCH && 3872286570Smav refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 3873275811Sdelphij hdr->b_flags |= ARC_FLAG_PREFETCH; 3874168404Spjd } 3875168404Spjd DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 3876168404Spjd arc_access(hdr, hash_lock); 3877275811Sdelphij if (*arc_flags & ARC_FLAG_L2CACHE) 3878275811Sdelphij hdr->b_flags |= ARC_FLAG_L2CACHE; 3879275811Sdelphij if (*arc_flags & ARC_FLAG_L2COMPRESS) 3880275811Sdelphij hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3881168404Spjd mutex_exit(hash_lock); 3882168404Spjd ARCSTAT_BUMP(arcstat_hits); 3883286570Smav ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 3884286570Smav demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 3885168404Spjd data, metadata, hits); 3886168404Spjd 3887168404Spjd if (done) 3888168404Spjd done(NULL, buf, private); 3889168404Spjd } else { 3890168404Spjd uint64_t size = BP_GET_LSIZE(bp); 3891268075Sdelphij arc_callback_t *acb; 3892185029Spjd vdev_t *vd = NULL; 3893247187Smm uint64_t addr = 0; 3894208373Smm boolean_t devw = B_FALSE; 3895258389Savg enum zio_compress b_compress = ZIO_COMPRESS_OFF; 3896286570Smav int32_t b_asize = 0; 3897168404Spjd 3898168404Spjd if (hdr == NULL) { 3899168404Spjd /* this block is not in the cache */ 3900268075Sdelphij arc_buf_hdr_t *exists = NULL; 3901168404Spjd arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 3902168404Spjd buf = arc_buf_alloc(spa, size, private, type); 3903168404Spjd hdr = buf->b_hdr; 3904268075Sdelphij if (!BP_IS_EMBEDDED(bp)) { 3905268075Sdelphij hdr->b_dva = *BP_IDENTITY(bp); 3906268075Sdelphij hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 3907268075Sdelphij exists = buf_hash_insert(hdr, &hash_lock); 3908268075Sdelphij } 3909268075Sdelphij if (exists != NULL) { 3910168404Spjd /* somebody beat us to the hash insert */ 3911168404Spjd mutex_exit(hash_lock); 3912219089Spjd buf_discard_identity(hdr); 3913168404Spjd (void) arc_buf_remove_ref(buf, private); 3914168404Spjd goto top; /* restart the IO request */ 3915168404Spjd } 3916275811Sdelphij 3917168404Spjd /* if this is a prefetch, we don't have a reference */ 3918275811Sdelphij if (*arc_flags & ARC_FLAG_PREFETCH) { 3919168404Spjd (void) remove_reference(hdr, hash_lock, 3920168404Spjd private); 3921275811Sdelphij hdr->b_flags |= ARC_FLAG_PREFETCH; 3922168404Spjd } 3923275811Sdelphij if (*arc_flags & ARC_FLAG_L2CACHE) 3924275811Sdelphij hdr->b_flags |= ARC_FLAG_L2CACHE; 3925275811Sdelphij if (*arc_flags & ARC_FLAG_L2COMPRESS) 3926275811Sdelphij hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3927168404Spjd if (BP_GET_LEVEL(bp) > 0) 3928275811Sdelphij hdr->b_flags |= ARC_FLAG_INDIRECT; 3929168404Spjd } else { 3930286570Smav /* 3931286570Smav * This block is in the ghost cache. If it was L2-only 3932286570Smav * (and thus didn't have an L1 hdr), we realloc the 3933286570Smav * header to add an L1 hdr. 3934286570Smav */ 3935286570Smav if (!HDR_HAS_L1HDR(hdr)) { 3936286570Smav hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, 3937286570Smav hdr_full_cache); 3938286570Smav } 3939286570Smav 3940286570Smav ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); 3941168404Spjd ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3942286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3943286570Smav ASSERT(hdr->b_l1hdr.b_buf == NULL); 3944168404Spjd 3945168404Spjd /* if this is a prefetch, we don't have a reference */ 3946275811Sdelphij if (*arc_flags & ARC_FLAG_PREFETCH) 3947275811Sdelphij hdr->b_flags |= ARC_FLAG_PREFETCH; 3948168404Spjd else 3949168404Spjd add_reference(hdr, hash_lock, private); 3950275811Sdelphij if (*arc_flags & ARC_FLAG_L2CACHE) 3951275811Sdelphij hdr->b_flags |= ARC_FLAG_L2CACHE; 3952275811Sdelphij if (*arc_flags & ARC_FLAG_L2COMPRESS) 3953275811Sdelphij hdr->b_flags |= ARC_FLAG_L2COMPRESS; 3954185029Spjd buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 3955168404Spjd buf->b_hdr = hdr; 3956168404Spjd buf->b_data = NULL; 3957168404Spjd buf->b_efunc = NULL; 3958168404Spjd buf->b_private = NULL; 3959168404Spjd buf->b_next = NULL; 3960286570Smav hdr->b_l1hdr.b_buf = buf; 3961286570Smav ASSERT0(hdr->b_l1hdr.b_datacnt); 3962286570Smav hdr->b_l1hdr.b_datacnt = 1; 3963219089Spjd arc_get_data_buf(buf); 3964219089Spjd arc_access(hdr, hash_lock); 3965168404Spjd } 3966168404Spjd 3967286570Smav ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); 3968219089Spjd 3969168404Spjd acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 3970168404Spjd acb->acb_done = done; 3971168404Spjd acb->acb_private = private; 3972168404Spjd 3973286570Smav ASSERT(hdr->b_l1hdr.b_acb == NULL); 3974286570Smav hdr->b_l1hdr.b_acb = acb; 3975275811Sdelphij hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 3976168404Spjd 3977286570Smav if (HDR_HAS_L2HDR(hdr) && 3978286570Smav (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { 3979286570Smav devw = hdr->b_l2hdr.b_dev->l2ad_writing; 3980286570Smav addr = hdr->b_l2hdr.b_daddr; 3981286570Smav b_compress = HDR_GET_COMPRESS(hdr); 3982286570Smav b_asize = hdr->b_l2hdr.b_asize; 3983185029Spjd /* 3984185029Spjd * Lock out device removal. 3985185029Spjd */ 3986185029Spjd if (vdev_is_dead(vd) || 3987185029Spjd !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 3988185029Spjd vd = NULL; 3989185029Spjd } 3990185029Spjd 3991268075Sdelphij if (hash_lock != NULL) 3992268075Sdelphij mutex_exit(hash_lock); 3993168404Spjd 3994251629Sdelphij /* 3995251629Sdelphij * At this point, we have a level 1 cache miss. Try again in 3996251629Sdelphij * L2ARC if possible. 3997251629Sdelphij */ 3998168404Spjd ASSERT3U(hdr->b_size, ==, size); 3999219089Spjd DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 4000268123Sdelphij uint64_t, size, zbookmark_phys_t *, zb); 4001168404Spjd ARCSTAT_BUMP(arcstat_misses); 4002286570Smav ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 4003286570Smav demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 4004168404Spjd data, metadata, misses); 4005228392Spjd#ifdef _KERNEL 4006228392Spjd curthread->td_ru.ru_inblock++; 4007228392Spjd#endif 4008168404Spjd 4009208373Smm if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 4010185029Spjd /* 4011185029Spjd * Read from the L2ARC if the following are true: 4012185029Spjd * 1. The L2ARC vdev was previously cached. 4013185029Spjd * 2. This buffer still has L2ARC metadata. 4014185029Spjd * 3. This buffer isn't currently writing to the L2ARC. 4015185029Spjd * 4. The L2ARC entry wasn't evicted, which may 4016185029Spjd * also have invalidated the vdev. 4017208373Smm * 5. This isn't prefetch and l2arc_noprefetch is set. 4018185029Spjd */ 4019286570Smav if (HDR_HAS_L2HDR(hdr) && 4020208373Smm !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 4021208373Smm !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 4022185029Spjd l2arc_read_callback_t *cb; 4023185029Spjd 4024185029Spjd DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 4025185029Spjd ARCSTAT_BUMP(arcstat_l2_hits); 4026185029Spjd 4027185029Spjd cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 4028185029Spjd KM_SLEEP); 4029185029Spjd cb->l2rcb_buf = buf; 4030185029Spjd cb->l2rcb_spa = spa; 4031185029Spjd cb->l2rcb_bp = *bp; 4032185029Spjd cb->l2rcb_zb = *zb; 4033185029Spjd cb->l2rcb_flags = zio_flags; 4034258389Savg cb->l2rcb_compress = b_compress; 4035185029Spjd 4036247187Smm ASSERT(addr >= VDEV_LABEL_START_SIZE && 4037247187Smm addr + size < vd->vdev_psize - 4038247187Smm VDEV_LABEL_END_SIZE); 4039247187Smm 4040185029Spjd /* 4041185029Spjd * l2arc read. The SCL_L2ARC lock will be 4042185029Spjd * released by l2arc_read_done(). 4043251478Sdelphij * Issue a null zio if the underlying buffer 4044251478Sdelphij * was squashed to zero size by compression. 4045185029Spjd */ 4046258389Savg if (b_compress == ZIO_COMPRESS_EMPTY) { 4047251478Sdelphij rzio = zio_null(pio, spa, vd, 4048251478Sdelphij l2arc_read_done, cb, 4049251478Sdelphij zio_flags | ZIO_FLAG_DONT_CACHE | 4050251478Sdelphij ZIO_FLAG_CANFAIL | 4051251478Sdelphij ZIO_FLAG_DONT_PROPAGATE | 4052251478Sdelphij ZIO_FLAG_DONT_RETRY); 4053251478Sdelphij } else { 4054251478Sdelphij rzio = zio_read_phys(pio, vd, addr, 4055258389Savg b_asize, buf->b_data, 4056258389Savg ZIO_CHECKSUM_OFF, 4057251478Sdelphij l2arc_read_done, cb, priority, 4058251478Sdelphij zio_flags | ZIO_FLAG_DONT_CACHE | 4059251478Sdelphij ZIO_FLAG_CANFAIL | 4060251478Sdelphij ZIO_FLAG_DONT_PROPAGATE | 4061251478Sdelphij ZIO_FLAG_DONT_RETRY, B_FALSE); 4062251478Sdelphij } 4063185029Spjd DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 4064185029Spjd zio_t *, rzio); 4065258389Savg ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); 4066185029Spjd 4067275811Sdelphij if (*arc_flags & ARC_FLAG_NOWAIT) { 4068185029Spjd zio_nowait(rzio); 4069185029Spjd return (0); 4070185029Spjd } 4071185029Spjd 4072275811Sdelphij ASSERT(*arc_flags & ARC_FLAG_WAIT); 4073185029Spjd if (zio_wait(rzio) == 0) 4074185029Spjd return (0); 4075185029Spjd 4076185029Spjd /* l2arc read error; goto zio_read() */ 4077185029Spjd } else { 4078185029Spjd DTRACE_PROBE1(l2arc__miss, 4079185029Spjd arc_buf_hdr_t *, hdr); 4080185029Spjd ARCSTAT_BUMP(arcstat_l2_misses); 4081185029Spjd if (HDR_L2_WRITING(hdr)) 4082185029Spjd ARCSTAT_BUMP(arcstat_l2_rw_clash); 4083185029Spjd spa_config_exit(spa, SCL_L2ARC, vd); 4084185029Spjd } 4085208373Smm } else { 4086208373Smm if (vd != NULL) 4087208373Smm spa_config_exit(spa, SCL_L2ARC, vd); 4088208373Smm if (l2arc_ndev != 0) { 4089208373Smm DTRACE_PROBE1(l2arc__miss, 4090208373Smm arc_buf_hdr_t *, hdr); 4091208373Smm ARCSTAT_BUMP(arcstat_l2_misses); 4092208373Smm } 4093185029Spjd } 4094185029Spjd 4095168404Spjd rzio = zio_read(pio, spa, bp, buf->b_data, size, 4096185029Spjd arc_read_done, buf, priority, zio_flags, zb); 4097168404Spjd 4098275811Sdelphij if (*arc_flags & ARC_FLAG_WAIT) 4099168404Spjd return (zio_wait(rzio)); 4100168404Spjd 4101275811Sdelphij ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 4102168404Spjd zio_nowait(rzio); 4103168404Spjd } 4104168404Spjd return (0); 4105168404Spjd} 4106168404Spjd 4107168404Spjdvoid 4108168404Spjdarc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 4109168404Spjd{ 4110168404Spjd ASSERT(buf->b_hdr != NULL); 4111286570Smav ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); 4112286570Smav ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || 4113286570Smav func == NULL); 4114219089Spjd ASSERT(buf->b_efunc == NULL); 4115219089Spjd ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 4116219089Spjd 4117168404Spjd buf->b_efunc = func; 4118168404Spjd buf->b_private = private; 4119168404Spjd} 4120168404Spjd 4121168404Spjd/* 4122251520Sdelphij * Notify the arc that a block was freed, and thus will never be used again. 4123251520Sdelphij */ 4124251520Sdelphijvoid 4125251520Sdelphijarc_freed(spa_t *spa, const blkptr_t *bp) 4126251520Sdelphij{ 4127251520Sdelphij arc_buf_hdr_t *hdr; 4128251520Sdelphij kmutex_t *hash_lock; 4129251520Sdelphij uint64_t guid = spa_load_guid(spa); 4130251520Sdelphij 4131268075Sdelphij ASSERT(!BP_IS_EMBEDDED(bp)); 4132268075Sdelphij 4133268075Sdelphij hdr = buf_hash_find(guid, bp, &hash_lock); 4134251520Sdelphij if (hdr == NULL) 4135251520Sdelphij return; 4136251520Sdelphij if (HDR_BUF_AVAILABLE(hdr)) { 4137286570Smav arc_buf_t *buf = hdr->b_l1hdr.b_buf; 4138251520Sdelphij add_reference(hdr, hash_lock, FTAG); 4139275811Sdelphij hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 4140251520Sdelphij mutex_exit(hash_lock); 4141251520Sdelphij 4142251520Sdelphij arc_release(buf, FTAG); 4143251520Sdelphij (void) arc_buf_remove_ref(buf, FTAG); 4144251520Sdelphij } else { 4145251520Sdelphij mutex_exit(hash_lock); 4146251520Sdelphij } 4147251520Sdelphij 4148251520Sdelphij} 4149251520Sdelphij 4150251520Sdelphij/* 4151268858Sdelphij * Clear the user eviction callback set by arc_set_callback(), first calling 4152268858Sdelphij * it if it exists. Because the presence of a callback keeps an arc_buf cached 4153268858Sdelphij * clearing the callback may result in the arc_buf being destroyed. However, 4154268858Sdelphij * it will not result in the *last* arc_buf being destroyed, hence the data 4155268858Sdelphij * will remain cached in the ARC. We make a copy of the arc buffer here so 4156268858Sdelphij * that we can process the callback without holding any locks. 4157268858Sdelphij * 4158268858Sdelphij * It's possible that the callback is already in the process of being cleared 4159268858Sdelphij * by another thread. In this case we can not clear the callback. 4160268858Sdelphij * 4161268858Sdelphij * Returns B_TRUE if the callback was successfully called and cleared. 4162168404Spjd */ 4163268858Sdelphijboolean_t 4164268858Sdelphijarc_clear_callback(arc_buf_t *buf) 4165168404Spjd{ 4166168404Spjd arc_buf_hdr_t *hdr; 4167168404Spjd kmutex_t *hash_lock; 4168268858Sdelphij arc_evict_func_t *efunc = buf->b_efunc; 4169268858Sdelphij void *private = buf->b_private; 4170206796Spjd 4171219089Spjd mutex_enter(&buf->b_evict_lock); 4172168404Spjd hdr = buf->b_hdr; 4173168404Spjd if (hdr == NULL) { 4174168404Spjd /* 4175168404Spjd * We are in arc_do_user_evicts(). 4176168404Spjd */ 4177168404Spjd ASSERT(buf->b_data == NULL); 4178219089Spjd mutex_exit(&buf->b_evict_lock); 4179268858Sdelphij return (B_FALSE); 4180185029Spjd } else if (buf->b_data == NULL) { 4181185029Spjd /* 4182185029Spjd * We are on the eviction list; process this buffer now 4183185029Spjd * but let arc_do_user_evicts() do the reaping. 4184185029Spjd */ 4185185029Spjd buf->b_efunc = NULL; 4186219089Spjd mutex_exit(&buf->b_evict_lock); 4187268858Sdelphij VERIFY0(efunc(private)); 4188268858Sdelphij return (B_TRUE); 4189168404Spjd } 4190168404Spjd hash_lock = HDR_LOCK(hdr); 4191168404Spjd mutex_enter(hash_lock); 4192219089Spjd hdr = buf->b_hdr; 4193219089Spjd ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4194168404Spjd 4195286570Smav ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, 4196286570Smav hdr->b_l1hdr.b_datacnt); 4197286570Smav ASSERT(hdr->b_l1hdr.b_state == arc_mru || 4198286570Smav hdr->b_l1hdr.b_state == arc_mfu); 4199168404Spjd 4200268858Sdelphij buf->b_efunc = NULL; 4201268858Sdelphij buf->b_private = NULL; 4202168404Spjd 4203286570Smav if (hdr->b_l1hdr.b_datacnt > 1) { 4204268858Sdelphij mutex_exit(&buf->b_evict_lock); 4205268858Sdelphij arc_buf_destroy(buf, FALSE, TRUE); 4206268858Sdelphij } else { 4207286570Smav ASSERT(buf == hdr->b_l1hdr.b_buf); 4208275811Sdelphij hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 4209268858Sdelphij mutex_exit(&buf->b_evict_lock); 4210268858Sdelphij } 4211168404Spjd 4212168404Spjd mutex_exit(hash_lock); 4213268858Sdelphij VERIFY0(efunc(private)); 4214268858Sdelphij return (B_TRUE); 4215168404Spjd} 4216168404Spjd 4217168404Spjd/* 4218251629Sdelphij * Release this buffer from the cache, making it an anonymous buffer. This 4219251629Sdelphij * must be done after a read and prior to modifying the buffer contents. 4220168404Spjd * If the buffer has more than one reference, we must make 4221185029Spjd * a new hdr for the buffer. 4222168404Spjd */ 4223168404Spjdvoid 4224168404Spjdarc_release(arc_buf_t *buf, void *tag) 4225168404Spjd{ 4226286570Smav arc_buf_hdr_t *hdr = buf->b_hdr; 4227168404Spjd 4228219089Spjd /* 4229219089Spjd * It would be nice to assert that if it's DMU metadata (level > 4230219089Spjd * 0 || it's the dnode file), then it must be syncing context. 4231219089Spjd * But we don't know that information at this level. 4232219089Spjd */ 4233219089Spjd 4234219089Spjd mutex_enter(&buf->b_evict_lock); 4235286570Smav /* 4236286570Smav * We don't grab the hash lock prior to this check, because if 4237286570Smav * the buffer's header is in the arc_anon state, it won't be 4238286570Smav * linked into the hash table. 4239286570Smav */ 4240286570Smav if (hdr->b_l1hdr.b_state == arc_anon) { 4241286570Smav mutex_exit(&buf->b_evict_lock); 4242286570Smav ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4243286570Smav ASSERT(!HDR_IN_HASH_TABLE(hdr)); 4244286570Smav ASSERT(!HDR_HAS_L2HDR(hdr)); 4245286570Smav ASSERT(BUF_EMPTY(hdr)); 4246286570Smav ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); 4247286570Smav ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); 4248286570Smav ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 4249185029Spjd 4250286570Smav ASSERT3P(buf->b_efunc, ==, NULL); 4251286570Smav ASSERT3P(buf->b_private, ==, NULL); 4252168404Spjd 4253286570Smav hdr->b_l1hdr.b_arc_access = 0; 4254286570Smav arc_buf_thaw(buf); 4255286570Smav 4256286570Smav return; 4257168404Spjd } 4258168404Spjd 4259286570Smav kmutex_t *hash_lock = HDR_LOCK(hdr); 4260286570Smav mutex_enter(hash_lock); 4261286570Smav 4262286570Smav /* 4263286570Smav * This assignment is only valid as long as the hash_lock is 4264286570Smav * held, we must be careful not to reference state or the 4265286570Smav * b_state field after dropping the lock. 4266286570Smav */ 4267286570Smav arc_state_t *state = hdr->b_l1hdr.b_state; 4268286570Smav ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4269286570Smav ASSERT3P(state, !=, arc_anon); 4270286570Smav 4271286570Smav /* this buffer is not on any list */ 4272286570Smav ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); 4273286570Smav 4274286570Smav if (HDR_HAS_L2HDR(hdr)) { 4275286570Smav mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4276286570Smav 4277286570Smav /* 4278286598Smav * We have to recheck this conditional again now that 4279286598Smav * we're holding the l2ad_mtx to prevent a race with 4280286598Smav * another thread which might be concurrently calling 4281286598Smav * l2arc_evict(). In that case, l2arc_evict() might have 4282286598Smav * destroyed the header's L2 portion as we were waiting 4283286598Smav * to acquire the l2ad_mtx. 4284286570Smav */ 4285286598Smav if (HDR_HAS_L2HDR(hdr)) { 4286286647Smav if (hdr->b_l2hdr.b_daddr != L2ARC_ADDR_UNSET) 4287286647Smav trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, 4288286647Smav hdr->b_l2hdr.b_daddr, 4289286647Smav hdr->b_l2hdr.b_asize, 0); 4290286598Smav arc_hdr_l2hdr_destroy(hdr); 4291286598Smav } 4292286570Smav 4293286570Smav mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4294185029Spjd } 4295185029Spjd 4296168404Spjd /* 4297168404Spjd * Do we have more than one buf? 4298168404Spjd */ 4299286570Smav if (hdr->b_l1hdr.b_datacnt > 1) { 4300168404Spjd arc_buf_hdr_t *nhdr; 4301168404Spjd arc_buf_t **bufp; 4302168404Spjd uint64_t blksz = hdr->b_size; 4303209962Smm uint64_t spa = hdr->b_spa; 4304286570Smav arc_buf_contents_t type = arc_buf_type(hdr); 4305185029Spjd uint32_t flags = hdr->b_flags; 4306168404Spjd 4307286570Smav ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); 4308168404Spjd /* 4309219089Spjd * Pull the data off of this hdr and attach it to 4310219089Spjd * a new anonymous hdr. 4311168404Spjd */ 4312168404Spjd (void) remove_reference(hdr, hash_lock, tag); 4313286570Smav bufp = &hdr->b_l1hdr.b_buf; 4314168404Spjd while (*bufp != buf) 4315168404Spjd bufp = &(*bufp)->b_next; 4316219089Spjd *bufp = buf->b_next; 4317168404Spjd buf->b_next = NULL; 4318168404Spjd 4319286570Smav ASSERT3P(state, !=, arc_l2c_only); 4320286570Smav ASSERT3U(state->arcs_size, >=, hdr->b_size); 4321286570Smav atomic_add_64(&state->arcs_size, -hdr->b_size); 4322286570Smav if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { 4323286570Smav ASSERT3P(state, !=, arc_l2c_only); 4324286570Smav uint64_t *size = &state->arcs_lsize[type]; 4325185029Spjd ASSERT3U(*size, >=, hdr->b_size); 4326185029Spjd atomic_add_64(size, -hdr->b_size); 4327168404Spjd } 4328242845Sdelphij 4329242845Sdelphij /* 4330242845Sdelphij * We're releasing a duplicate user data buffer, update 4331242845Sdelphij * our statistics accordingly. 4332242845Sdelphij */ 4333286570Smav if (HDR_ISTYPE_DATA(hdr)) { 4334242845Sdelphij ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 4335242845Sdelphij ARCSTAT_INCR(arcstat_duplicate_buffers_size, 4336242845Sdelphij -hdr->b_size); 4337242845Sdelphij } 4338286570Smav hdr->b_l1hdr.b_datacnt -= 1; 4339168404Spjd arc_cksum_verify(buf); 4340240133Smm#ifdef illumos 4341240133Smm arc_buf_unwatch(buf); 4342277300Ssmh#endif 4343168404Spjd 4344168404Spjd mutex_exit(hash_lock); 4345168404Spjd 4346286570Smav nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 4347168404Spjd nhdr->b_size = blksz; 4348168404Spjd nhdr->b_spa = spa; 4349286570Smav 4350275811Sdelphij nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; 4351286570Smav nhdr->b_flags |= arc_bufc_to_flags(type); 4352286570Smav nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 4353286570Smav 4354286570Smav nhdr->b_l1hdr.b_buf = buf; 4355286570Smav nhdr->b_l1hdr.b_datacnt = 1; 4356286570Smav nhdr->b_l1hdr.b_state = arc_anon; 4357286570Smav nhdr->b_l1hdr.b_arc_access = 0; 4358168404Spjd nhdr->b_freeze_cksum = NULL; 4359286570Smav 4360286570Smav (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); 4361168404Spjd buf->b_hdr = nhdr; 4362219089Spjd mutex_exit(&buf->b_evict_lock); 4363168404Spjd atomic_add_64(&arc_anon->arcs_size, blksz); 4364168404Spjd } else { 4365219089Spjd mutex_exit(&buf->b_evict_lock); 4366286570Smav ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); 4367286570Smav /* protected by hash lock */ 4368286570Smav ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 4369168404Spjd ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4370286570Smav arc_change_state(arc_anon, hdr, hash_lock); 4371286570Smav hdr->b_l1hdr.b_arc_access = 0; 4372286570Smav mutex_exit(hash_lock); 4373185029Spjd 4374219089Spjd buf_discard_identity(hdr); 4375168404Spjd arc_buf_thaw(buf); 4376168404Spjd } 4377168404Spjd buf->b_efunc = NULL; 4378168404Spjd buf->b_private = NULL; 4379168404Spjd} 4380168404Spjd 4381168404Spjdint 4382168404Spjdarc_released(arc_buf_t *buf) 4383168404Spjd{ 4384185029Spjd int released; 4385185029Spjd 4386219089Spjd mutex_enter(&buf->b_evict_lock); 4387286570Smav released = (buf->b_data != NULL && 4388286570Smav buf->b_hdr->b_l1hdr.b_state == arc_anon); 4389219089Spjd mutex_exit(&buf->b_evict_lock); 4390185029Spjd return (released); 4391168404Spjd} 4392168404Spjd 4393168404Spjd#ifdef ZFS_DEBUG 4394168404Spjdint 4395168404Spjdarc_referenced(arc_buf_t *buf) 4396168404Spjd{ 4397185029Spjd int referenced; 4398185029Spjd 4399219089Spjd mutex_enter(&buf->b_evict_lock); 4400286570Smav referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); 4401219089Spjd mutex_exit(&buf->b_evict_lock); 4402185029Spjd return (referenced); 4403168404Spjd} 4404168404Spjd#endif 4405168404Spjd 4406168404Spjdstatic void 4407168404Spjdarc_write_ready(zio_t *zio) 4408168404Spjd{ 4409168404Spjd arc_write_callback_t *callback = zio->io_private; 4410168404Spjd arc_buf_t *buf = callback->awcb_buf; 4411185029Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 4412168404Spjd 4413286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 4414286570Smav ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); 4415286570Smav ASSERT(hdr->b_l1hdr.b_datacnt > 0); 4416185029Spjd callback->awcb_ready(zio, buf, callback->awcb_private); 4417185029Spjd 4418185029Spjd /* 4419185029Spjd * If the IO is already in progress, then this is a re-write 4420185029Spjd * attempt, so we need to thaw and re-compute the cksum. 4421185029Spjd * It is the responsibility of the callback to handle the 4422185029Spjd * accounting for any re-write attempt. 4423185029Spjd */ 4424185029Spjd if (HDR_IO_IN_PROGRESS(hdr)) { 4425286570Smav mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 4426185029Spjd if (hdr->b_freeze_cksum != NULL) { 4427185029Spjd kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 4428185029Spjd hdr->b_freeze_cksum = NULL; 4429185029Spjd } 4430286570Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 4431168404Spjd } 4432185029Spjd arc_cksum_compute(buf, B_FALSE); 4433275811Sdelphij hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 4434168404Spjd} 4435168404Spjd 4436258632Savg/* 4437258632Savg * The SPA calls this callback for each physical write that happens on behalf 4438258632Savg * of a logical write. See the comment in dbuf_write_physdone() for details. 4439258632Savg */ 4440168404Spjdstatic void 4441258632Savgarc_write_physdone(zio_t *zio) 4442258632Savg{ 4443258632Savg arc_write_callback_t *cb = zio->io_private; 4444258632Savg if (cb->awcb_physdone != NULL) 4445258632Savg cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 4446258632Savg} 4447258632Savg 4448258632Savgstatic void 4449168404Spjdarc_write_done(zio_t *zio) 4450168404Spjd{ 4451168404Spjd arc_write_callback_t *callback = zio->io_private; 4452168404Spjd arc_buf_t *buf = callback->awcb_buf; 4453168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 4454168404Spjd 4455286570Smav ASSERT(hdr->b_l1hdr.b_acb == NULL); 4456168404Spjd 4457219089Spjd if (zio->io_error == 0) { 4458268075Sdelphij if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 4459260150Sdelphij buf_discard_identity(hdr); 4460260150Sdelphij } else { 4461260150Sdelphij hdr->b_dva = *BP_IDENTITY(zio->io_bp); 4462260150Sdelphij hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 4463260150Sdelphij } 4464219089Spjd } else { 4465219089Spjd ASSERT(BUF_EMPTY(hdr)); 4466219089Spjd } 4467219089Spjd 4468168404Spjd /* 4469268075Sdelphij * If the block to be written was all-zero or compressed enough to be 4470268075Sdelphij * embedded in the BP, no write was performed so there will be no 4471268075Sdelphij * dva/birth/checksum. The buffer must therefore remain anonymous 4472268075Sdelphij * (and uncached). 4473168404Spjd */ 4474168404Spjd if (!BUF_EMPTY(hdr)) { 4475168404Spjd arc_buf_hdr_t *exists; 4476168404Spjd kmutex_t *hash_lock; 4477168404Spjd 4478219089Spjd ASSERT(zio->io_error == 0); 4479219089Spjd 4480168404Spjd arc_cksum_verify(buf); 4481168404Spjd 4482168404Spjd exists = buf_hash_insert(hdr, &hash_lock); 4483286570Smav if (exists != NULL) { 4484168404Spjd /* 4485168404Spjd * This can only happen if we overwrite for 4486168404Spjd * sync-to-convergence, because we remove 4487168404Spjd * buffers from the hash table when we arc_free(). 4488168404Spjd */ 4489219089Spjd if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 4490219089Spjd if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 4491219089Spjd panic("bad overwrite, hdr=%p exists=%p", 4492219089Spjd (void *)hdr, (void *)exists); 4493286570Smav ASSERT(refcount_is_zero( 4494286570Smav &exists->b_l1hdr.b_refcnt)); 4495219089Spjd arc_change_state(arc_anon, exists, hash_lock); 4496219089Spjd mutex_exit(hash_lock); 4497219089Spjd arc_hdr_destroy(exists); 4498219089Spjd exists = buf_hash_insert(hdr, &hash_lock); 4499219089Spjd ASSERT3P(exists, ==, NULL); 4500243524Smm } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 4501243524Smm /* nopwrite */ 4502243524Smm ASSERT(zio->io_prop.zp_nopwrite); 4503243524Smm if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 4504243524Smm panic("bad nopwrite, hdr=%p exists=%p", 4505243524Smm (void *)hdr, (void *)exists); 4506219089Spjd } else { 4507219089Spjd /* Dedup */ 4508286570Smav ASSERT(hdr->b_l1hdr.b_datacnt == 1); 4509286570Smav ASSERT(hdr->b_l1hdr.b_state == arc_anon); 4510219089Spjd ASSERT(BP_GET_DEDUP(zio->io_bp)); 4511219089Spjd ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 4512219089Spjd } 4513168404Spjd } 4514275811Sdelphij hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4515185029Spjd /* if it's not anon, we are doing a scrub */ 4516286570Smav if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) 4517185029Spjd arc_access(hdr, hash_lock); 4518168404Spjd mutex_exit(hash_lock); 4519168404Spjd } else { 4520275811Sdelphij hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4521168404Spjd } 4522168404Spjd 4523286570Smav ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4524219089Spjd callback->awcb_done(zio, buf, callback->awcb_private); 4525168404Spjd 4526168404Spjd kmem_free(callback, sizeof (arc_write_callback_t)); 4527168404Spjd} 4528168404Spjd 4529168404Spjdzio_t * 4530219089Spjdarc_write(zio_t *pio, spa_t *spa, uint64_t txg, 4531251478Sdelphij blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, 4532258632Savg const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, 4533258632Savg arc_done_func_t *done, void *private, zio_priority_t priority, 4534268123Sdelphij int zio_flags, const zbookmark_phys_t *zb) 4535168404Spjd{ 4536168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 4537168404Spjd arc_write_callback_t *callback; 4538185029Spjd zio_t *zio; 4539168404Spjd 4540185029Spjd ASSERT(ready != NULL); 4541219089Spjd ASSERT(done != NULL); 4542168404Spjd ASSERT(!HDR_IO_ERROR(hdr)); 4543286570Smav ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4544286570Smav ASSERT(hdr->b_l1hdr.b_acb == NULL); 4545286570Smav ASSERT(hdr->b_l1hdr.b_datacnt > 0); 4546185029Spjd if (l2arc) 4547275811Sdelphij hdr->b_flags |= ARC_FLAG_L2CACHE; 4548251478Sdelphij if (l2arc_compress) 4549275811Sdelphij hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4550168404Spjd callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 4551168404Spjd callback->awcb_ready = ready; 4552258632Savg callback->awcb_physdone = physdone; 4553168404Spjd callback->awcb_done = done; 4554168404Spjd callback->awcb_private = private; 4555168404Spjd callback->awcb_buf = buf; 4556168404Spjd 4557219089Spjd zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 4558258632Savg arc_write_ready, arc_write_physdone, arc_write_done, callback, 4559258632Savg priority, zio_flags, zb); 4560185029Spjd 4561168404Spjd return (zio); 4562168404Spjd} 4563168404Spjd 4564185029Spjdstatic int 4565258632Savgarc_memory_throttle(uint64_t reserve, uint64_t txg) 4566185029Spjd{ 4567185029Spjd#ifdef _KERNEL 4568272483Ssmh uint64_t available_memory = ptob(freemem); 4569185029Spjd static uint64_t page_load = 0; 4570185029Spjd static uint64_t last_txg = 0; 4571185029Spjd 4572272483Ssmh#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 4573185029Spjd available_memory = 4574272483Ssmh MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); 4575185029Spjd#endif 4576258632Savg 4577272483Ssmh if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) 4578185029Spjd return (0); 4579185029Spjd 4580185029Spjd if (txg > last_txg) { 4581185029Spjd last_txg = txg; 4582185029Spjd page_load = 0; 4583185029Spjd } 4584185029Spjd /* 4585185029Spjd * If we are in pageout, we know that memory is already tight, 4586185029Spjd * the arc is already going to be evicting, so we just want to 4587185029Spjd * continue to let page writes occur as quickly as possible. 4588185029Spjd */ 4589185029Spjd if (curproc == pageproc) { 4590272483Ssmh if (page_load > MAX(ptob(minfree), available_memory) / 4) 4591249195Smm return (SET_ERROR(ERESTART)); 4592185029Spjd /* Note: reserve is inflated, so we deflate */ 4593185029Spjd page_load += reserve / 8; 4594185029Spjd return (0); 4595185029Spjd } else if (page_load > 0 && arc_reclaim_needed()) { 4596185029Spjd /* memory is low, delay before restarting */ 4597185029Spjd ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 4598249195Smm return (SET_ERROR(EAGAIN)); 4599185029Spjd } 4600185029Spjd page_load = 0; 4601185029Spjd#endif 4602185029Spjd return (0); 4603185029Spjd} 4604185029Spjd 4605168404Spjdvoid 4606185029Spjdarc_tempreserve_clear(uint64_t reserve) 4607168404Spjd{ 4608185029Spjd atomic_add_64(&arc_tempreserve, -reserve); 4609168404Spjd ASSERT((int64_t)arc_tempreserve >= 0); 4610168404Spjd} 4611168404Spjd 4612168404Spjdint 4613185029Spjdarc_tempreserve_space(uint64_t reserve, uint64_t txg) 4614168404Spjd{ 4615185029Spjd int error; 4616209962Smm uint64_t anon_size; 4617185029Spjd 4618272483Ssmh if (reserve > arc_c/4 && !arc_no_grow) { 4619185029Spjd arc_c = MIN(arc_c_max, reserve * 4); 4620272483Ssmh DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); 4621272483Ssmh } 4622185029Spjd if (reserve > arc_c) 4623249195Smm return (SET_ERROR(ENOMEM)); 4624168404Spjd 4625168404Spjd /* 4626209962Smm * Don't count loaned bufs as in flight dirty data to prevent long 4627209962Smm * network delays from blocking transactions that are ready to be 4628209962Smm * assigned to a txg. 4629209962Smm */ 4630209962Smm anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); 4631209962Smm 4632209962Smm /* 4633185029Spjd * Writes will, almost always, require additional memory allocations 4634251631Sdelphij * in order to compress/encrypt/etc the data. We therefore need to 4635185029Spjd * make sure that there is sufficient available memory for this. 4636185029Spjd */ 4637258632Savg error = arc_memory_throttle(reserve, txg); 4638258632Savg if (error != 0) 4639185029Spjd return (error); 4640185029Spjd 4641185029Spjd /* 4642168404Spjd * Throttle writes when the amount of dirty data in the cache 4643168404Spjd * gets too large. We try to keep the cache less than half full 4644168404Spjd * of dirty blocks so that our sync times don't grow too large. 4645168404Spjd * Note: if two requests come in concurrently, we might let them 4646168404Spjd * both succeed, when one of them should fail. Not a huge deal. 4647168404Spjd */ 4648209962Smm 4649209962Smm if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 4650209962Smm anon_size > arc_c / 4) { 4651185029Spjd dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 4652185029Spjd "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 4653185029Spjd arc_tempreserve>>10, 4654185029Spjd arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 4655185029Spjd arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 4656185029Spjd reserve>>10, arc_c>>10); 4657249195Smm return (SET_ERROR(ERESTART)); 4658168404Spjd } 4659185029Spjd atomic_add_64(&arc_tempreserve, reserve); 4660168404Spjd return (0); 4661168404Spjd} 4662168404Spjd 4663286626Smavstatic void 4664286626Smavarc_kstat_update_state(arc_state_t *state, kstat_named_t *size, 4665286626Smav kstat_named_t *evict_data, kstat_named_t *evict_metadata) 4666286626Smav{ 4667286626Smav size->value.ui64 = state->arcs_size; 4668286626Smav evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; 4669286626Smav evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; 4670286626Smav} 4671286626Smav 4672286626Smavstatic int 4673286626Smavarc_kstat_update(kstat_t *ksp, int rw) 4674286626Smav{ 4675286626Smav arc_stats_t *as = ksp->ks_data; 4676286626Smav 4677286626Smav if (rw == KSTAT_WRITE) { 4678286626Smav return (EACCES); 4679286626Smav } else { 4680286626Smav arc_kstat_update_state(arc_anon, 4681286626Smav &as->arcstat_anon_size, 4682286626Smav &as->arcstat_anon_evictable_data, 4683286626Smav &as->arcstat_anon_evictable_metadata); 4684286626Smav arc_kstat_update_state(arc_mru, 4685286626Smav &as->arcstat_mru_size, 4686286626Smav &as->arcstat_mru_evictable_data, 4687286626Smav &as->arcstat_mru_evictable_metadata); 4688286626Smav arc_kstat_update_state(arc_mru_ghost, 4689286626Smav &as->arcstat_mru_ghost_size, 4690286626Smav &as->arcstat_mru_ghost_evictable_data, 4691286626Smav &as->arcstat_mru_ghost_evictable_metadata); 4692286626Smav arc_kstat_update_state(arc_mfu, 4693286626Smav &as->arcstat_mfu_size, 4694286626Smav &as->arcstat_mfu_evictable_data, 4695286626Smav &as->arcstat_mfu_evictable_metadata); 4696286626Smav arc_kstat_update_state(arc_mfu_ghost, 4697286626Smav &as->arcstat_mfu_ghost_size, 4698286626Smav &as->arcstat_mfu_ghost_evictable_data, 4699286626Smav &as->arcstat_mfu_ghost_evictable_metadata); 4700286626Smav } 4701286626Smav 4702286626Smav return (0); 4703286626Smav} 4704286626Smav 4705168404Spjd#ifdef _KERNEL 4706168566Spjdstatic eventhandler_tag arc_event_lowmem = NULL; 4707168404Spjd 4708168404Spjdstatic void 4709168566Spjdarc_lowmem(void *arg __unused, int howto __unused) 4710168404Spjd{ 4711168404Spjd 4712219089Spjd mutex_enter(&arc_reclaim_thr_lock); 4713286625Smav /* XXX: Memory deficit should be passed as argument. */ 4714286625Smav needfree = btoc(arc_c >> arc_shrink_shift); 4715272483Ssmh DTRACE_PROBE(arc__needfree); 4716168404Spjd cv_signal(&arc_reclaim_thr_cv); 4717241773Savg 4718241773Savg /* 4719241773Savg * It is unsafe to block here in arbitrary threads, because we can come 4720241773Savg * here from ARC itself and may hold ARC locks and thus risk a deadlock 4721241773Savg * with ARC reclaim thread. 4722241773Savg */ 4723286623Smav if (curproc == pageproc) 4724286623Smav msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0); 4725219089Spjd mutex_exit(&arc_reclaim_thr_lock); 4726168404Spjd} 4727168404Spjd#endif 4728168404Spjd 4729168404Spjdvoid 4730168404Spjdarc_init(void) 4731168404Spjd{ 4732219089Spjd int i, prefetch_tunable_set = 0; 4733205231Skmacy 4734168404Spjd mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 4735168404Spjd cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 4736168404Spjd 4737168404Spjd /* Convert seconds to clock ticks */ 4738168404Spjd arc_min_prefetch_lifespan = 1 * hz; 4739168404Spjd 4740168404Spjd /* Start out with 1/8 of all memory */ 4741168566Spjd arc_c = kmem_size() / 8; 4742219089Spjd 4743277300Ssmh#ifdef illumos 4744192360Skmacy#ifdef _KERNEL 4745192360Skmacy /* 4746192360Skmacy * On architectures where the physical memory can be larger 4747192360Skmacy * than the addressable space (intel in 32-bit mode), we may 4748192360Skmacy * need to limit the cache to 1/8 of VM size. 4749192360Skmacy */ 4750192360Skmacy arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 4751192360Skmacy#endif 4752277300Ssmh#endif /* illumos */ 4753168566Spjd /* set min cache to 1/32 of all memory, or 16MB, whichever is more */ 4754280822Smav arc_c_min = MAX(arc_c / 4, 16 << 20); 4755168566Spjd /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ 4756280822Smav if (arc_c * 8 >= 1 << 30) 4757280822Smav arc_c_max = (arc_c * 8) - (1 << 30); 4758168404Spjd else 4759168404Spjd arc_c_max = arc_c_min; 4760175633Spjd arc_c_max = MAX(arc_c * 5, arc_c_max); 4761219089Spjd 4762168481Spjd#ifdef _KERNEL 4763168404Spjd /* 4764168404Spjd * Allow the tunables to override our calculations if they are 4765168566Spjd * reasonable (ie. over 16MB) 4766168404Spjd */ 4767280822Smav if (zfs_arc_max > 16 << 20 && zfs_arc_max < kmem_size()) 4768168404Spjd arc_c_max = zfs_arc_max; 4769280822Smav if (zfs_arc_min > 16 << 20 && zfs_arc_min <= arc_c_max) 4770168404Spjd arc_c_min = zfs_arc_min; 4771168481Spjd#endif 4772219089Spjd 4773168404Spjd arc_c = arc_c_max; 4774168404Spjd arc_p = (arc_c >> 1); 4775168404Spjd 4776185029Spjd /* limit meta-data to 1/4 of the arc capacity */ 4777185029Spjd arc_meta_limit = arc_c_max / 4; 4778185029Spjd 4779185029Spjd /* Allow the tunable to override if it is reasonable */ 4780185029Spjd if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 4781185029Spjd arc_meta_limit = zfs_arc_meta_limit; 4782185029Spjd 4783185029Spjd if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 4784185029Spjd arc_c_min = arc_meta_limit / 2; 4785185029Spjd 4786275780Sdelphij if (zfs_arc_meta_min > 0) { 4787275780Sdelphij arc_meta_min = zfs_arc_meta_min; 4788275780Sdelphij } else { 4789275780Sdelphij arc_meta_min = arc_c_min / 2; 4790275780Sdelphij } 4791275780Sdelphij 4792208373Smm if (zfs_arc_grow_retry > 0) 4793208373Smm arc_grow_retry = zfs_arc_grow_retry; 4794208373Smm 4795208373Smm if (zfs_arc_shrink_shift > 0) 4796208373Smm arc_shrink_shift = zfs_arc_shrink_shift; 4797208373Smm 4798286625Smav /* 4799286625Smav * Ensure that arc_no_grow_shift is less than arc_shrink_shift. 4800286625Smav */ 4801286625Smav if (arc_no_grow_shift >= arc_shrink_shift) 4802286625Smav arc_no_grow_shift = arc_shrink_shift - 1; 4803286625Smav 4804208373Smm if (zfs_arc_p_min_shift > 0) 4805208373Smm arc_p_min_shift = zfs_arc_p_min_shift; 4806208373Smm 4807168404Spjd /* if kmem_flags are set, lets try to use less memory */ 4808168404Spjd if (kmem_debugging()) 4809168404Spjd arc_c = arc_c / 2; 4810168404Spjd if (arc_c < arc_c_min) 4811168404Spjd arc_c = arc_c_min; 4812168404Spjd 4813168473Spjd zfs_arc_min = arc_c_min; 4814168473Spjd zfs_arc_max = arc_c_max; 4815168473Spjd 4816168404Spjd arc_anon = &ARC_anon; 4817168404Spjd arc_mru = &ARC_mru; 4818168404Spjd arc_mru_ghost = &ARC_mru_ghost; 4819168404Spjd arc_mfu = &ARC_mfu; 4820168404Spjd arc_mfu_ghost = &ARC_mfu_ghost; 4821185029Spjd arc_l2c_only = &ARC_l2c_only; 4822168404Spjd arc_size = 0; 4823168404Spjd 4824286762Smav mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4825286762Smav mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4826286762Smav mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4827286762Smav mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4828286762Smav mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4829286762Smav mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 4830206796Spjd 4831286762Smav list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], 4832286762Smav sizeof (arc_buf_hdr_t), 4833286762Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4834286762Smav list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], 4835286762Smav sizeof (arc_buf_hdr_t), 4836286762Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4837286762Smav list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], 4838286762Smav sizeof (arc_buf_hdr_t), 4839286762Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4840286762Smav list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], 4841286762Smav sizeof (arc_buf_hdr_t), 4842286762Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4843286762Smav list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], 4844286762Smav sizeof (arc_buf_hdr_t), 4845286762Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4846286762Smav list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], 4847286762Smav sizeof (arc_buf_hdr_t), 4848286762Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4849286762Smav list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], 4850286762Smav sizeof (arc_buf_hdr_t), 4851286762Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4852286762Smav list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], 4853286762Smav sizeof (arc_buf_hdr_t), 4854286762Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4855286762Smav list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], 4856286762Smav sizeof (arc_buf_hdr_t), 4857286762Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4858286762Smav list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], 4859286762Smav sizeof (arc_buf_hdr_t), 4860286762Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node)); 4861168404Spjd 4862168404Spjd buf_init(); 4863168404Spjd 4864168404Spjd arc_thread_exit = 0; 4865168404Spjd arc_eviction_list = NULL; 4866168404Spjd mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 4867168404Spjd bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 4868168404Spjd 4869168404Spjd arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 4870168404Spjd sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 4871168404Spjd 4872168404Spjd if (arc_ksp != NULL) { 4873168404Spjd arc_ksp->ks_data = &arc_stats; 4874286574Smav arc_ksp->ks_update = arc_kstat_update; 4875168404Spjd kstat_install(arc_ksp); 4876168404Spjd } 4877168404Spjd 4878168404Spjd (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 4879168404Spjd TS_RUN, minclsyspri); 4880168404Spjd 4881168404Spjd#ifdef _KERNEL 4882168566Spjd arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 4883168404Spjd EVENTHANDLER_PRI_FIRST); 4884168404Spjd#endif 4885168404Spjd 4886168404Spjd arc_dead = FALSE; 4887185029Spjd arc_warm = B_FALSE; 4888168566Spjd 4889258632Savg /* 4890258632Savg * Calculate maximum amount of dirty data per pool. 4891258632Savg * 4892258632Savg * If it has been set by /etc/system, take that. 4893258632Savg * Otherwise, use a percentage of physical memory defined by 4894258632Savg * zfs_dirty_data_max_percent (default 10%) with a cap at 4895258632Savg * zfs_dirty_data_max_max (default 4GB). 4896258632Savg */ 4897258632Savg if (zfs_dirty_data_max == 0) { 4898258632Savg zfs_dirty_data_max = ptob(physmem) * 4899258632Savg zfs_dirty_data_max_percent / 100; 4900258632Savg zfs_dirty_data_max = MIN(zfs_dirty_data_max, 4901258632Savg zfs_dirty_data_max_max); 4902258632Savg } 4903185029Spjd 4904168566Spjd#ifdef _KERNEL 4905194043Skmacy if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 4906193953Skmacy prefetch_tunable_set = 1; 4907206796Spjd 4908193878Skmacy#ifdef __i386__ 4909193953Skmacy if (prefetch_tunable_set == 0) { 4910196863Strasz printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 4911196863Strasz "-- to enable,\n"); 4912196863Strasz printf(" add \"vfs.zfs.prefetch_disable=0\" " 4913196863Strasz "to /boot/loader.conf.\n"); 4914219089Spjd zfs_prefetch_disable = 1; 4915193878Skmacy } 4916206796Spjd#else 4917193878Skmacy if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 4918193953Skmacy prefetch_tunable_set == 0) { 4919196863Strasz printf("ZFS NOTICE: Prefetch is disabled by default if less " 4920196941Strasz "than 4GB of RAM is present;\n" 4921196863Strasz " to enable, add \"vfs.zfs.prefetch_disable=0\" " 4922196863Strasz "to /boot/loader.conf.\n"); 4923219089Spjd zfs_prefetch_disable = 1; 4924193878Skmacy } 4925206796Spjd#endif 4926175633Spjd /* Warn about ZFS memory and address space requirements. */ 4927168696Spjd if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 4928168987Sbmah printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 4929168987Sbmah "expect unstable behavior.\n"); 4930175633Spjd } 4931175633Spjd if (kmem_size() < 512 * (1 << 20)) { 4932173419Spjd printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 4933168987Sbmah "expect unstable behavior.\n"); 4934185029Spjd printf(" Consider tuning vm.kmem_size and " 4935173419Spjd "vm.kmem_size_max\n"); 4936185029Spjd printf(" in /boot/loader.conf.\n"); 4937168566Spjd } 4938168566Spjd#endif 4939168404Spjd} 4940168404Spjd 4941168404Spjdvoid 4942168404Spjdarc_fini(void) 4943168404Spjd{ 4944168404Spjd mutex_enter(&arc_reclaim_thr_lock); 4945168404Spjd arc_thread_exit = 1; 4946168404Spjd cv_signal(&arc_reclaim_thr_cv); 4947168404Spjd while (arc_thread_exit != 0) 4948168404Spjd cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 4949168404Spjd mutex_exit(&arc_reclaim_thr_lock); 4950168404Spjd 4951185029Spjd arc_flush(NULL); 4952168404Spjd 4953168404Spjd arc_dead = TRUE; 4954168404Spjd 4955168404Spjd if (arc_ksp != NULL) { 4956168404Spjd kstat_delete(arc_ksp); 4957168404Spjd arc_ksp = NULL; 4958168404Spjd } 4959168404Spjd 4960168404Spjd mutex_destroy(&arc_eviction_mtx); 4961168404Spjd mutex_destroy(&arc_reclaim_thr_lock); 4962168404Spjd cv_destroy(&arc_reclaim_thr_cv); 4963168404Spjd 4964286762Smav list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); 4965286762Smav list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 4966286762Smav list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); 4967286762Smav list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 4968286762Smav list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); 4969286762Smav list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 4970286762Smav list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); 4971286762Smav list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 4972168404Spjd 4973286762Smav mutex_destroy(&arc_anon->arcs_mtx); 4974286762Smav mutex_destroy(&arc_mru->arcs_mtx); 4975286762Smav mutex_destroy(&arc_mru_ghost->arcs_mtx); 4976286762Smav mutex_destroy(&arc_mfu->arcs_mtx); 4977286762Smav mutex_destroy(&arc_mfu_ghost->arcs_mtx); 4978286762Smav mutex_destroy(&arc_l2c_only->arcs_mtx); 4979206796Spjd 4980168404Spjd buf_fini(); 4981168404Spjd 4982286570Smav ASSERT0(arc_loaned_bytes); 4983209962Smm 4984168404Spjd#ifdef _KERNEL 4985168566Spjd if (arc_event_lowmem != NULL) 4986168566Spjd EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 4987168404Spjd#endif 4988168404Spjd} 4989185029Spjd 4990185029Spjd/* 4991185029Spjd * Level 2 ARC 4992185029Spjd * 4993185029Spjd * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 4994185029Spjd * It uses dedicated storage devices to hold cached data, which are populated 4995185029Spjd * using large infrequent writes. The main role of this cache is to boost 4996185029Spjd * the performance of random read workloads. The intended L2ARC devices 4997185029Spjd * include short-stroked disks, solid state disks, and other media with 4998185029Spjd * substantially faster read latency than disk. 4999185029Spjd * 5000185029Spjd * +-----------------------+ 5001185029Spjd * | ARC | 5002185029Spjd * +-----------------------+ 5003185029Spjd * | ^ ^ 5004185029Spjd * | | | 5005185029Spjd * l2arc_feed_thread() arc_read() 5006185029Spjd * | | | 5007185029Spjd * | l2arc read | 5008185029Spjd * V | | 5009185029Spjd * +---------------+ | 5010185029Spjd * | L2ARC | | 5011185029Spjd * +---------------+ | 5012185029Spjd * | ^ | 5013185029Spjd * l2arc_write() | | 5014185029Spjd * | | | 5015185029Spjd * V | | 5016185029Spjd * +-------+ +-------+ 5017185029Spjd * | vdev | | vdev | 5018185029Spjd * | cache | | cache | 5019185029Spjd * +-------+ +-------+ 5020185029Spjd * +=========+ .-----. 5021185029Spjd * : L2ARC : |-_____-| 5022185029Spjd * : devices : | Disks | 5023185029Spjd * +=========+ `-_____-' 5024185029Spjd * 5025185029Spjd * Read requests are satisfied from the following sources, in order: 5026185029Spjd * 5027185029Spjd * 1) ARC 5028185029Spjd * 2) vdev cache of L2ARC devices 5029185029Spjd * 3) L2ARC devices 5030185029Spjd * 4) vdev cache of disks 5031185029Spjd * 5) disks 5032185029Spjd * 5033185029Spjd * Some L2ARC device types exhibit extremely slow write performance. 5034185029Spjd * To accommodate for this there are some significant differences between 5035185029Spjd * the L2ARC and traditional cache design: 5036185029Spjd * 5037185029Spjd * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 5038185029Spjd * the ARC behave as usual, freeing buffers and placing headers on ghost 5039185029Spjd * lists. The ARC does not send buffers to the L2ARC during eviction as 5040185029Spjd * this would add inflated write latencies for all ARC memory pressure. 5041185029Spjd * 5042185029Spjd * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 5043185029Spjd * It does this by periodically scanning buffers from the eviction-end of 5044185029Spjd * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 5045251478Sdelphij * not already there. It scans until a headroom of buffers is satisfied, 5046251478Sdelphij * which itself is a buffer for ARC eviction. If a compressible buffer is 5047251478Sdelphij * found during scanning and selected for writing to an L2ARC device, we 5048251478Sdelphij * temporarily boost scanning headroom during the next scan cycle to make 5049251478Sdelphij * sure we adapt to compression effects (which might significantly reduce 5050251478Sdelphij * the data volume we write to L2ARC). The thread that does this is 5051185029Spjd * l2arc_feed_thread(), illustrated below; example sizes are included to 5052185029Spjd * provide a better sense of ratio than this diagram: 5053185029Spjd * 5054185029Spjd * head --> tail 5055185029Spjd * +---------------------+----------+ 5056185029Spjd * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 5057185029Spjd * +---------------------+----------+ | o L2ARC eligible 5058185029Spjd * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 5059185029Spjd * +---------------------+----------+ | 5060185029Spjd * 15.9 Gbytes ^ 32 Mbytes | 5061185029Spjd * headroom | 5062185029Spjd * l2arc_feed_thread() 5063185029Spjd * | 5064185029Spjd * l2arc write hand <--[oooo]--' 5065185029Spjd * | 8 Mbyte 5066185029Spjd * | write max 5067185029Spjd * V 5068185029Spjd * +==============================+ 5069185029Spjd * L2ARC dev |####|#|###|###| |####| ... | 5070185029Spjd * +==============================+ 5071185029Spjd * 32 Gbytes 5072185029Spjd * 5073185029Spjd * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 5074185029Spjd * evicted, then the L2ARC has cached a buffer much sooner than it probably 5075185029Spjd * needed to, potentially wasting L2ARC device bandwidth and storage. It is 5076185029Spjd * safe to say that this is an uncommon case, since buffers at the end of 5077185029Spjd * the ARC lists have moved there due to inactivity. 5078185029Spjd * 5079185029Spjd * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 5080185029Spjd * then the L2ARC simply misses copying some buffers. This serves as a 5081185029Spjd * pressure valve to prevent heavy read workloads from both stalling the ARC 5082185029Spjd * with waits and clogging the L2ARC with writes. This also helps prevent 5083185029Spjd * the potential for the L2ARC to churn if it attempts to cache content too 5084185029Spjd * quickly, such as during backups of the entire pool. 5085185029Spjd * 5086185029Spjd * 5. After system boot and before the ARC has filled main memory, there are 5087185029Spjd * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 5088185029Spjd * lists can remain mostly static. Instead of searching from tail of these 5089185029Spjd * lists as pictured, the l2arc_feed_thread() will search from the list heads 5090185029Spjd * for eligible buffers, greatly increasing its chance of finding them. 5091185029Spjd * 5092185029Spjd * The L2ARC device write speed is also boosted during this time so that 5093185029Spjd * the L2ARC warms up faster. Since there have been no ARC evictions yet, 5094185029Spjd * there are no L2ARC reads, and no fear of degrading read performance 5095185029Spjd * through increased writes. 5096185029Spjd * 5097185029Spjd * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 5098185029Spjd * the vdev queue can aggregate them into larger and fewer writes. Each 5099185029Spjd * device is written to in a rotor fashion, sweeping writes through 5100185029Spjd * available space then repeating. 5101185029Spjd * 5102185029Spjd * 7. The L2ARC does not store dirty content. It never needs to flush 5103185029Spjd * write buffers back to disk based storage. 5104185029Spjd * 5105185029Spjd * 8. If an ARC buffer is written (and dirtied) which also exists in the 5106185029Spjd * L2ARC, the now stale L2ARC buffer is immediately dropped. 5107185029Spjd * 5108185029Spjd * The performance of the L2ARC can be tweaked by a number of tunables, which 5109185029Spjd * may be necessary for different workloads: 5110185029Spjd * 5111185029Spjd * l2arc_write_max max write bytes per interval 5112185029Spjd * l2arc_write_boost extra write bytes during device warmup 5113185029Spjd * l2arc_noprefetch skip caching prefetched buffers 5114185029Spjd * l2arc_headroom number of max device writes to precache 5115251478Sdelphij * l2arc_headroom_boost when we find compressed buffers during ARC 5116251478Sdelphij * scanning, we multiply headroom by this 5117251478Sdelphij * percentage factor for the next scan cycle, 5118251478Sdelphij * since more compressed buffers are likely to 5119251478Sdelphij * be present 5120185029Spjd * l2arc_feed_secs seconds between L2ARC writing 5121185029Spjd * 5122185029Spjd * Tunables may be removed or added as future performance improvements are 5123185029Spjd * integrated, and also may become zpool properties. 5124208373Smm * 5125208373Smm * There are three key functions that control how the L2ARC warms up: 5126208373Smm * 5127208373Smm * l2arc_write_eligible() check if a buffer is eligible to cache 5128208373Smm * l2arc_write_size() calculate how much to write 5129208373Smm * l2arc_write_interval() calculate sleep delay between writes 5130208373Smm * 5131208373Smm * These three functions determine what to write, how much, and how quickly 5132208373Smm * to send writes. 5133185029Spjd */ 5134185029Spjd 5135208373Smmstatic boolean_t 5136275811Sdelphijl2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) 5137208373Smm{ 5138208373Smm /* 5139208373Smm * A buffer is *not* eligible for the L2ARC if it: 5140208373Smm * 1. belongs to a different spa. 5141208373Smm * 2. is already cached on the L2ARC. 5142208373Smm * 3. has an I/O in progress (it may be an incomplete read). 5143208373Smm * 4. is flagged not eligible (zfs property). 5144208373Smm */ 5145275811Sdelphij if (hdr->b_spa != spa_guid) { 5146208373Smm ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 5147208373Smm return (B_FALSE); 5148208373Smm } 5149286570Smav if (HDR_HAS_L2HDR(hdr)) { 5150208373Smm ARCSTAT_BUMP(arcstat_l2_write_in_l2); 5151208373Smm return (B_FALSE); 5152208373Smm } 5153275811Sdelphij if (HDR_IO_IN_PROGRESS(hdr)) { 5154208373Smm ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 5155208373Smm return (B_FALSE); 5156208373Smm } 5157275811Sdelphij if (!HDR_L2CACHE(hdr)) { 5158208373Smm ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 5159208373Smm return (B_FALSE); 5160208373Smm } 5161208373Smm 5162208373Smm return (B_TRUE); 5163208373Smm} 5164208373Smm 5165208373Smmstatic uint64_t 5166251478Sdelphijl2arc_write_size(void) 5167208373Smm{ 5168208373Smm uint64_t size; 5169208373Smm 5170251478Sdelphij /* 5171251478Sdelphij * Make sure our globals have meaningful values in case the user 5172251478Sdelphij * altered them. 5173251478Sdelphij */ 5174251478Sdelphij size = l2arc_write_max; 5175251478Sdelphij if (size == 0) { 5176251478Sdelphij cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 5177251478Sdelphij "be greater than zero, resetting it to the default (%d)", 5178251478Sdelphij L2ARC_WRITE_SIZE); 5179251478Sdelphij size = l2arc_write_max = L2ARC_WRITE_SIZE; 5180251478Sdelphij } 5181208373Smm 5182208373Smm if (arc_warm == B_FALSE) 5183251478Sdelphij size += l2arc_write_boost; 5184208373Smm 5185208373Smm return (size); 5186208373Smm 5187208373Smm} 5188208373Smm 5189208373Smmstatic clock_t 5190208373Smml2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 5191208373Smm{ 5192219089Spjd clock_t interval, next, now; 5193208373Smm 5194208373Smm /* 5195208373Smm * If the ARC lists are busy, increase our write rate; if the 5196208373Smm * lists are stale, idle back. This is achieved by checking 5197208373Smm * how much we previously wrote - if it was more than half of 5198208373Smm * what we wanted, schedule the next write much sooner. 5199208373Smm */ 5200208373Smm if (l2arc_feed_again && wrote > (wanted / 2)) 5201208373Smm interval = (hz * l2arc_feed_min_ms) / 1000; 5202208373Smm else 5203208373Smm interval = hz * l2arc_feed_secs; 5204208373Smm 5205219089Spjd now = ddi_get_lbolt(); 5206219089Spjd next = MAX(now, MIN(now + interval, began + interval)); 5207208373Smm 5208208373Smm return (next); 5209208373Smm} 5210208373Smm 5211185029Spjd/* 5212185029Spjd * Cycle through L2ARC devices. This is how L2ARC load balances. 5213185029Spjd * If a device is returned, this also returns holding the spa config lock. 5214185029Spjd */ 5215185029Spjdstatic l2arc_dev_t * 5216185029Spjdl2arc_dev_get_next(void) 5217185029Spjd{ 5218185029Spjd l2arc_dev_t *first, *next = NULL; 5219185029Spjd 5220185029Spjd /* 5221185029Spjd * Lock out the removal of spas (spa_namespace_lock), then removal 5222185029Spjd * of cache devices (l2arc_dev_mtx). Once a device has been selected, 5223185029Spjd * both locks will be dropped and a spa config lock held instead. 5224185029Spjd */ 5225185029Spjd mutex_enter(&spa_namespace_lock); 5226185029Spjd mutex_enter(&l2arc_dev_mtx); 5227185029Spjd 5228185029Spjd /* if there are no vdevs, there is nothing to do */ 5229185029Spjd if (l2arc_ndev == 0) 5230185029Spjd goto out; 5231185029Spjd 5232185029Spjd first = NULL; 5233185029Spjd next = l2arc_dev_last; 5234185029Spjd do { 5235185029Spjd /* loop around the list looking for a non-faulted vdev */ 5236185029Spjd if (next == NULL) { 5237185029Spjd next = list_head(l2arc_dev_list); 5238185029Spjd } else { 5239185029Spjd next = list_next(l2arc_dev_list, next); 5240185029Spjd if (next == NULL) 5241185029Spjd next = list_head(l2arc_dev_list); 5242185029Spjd } 5243185029Spjd 5244185029Spjd /* if we have come back to the start, bail out */ 5245185029Spjd if (first == NULL) 5246185029Spjd first = next; 5247185029Spjd else if (next == first) 5248185029Spjd break; 5249185029Spjd 5250185029Spjd } while (vdev_is_dead(next->l2ad_vdev)); 5251185029Spjd 5252185029Spjd /* if we were unable to find any usable vdevs, return NULL */ 5253185029Spjd if (vdev_is_dead(next->l2ad_vdev)) 5254185029Spjd next = NULL; 5255185029Spjd 5256185029Spjd l2arc_dev_last = next; 5257185029Spjd 5258185029Spjdout: 5259185029Spjd mutex_exit(&l2arc_dev_mtx); 5260185029Spjd 5261185029Spjd /* 5262185029Spjd * Grab the config lock to prevent the 'next' device from being 5263185029Spjd * removed while we are writing to it. 5264185029Spjd */ 5265185029Spjd if (next != NULL) 5266185029Spjd spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 5267185029Spjd mutex_exit(&spa_namespace_lock); 5268185029Spjd 5269185029Spjd return (next); 5270185029Spjd} 5271185029Spjd 5272185029Spjd/* 5273185029Spjd * Free buffers that were tagged for destruction. 5274185029Spjd */ 5275185029Spjdstatic void 5276185029Spjdl2arc_do_free_on_write() 5277185029Spjd{ 5278185029Spjd list_t *buflist; 5279185029Spjd l2arc_data_free_t *df, *df_prev; 5280185029Spjd 5281185029Spjd mutex_enter(&l2arc_free_on_write_mtx); 5282185029Spjd buflist = l2arc_free_on_write; 5283185029Spjd 5284185029Spjd for (df = list_tail(buflist); df; df = df_prev) { 5285185029Spjd df_prev = list_prev(buflist, df); 5286185029Spjd ASSERT(df->l2df_data != NULL); 5287185029Spjd ASSERT(df->l2df_func != NULL); 5288185029Spjd df->l2df_func(df->l2df_data, df->l2df_size); 5289185029Spjd list_remove(buflist, df); 5290185029Spjd kmem_free(df, sizeof (l2arc_data_free_t)); 5291185029Spjd } 5292185029Spjd 5293185029Spjd mutex_exit(&l2arc_free_on_write_mtx); 5294185029Spjd} 5295185029Spjd 5296185029Spjd/* 5297185029Spjd * A write to a cache device has completed. Update all headers to allow 5298185029Spjd * reads from these buffers to begin. 5299185029Spjd */ 5300185029Spjdstatic void 5301185029Spjdl2arc_write_done(zio_t *zio) 5302185029Spjd{ 5303185029Spjd l2arc_write_callback_t *cb; 5304185029Spjd l2arc_dev_t *dev; 5305185029Spjd list_t *buflist; 5306275811Sdelphij arc_buf_hdr_t *head, *hdr, *hdr_prev; 5307185029Spjd kmutex_t *hash_lock; 5308268085Sdelphij int64_t bytes_dropped = 0; 5309185029Spjd 5310185029Spjd cb = zio->io_private; 5311185029Spjd ASSERT(cb != NULL); 5312185029Spjd dev = cb->l2wcb_dev; 5313185029Spjd ASSERT(dev != NULL); 5314185029Spjd head = cb->l2wcb_head; 5315185029Spjd ASSERT(head != NULL); 5316286570Smav buflist = &dev->l2ad_buflist; 5317185029Spjd ASSERT(buflist != NULL); 5318185029Spjd DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 5319185029Spjd l2arc_write_callback_t *, cb); 5320185029Spjd 5321185029Spjd if (zio->io_error != 0) 5322185029Spjd ARCSTAT_BUMP(arcstat_l2_writes_error); 5323185029Spjd 5324286570Smav mutex_enter(&dev->l2ad_mtx); 5325185029Spjd 5326185029Spjd /* 5327185029Spjd * All writes completed, or an error was hit. 5328185029Spjd */ 5329275811Sdelphij for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { 5330275811Sdelphij hdr_prev = list_prev(buflist, hdr); 5331185029Spjd 5332275811Sdelphij hash_lock = HDR_LOCK(hdr); 5333185029Spjd if (!mutex_tryenter(hash_lock)) { 5334185029Spjd /* 5335185029Spjd * This buffer misses out. It may be in a stage 5336286570Smav * of eviction. Its ARC_FLAG_L2_WRITING flag will be 5337185029Spjd * left set, denying reads to this buffer. 5338185029Spjd */ 5339185029Spjd ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); 5340185029Spjd continue; 5341185029Spjd } 5342185029Spjd 5343286570Smav /* 5344286570Smav * It's possible that this buffer got evicted from the L1 cache 5345286570Smav * before we grabbed the vdev + hash locks, in which case 5346286570Smav * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated. 5347286570Smav * Only free the buffer if we still have an L1 hdr. 5348286570Smav */ 5349286570Smav if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL && 5350286570Smav HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) 5351286570Smav l2arc_release_cdata_buf(hdr); 5352286570Smav 5353185029Spjd if (zio->io_error != 0) { 5354185029Spjd /* 5355185029Spjd * Error - drop L2ARC entry. 5356185029Spjd */ 5357286570Smav trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev, 5358286570Smav hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0); 5359286570Smav hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 5360286570Smav 5361286570Smav ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); 5362275811Sdelphij ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 5363286598Smav 5364286598Smav bytes_dropped += hdr->b_l2hdr.b_asize; 5365286598Smav (void) refcount_remove_many(&dev->l2ad_alloc, 5366286598Smav hdr->b_l2hdr.b_asize, hdr); 5367185029Spjd } 5368185029Spjd 5369185029Spjd /* 5370185029Spjd * Allow ARC to begin reads to this L2ARC entry. 5371185029Spjd */ 5372275811Sdelphij hdr->b_flags &= ~ARC_FLAG_L2_WRITING; 5373185029Spjd 5374185029Spjd mutex_exit(hash_lock); 5375185029Spjd } 5376185029Spjd 5377185029Spjd atomic_inc_64(&l2arc_writes_done); 5378185029Spjd list_remove(buflist, head); 5379286570Smav ASSERT(!HDR_HAS_L1HDR(head)); 5380286570Smav kmem_cache_free(hdr_l2only_cache, head); 5381286570Smav mutex_exit(&dev->l2ad_mtx); 5382185029Spjd 5383268085Sdelphij vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); 5384268085Sdelphij 5385185029Spjd l2arc_do_free_on_write(); 5386185029Spjd 5387185029Spjd kmem_free(cb, sizeof (l2arc_write_callback_t)); 5388185029Spjd} 5389185029Spjd 5390185029Spjd/* 5391185029Spjd * A read to a cache device completed. Validate buffer contents before 5392185029Spjd * handing over to the regular ARC routines. 5393185029Spjd */ 5394185029Spjdstatic void 5395185029Spjdl2arc_read_done(zio_t *zio) 5396185029Spjd{ 5397185029Spjd l2arc_read_callback_t *cb; 5398185029Spjd arc_buf_hdr_t *hdr; 5399185029Spjd arc_buf_t *buf; 5400185029Spjd kmutex_t *hash_lock; 5401185029Spjd int equal; 5402185029Spjd 5403185029Spjd ASSERT(zio->io_vd != NULL); 5404185029Spjd ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 5405185029Spjd 5406185029Spjd spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 5407185029Spjd 5408185029Spjd cb = zio->io_private; 5409185029Spjd ASSERT(cb != NULL); 5410185029Spjd buf = cb->l2rcb_buf; 5411185029Spjd ASSERT(buf != NULL); 5412185029Spjd 5413219089Spjd hash_lock = HDR_LOCK(buf->b_hdr); 5414185029Spjd mutex_enter(hash_lock); 5415219089Spjd hdr = buf->b_hdr; 5416219089Spjd ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 5417185029Spjd 5418185029Spjd /* 5419251478Sdelphij * If the buffer was compressed, decompress it first. 5420251478Sdelphij */ 5421251478Sdelphij if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) 5422251478Sdelphij l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); 5423251478Sdelphij ASSERT(zio->io_data != NULL); 5424251478Sdelphij 5425251478Sdelphij /* 5426185029Spjd * Check this survived the L2ARC journey. 5427185029Spjd */ 5428185029Spjd equal = arc_cksum_equal(buf); 5429185029Spjd if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 5430185029Spjd mutex_exit(hash_lock); 5431185029Spjd zio->io_private = buf; 5432185029Spjd zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 5433185029Spjd zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 5434185029Spjd arc_read_done(zio); 5435185029Spjd } else { 5436185029Spjd mutex_exit(hash_lock); 5437185029Spjd /* 5438185029Spjd * Buffer didn't survive caching. Increment stats and 5439185029Spjd * reissue to the original storage device. 5440185029Spjd */ 5441185029Spjd if (zio->io_error != 0) { 5442185029Spjd ARCSTAT_BUMP(arcstat_l2_io_error); 5443185029Spjd } else { 5444249195Smm zio->io_error = SET_ERROR(EIO); 5445185029Spjd } 5446185029Spjd if (!equal) 5447185029Spjd ARCSTAT_BUMP(arcstat_l2_cksum_bad); 5448185029Spjd 5449185029Spjd /* 5450185029Spjd * If there's no waiter, issue an async i/o to the primary 5451185029Spjd * storage now. If there *is* a waiter, the caller must 5452185029Spjd * issue the i/o in a context where it's OK to block. 5453185029Spjd */ 5454209962Smm if (zio->io_waiter == NULL) { 5455209962Smm zio_t *pio = zio_unique_parent(zio); 5456209962Smm 5457209962Smm ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 5458209962Smm 5459209962Smm zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 5460185029Spjd buf->b_data, zio->io_size, arc_read_done, buf, 5461185029Spjd zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 5462209962Smm } 5463185029Spjd } 5464185029Spjd 5465185029Spjd kmem_free(cb, sizeof (l2arc_read_callback_t)); 5466185029Spjd} 5467185029Spjd 5468185029Spjd/* 5469185029Spjd * This is the list priority from which the L2ARC will search for pages to 5470185029Spjd * cache. This is used within loops (0..3) to cycle through lists in the 5471185029Spjd * desired order. This order can have a significant effect on cache 5472185029Spjd * performance. 5473185029Spjd * 5474185029Spjd * Currently the metadata lists are hit first, MFU then MRU, followed by 5475185029Spjd * the data lists. This function returns a locked list, and also returns 5476185029Spjd * the lock pointer. 5477185029Spjd */ 5478185029Spjdstatic list_t * 5479185029Spjdl2arc_list_locked(int list_num, kmutex_t **lock) 5480185029Spjd{ 5481247187Smm list_t *list = NULL; 5482185029Spjd 5483286762Smav ASSERT(list_num >= 0 && list_num <= 3); 5484206796Spjd 5485286762Smav switch (list_num) { 5486286762Smav case 0: 5487286762Smav list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; 5488286762Smav *lock = &arc_mfu->arcs_mtx; 5489286762Smav break; 5490286762Smav case 1: 5491286762Smav list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; 5492286762Smav *lock = &arc_mru->arcs_mtx; 5493286762Smav break; 5494286762Smav case 2: 5495286762Smav list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; 5496286762Smav *lock = &arc_mfu->arcs_mtx; 5497286762Smav break; 5498286762Smav case 3: 5499286762Smav list = &arc_mru->arcs_list[ARC_BUFC_DATA]; 5500286762Smav *lock = &arc_mru->arcs_mtx; 5501286762Smav break; 5502185029Spjd } 5503185029Spjd 5504185029Spjd ASSERT(!(MUTEX_HELD(*lock))); 5505185029Spjd mutex_enter(*lock); 5506185029Spjd return (list); 5507185029Spjd} 5508185029Spjd 5509185029Spjd/* 5510185029Spjd * Evict buffers from the device write hand to the distance specified in 5511185029Spjd * bytes. This distance may span populated buffers, it may span nothing. 5512185029Spjd * This is clearing a region on the L2ARC device ready for writing. 5513185029Spjd * If the 'all' boolean is set, every buffer is evicted. 5514185029Spjd */ 5515185029Spjdstatic void 5516185029Spjdl2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 5517185029Spjd{ 5518185029Spjd list_t *buflist; 5519275811Sdelphij arc_buf_hdr_t *hdr, *hdr_prev; 5520185029Spjd kmutex_t *hash_lock; 5521185029Spjd uint64_t taddr; 5522185029Spjd 5523286570Smav buflist = &dev->l2ad_buflist; 5524185029Spjd 5525185029Spjd if (!all && dev->l2ad_first) { 5526185029Spjd /* 5527185029Spjd * This is the first sweep through the device. There is 5528185029Spjd * nothing to evict. 5529185029Spjd */ 5530185029Spjd return; 5531185029Spjd } 5532185029Spjd 5533185029Spjd if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 5534185029Spjd /* 5535185029Spjd * When nearing the end of the device, evict to the end 5536185029Spjd * before the device write hand jumps to the start. 5537185029Spjd */ 5538185029Spjd taddr = dev->l2ad_end; 5539185029Spjd } else { 5540185029Spjd taddr = dev->l2ad_hand + distance; 5541185029Spjd } 5542185029Spjd DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 5543185029Spjd uint64_t, taddr, boolean_t, all); 5544185029Spjd 5545185029Spjdtop: 5546286570Smav mutex_enter(&dev->l2ad_mtx); 5547275811Sdelphij for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { 5548275811Sdelphij hdr_prev = list_prev(buflist, hdr); 5549185029Spjd 5550275811Sdelphij hash_lock = HDR_LOCK(hdr); 5551185029Spjd if (!mutex_tryenter(hash_lock)) { 5552185029Spjd /* 5553185029Spjd * Missed the hash lock. Retry. 5554185029Spjd */ 5555185029Spjd ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 5556286570Smav mutex_exit(&dev->l2ad_mtx); 5557185029Spjd mutex_enter(hash_lock); 5558185029Spjd mutex_exit(hash_lock); 5559185029Spjd goto top; 5560185029Spjd } 5561185029Spjd 5562275811Sdelphij if (HDR_L2_WRITE_HEAD(hdr)) { 5563185029Spjd /* 5564185029Spjd * We hit a write head node. Leave it for 5565185029Spjd * l2arc_write_done(). 5566185029Spjd */ 5567275811Sdelphij list_remove(buflist, hdr); 5568185029Spjd mutex_exit(hash_lock); 5569185029Spjd continue; 5570185029Spjd } 5571185029Spjd 5572286570Smav if (!all && HDR_HAS_L2HDR(hdr) && 5573286570Smav (hdr->b_l2hdr.b_daddr > taddr || 5574286570Smav hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { 5575185029Spjd /* 5576185029Spjd * We've evicted to the target address, 5577185029Spjd * or the end of the device. 5578185029Spjd */ 5579185029Spjd mutex_exit(hash_lock); 5580185029Spjd break; 5581185029Spjd } 5582185029Spjd 5583286570Smav ASSERT(HDR_HAS_L2HDR(hdr)); 5584286570Smav if (!HDR_HAS_L1HDR(hdr)) { 5585275811Sdelphij ASSERT(!HDR_L2_READING(hdr)); 5586185029Spjd /* 5587185029Spjd * This doesn't exist in the ARC. Destroy. 5588185029Spjd * arc_hdr_destroy() will call list_remove() 5589185029Spjd * and decrement arcstat_l2_size. 5590185029Spjd */ 5591275811Sdelphij arc_change_state(arc_anon, hdr, hash_lock); 5592275811Sdelphij arc_hdr_destroy(hdr); 5593185029Spjd } else { 5594286570Smav ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); 5595286570Smav ARCSTAT_BUMP(arcstat_l2_evict_l1cached); 5596185029Spjd /* 5597185029Spjd * Invalidate issued or about to be issued 5598185029Spjd * reads, since we may be about to write 5599185029Spjd * over this location. 5600185029Spjd */ 5601275811Sdelphij if (HDR_L2_READING(hdr)) { 5602185029Spjd ARCSTAT_BUMP(arcstat_l2_evict_reading); 5603275811Sdelphij hdr->b_flags |= ARC_FLAG_L2_EVICTED; 5604185029Spjd } 5605185029Spjd 5606286598Smav arc_hdr_l2hdr_destroy(hdr); 5607185029Spjd } 5608185029Spjd mutex_exit(hash_lock); 5609185029Spjd } 5610286570Smav mutex_exit(&dev->l2ad_mtx); 5611185029Spjd} 5612185029Spjd 5613185029Spjd/* 5614185029Spjd * Find and write ARC buffers to the L2ARC device. 5615185029Spjd * 5616275811Sdelphij * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid 5617185029Spjd * for reading until they have completed writing. 5618251478Sdelphij * The headroom_boost is an in-out parameter used to maintain headroom boost 5619251478Sdelphij * state between calls to this function. 5620251478Sdelphij * 5621251478Sdelphij * Returns the number of bytes actually written (which may be smaller than 5622251478Sdelphij * the delta by which the device hand has changed due to alignment). 5623185029Spjd */ 5624208373Smmstatic uint64_t 5625251478Sdelphijl2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, 5626251478Sdelphij boolean_t *headroom_boost) 5627185029Spjd{ 5628275811Sdelphij arc_buf_hdr_t *hdr, *hdr_prev, *head; 5629185029Spjd list_t *list; 5630251478Sdelphij uint64_t write_asize, write_psize, write_sz, headroom, 5631251478Sdelphij buf_compress_minsz; 5632185029Spjd void *buf_data; 5633251478Sdelphij kmutex_t *list_lock; 5634251478Sdelphij boolean_t full; 5635185029Spjd l2arc_write_callback_t *cb; 5636185029Spjd zio_t *pio, *wzio; 5637228103Smm uint64_t guid = spa_load_guid(spa); 5638251478Sdelphij const boolean_t do_headroom_boost = *headroom_boost; 5639185029Spjd int try; 5640185029Spjd 5641185029Spjd ASSERT(dev->l2ad_vdev != NULL); 5642185029Spjd 5643251478Sdelphij /* Lower the flag now, we might want to raise it again later. */ 5644251478Sdelphij *headroom_boost = B_FALSE; 5645251478Sdelphij 5646185029Spjd pio = NULL; 5647251478Sdelphij write_sz = write_asize = write_psize = 0; 5648185029Spjd full = B_FALSE; 5649286570Smav head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); 5650275811Sdelphij head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; 5651286570Smav head->b_flags |= ARC_FLAG_HAS_L2HDR; 5652185029Spjd 5653205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); 5654185029Spjd /* 5655251478Sdelphij * We will want to try to compress buffers that are at least 2x the 5656251478Sdelphij * device sector size. 5657251478Sdelphij */ 5658251478Sdelphij buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; 5659251478Sdelphij 5660251478Sdelphij /* 5661185029Spjd * Copy buffers for L2ARC writing. 5662185029Spjd */ 5663286570Smav mutex_enter(&dev->l2ad_mtx); 5664286762Smav for (try = 0; try <= 3; try++) { 5665251478Sdelphij uint64_t passed_sz = 0; 5666251478Sdelphij 5667185029Spjd list = l2arc_list_locked(try, &list_lock); 5668205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); 5669185029Spjd 5670185029Spjd /* 5671185029Spjd * L2ARC fast warmup. 5672185029Spjd * 5673185029Spjd * Until the ARC is warm and starts to evict, read from the 5674185029Spjd * head of the ARC lists rather than the tail. 5675185029Spjd */ 5676185029Spjd if (arc_warm == B_FALSE) 5677275811Sdelphij hdr = list_head(list); 5678185029Spjd else 5679275811Sdelphij hdr = list_tail(list); 5680275811Sdelphij if (hdr == NULL) 5681205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); 5682185029Spjd 5683286762Smav headroom = target_sz * l2arc_headroom; 5684251478Sdelphij if (do_headroom_boost) 5685251478Sdelphij headroom = (headroom * l2arc_headroom_boost) / 100; 5686251478Sdelphij 5687275811Sdelphij for (; hdr; hdr = hdr_prev) { 5688251478Sdelphij kmutex_t *hash_lock; 5689251478Sdelphij uint64_t buf_sz; 5690251478Sdelphij 5691185029Spjd if (arc_warm == B_FALSE) 5692275811Sdelphij hdr_prev = list_next(list, hdr); 5693185029Spjd else 5694275811Sdelphij hdr_prev = list_prev(list, hdr); 5695275811Sdelphij ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size); 5696206796Spjd 5697275811Sdelphij hash_lock = HDR_LOCK(hdr); 5698251478Sdelphij if (!mutex_tryenter(hash_lock)) { 5699205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); 5700185029Spjd /* 5701185029Spjd * Skip this buffer rather than waiting. 5702185029Spjd */ 5703185029Spjd continue; 5704185029Spjd } 5705185029Spjd 5706275811Sdelphij passed_sz += hdr->b_size; 5707185029Spjd if (passed_sz > headroom) { 5708185029Spjd /* 5709185029Spjd * Searched too far. 5710185029Spjd */ 5711185029Spjd mutex_exit(hash_lock); 5712205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); 5713185029Spjd break; 5714185029Spjd } 5715185029Spjd 5716275811Sdelphij if (!l2arc_write_eligible(guid, hdr)) { 5717185029Spjd mutex_exit(hash_lock); 5718185029Spjd continue; 5719185029Spjd } 5720185029Spjd 5721275811Sdelphij if ((write_sz + hdr->b_size) > target_sz) { 5722185029Spjd full = B_TRUE; 5723185029Spjd mutex_exit(hash_lock); 5724205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_full); 5725185029Spjd break; 5726185029Spjd } 5727185029Spjd 5728185029Spjd if (pio == NULL) { 5729185029Spjd /* 5730185029Spjd * Insert a dummy header on the buflist so 5731185029Spjd * l2arc_write_done() can find where the 5732185029Spjd * write buffers begin without searching. 5733185029Spjd */ 5734286570Smav list_insert_head(&dev->l2ad_buflist, head); 5735185029Spjd 5736185029Spjd cb = kmem_alloc( 5737185029Spjd sizeof (l2arc_write_callback_t), KM_SLEEP); 5738185029Spjd cb->l2wcb_dev = dev; 5739185029Spjd cb->l2wcb_head = head; 5740185029Spjd pio = zio_root(spa, l2arc_write_done, cb, 5741185029Spjd ZIO_FLAG_CANFAIL); 5742205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_pios); 5743185029Spjd } 5744185029Spjd 5745185029Spjd /* 5746185029Spjd * Create and add a new L2ARC header. 5747185029Spjd */ 5748286570Smav hdr->b_l2hdr.b_dev = dev; 5749275811Sdelphij hdr->b_flags |= ARC_FLAG_L2_WRITING; 5750251478Sdelphij /* 5751251478Sdelphij * Temporarily stash the data buffer in b_tmp_cdata. 5752251478Sdelphij * The subsequent write step will pick it up from 5753286570Smav * there. This is because can't access b_l1hdr.b_buf 5754251478Sdelphij * without holding the hash_lock, which we in turn 5755251478Sdelphij * can't access without holding the ARC list locks 5756251478Sdelphij * (which we want to avoid during compression/writing). 5757251478Sdelphij */ 5758286570Smav HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); 5759286570Smav hdr->b_l2hdr.b_asize = hdr->b_size; 5760286570Smav hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; 5761251478Sdelphij 5762286598Smav /* 5763286598Smav * Explicitly set the b_daddr field to a known 5764286598Smav * value which means "invalid address". This 5765286598Smav * enables us to differentiate which stage of 5766286598Smav * l2arc_write_buffers() the particular header 5767286598Smav * is in (e.g. this loop, or the one below). 5768286598Smav * ARC_FLAG_L2_WRITING is not enough to make 5769286598Smav * this distinction, and we need to know in 5770286598Smav * order to do proper l2arc vdev accounting in 5771286598Smav * arc_release() and arc_hdr_destroy(). 5772286598Smav * 5773286598Smav * Note, we can't use a new flag to distinguish 5774286598Smav * the two stages because we don't hold the 5775286598Smav * header's hash_lock below, in the second stage 5776286598Smav * of this function. Thus, we can't simply 5777286598Smav * change the b_flags field to denote that the 5778286598Smav * IO has been sent. We can change the b_daddr 5779286598Smav * field of the L2 portion, though, since we'll 5780286598Smav * be holding the l2ad_mtx; which is why we're 5781286598Smav * using it to denote the header's state change. 5782286598Smav */ 5783286598Smav hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; 5784286598Smav 5785275811Sdelphij buf_sz = hdr->b_size; 5786286570Smav hdr->b_flags |= ARC_FLAG_HAS_L2HDR; 5787185029Spjd 5788286570Smav list_insert_head(&dev->l2ad_buflist, hdr); 5789251478Sdelphij 5790185029Spjd /* 5791185029Spjd * Compute and store the buffer cksum before 5792185029Spjd * writing. On debug the cksum is verified first. 5793185029Spjd */ 5794286570Smav arc_cksum_verify(hdr->b_l1hdr.b_buf); 5795286570Smav arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); 5796185029Spjd 5797185029Spjd mutex_exit(hash_lock); 5798185029Spjd 5799251478Sdelphij write_sz += buf_sz; 5800251478Sdelphij } 5801251478Sdelphij 5802251478Sdelphij mutex_exit(list_lock); 5803251478Sdelphij 5804251478Sdelphij if (full == B_TRUE) 5805251478Sdelphij break; 5806251478Sdelphij } 5807251478Sdelphij 5808251478Sdelphij /* No buffers selected for writing? */ 5809251478Sdelphij if (pio == NULL) { 5810251478Sdelphij ASSERT0(write_sz); 5811286570Smav mutex_exit(&dev->l2ad_mtx); 5812286570Smav ASSERT(!HDR_HAS_L1HDR(head)); 5813286570Smav kmem_cache_free(hdr_l2only_cache, head); 5814251478Sdelphij return (0); 5815251478Sdelphij } 5816251478Sdelphij 5817251478Sdelphij /* 5818251478Sdelphij * Now start writing the buffers. We're starting at the write head 5819251478Sdelphij * and work backwards, retracing the course of the buffer selector 5820251478Sdelphij * loop above. 5821251478Sdelphij */ 5822286570Smav for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; 5823286570Smav hdr = list_prev(&dev->l2ad_buflist, hdr)) { 5824251478Sdelphij uint64_t buf_sz; 5825251478Sdelphij 5826251478Sdelphij /* 5827251478Sdelphij * We shouldn't need to lock the buffer here, since we flagged 5828275811Sdelphij * it as ARC_FLAG_L2_WRITING in the previous step, but we must 5829275811Sdelphij * take care to only access its L2 cache parameters. In 5830286570Smav * particular, hdr->l1hdr.b_buf may be invalid by now due to 5831275811Sdelphij * ARC eviction. 5832251478Sdelphij */ 5833286570Smav hdr->b_l2hdr.b_daddr = dev->l2ad_hand; 5834251478Sdelphij 5835286570Smav if ((HDR_L2COMPRESS(hdr)) && 5836286570Smav hdr->b_l2hdr.b_asize >= buf_compress_minsz) { 5837286570Smav if (l2arc_compress_buf(hdr)) { 5838251478Sdelphij /* 5839251478Sdelphij * If compression succeeded, enable headroom 5840251478Sdelphij * boost on the next scan cycle. 5841251478Sdelphij */ 5842251478Sdelphij *headroom_boost = B_TRUE; 5843251478Sdelphij } 5844251478Sdelphij } 5845251478Sdelphij 5846251478Sdelphij /* 5847251478Sdelphij * Pick up the buffer data we had previously stashed away 5848251478Sdelphij * (and now potentially also compressed). 5849251478Sdelphij */ 5850286570Smav buf_data = hdr->b_l1hdr.b_tmp_cdata; 5851286570Smav buf_sz = hdr->b_l2hdr.b_asize; 5852251478Sdelphij 5853274172Savg /* 5854274172Savg * If the data has not been compressed, then clear b_tmp_cdata 5855274172Savg * to make sure that it points only to a temporary compression 5856274172Savg * buffer. 5857274172Savg */ 5858286570Smav if (!L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr))) 5859286570Smav hdr->b_l1hdr.b_tmp_cdata = NULL; 5860274172Savg 5861286598Smav /* 5862286598Smav * We need to do this regardless if buf_sz is zero or 5863286598Smav * not, otherwise, when this l2hdr is evicted we'll 5864286598Smav * remove a reference that was never added. 5865286598Smav */ 5866286598Smav (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr); 5867286598Smav 5868251478Sdelphij /* Compression may have squashed the buffer to zero length. */ 5869251478Sdelphij if (buf_sz != 0) { 5870251478Sdelphij uint64_t buf_p_sz; 5871251478Sdelphij 5872185029Spjd wzio = zio_write_phys(pio, dev->l2ad_vdev, 5873185029Spjd dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 5874185029Spjd NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 5875185029Spjd ZIO_FLAG_CANFAIL, B_FALSE); 5876185029Spjd 5877185029Spjd DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 5878185029Spjd zio_t *, wzio); 5879185029Spjd (void) zio_nowait(wzio); 5880185029Spjd 5881251478Sdelphij write_asize += buf_sz; 5882286598Smav 5883185029Spjd /* 5884185029Spjd * Keep the clock hand suitably device-aligned. 5885185029Spjd */ 5886251478Sdelphij buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 5887251478Sdelphij write_psize += buf_p_sz; 5888251478Sdelphij dev->l2ad_hand += buf_p_sz; 5889185029Spjd } 5890251478Sdelphij } 5891185029Spjd 5892286570Smav mutex_exit(&dev->l2ad_mtx); 5893185029Spjd 5894251478Sdelphij ASSERT3U(write_asize, <=, target_sz); 5895185029Spjd ARCSTAT_BUMP(arcstat_l2_writes_sent); 5896251478Sdelphij ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); 5897185029Spjd ARCSTAT_INCR(arcstat_l2_size, write_sz); 5898251478Sdelphij ARCSTAT_INCR(arcstat_l2_asize, write_asize); 5899275096Sdelphij vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0); 5900185029Spjd 5901185029Spjd /* 5902185029Spjd * Bump device hand to the device start if it is approaching the end. 5903185029Spjd * l2arc_evict() will already have evicted ahead for this case. 5904185029Spjd */ 5905185029Spjd if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 5906185029Spjd dev->l2ad_hand = dev->l2ad_start; 5907185029Spjd dev->l2ad_first = B_FALSE; 5908185029Spjd } 5909185029Spjd 5910208373Smm dev->l2ad_writing = B_TRUE; 5911185029Spjd (void) zio_wait(pio); 5912208373Smm dev->l2ad_writing = B_FALSE; 5913208373Smm 5914251478Sdelphij return (write_asize); 5915185029Spjd} 5916185029Spjd 5917185029Spjd/* 5918251478Sdelphij * Compresses an L2ARC buffer. 5919286570Smav * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its 5920251478Sdelphij * size in l2hdr->b_asize. This routine tries to compress the data and 5921251478Sdelphij * depending on the compression result there are three possible outcomes: 5922251478Sdelphij * *) The buffer was incompressible. The original l2hdr contents were left 5923251478Sdelphij * untouched and are ready for writing to an L2 device. 5924251478Sdelphij * *) The buffer was all-zeros, so there is no need to write it to an L2 5925251478Sdelphij * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is 5926251478Sdelphij * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. 5927251478Sdelphij * *) Compression succeeded and b_tmp_cdata was replaced with a temporary 5928251478Sdelphij * data buffer which holds the compressed data to be written, and b_asize 5929251478Sdelphij * tells us how much data there is. b_compress is set to the appropriate 5930251478Sdelphij * compression algorithm. Once writing is done, invoke 5931251478Sdelphij * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. 5932251478Sdelphij * 5933251478Sdelphij * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the 5934251478Sdelphij * buffer was incompressible). 5935251478Sdelphij */ 5936251478Sdelphijstatic boolean_t 5937286570Smavl2arc_compress_buf(arc_buf_hdr_t *hdr) 5938251478Sdelphij{ 5939251478Sdelphij void *cdata; 5940268075Sdelphij size_t csize, len, rounded; 5941286570Smav ASSERT(HDR_HAS_L2HDR(hdr)); 5942286570Smav l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 5943251478Sdelphij 5944286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 5945286570Smav ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF); 5946286570Smav ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 5947251478Sdelphij 5948251478Sdelphij len = l2hdr->b_asize; 5949251478Sdelphij cdata = zio_data_buf_alloc(len); 5950286570Smav ASSERT3P(cdata, !=, NULL); 5951286570Smav csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata, 5952269086Sdelphij cdata, l2hdr->b_asize); 5953251478Sdelphij 5954251478Sdelphij if (csize == 0) { 5955251478Sdelphij /* zero block, indicate that there's nothing to write */ 5956251478Sdelphij zio_data_buf_free(cdata, len); 5957286570Smav HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY); 5958251478Sdelphij l2hdr->b_asize = 0; 5959286570Smav hdr->b_l1hdr.b_tmp_cdata = NULL; 5960251478Sdelphij ARCSTAT_BUMP(arcstat_l2_compress_zeros); 5961251478Sdelphij return (B_TRUE); 5962274628Savg } 5963274628Savg 5964274628Savg rounded = P2ROUNDUP(csize, 5965274628Savg (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift); 5966274628Savg if (rounded < len) { 5967251478Sdelphij /* 5968251478Sdelphij * Compression succeeded, we'll keep the cdata around for 5969251478Sdelphij * writing and release it afterwards. 5970251478Sdelphij */ 5971274628Savg if (rounded > csize) { 5972274628Savg bzero((char *)cdata + csize, rounded - csize); 5973274628Savg csize = rounded; 5974274628Savg } 5975286570Smav HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4); 5976251478Sdelphij l2hdr->b_asize = csize; 5977286570Smav hdr->b_l1hdr.b_tmp_cdata = cdata; 5978251478Sdelphij ARCSTAT_BUMP(arcstat_l2_compress_successes); 5979251478Sdelphij return (B_TRUE); 5980251478Sdelphij } else { 5981251478Sdelphij /* 5982251478Sdelphij * Compression failed, release the compressed buffer. 5983251478Sdelphij * l2hdr will be left unmodified. 5984251478Sdelphij */ 5985251478Sdelphij zio_data_buf_free(cdata, len); 5986251478Sdelphij ARCSTAT_BUMP(arcstat_l2_compress_failures); 5987251478Sdelphij return (B_FALSE); 5988251478Sdelphij } 5989251478Sdelphij} 5990251478Sdelphij 5991251478Sdelphij/* 5992251478Sdelphij * Decompresses a zio read back from an l2arc device. On success, the 5993251478Sdelphij * underlying zio's io_data buffer is overwritten by the uncompressed 5994251478Sdelphij * version. On decompression error (corrupt compressed stream), the 5995251478Sdelphij * zio->io_error value is set to signal an I/O error. 5996251478Sdelphij * 5997251478Sdelphij * Please note that the compressed data stream is not checksummed, so 5998251478Sdelphij * if the underlying device is experiencing data corruption, we may feed 5999251478Sdelphij * corrupt data to the decompressor, so the decompressor needs to be 6000251478Sdelphij * able to handle this situation (LZ4 does). 6001251478Sdelphij */ 6002251478Sdelphijstatic void 6003251478Sdelphijl2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) 6004251478Sdelphij{ 6005251478Sdelphij ASSERT(L2ARC_IS_VALID_COMPRESS(c)); 6006251478Sdelphij 6007251478Sdelphij if (zio->io_error != 0) { 6008251478Sdelphij /* 6009251478Sdelphij * An io error has occured, just restore the original io 6010251478Sdelphij * size in preparation for a main pool read. 6011251478Sdelphij */ 6012251478Sdelphij zio->io_orig_size = zio->io_size = hdr->b_size; 6013251478Sdelphij return; 6014251478Sdelphij } 6015251478Sdelphij 6016251478Sdelphij if (c == ZIO_COMPRESS_EMPTY) { 6017251478Sdelphij /* 6018251478Sdelphij * An empty buffer results in a null zio, which means we 6019251478Sdelphij * need to fill its io_data after we're done restoring the 6020251478Sdelphij * buffer's contents. 6021251478Sdelphij */ 6022286570Smav ASSERT(hdr->b_l1hdr.b_buf != NULL); 6023286570Smav bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); 6024286570Smav zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; 6025251478Sdelphij } else { 6026251478Sdelphij ASSERT(zio->io_data != NULL); 6027251478Sdelphij /* 6028251478Sdelphij * We copy the compressed data from the start of the arc buffer 6029251478Sdelphij * (the zio_read will have pulled in only what we need, the 6030251478Sdelphij * rest is garbage which we will overwrite at decompression) 6031251478Sdelphij * and then decompress back to the ARC data buffer. This way we 6032251478Sdelphij * can minimize copying by simply decompressing back over the 6033251478Sdelphij * original compressed data (rather than decompressing to an 6034251478Sdelphij * aux buffer and then copying back the uncompressed buffer, 6035251478Sdelphij * which is likely to be much larger). 6036251478Sdelphij */ 6037251478Sdelphij uint64_t csize; 6038251478Sdelphij void *cdata; 6039251478Sdelphij 6040251478Sdelphij csize = zio->io_size; 6041251478Sdelphij cdata = zio_data_buf_alloc(csize); 6042251478Sdelphij bcopy(zio->io_data, cdata, csize); 6043251478Sdelphij if (zio_decompress_data(c, cdata, zio->io_data, csize, 6044251478Sdelphij hdr->b_size) != 0) 6045251478Sdelphij zio->io_error = EIO; 6046251478Sdelphij zio_data_buf_free(cdata, csize); 6047251478Sdelphij } 6048251478Sdelphij 6049251478Sdelphij /* Restore the expected uncompressed IO size. */ 6050251478Sdelphij zio->io_orig_size = zio->io_size = hdr->b_size; 6051251478Sdelphij} 6052251478Sdelphij 6053251478Sdelphij/* 6054251478Sdelphij * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. 6055251478Sdelphij * This buffer serves as a temporary holder of compressed data while 6056251478Sdelphij * the buffer entry is being written to an l2arc device. Once that is 6057251478Sdelphij * done, we can dispose of it. 6058251478Sdelphij */ 6059251478Sdelphijstatic void 6060275811Sdelphijl2arc_release_cdata_buf(arc_buf_hdr_t *hdr) 6061251478Sdelphij{ 6062286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 6063286570Smav if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) { 6064251478Sdelphij /* 6065251478Sdelphij * If the data was compressed, then we've allocated a 6066251478Sdelphij * temporary buffer for it, so now we need to release it. 6067251478Sdelphij */ 6068286570Smav ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 6069286570Smav zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, 6070286570Smav hdr->b_size); 6071286570Smav hdr->b_l1hdr.b_tmp_cdata = NULL; 6072274172Savg } else { 6073286570Smav ASSERT(hdr->b_l1hdr.b_tmp_cdata == NULL); 6074251478Sdelphij } 6075251478Sdelphij} 6076251478Sdelphij 6077251478Sdelphij/* 6078185029Spjd * This thread feeds the L2ARC at regular intervals. This is the beating 6079185029Spjd * heart of the L2ARC. 6080185029Spjd */ 6081185029Spjdstatic void 6082185029Spjdl2arc_feed_thread(void *dummy __unused) 6083185029Spjd{ 6084185029Spjd callb_cpr_t cpr; 6085185029Spjd l2arc_dev_t *dev; 6086185029Spjd spa_t *spa; 6087208373Smm uint64_t size, wrote; 6088219089Spjd clock_t begin, next = ddi_get_lbolt(); 6089251478Sdelphij boolean_t headroom_boost = B_FALSE; 6090185029Spjd 6091185029Spjd CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 6092185029Spjd 6093185029Spjd mutex_enter(&l2arc_feed_thr_lock); 6094185029Spjd 6095185029Spjd while (l2arc_thread_exit == 0) { 6096185029Spjd CALLB_CPR_SAFE_BEGIN(&cpr); 6097185029Spjd (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 6098219089Spjd next - ddi_get_lbolt()); 6099185029Spjd CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 6100219089Spjd next = ddi_get_lbolt() + hz; 6101185029Spjd 6102185029Spjd /* 6103185029Spjd * Quick check for L2ARC devices. 6104185029Spjd */ 6105185029Spjd mutex_enter(&l2arc_dev_mtx); 6106185029Spjd if (l2arc_ndev == 0) { 6107185029Spjd mutex_exit(&l2arc_dev_mtx); 6108185029Spjd continue; 6109185029Spjd } 6110185029Spjd mutex_exit(&l2arc_dev_mtx); 6111219089Spjd begin = ddi_get_lbolt(); 6112185029Spjd 6113185029Spjd /* 6114185029Spjd * This selects the next l2arc device to write to, and in 6115185029Spjd * doing so the next spa to feed from: dev->l2ad_spa. This 6116185029Spjd * will return NULL if there are now no l2arc devices or if 6117185029Spjd * they are all faulted. 6118185029Spjd * 6119185029Spjd * If a device is returned, its spa's config lock is also 6120185029Spjd * held to prevent device removal. l2arc_dev_get_next() 6121185029Spjd * will grab and release l2arc_dev_mtx. 6122185029Spjd */ 6123185029Spjd if ((dev = l2arc_dev_get_next()) == NULL) 6124185029Spjd continue; 6125185029Spjd 6126185029Spjd spa = dev->l2ad_spa; 6127185029Spjd ASSERT(spa != NULL); 6128185029Spjd 6129185029Spjd /* 6130219089Spjd * If the pool is read-only then force the feed thread to 6131219089Spjd * sleep a little longer. 6132219089Spjd */ 6133219089Spjd if (!spa_writeable(spa)) { 6134219089Spjd next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 6135219089Spjd spa_config_exit(spa, SCL_L2ARC, dev); 6136219089Spjd continue; 6137219089Spjd } 6138219089Spjd 6139219089Spjd /* 6140185029Spjd * Avoid contributing to memory pressure. 6141185029Spjd */ 6142185029Spjd if (arc_reclaim_needed()) { 6143185029Spjd ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 6144185029Spjd spa_config_exit(spa, SCL_L2ARC, dev); 6145185029Spjd continue; 6146185029Spjd } 6147185029Spjd 6148185029Spjd ARCSTAT_BUMP(arcstat_l2_feeds); 6149185029Spjd 6150251478Sdelphij size = l2arc_write_size(); 6151185029Spjd 6152185029Spjd /* 6153185029Spjd * Evict L2ARC buffers that will be overwritten. 6154185029Spjd */ 6155185029Spjd l2arc_evict(dev, size, B_FALSE); 6156185029Spjd 6157185029Spjd /* 6158185029Spjd * Write ARC buffers. 6159185029Spjd */ 6160251478Sdelphij wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); 6161208373Smm 6162208373Smm /* 6163208373Smm * Calculate interval between writes. 6164208373Smm */ 6165208373Smm next = l2arc_write_interval(begin, size, wrote); 6166185029Spjd spa_config_exit(spa, SCL_L2ARC, dev); 6167185029Spjd } 6168185029Spjd 6169185029Spjd l2arc_thread_exit = 0; 6170185029Spjd cv_broadcast(&l2arc_feed_thr_cv); 6171185029Spjd CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 6172185029Spjd thread_exit(); 6173185029Spjd} 6174185029Spjd 6175185029Spjdboolean_t 6176185029Spjdl2arc_vdev_present(vdev_t *vd) 6177185029Spjd{ 6178185029Spjd l2arc_dev_t *dev; 6179185029Spjd 6180185029Spjd mutex_enter(&l2arc_dev_mtx); 6181185029Spjd for (dev = list_head(l2arc_dev_list); dev != NULL; 6182185029Spjd dev = list_next(l2arc_dev_list, dev)) { 6183185029Spjd if (dev->l2ad_vdev == vd) 6184185029Spjd break; 6185185029Spjd } 6186185029Spjd mutex_exit(&l2arc_dev_mtx); 6187185029Spjd 6188185029Spjd return (dev != NULL); 6189185029Spjd} 6190185029Spjd 6191185029Spjd/* 6192185029Spjd * Add a vdev for use by the L2ARC. By this point the spa has already 6193185029Spjd * validated the vdev and opened it. 6194185029Spjd */ 6195185029Spjdvoid 6196219089Spjdl2arc_add_vdev(spa_t *spa, vdev_t *vd) 6197185029Spjd{ 6198185029Spjd l2arc_dev_t *adddev; 6199185029Spjd 6200185029Spjd ASSERT(!l2arc_vdev_present(vd)); 6201185029Spjd 6202255753Sgibbs vdev_ashift_optimize(vd); 6203255753Sgibbs 6204185029Spjd /* 6205185029Spjd * Create a new l2arc device entry. 6206185029Spjd */ 6207185029Spjd adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 6208185029Spjd adddev->l2ad_spa = spa; 6209185029Spjd adddev->l2ad_vdev = vd; 6210219089Spjd adddev->l2ad_start = VDEV_LABEL_START_SIZE; 6211219089Spjd adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 6212185029Spjd adddev->l2ad_hand = adddev->l2ad_start; 6213185029Spjd adddev->l2ad_first = B_TRUE; 6214208373Smm adddev->l2ad_writing = B_FALSE; 6215185029Spjd 6216286570Smav mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); 6217185029Spjd /* 6218185029Spjd * This is a list of all ARC buffers that are still valid on the 6219185029Spjd * device. 6220185029Spjd */ 6221286570Smav list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 6222286570Smav offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); 6223185029Spjd 6224219089Spjd vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 6225286598Smav refcount_create(&adddev->l2ad_alloc); 6226185029Spjd 6227185029Spjd /* 6228185029Spjd * Add device to global list 6229185029Spjd */ 6230185029Spjd mutex_enter(&l2arc_dev_mtx); 6231185029Spjd list_insert_head(l2arc_dev_list, adddev); 6232185029Spjd atomic_inc_64(&l2arc_ndev); 6233185029Spjd mutex_exit(&l2arc_dev_mtx); 6234185029Spjd} 6235185029Spjd 6236185029Spjd/* 6237185029Spjd * Remove a vdev from the L2ARC. 6238185029Spjd */ 6239185029Spjdvoid 6240185029Spjdl2arc_remove_vdev(vdev_t *vd) 6241185029Spjd{ 6242185029Spjd l2arc_dev_t *dev, *nextdev, *remdev = NULL; 6243185029Spjd 6244185029Spjd /* 6245185029Spjd * Find the device by vdev 6246185029Spjd */ 6247185029Spjd mutex_enter(&l2arc_dev_mtx); 6248185029Spjd for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 6249185029Spjd nextdev = list_next(l2arc_dev_list, dev); 6250185029Spjd if (vd == dev->l2ad_vdev) { 6251185029Spjd remdev = dev; 6252185029Spjd break; 6253185029Spjd } 6254185029Spjd } 6255185029Spjd ASSERT(remdev != NULL); 6256185029Spjd 6257185029Spjd /* 6258185029Spjd * Remove device from global list 6259185029Spjd */ 6260185029Spjd list_remove(l2arc_dev_list, remdev); 6261185029Spjd l2arc_dev_last = NULL; /* may have been invalidated */ 6262185029Spjd atomic_dec_64(&l2arc_ndev); 6263185029Spjd mutex_exit(&l2arc_dev_mtx); 6264185029Spjd 6265185029Spjd /* 6266185029Spjd * Clear all buflists and ARC references. L2ARC device flush. 6267185029Spjd */ 6268185029Spjd l2arc_evict(remdev, 0, B_TRUE); 6269286570Smav list_destroy(&remdev->l2ad_buflist); 6270286570Smav mutex_destroy(&remdev->l2ad_mtx); 6271286598Smav refcount_destroy(&remdev->l2ad_alloc); 6272185029Spjd kmem_free(remdev, sizeof (l2arc_dev_t)); 6273185029Spjd} 6274185029Spjd 6275185029Spjdvoid 6276185029Spjdl2arc_init(void) 6277185029Spjd{ 6278185029Spjd l2arc_thread_exit = 0; 6279185029Spjd l2arc_ndev = 0; 6280185029Spjd l2arc_writes_sent = 0; 6281185029Spjd l2arc_writes_done = 0; 6282185029Spjd 6283185029Spjd mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 6284185029Spjd cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 6285185029Spjd mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 6286185029Spjd mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 6287185029Spjd 6288185029Spjd l2arc_dev_list = &L2ARC_dev_list; 6289185029Spjd l2arc_free_on_write = &L2ARC_free_on_write; 6290185029Spjd list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 6291185029Spjd offsetof(l2arc_dev_t, l2ad_node)); 6292185029Spjd list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 6293185029Spjd offsetof(l2arc_data_free_t, l2df_list_node)); 6294185029Spjd} 6295185029Spjd 6296185029Spjdvoid 6297185029Spjdl2arc_fini(void) 6298185029Spjd{ 6299185029Spjd /* 6300185029Spjd * This is called from dmu_fini(), which is called from spa_fini(); 6301185029Spjd * Because of this, we can assume that all l2arc devices have 6302185029Spjd * already been removed when the pools themselves were removed. 6303185029Spjd */ 6304185029Spjd 6305185029Spjd l2arc_do_free_on_write(); 6306185029Spjd 6307185029Spjd mutex_destroy(&l2arc_feed_thr_lock); 6308185029Spjd cv_destroy(&l2arc_feed_thr_cv); 6309185029Spjd mutex_destroy(&l2arc_dev_mtx); 6310185029Spjd mutex_destroy(&l2arc_free_on_write_mtx); 6311185029Spjd 6312185029Spjd list_destroy(l2arc_dev_list); 6313185029Spjd list_destroy(l2arc_free_on_write); 6314185029Spjd} 6315185029Spjd 6316185029Spjdvoid 6317185029Spjdl2arc_start(void) 6318185029Spjd{ 6319209962Smm if (!(spa_mode_global & FWRITE)) 6320185029Spjd return; 6321185029Spjd 6322185029Spjd (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 6323185029Spjd TS_RUN, minclsyspri); 6324185029Spjd} 6325185029Spjd 6326185029Spjdvoid 6327185029Spjdl2arc_stop(void) 6328185029Spjd{ 6329209962Smm if (!(spa_mode_global & FWRITE)) 6330185029Spjd return; 6331185029Spjd 6332185029Spjd mutex_enter(&l2arc_feed_thr_lock); 6333185029Spjd cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 6334185029Spjd l2arc_thread_exit = 1; 6335185029Spjd while (l2arc_thread_exit != 0) 6336185029Spjd cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 6337185029Spjd mutex_exit(&l2arc_feed_thr_lock); 6338185029Spjd} 6339