arc.c revision 302265
1168404Spjd/* 2168404Spjd * CDDL HEADER START 3168404Spjd * 4168404Spjd * The contents of this file are subject to the terms of the 5168404Spjd * Common Development and Distribution License (the "License"). 6168404Spjd * You may not use this file except in compliance with the License. 7168404Spjd * 8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9168404Spjd * or http://www.opensolaris.org/os/licensing. 10168404Spjd * See the License for the specific language governing permissions 11168404Spjd * and limitations under the License. 12168404Spjd * 13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each 14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15168404Spjd * If applicable, add the following below this CDDL HEADER, with the 16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying 17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner] 18168404Spjd * 19168404Spjd * CDDL HEADER END 20168404Spjd */ 21168404Spjd/* 22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23277826Sdelphij * Copyright (c) 2012, Joyent, Inc. All rights reserved. 24286766Smav * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 25260835Sdelphij * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. 26286764Smav * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 27168404Spjd */ 28168404Spjd 29168404Spjd/* 30168404Spjd * DVA-based Adjustable Replacement Cache 31168404Spjd * 32168404Spjd * While much of the theory of operation used here is 33168404Spjd * based on the self-tuning, low overhead replacement cache 34168404Spjd * presented by Megiddo and Modha at FAST 2003, there are some 35168404Spjd * significant differences: 36168404Spjd * 37168404Spjd * 1. The Megiddo and Modha model assumes any page is evictable. 38168404Spjd * Pages in its cache cannot be "locked" into memory. This makes 39168404Spjd * the eviction algorithm simple: evict the last page in the list. 40168404Spjd * This also make the performance characteristics easy to reason 41168404Spjd * about. Our cache is not so simple. At any given moment, some 42168404Spjd * subset of the blocks in the cache are un-evictable because we 43168404Spjd * have handed out a reference to them. Blocks are only evictable 44168404Spjd * when there are no external references active. This makes 45168404Spjd * eviction far more problematic: we choose to evict the evictable 46168404Spjd * blocks that are the "lowest" in the list. 47168404Spjd * 48168404Spjd * There are times when it is not possible to evict the requested 49168404Spjd * space. In these circumstances we are unable to adjust the cache 50168404Spjd * size. To prevent the cache growing unbounded at these times we 51185029Spjd * implement a "cache throttle" that slows the flow of new data 52185029Spjd * into the cache until we can make space available. 53168404Spjd * 54168404Spjd * 2. The Megiddo and Modha model assumes a fixed cache size. 55168404Spjd * Pages are evicted when the cache is full and there is a cache 56168404Spjd * miss. Our model has a variable sized cache. It grows with 57185029Spjd * high use, but also tries to react to memory pressure from the 58168404Spjd * operating system: decreasing its size when system memory is 59168404Spjd * tight. 60168404Spjd * 61168404Spjd * 3. The Megiddo and Modha model assumes a fixed page size. All 62251631Sdelphij * elements of the cache are therefore exactly the same size. So 63168404Spjd * when adjusting the cache size following a cache miss, its simply 64168404Spjd * a matter of choosing a single page to evict. In our model, we 65168404Spjd * have variable sized cache blocks (rangeing from 512 bytes to 66251631Sdelphij * 128K bytes). We therefore choose a set of blocks to evict to make 67168404Spjd * space for a cache miss that approximates as closely as possible 68168404Spjd * the space used by the new block. 69168404Spjd * 70168404Spjd * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 71168404Spjd * by N. Megiddo & D. Modha, FAST 2003 72168404Spjd */ 73168404Spjd 74168404Spjd/* 75168404Spjd * The locking model: 76168404Spjd * 77168404Spjd * A new reference to a cache buffer can be obtained in two 78168404Spjd * ways: 1) via a hash table lookup using the DVA as a key, 79185029Spjd * or 2) via one of the ARC lists. The arc_read() interface 80168404Spjd * uses method 1, while the internal arc algorithms for 81251631Sdelphij * adjusting the cache use method 2. We therefore provide two 82168404Spjd * types of locks: 1) the hash table lock array, and 2) the 83168404Spjd * arc list locks. 84168404Spjd * 85286774Smav * Buffers do not have their own mutexes, rather they rely on the 86286774Smav * hash table mutexes for the bulk of their protection (i.e. most 87286774Smav * fields in the arc_buf_hdr_t are protected by these mutexes). 88168404Spjd * 89168404Spjd * buf_hash_find() returns the appropriate mutex (held) when it 90168404Spjd * locates the requested buffer in the hash table. It returns 91168404Spjd * NULL for the mutex if the buffer was not in the table. 92168404Spjd * 93168404Spjd * buf_hash_remove() expects the appropriate hash mutex to be 94168404Spjd * already held before it is invoked. 95168404Spjd * 96168404Spjd * Each arc state also has a mutex which is used to protect the 97168404Spjd * buffer list associated with the state. When attempting to 98168404Spjd * obtain a hash table lock while holding an arc list lock you 99168404Spjd * must use: mutex_tryenter() to avoid deadlock. Also note that 100168404Spjd * the active state mutex must be held before the ghost state mutex. 101168404Spjd * 102168404Spjd * Arc buffers may have an associated eviction callback function. 103168404Spjd * This function will be invoked prior to removing the buffer (e.g. 104168404Spjd * in arc_do_user_evicts()). Note however that the data associated 105168404Spjd * with the buffer may be evicted prior to the callback. The callback 106168404Spjd * must be made with *no locks held* (to prevent deadlock). Additionally, 107168404Spjd * the users of callbacks must ensure that their private data is 108268858Sdelphij * protected from simultaneous callbacks from arc_clear_callback() 109168404Spjd * and arc_do_user_evicts(). 110168404Spjd * 111168404Spjd * Note that the majority of the performance stats are manipulated 112168404Spjd * with atomic operations. 113185029Spjd * 114286570Smav * The L2ARC uses the l2ad_mtx on each vdev for the following: 115185029Spjd * 116185029Spjd * - L2ARC buflist creation 117185029Spjd * - L2ARC buflist eviction 118185029Spjd * - L2ARC write completion, which walks L2ARC buflists 119185029Spjd * - ARC header destruction, as it removes from L2ARC buflists 120185029Spjd * - ARC header release, as it removes from L2ARC buflists 121168404Spjd */ 122168404Spjd 123168404Spjd#include <sys/spa.h> 124168404Spjd#include <sys/zio.h> 125251478Sdelphij#include <sys/zio_compress.h> 126168404Spjd#include <sys/zfs_context.h> 127168404Spjd#include <sys/arc.h> 128168404Spjd#include <sys/refcount.h> 129185029Spjd#include <sys/vdev.h> 130219089Spjd#include <sys/vdev_impl.h> 131258632Savg#include <sys/dsl_pool.h> 132286763Smav#include <sys/multilist.h> 133168404Spjd#ifdef _KERNEL 134168404Spjd#include <sys/dnlc.h> 135297633Strasz#include <sys/racct.h> 136168404Spjd#endif 137168404Spjd#include <sys/callb.h> 138168404Spjd#include <sys/kstat.h> 139248572Ssmh#include <sys/trim_map.h> 140219089Spjd#include <zfs_fletcher.h> 141168404Spjd#include <sys/sdt.h> 142168404Spjd 143272483Ssmh#include <machine/vmparam.h> 144191902Skmacy 145240133Smm#ifdef illumos 146240133Smm#ifndef _KERNEL 147240133Smm/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 148240133Smmboolean_t arc_watch = B_FALSE; 149240133Smmint arc_procfd; 150240133Smm#endif 151240133Smm#endif /* illumos */ 152240133Smm 153286763Smavstatic kmutex_t arc_reclaim_lock; 154286763Smavstatic kcondvar_t arc_reclaim_thread_cv; 155286763Smavstatic boolean_t arc_reclaim_thread_exit; 156286763Smavstatic kcondvar_t arc_reclaim_waiters_cv; 157168404Spjd 158286763Smavstatic kmutex_t arc_user_evicts_lock; 159286763Smavstatic kcondvar_t arc_user_evicts_cv; 160286763Smavstatic boolean_t arc_user_evicts_thread_exit; 161286763Smav 162301997Skibstatic kmutex_t arc_dnlc_evicts_lock; 163301997Skibstatic kcondvar_t arc_dnlc_evicts_cv; 164301997Skibstatic boolean_t arc_dnlc_evicts_thread_exit; 165301997Skib 166286625Smavuint_t arc_reduce_dnlc_percent = 3; 167168404Spjd 168258632Savg/* 169286763Smav * The number of headers to evict in arc_evict_state_impl() before 170286763Smav * dropping the sublist lock and evicting from another sublist. A lower 171286763Smav * value means we're more likely to evict the "correct" header (i.e. the 172286763Smav * oldest header in the arc state), but comes with higher overhead 173286763Smav * (i.e. more invocations of arc_evict_state_impl()). 174258632Savg */ 175286763Smavint zfs_arc_evict_batch_limit = 10; 176258632Savg 177286763Smav/* 178286763Smav * The number of sublists used for each of the arc state lists. If this 179286763Smav * is not set to a suitable value by the user, it will be configured to 180286763Smav * the number of CPUs on the system in arc_init(). 181286763Smav */ 182286763Smavint zfs_arc_num_sublists_per_state = 0; 183286763Smav 184168404Spjd/* number of seconds before growing cache again */ 185168404Spjdstatic int arc_grow_retry = 60; 186168404Spjd 187286763Smav/* shift of arc_c for calculating overflow limit in arc_get_data_buf */ 188286763Smavint zfs_arc_overflow_shift = 8; 189286763Smav 190208373Smm/* shift of arc_c for calculating both min and max arc_p */ 191208373Smmstatic int arc_p_min_shift = 4; 192208373Smm 193208373Smm/* log2(fraction of arc to reclaim) */ 194286625Smavstatic int arc_shrink_shift = 7; 195208373Smm 196168404Spjd/* 197286625Smav * log2(fraction of ARC which must be free to allow growing). 198286625Smav * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, 199286625Smav * when reading a new block into the ARC, we will evict an equal-sized block 200286625Smav * from the ARC. 201286625Smav * 202286625Smav * This must be less than arc_shrink_shift, so that when we shrink the ARC, 203286625Smav * we will still not allow it to grow. 204286625Smav */ 205286625Smavint arc_no_grow_shift = 5; 206286625Smav 207286625Smav 208286625Smav/* 209168404Spjd * minimum lifespan of a prefetch block in clock ticks 210168404Spjd * (initialized in arc_init()) 211168404Spjd */ 212168404Spjdstatic int arc_min_prefetch_lifespan; 213168404Spjd 214258632Savg/* 215258632Savg * If this percent of memory is free, don't throttle. 216258632Savg */ 217258632Savgint arc_lotsfree_percent = 10; 218258632Savg 219208373Smmstatic int arc_dead; 220287702Sdelphijextern boolean_t zfs_prefetch_disable; 221168404Spjd 222168404Spjd/* 223185029Spjd * The arc has filled available memory and has now warmed up. 224185029Spjd */ 225185029Spjdstatic boolean_t arc_warm; 226185029Spjd 227286762Smav/* 228286762Smav * These tunables are for performance analysis. 229286762Smav */ 230185029Spjduint64_t zfs_arc_max; 231185029Spjduint64_t zfs_arc_min; 232185029Spjduint64_t zfs_arc_meta_limit = 0; 233275780Sdelphijuint64_t zfs_arc_meta_min = 0; 234208373Smmint zfs_arc_grow_retry = 0; 235208373Smmint zfs_arc_shrink_shift = 0; 236208373Smmint zfs_arc_p_min_shift = 0; 237242845Sdelphijint zfs_disable_dup_eviction = 0; 238269230Sdelphijuint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ 239272483Ssmhu_int zfs_arc_free_target = 0; 240185029Spjd 241302265Ssmh/* Absolute min for arc min / max is 16MB. */ 242302265Ssmhstatic uint64_t arc_abs_min = 16 << 20; 243302265Ssmh 244270759Ssmhstatic int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); 245275748Sdelphijstatic int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); 246302265Ssmhstatic int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS); 247302265Ssmhstatic int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS); 248270759Ssmh 249302265Ssmh#if defined(__FreeBSD__) && defined(_KERNEL) 250270759Ssmhstatic void 251270759Ssmharc_free_target_init(void *unused __unused) 252270759Ssmh{ 253270759Ssmh 254272483Ssmh zfs_arc_free_target = vm_pageout_wakeup_thresh; 255270759Ssmh} 256270759SsmhSYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, 257270759Ssmh arc_free_target_init, NULL); 258270759Ssmh 259185029SpjdTUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); 260275780SdelphijTUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); 261273026SdelphijTUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); 262168473SpjdSYSCTL_DECL(_vfs_zfs); 263302265SsmhSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_U64 | CTLFLAG_RWTUN, 264302265Ssmh 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size"); 265302265SsmhSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_U64 | CTLFLAG_RWTUN, 266302265Ssmh 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size"); 267269230SdelphijSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, 268269230Sdelphij &zfs_arc_average_blocksize, 0, 269269230Sdelphij "ARC average blocksize"); 270273026SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, 271273026Sdelphij &arc_shrink_shift, 0, 272273026Sdelphij "log2(fraction of arc to reclaim)"); 273273026Sdelphij 274270759Ssmh/* 275270759Ssmh * We don't have a tunable for arc_free_target due to the dependency on 276270759Ssmh * pagedaemon initialisation. 277270759Ssmh */ 278270759SsmhSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, 279270759Ssmh CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), 280270759Ssmh sysctl_vfs_zfs_arc_free_target, "IU", 281270759Ssmh "Desired number of free pages below which ARC triggers reclaim"); 282168404Spjd 283270759Ssmhstatic int 284270759Ssmhsysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) 285270759Ssmh{ 286270759Ssmh u_int val; 287270759Ssmh int err; 288270759Ssmh 289270759Ssmh val = zfs_arc_free_target; 290270759Ssmh err = sysctl_handle_int(oidp, &val, 0, req); 291270759Ssmh if (err != 0 || req->newptr == NULL) 292270759Ssmh return (err); 293270759Ssmh 294272483Ssmh if (val < minfree) 295270759Ssmh return (EINVAL); 296272483Ssmh if (val > vm_cnt.v_page_count) 297270759Ssmh return (EINVAL); 298270759Ssmh 299270759Ssmh zfs_arc_free_target = val; 300270759Ssmh 301270759Ssmh return (0); 302270759Ssmh} 303275748Sdelphij 304275748Sdelphij/* 305275748Sdelphij * Must be declared here, before the definition of corresponding kstat 306275748Sdelphij * macro which uses the same names will confuse the compiler. 307275748Sdelphij */ 308275748SdelphijSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, 309275748Sdelphij CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), 310275748Sdelphij sysctl_vfs_zfs_arc_meta_limit, "QU", 311275748Sdelphij "ARC metadata limit"); 312272483Ssmh#endif 313270759Ssmh 314168404Spjd/* 315185029Spjd * Note that buffers can be in one of 6 states: 316168404Spjd * ARC_anon - anonymous (discussed below) 317168404Spjd * ARC_mru - recently used, currently cached 318168404Spjd * ARC_mru_ghost - recentely used, no longer in cache 319168404Spjd * ARC_mfu - frequently used, currently cached 320168404Spjd * ARC_mfu_ghost - frequently used, no longer in cache 321185029Spjd * ARC_l2c_only - exists in L2ARC but not other states 322185029Spjd * When there are no active references to the buffer, they are 323185029Spjd * are linked onto a list in one of these arc states. These are 324185029Spjd * the only buffers that can be evicted or deleted. Within each 325185029Spjd * state there are multiple lists, one for meta-data and one for 326185029Spjd * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 327185029Spjd * etc.) is tracked separately so that it can be managed more 328185029Spjd * explicitly: favored over data, limited explicitly. 329168404Spjd * 330168404Spjd * Anonymous buffers are buffers that are not associated with 331168404Spjd * a DVA. These are buffers that hold dirty block copies 332168404Spjd * before they are written to stable storage. By definition, 333168404Spjd * they are "ref'd" and are considered part of arc_mru 334168404Spjd * that cannot be freed. Generally, they will aquire a DVA 335168404Spjd * as they are written and migrate onto the arc_mru list. 336185029Spjd * 337185029Spjd * The ARC_l2c_only state is for buffers that are in the second 338185029Spjd * level ARC but no longer in any of the ARC_m* lists. The second 339185029Spjd * level ARC itself may also contain buffers that are in any of 340185029Spjd * the ARC_m* states - meaning that a buffer can exist in two 341185029Spjd * places. The reason for the ARC_l2c_only state is to keep the 342185029Spjd * buffer header in the hash table, so that reads that hit the 343185029Spjd * second level ARC benefit from these fast lookups. 344168404Spjd */ 345168404Spjd 346168404Spjdtypedef struct arc_state { 347286763Smav /* 348286763Smav * list of evictable buffers 349286763Smav */ 350286763Smav multilist_t arcs_list[ARC_BUFC_NUMTYPES]; 351286763Smav /* 352286763Smav * total amount of evictable data in this state 353286763Smav */ 354286763Smav uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; 355286763Smav /* 356286763Smav * total amount of data in this state; this includes: evictable, 357286763Smav * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. 358286763Smav */ 359286766Smav refcount_t arcs_size; 360168404Spjd} arc_state_t; 361168404Spjd 362185029Spjd/* The 6 states: */ 363168404Spjdstatic arc_state_t ARC_anon; 364168404Spjdstatic arc_state_t ARC_mru; 365168404Spjdstatic arc_state_t ARC_mru_ghost; 366168404Spjdstatic arc_state_t ARC_mfu; 367168404Spjdstatic arc_state_t ARC_mfu_ghost; 368185029Spjdstatic arc_state_t ARC_l2c_only; 369168404Spjd 370168404Spjdtypedef struct arc_stats { 371168404Spjd kstat_named_t arcstat_hits; 372168404Spjd kstat_named_t arcstat_misses; 373168404Spjd kstat_named_t arcstat_demand_data_hits; 374168404Spjd kstat_named_t arcstat_demand_data_misses; 375168404Spjd kstat_named_t arcstat_demand_metadata_hits; 376168404Spjd kstat_named_t arcstat_demand_metadata_misses; 377168404Spjd kstat_named_t arcstat_prefetch_data_hits; 378168404Spjd kstat_named_t arcstat_prefetch_data_misses; 379168404Spjd kstat_named_t arcstat_prefetch_metadata_hits; 380168404Spjd kstat_named_t arcstat_prefetch_metadata_misses; 381168404Spjd kstat_named_t arcstat_mru_hits; 382168404Spjd kstat_named_t arcstat_mru_ghost_hits; 383168404Spjd kstat_named_t arcstat_mfu_hits; 384168404Spjd kstat_named_t arcstat_mfu_ghost_hits; 385205231Skmacy kstat_named_t arcstat_allocated; 386168404Spjd kstat_named_t arcstat_deleted; 387251629Sdelphij /* 388251629Sdelphij * Number of buffers that could not be evicted because the hash lock 389251629Sdelphij * was held by another thread. The lock may not necessarily be held 390251629Sdelphij * by something using the same buffer, since hash locks are shared 391251629Sdelphij * by multiple buffers. 392251629Sdelphij */ 393168404Spjd kstat_named_t arcstat_mutex_miss; 394251629Sdelphij /* 395251629Sdelphij * Number of buffers skipped because they have I/O in progress, are 396251629Sdelphij * indrect prefetch buffers that have not lived long enough, or are 397251629Sdelphij * not from the spa we're trying to evict from. 398251629Sdelphij */ 399168404Spjd kstat_named_t arcstat_evict_skip; 400286763Smav /* 401286763Smav * Number of times arc_evict_state() was unable to evict enough 402286763Smav * buffers to reach it's target amount. 403286763Smav */ 404286763Smav kstat_named_t arcstat_evict_not_enough; 405208373Smm kstat_named_t arcstat_evict_l2_cached; 406208373Smm kstat_named_t arcstat_evict_l2_eligible; 407208373Smm kstat_named_t arcstat_evict_l2_ineligible; 408286763Smav kstat_named_t arcstat_evict_l2_skip; 409168404Spjd kstat_named_t arcstat_hash_elements; 410168404Spjd kstat_named_t arcstat_hash_elements_max; 411168404Spjd kstat_named_t arcstat_hash_collisions; 412168404Spjd kstat_named_t arcstat_hash_chains; 413168404Spjd kstat_named_t arcstat_hash_chain_max; 414168404Spjd kstat_named_t arcstat_p; 415168404Spjd kstat_named_t arcstat_c; 416168404Spjd kstat_named_t arcstat_c_min; 417168404Spjd kstat_named_t arcstat_c_max; 418168404Spjd kstat_named_t arcstat_size; 419286574Smav /* 420286574Smav * Number of bytes consumed by internal ARC structures necessary 421286574Smav * for tracking purposes; these structures are not actually 422286574Smav * backed by ARC buffers. This includes arc_buf_hdr_t structures 423286574Smav * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 424286574Smav * caches), and arc_buf_t structures (allocated via arc_buf_t 425286574Smav * cache). 426286574Smav */ 427185029Spjd kstat_named_t arcstat_hdr_size; 428286574Smav /* 429286574Smav * Number of bytes consumed by ARC buffers of type equal to 430286574Smav * ARC_BUFC_DATA. This is generally consumed by buffers backing 431286574Smav * on disk user data (e.g. plain file contents). 432286574Smav */ 433208373Smm kstat_named_t arcstat_data_size; 434286574Smav /* 435286574Smav * Number of bytes consumed by ARC buffers of type equal to 436286574Smav * ARC_BUFC_METADATA. This is generally consumed by buffers 437286574Smav * backing on disk data that is used for internal ZFS 438286574Smav * structures (e.g. ZAP, dnode, indirect blocks, etc). 439286574Smav */ 440286574Smav kstat_named_t arcstat_metadata_size; 441286574Smav /* 442286574Smav * Number of bytes consumed by various buffers and structures 443286574Smav * not actually backed with ARC buffers. This includes bonus 444286574Smav * buffers (allocated directly via zio_buf_* functions), 445286574Smav * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t 446286574Smav * cache), and dnode_t structures (allocated via dnode_t cache). 447286574Smav */ 448208373Smm kstat_named_t arcstat_other_size; 449286574Smav /* 450286574Smav * Total number of bytes consumed by ARC buffers residing in the 451286574Smav * arc_anon state. This includes *all* buffers in the arc_anon 452286574Smav * state; e.g. data, metadata, evictable, and unevictable buffers 453286574Smav * are all included in this value. 454286574Smav */ 455286574Smav kstat_named_t arcstat_anon_size; 456286574Smav /* 457286574Smav * Number of bytes consumed by ARC buffers that meet the 458286574Smav * following criteria: backing buffers of type ARC_BUFC_DATA, 459286574Smav * residing in the arc_anon state, and are eligible for eviction 460286574Smav * (e.g. have no outstanding holds on the buffer). 461286574Smav */ 462286574Smav kstat_named_t arcstat_anon_evictable_data; 463286574Smav /* 464286574Smav * Number of bytes consumed by ARC buffers that meet the 465286574Smav * following criteria: backing buffers of type ARC_BUFC_METADATA, 466286574Smav * residing in the arc_anon state, and are eligible for eviction 467286574Smav * (e.g. have no outstanding holds on the buffer). 468286574Smav */ 469286574Smav kstat_named_t arcstat_anon_evictable_metadata; 470286574Smav /* 471286574Smav * Total number of bytes consumed by ARC buffers residing in the 472286574Smav * arc_mru state. This includes *all* buffers in the arc_mru 473286574Smav * state; e.g. data, metadata, evictable, and unevictable buffers 474286574Smav * are all included in this value. 475286574Smav */ 476286574Smav kstat_named_t arcstat_mru_size; 477286574Smav /* 478286574Smav * Number of bytes consumed by ARC buffers that meet the 479286574Smav * following criteria: backing buffers of type ARC_BUFC_DATA, 480286574Smav * residing in the arc_mru state, and are eligible for eviction 481286574Smav * (e.g. have no outstanding holds on the buffer). 482286574Smav */ 483286574Smav kstat_named_t arcstat_mru_evictable_data; 484286574Smav /* 485286574Smav * Number of bytes consumed by ARC buffers that meet the 486286574Smav * following criteria: backing buffers of type ARC_BUFC_METADATA, 487286574Smav * residing in the arc_mru state, and are eligible for eviction 488286574Smav * (e.g. have no outstanding holds on the buffer). 489286574Smav */ 490286574Smav kstat_named_t arcstat_mru_evictable_metadata; 491286574Smav /* 492286574Smav * Total number of bytes that *would have been* consumed by ARC 493286574Smav * buffers in the arc_mru_ghost state. The key thing to note 494286574Smav * here, is the fact that this size doesn't actually indicate 495286574Smav * RAM consumption. The ghost lists only consist of headers and 496286574Smav * don't actually have ARC buffers linked off of these headers. 497286574Smav * Thus, *if* the headers had associated ARC buffers, these 498286574Smav * buffers *would have* consumed this number of bytes. 499286574Smav */ 500286574Smav kstat_named_t arcstat_mru_ghost_size; 501286574Smav /* 502286574Smav * Number of bytes that *would have been* consumed by ARC 503286574Smav * buffers that are eligible for eviction, of type 504286574Smav * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 505286574Smav */ 506286574Smav kstat_named_t arcstat_mru_ghost_evictable_data; 507286574Smav /* 508286574Smav * Number of bytes that *would have been* consumed by ARC 509286574Smav * buffers that are eligible for eviction, of type 510286574Smav * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 511286574Smav */ 512286574Smav kstat_named_t arcstat_mru_ghost_evictable_metadata; 513286574Smav /* 514286574Smav * Total number of bytes consumed by ARC buffers residing in the 515286574Smav * arc_mfu state. This includes *all* buffers in the arc_mfu 516286574Smav * state; e.g. data, metadata, evictable, and unevictable buffers 517286574Smav * are all included in this value. 518286574Smav */ 519286574Smav kstat_named_t arcstat_mfu_size; 520286574Smav /* 521286574Smav * Number of bytes consumed by ARC buffers that are eligible for 522286574Smav * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 523286574Smav * state. 524286574Smav */ 525286574Smav kstat_named_t arcstat_mfu_evictable_data; 526286574Smav /* 527286574Smav * Number of bytes consumed by ARC buffers that are eligible for 528286574Smav * eviction, of type ARC_BUFC_METADATA, and reside in the 529286574Smav * arc_mfu state. 530286574Smav */ 531286574Smav kstat_named_t arcstat_mfu_evictable_metadata; 532286574Smav /* 533286574Smav * Total number of bytes that *would have been* consumed by ARC 534286574Smav * buffers in the arc_mfu_ghost state. See the comment above 535286574Smav * arcstat_mru_ghost_size for more details. 536286574Smav */ 537286574Smav kstat_named_t arcstat_mfu_ghost_size; 538286574Smav /* 539286574Smav * Number of bytes that *would have been* consumed by ARC 540286574Smav * buffers that are eligible for eviction, of type 541286574Smav * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 542286574Smav */ 543286574Smav kstat_named_t arcstat_mfu_ghost_evictable_data; 544286574Smav /* 545286574Smav * Number of bytes that *would have been* consumed by ARC 546286574Smav * buffers that are eligible for eviction, of type 547286574Smav * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 548286574Smav */ 549286574Smav kstat_named_t arcstat_mfu_ghost_evictable_metadata; 550185029Spjd kstat_named_t arcstat_l2_hits; 551185029Spjd kstat_named_t arcstat_l2_misses; 552185029Spjd kstat_named_t arcstat_l2_feeds; 553185029Spjd kstat_named_t arcstat_l2_rw_clash; 554208373Smm kstat_named_t arcstat_l2_read_bytes; 555208373Smm kstat_named_t arcstat_l2_write_bytes; 556185029Spjd kstat_named_t arcstat_l2_writes_sent; 557185029Spjd kstat_named_t arcstat_l2_writes_done; 558185029Spjd kstat_named_t arcstat_l2_writes_error; 559286763Smav kstat_named_t arcstat_l2_writes_lock_retry; 560185029Spjd kstat_named_t arcstat_l2_evict_lock_retry; 561185029Spjd kstat_named_t arcstat_l2_evict_reading; 562286570Smav kstat_named_t arcstat_l2_evict_l1cached; 563185029Spjd kstat_named_t arcstat_l2_free_on_write; 564274172Savg kstat_named_t arcstat_l2_cdata_free_on_write; 565185029Spjd kstat_named_t arcstat_l2_abort_lowmem; 566185029Spjd kstat_named_t arcstat_l2_cksum_bad; 567185029Spjd kstat_named_t arcstat_l2_io_error; 568185029Spjd kstat_named_t arcstat_l2_size; 569251478Sdelphij kstat_named_t arcstat_l2_asize; 570185029Spjd kstat_named_t arcstat_l2_hdr_size; 571251478Sdelphij kstat_named_t arcstat_l2_compress_successes; 572251478Sdelphij kstat_named_t arcstat_l2_compress_zeros; 573251478Sdelphij kstat_named_t arcstat_l2_compress_failures; 574297848Savg kstat_named_t arcstat_l2_padding_needed; 575205231Skmacy kstat_named_t arcstat_l2_write_trylock_fail; 576205231Skmacy kstat_named_t arcstat_l2_write_passed_headroom; 577205231Skmacy kstat_named_t arcstat_l2_write_spa_mismatch; 578206796Spjd kstat_named_t arcstat_l2_write_in_l2; 579205231Skmacy kstat_named_t arcstat_l2_write_hdr_io_in_progress; 580205231Skmacy kstat_named_t arcstat_l2_write_not_cacheable; 581205231Skmacy kstat_named_t arcstat_l2_write_full; 582205231Skmacy kstat_named_t arcstat_l2_write_buffer_iter; 583205231Skmacy kstat_named_t arcstat_l2_write_pios; 584205231Skmacy kstat_named_t arcstat_l2_write_buffer_bytes_scanned; 585205231Skmacy kstat_named_t arcstat_l2_write_buffer_list_iter; 586205231Skmacy kstat_named_t arcstat_l2_write_buffer_list_null_iter; 587242845Sdelphij kstat_named_t arcstat_memory_throttle_count; 588242845Sdelphij kstat_named_t arcstat_duplicate_buffers; 589242845Sdelphij kstat_named_t arcstat_duplicate_buffers_size; 590242845Sdelphij kstat_named_t arcstat_duplicate_reads; 591275748Sdelphij kstat_named_t arcstat_meta_used; 592275748Sdelphij kstat_named_t arcstat_meta_limit; 593275748Sdelphij kstat_named_t arcstat_meta_max; 594275780Sdelphij kstat_named_t arcstat_meta_min; 595287702Sdelphij kstat_named_t arcstat_sync_wait_for_async; 596287702Sdelphij kstat_named_t arcstat_demand_hit_predictive_prefetch; 597168404Spjd} arc_stats_t; 598168404Spjd 599168404Spjdstatic arc_stats_t arc_stats = { 600168404Spjd { "hits", KSTAT_DATA_UINT64 }, 601168404Spjd { "misses", KSTAT_DATA_UINT64 }, 602168404Spjd { "demand_data_hits", KSTAT_DATA_UINT64 }, 603168404Spjd { "demand_data_misses", KSTAT_DATA_UINT64 }, 604168404Spjd { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 605168404Spjd { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 606168404Spjd { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 607168404Spjd { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 608168404Spjd { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 609168404Spjd { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 610168404Spjd { "mru_hits", KSTAT_DATA_UINT64 }, 611168404Spjd { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 612168404Spjd { "mfu_hits", KSTAT_DATA_UINT64 }, 613168404Spjd { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 614205231Skmacy { "allocated", KSTAT_DATA_UINT64 }, 615168404Spjd { "deleted", KSTAT_DATA_UINT64 }, 616168404Spjd { "mutex_miss", KSTAT_DATA_UINT64 }, 617168404Spjd { "evict_skip", KSTAT_DATA_UINT64 }, 618286763Smav { "evict_not_enough", KSTAT_DATA_UINT64 }, 619208373Smm { "evict_l2_cached", KSTAT_DATA_UINT64 }, 620208373Smm { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 621208373Smm { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 622286763Smav { "evict_l2_skip", KSTAT_DATA_UINT64 }, 623168404Spjd { "hash_elements", KSTAT_DATA_UINT64 }, 624168404Spjd { "hash_elements_max", KSTAT_DATA_UINT64 }, 625168404Spjd { "hash_collisions", KSTAT_DATA_UINT64 }, 626168404Spjd { "hash_chains", KSTAT_DATA_UINT64 }, 627168404Spjd { "hash_chain_max", KSTAT_DATA_UINT64 }, 628168404Spjd { "p", KSTAT_DATA_UINT64 }, 629168404Spjd { "c", KSTAT_DATA_UINT64 }, 630168404Spjd { "c_min", KSTAT_DATA_UINT64 }, 631168404Spjd { "c_max", KSTAT_DATA_UINT64 }, 632185029Spjd { "size", KSTAT_DATA_UINT64 }, 633185029Spjd { "hdr_size", KSTAT_DATA_UINT64 }, 634208373Smm { "data_size", KSTAT_DATA_UINT64 }, 635286574Smav { "metadata_size", KSTAT_DATA_UINT64 }, 636208373Smm { "other_size", KSTAT_DATA_UINT64 }, 637286574Smav { "anon_size", KSTAT_DATA_UINT64 }, 638286574Smav { "anon_evictable_data", KSTAT_DATA_UINT64 }, 639286574Smav { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, 640286574Smav { "mru_size", KSTAT_DATA_UINT64 }, 641286574Smav { "mru_evictable_data", KSTAT_DATA_UINT64 }, 642286574Smav { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, 643286574Smav { "mru_ghost_size", KSTAT_DATA_UINT64 }, 644286574Smav { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, 645286574Smav { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 646286574Smav { "mfu_size", KSTAT_DATA_UINT64 }, 647286574Smav { "mfu_evictable_data", KSTAT_DATA_UINT64 }, 648286574Smav { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, 649286574Smav { "mfu_ghost_size", KSTAT_DATA_UINT64 }, 650286574Smav { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, 651286574Smav { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 652185029Spjd { "l2_hits", KSTAT_DATA_UINT64 }, 653185029Spjd { "l2_misses", KSTAT_DATA_UINT64 }, 654185029Spjd { "l2_feeds", KSTAT_DATA_UINT64 }, 655185029Spjd { "l2_rw_clash", KSTAT_DATA_UINT64 }, 656208373Smm { "l2_read_bytes", KSTAT_DATA_UINT64 }, 657208373Smm { "l2_write_bytes", KSTAT_DATA_UINT64 }, 658185029Spjd { "l2_writes_sent", KSTAT_DATA_UINT64 }, 659185029Spjd { "l2_writes_done", KSTAT_DATA_UINT64 }, 660185029Spjd { "l2_writes_error", KSTAT_DATA_UINT64 }, 661286763Smav { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, 662185029Spjd { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 663185029Spjd { "l2_evict_reading", KSTAT_DATA_UINT64 }, 664286570Smav { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, 665185029Spjd { "l2_free_on_write", KSTAT_DATA_UINT64 }, 666274172Savg { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, 667185029Spjd { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 668185029Spjd { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 669185029Spjd { "l2_io_error", KSTAT_DATA_UINT64 }, 670185029Spjd { "l2_size", KSTAT_DATA_UINT64 }, 671251478Sdelphij { "l2_asize", KSTAT_DATA_UINT64 }, 672185029Spjd { "l2_hdr_size", KSTAT_DATA_UINT64 }, 673251478Sdelphij { "l2_compress_successes", KSTAT_DATA_UINT64 }, 674251478Sdelphij { "l2_compress_zeros", KSTAT_DATA_UINT64 }, 675251478Sdelphij { "l2_compress_failures", KSTAT_DATA_UINT64 }, 676297848Savg { "l2_padding_needed", KSTAT_DATA_UINT64 }, 677206796Spjd { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, 678206796Spjd { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, 679206796Spjd { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, 680206796Spjd { "l2_write_in_l2", KSTAT_DATA_UINT64 }, 681206796Spjd { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, 682206796Spjd { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, 683206796Spjd { "l2_write_full", KSTAT_DATA_UINT64 }, 684206796Spjd { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, 685206796Spjd { "l2_write_pios", KSTAT_DATA_UINT64 }, 686206796Spjd { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, 687206796Spjd { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, 688242845Sdelphij { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, 689242845Sdelphij { "memory_throttle_count", KSTAT_DATA_UINT64 }, 690242845Sdelphij { "duplicate_buffers", KSTAT_DATA_UINT64 }, 691242845Sdelphij { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 692275748Sdelphij { "duplicate_reads", KSTAT_DATA_UINT64 }, 693275748Sdelphij { "arc_meta_used", KSTAT_DATA_UINT64 }, 694275748Sdelphij { "arc_meta_limit", KSTAT_DATA_UINT64 }, 695275780Sdelphij { "arc_meta_max", KSTAT_DATA_UINT64 }, 696287702Sdelphij { "arc_meta_min", KSTAT_DATA_UINT64 }, 697287702Sdelphij { "sync_wait_for_async", KSTAT_DATA_UINT64 }, 698287702Sdelphij { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, 699168404Spjd}; 700168404Spjd 701168404Spjd#define ARCSTAT(stat) (arc_stats.stat.value.ui64) 702168404Spjd 703168404Spjd#define ARCSTAT_INCR(stat, val) \ 704251631Sdelphij atomic_add_64(&arc_stats.stat.value.ui64, (val)) 705168404Spjd 706206796Spjd#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 707168404Spjd#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 708168404Spjd 709168404Spjd#define ARCSTAT_MAX(stat, val) { \ 710168404Spjd uint64_t m; \ 711168404Spjd while ((val) > (m = arc_stats.stat.value.ui64) && \ 712168404Spjd (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 713168404Spjd continue; \ 714168404Spjd} 715168404Spjd 716168404Spjd#define ARCSTAT_MAXSTAT(stat) \ 717168404Spjd ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 718168404Spjd 719168404Spjd/* 720168404Spjd * We define a macro to allow ARC hits/misses to be easily broken down by 721168404Spjd * two separate conditions, giving a total of four different subtypes for 722168404Spjd * each of hits and misses (so eight statistics total). 723168404Spjd */ 724168404Spjd#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 725168404Spjd if (cond1) { \ 726168404Spjd if (cond2) { \ 727168404Spjd ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 728168404Spjd } else { \ 729168404Spjd ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 730168404Spjd } \ 731168404Spjd } else { \ 732168404Spjd if (cond2) { \ 733168404Spjd ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 734168404Spjd } else { \ 735168404Spjd ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 736168404Spjd } \ 737168404Spjd } 738168404Spjd 739168404Spjdkstat_t *arc_ksp; 740206796Spjdstatic arc_state_t *arc_anon; 741168404Spjdstatic arc_state_t *arc_mru; 742168404Spjdstatic arc_state_t *arc_mru_ghost; 743168404Spjdstatic arc_state_t *arc_mfu; 744168404Spjdstatic arc_state_t *arc_mfu_ghost; 745185029Spjdstatic arc_state_t *arc_l2c_only; 746168404Spjd 747168404Spjd/* 748168404Spjd * There are several ARC variables that are critical to export as kstats -- 749168404Spjd * but we don't want to have to grovel around in the kstat whenever we wish to 750168404Spjd * manipulate them. For these variables, we therefore define them to be in 751168404Spjd * terms of the statistic variable. This assures that we are not introducing 752168404Spjd * the possibility of inconsistency by having shadow copies of the variables, 753168404Spjd * while still allowing the code to be readable. 754168404Spjd */ 755168404Spjd#define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 756168404Spjd#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 757168404Spjd#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 758168404Spjd#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 759168404Spjd#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 760275748Sdelphij#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 761275780Sdelphij#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ 762275748Sdelphij#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ 763275748Sdelphij#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 764168404Spjd 765251478Sdelphij#define L2ARC_IS_VALID_COMPRESS(_c_) \ 766251478Sdelphij ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) 767251478Sdelphij 768168404Spjdstatic int arc_no_grow; /* Don't try to grow cache size */ 769168404Spjdstatic uint64_t arc_tempreserve; 770209962Smmstatic uint64_t arc_loaned_bytes; 771168404Spjd 772168404Spjdtypedef struct arc_callback arc_callback_t; 773168404Spjd 774168404Spjdstruct arc_callback { 775168404Spjd void *acb_private; 776168404Spjd arc_done_func_t *acb_done; 777168404Spjd arc_buf_t *acb_buf; 778168404Spjd zio_t *acb_zio_dummy; 779168404Spjd arc_callback_t *acb_next; 780168404Spjd}; 781168404Spjd 782168404Spjdtypedef struct arc_write_callback arc_write_callback_t; 783168404Spjd 784168404Spjdstruct arc_write_callback { 785168404Spjd void *awcb_private; 786168404Spjd arc_done_func_t *awcb_ready; 787258632Savg arc_done_func_t *awcb_physdone; 788168404Spjd arc_done_func_t *awcb_done; 789168404Spjd arc_buf_t *awcb_buf; 790168404Spjd}; 791168404Spjd 792286570Smav/* 793286570Smav * ARC buffers are separated into multiple structs as a memory saving measure: 794286570Smav * - Common fields struct, always defined, and embedded within it: 795286570Smav * - L2-only fields, always allocated but undefined when not in L2ARC 796286570Smav * - L1-only fields, only allocated when in L1ARC 797286570Smav * 798286570Smav * Buffer in L1 Buffer only in L2 799286570Smav * +------------------------+ +------------------------+ 800286570Smav * | arc_buf_hdr_t | | arc_buf_hdr_t | 801286570Smav * | | | | 802286570Smav * | | | | 803286570Smav * | | | | 804286570Smav * +------------------------+ +------------------------+ 805286570Smav * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 806286570Smav * | (undefined if L1-only) | | | 807286570Smav * +------------------------+ +------------------------+ 808286570Smav * | l1arc_buf_hdr_t | 809286570Smav * | | 810286570Smav * | | 811286570Smav * | | 812286570Smav * | | 813286570Smav * +------------------------+ 814286570Smav * 815286570Smav * Because it's possible for the L2ARC to become extremely large, we can wind 816286570Smav * up eating a lot of memory in L2ARC buffer headers, so the size of a header 817286570Smav * is minimized by only allocating the fields necessary for an L1-cached buffer 818286570Smav * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 819286570Smav * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 820286570Smav * words in pointers. arc_hdr_realloc() is used to switch a header between 821286570Smav * these two allocation states. 822286570Smav */ 823286570Smavtypedef struct l1arc_buf_hdr { 824168404Spjd kmutex_t b_freeze_lock; 825286570Smav#ifdef ZFS_DEBUG 826286570Smav /* 827286570Smav * used for debugging wtih kmem_flags - by allocating and freeing 828286570Smav * b_thawed when the buffer is thawed, we get a record of the stack 829286570Smav * trace that thawed it. 830286570Smav */ 831219089Spjd void *b_thawed; 832286570Smav#endif 833168404Spjd 834168404Spjd arc_buf_t *b_buf; 835168404Spjd uint32_t b_datacnt; 836286570Smav /* for waiting on writes to complete */ 837168404Spjd kcondvar_t b_cv; 838168404Spjd 839168404Spjd /* protected by arc state mutex */ 840168404Spjd arc_state_t *b_state; 841286763Smav multilist_node_t b_arc_node; 842168404Spjd 843168404Spjd /* updated atomically */ 844168404Spjd clock_t b_arc_access; 845168404Spjd 846168404Spjd /* self protecting */ 847168404Spjd refcount_t b_refcnt; 848185029Spjd 849286570Smav arc_callback_t *b_acb; 850297848Savg /* temporary buffer holder for in-flight compressed or padded data */ 851286570Smav void *b_tmp_cdata; 852286570Smav} l1arc_buf_hdr_t; 853286570Smav 854286570Smavtypedef struct l2arc_dev l2arc_dev_t; 855286570Smav 856286570Smavtypedef struct l2arc_buf_hdr { 857286570Smav /* protected by arc_buf_hdr mutex */ 858286570Smav l2arc_dev_t *b_dev; /* L2ARC device */ 859286570Smav uint64_t b_daddr; /* disk address, offset byte */ 860286570Smav /* real alloc'd buffer size depending on b_compress applied */ 861286570Smav int32_t b_asize; 862287706Sdelphij uint8_t b_compress; 863286570Smav 864185029Spjd list_node_t b_l2node; 865286570Smav} l2arc_buf_hdr_t; 866286570Smav 867286570Smavstruct arc_buf_hdr { 868286570Smav /* protected by hash lock */ 869286570Smav dva_t b_dva; 870286570Smav uint64_t b_birth; 871286570Smav /* 872286570Smav * Even though this checksum is only set/verified when a buffer is in 873286570Smav * the L1 cache, it needs to be in the set of common fields because it 874286570Smav * must be preserved from the time before a buffer is written out to 875286570Smav * L2ARC until after it is read back in. 876286570Smav */ 877286570Smav zio_cksum_t *b_freeze_cksum; 878286570Smav 879286570Smav arc_buf_hdr_t *b_hash_next; 880286570Smav arc_flags_t b_flags; 881286570Smav 882286570Smav /* immutable */ 883286570Smav int32_t b_size; 884286570Smav uint64_t b_spa; 885286570Smav 886286570Smav /* L2ARC fields. Undefined when not in L2ARC. */ 887286570Smav l2arc_buf_hdr_t b_l2hdr; 888286570Smav /* L1ARC fields. Undefined when in l2arc_only state */ 889286570Smav l1arc_buf_hdr_t b_l1hdr; 890168404Spjd}; 891168404Spjd 892302265Ssmh#if defined(__FreeBSD__) && defined(_KERNEL) 893275748Sdelphijstatic int 894275748Sdelphijsysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) 895275748Sdelphij{ 896275748Sdelphij uint64_t val; 897275748Sdelphij int err; 898275748Sdelphij 899275748Sdelphij val = arc_meta_limit; 900275748Sdelphij err = sysctl_handle_64(oidp, &val, 0, req); 901275748Sdelphij if (err != 0 || req->newptr == NULL) 902275748Sdelphij return (err); 903275748Sdelphij 904275748Sdelphij if (val <= 0 || val > arc_c_max) 905275748Sdelphij return (EINVAL); 906275748Sdelphij 907275748Sdelphij arc_meta_limit = val; 908275748Sdelphij return (0); 909275748Sdelphij} 910302265Ssmh 911302265Ssmhstatic int 912302265Ssmhsysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS) 913302265Ssmh{ 914302265Ssmh uint64_t val; 915302265Ssmh int err; 916302265Ssmh 917302265Ssmh val = zfs_arc_max; 918302265Ssmh err = sysctl_handle_64(oidp, &val, 0, req); 919302265Ssmh if (err != 0 || req->newptr == NULL) 920302265Ssmh return (err); 921302265Ssmh 922302265Ssmh if (val < arc_abs_min || val > kmem_size()) 923302265Ssmh return (EINVAL); 924302265Ssmh if (val < arc_c_min) 925302265Ssmh return (EINVAL); 926302265Ssmh if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit) 927302265Ssmh return (EINVAL); 928302265Ssmh 929302265Ssmh arc_c_max = val; 930302265Ssmh 931302265Ssmh arc_c = arc_c_max; 932302265Ssmh arc_p = (arc_c >> 1); 933302265Ssmh 934302265Ssmh if (zfs_arc_meta_limit == 0) { 935302265Ssmh /* limit meta-data to 1/4 of the arc capacity */ 936302265Ssmh arc_meta_limit = arc_c_max / 4; 937302265Ssmh } 938302265Ssmh 939302265Ssmh /* if kmem_flags are set, lets try to use less memory */ 940302265Ssmh if (kmem_debugging()) 941302265Ssmh arc_c = arc_c / 2; 942302265Ssmh 943302265Ssmh zfs_arc_max = arc_c; 944302265Ssmh 945302265Ssmh return (0); 946302265Ssmh} 947302265Ssmh 948302265Ssmhstatic int 949302265Ssmhsysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS) 950302265Ssmh{ 951302265Ssmh uint64_t val; 952302265Ssmh int err; 953302265Ssmh 954302265Ssmh val = zfs_arc_min; 955302265Ssmh err = sysctl_handle_64(oidp, &val, 0, req); 956302265Ssmh if (err != 0 || req->newptr == NULL) 957302265Ssmh return (err); 958302265Ssmh 959302265Ssmh if (val < arc_abs_min || val > arc_c_max) 960302265Ssmh return (EINVAL); 961302265Ssmh 962302265Ssmh arc_c_min = val; 963302265Ssmh 964302265Ssmh if (zfs_arc_meta_min == 0) 965302265Ssmh arc_meta_min = arc_c_min / 2; 966302265Ssmh 967302265Ssmh if (arc_c < arc_c_min) 968302265Ssmh arc_c = arc_c_min; 969302265Ssmh 970302265Ssmh zfs_arc_min = arc_c_min; 971302265Ssmh 972302265Ssmh return (0); 973302265Ssmh} 974275748Sdelphij#endif 975275748Sdelphij 976168404Spjdstatic arc_buf_t *arc_eviction_list; 977168404Spjdstatic arc_buf_hdr_t arc_eviction_hdr; 978168404Spjd 979168404Spjd#define GHOST_STATE(state) \ 980185029Spjd ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 981185029Spjd (state) == arc_l2c_only) 982168404Spjd 983275811Sdelphij#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) 984275811Sdelphij#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) 985275811Sdelphij#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) 986275811Sdelphij#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) 987275811Sdelphij#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) 988275811Sdelphij#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) 989286570Smav 990275811Sdelphij#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) 991286570Smav#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) 992275811Sdelphij#define HDR_L2_READING(hdr) \ 993286570Smav (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ 994286570Smav ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) 995275811Sdelphij#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) 996275811Sdelphij#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) 997275811Sdelphij#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) 998168404Spjd 999286570Smav#define HDR_ISTYPE_METADATA(hdr) \ 1000286570Smav ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) 1001286570Smav#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) 1002286570Smav 1003286570Smav#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) 1004286570Smav#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) 1005286570Smav 1006168404Spjd/* 1007185029Spjd * Other sizes 1008185029Spjd */ 1009185029Spjd 1010286570Smav#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 1011286570Smav#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) 1012185029Spjd 1013185029Spjd/* 1014168404Spjd * Hash table routines 1015168404Spjd */ 1016168404Spjd 1017205253Skmacy#define HT_LOCK_PAD CACHE_LINE_SIZE 1018168404Spjd 1019168404Spjdstruct ht_lock { 1020168404Spjd kmutex_t ht_lock; 1021168404Spjd#ifdef _KERNEL 1022168404Spjd unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 1023168404Spjd#endif 1024168404Spjd}; 1025168404Spjd 1026168404Spjd#define BUF_LOCKS 256 1027168404Spjdtypedef struct buf_hash_table { 1028168404Spjd uint64_t ht_mask; 1029168404Spjd arc_buf_hdr_t **ht_table; 1030205264Skmacy struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); 1031168404Spjd} buf_hash_table_t; 1032168404Spjd 1033168404Spjdstatic buf_hash_table_t buf_hash_table; 1034168404Spjd 1035168404Spjd#define BUF_HASH_INDEX(spa, dva, birth) \ 1036168404Spjd (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 1037168404Spjd#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 1038168404Spjd#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 1039219089Spjd#define HDR_LOCK(hdr) \ 1040219089Spjd (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 1041168404Spjd 1042168404Spjduint64_t zfs_crc64_table[256]; 1043168404Spjd 1044185029Spjd/* 1045185029Spjd * Level 2 ARC 1046185029Spjd */ 1047185029Spjd 1048272707Savg#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 1049251478Sdelphij#define L2ARC_HEADROOM 2 /* num of writes */ 1050251478Sdelphij/* 1051251478Sdelphij * If we discover during ARC scan any buffers to be compressed, we boost 1052251478Sdelphij * our headroom for the next scanning cycle by this percentage multiple. 1053251478Sdelphij */ 1054251478Sdelphij#define L2ARC_HEADROOM_BOOST 200 1055208373Smm#define L2ARC_FEED_SECS 1 /* caching interval secs */ 1056208373Smm#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 1057185029Spjd 1058286598Smav/* 1059286598Smav * Used to distinguish headers that are being process by 1060286598Smav * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk 1061286598Smav * address. This can happen when the header is added to the l2arc's list 1062286598Smav * of buffers to write in the first stage of l2arc_write_buffers(), but 1063286598Smav * has not yet been written out which happens in the second stage of 1064286598Smav * l2arc_write_buffers(). 1065286598Smav */ 1066286598Smav#define L2ARC_ADDR_UNSET ((uint64_t)(-1)) 1067286598Smav 1068185029Spjd#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 1069185029Spjd#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 1070185029Spjd 1071251631Sdelphij/* L2ARC Performance Tunables */ 1072185029Spjduint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 1073185029Spjduint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 1074185029Spjduint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 1075251478Sdelphijuint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 1076185029Spjduint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 1077208373Smmuint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 1078219089Spjdboolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 1079208373Smmboolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 1080208373Smmboolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 1081185029Spjd 1082217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, 1083205231Skmacy &l2arc_write_max, 0, "max write size"); 1084217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, 1085205231Skmacy &l2arc_write_boost, 0, "extra write during warmup"); 1086217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, 1087205231Skmacy &l2arc_headroom, 0, "number of dev writes"); 1088217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, 1089205231Skmacy &l2arc_feed_secs, 0, "interval seconds"); 1090217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, 1091208373Smm &l2arc_feed_min_ms, 0, "min interval milliseconds"); 1092205231Skmacy 1093205231SkmacySYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, 1094205231Skmacy &l2arc_noprefetch, 0, "don't cache prefetch bufs"); 1095208373SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, 1096208373Smm &l2arc_feed_again, 0, "turbo warmup"); 1097208373SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, 1098208373Smm &l2arc_norw, 0, "no reads during writes"); 1099205231Skmacy 1100217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, 1101286770Smav &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); 1102217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, 1103205231Skmacy &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); 1104217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, 1105205231Skmacy &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); 1106205231Skmacy 1107217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, 1108286770Smav &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); 1109217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, 1110205231Skmacy &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); 1111217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, 1112205231Skmacy &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); 1113205231Skmacy 1114217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, 1115286770Smav &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); 1116217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, 1117205231Skmacy &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1118205231Skmacy "size of metadata in mru ghost state"); 1119217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, 1120205231Skmacy &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1121205231Skmacy "size of data in mru ghost state"); 1122205231Skmacy 1123217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, 1124286770Smav &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); 1125217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, 1126205231Skmacy &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); 1127217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, 1128205231Skmacy &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); 1129205231Skmacy 1130217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, 1131286770Smav &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); 1132217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, 1133205231Skmacy &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, 1134205231Skmacy "size of metadata in mfu ghost state"); 1135217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, 1136205231Skmacy &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, 1137205231Skmacy "size of data in mfu ghost state"); 1138205231Skmacy 1139217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, 1140286770Smav &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); 1141205231Skmacy 1142185029Spjd/* 1143185029Spjd * L2ARC Internals 1144185029Spjd */ 1145286570Smavstruct l2arc_dev { 1146185029Spjd vdev_t *l2ad_vdev; /* vdev */ 1147185029Spjd spa_t *l2ad_spa; /* spa */ 1148185029Spjd uint64_t l2ad_hand; /* next write location */ 1149185029Spjd uint64_t l2ad_start; /* first addr on device */ 1150185029Spjd uint64_t l2ad_end; /* last addr on device */ 1151185029Spjd boolean_t l2ad_first; /* first sweep through */ 1152208373Smm boolean_t l2ad_writing; /* currently writing */ 1153286570Smav kmutex_t l2ad_mtx; /* lock for buffer list */ 1154286570Smav list_t l2ad_buflist; /* buffer list */ 1155185029Spjd list_node_t l2ad_node; /* device list node */ 1156286598Smav refcount_t l2ad_alloc; /* allocated bytes */ 1157286570Smav}; 1158185029Spjd 1159185029Spjdstatic list_t L2ARC_dev_list; /* device list */ 1160185029Spjdstatic list_t *l2arc_dev_list; /* device list pointer */ 1161185029Spjdstatic kmutex_t l2arc_dev_mtx; /* device list mutex */ 1162185029Spjdstatic l2arc_dev_t *l2arc_dev_last; /* last device used */ 1163185029Spjdstatic list_t L2ARC_free_on_write; /* free after write buf list */ 1164185029Spjdstatic list_t *l2arc_free_on_write; /* free after write list ptr */ 1165185029Spjdstatic kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 1166185029Spjdstatic uint64_t l2arc_ndev; /* number of devices */ 1167185029Spjd 1168185029Spjdtypedef struct l2arc_read_callback { 1169251478Sdelphij arc_buf_t *l2rcb_buf; /* read buffer */ 1170251478Sdelphij spa_t *l2rcb_spa; /* spa */ 1171251478Sdelphij blkptr_t l2rcb_bp; /* original blkptr */ 1172268123Sdelphij zbookmark_phys_t l2rcb_zb; /* original bookmark */ 1173251478Sdelphij int l2rcb_flags; /* original flags */ 1174251478Sdelphij enum zio_compress l2rcb_compress; /* applied compress */ 1175297848Savg void *l2rcb_data; /* temporary buffer */ 1176185029Spjd} l2arc_read_callback_t; 1177185029Spjd 1178185029Spjdtypedef struct l2arc_write_callback { 1179185029Spjd l2arc_dev_t *l2wcb_dev; /* device info */ 1180185029Spjd arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 1181185029Spjd} l2arc_write_callback_t; 1182185029Spjd 1183185029Spjdtypedef struct l2arc_data_free { 1184185029Spjd /* protected by l2arc_free_on_write_mtx */ 1185185029Spjd void *l2df_data; 1186185029Spjd size_t l2df_size; 1187185029Spjd void (*l2df_func)(void *, size_t); 1188185029Spjd list_node_t l2df_list_node; 1189185029Spjd} l2arc_data_free_t; 1190185029Spjd 1191185029Spjdstatic kmutex_t l2arc_feed_thr_lock; 1192185029Spjdstatic kcondvar_t l2arc_feed_thr_cv; 1193185029Spjdstatic uint8_t l2arc_thread_exit; 1194185029Spjd 1195275811Sdelphijstatic void arc_get_data_buf(arc_buf_t *); 1196275811Sdelphijstatic void arc_access(arc_buf_hdr_t *, kmutex_t *); 1197286763Smavstatic boolean_t arc_is_overflowing(); 1198275811Sdelphijstatic void arc_buf_watch(arc_buf_t *); 1199275811Sdelphij 1200286570Smavstatic arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); 1201286570Smavstatic uint32_t arc_bufc_to_flags(arc_buf_contents_t); 1202286570Smav 1203275811Sdelphijstatic boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); 1204275811Sdelphijstatic void l2arc_read_done(zio_t *); 1205185029Spjd 1206297848Savgstatic boolean_t l2arc_transform_buf(arc_buf_hdr_t *, boolean_t); 1207275811Sdelphijstatic void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); 1208275811Sdelphijstatic void l2arc_release_cdata_buf(arc_buf_hdr_t *); 1209251478Sdelphij 1210290191Savgstatic void 1211290191Savgl2arc_trim(const arc_buf_hdr_t *hdr) 1212290191Savg{ 1213290191Savg l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 1214290191Savg 1215290191Savg ASSERT(HDR_HAS_L2HDR(hdr)); 1216290191Savg ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 1217290191Savg 1218290191Savg if (hdr->b_l2hdr.b_daddr == L2ARC_ADDR_UNSET) 1219290191Savg return; 1220290191Savg if (hdr->b_l2hdr.b_asize != 0) { 1221290191Savg trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, 1222290191Savg hdr->b_l2hdr.b_asize, 0); 1223290191Savg } else { 1224290191Savg ASSERT3U(hdr->b_l2hdr.b_compress, ==, ZIO_COMPRESS_EMPTY); 1225290191Savg } 1226290191Savg} 1227290191Savg 1228168404Spjdstatic uint64_t 1229209962Smmbuf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 1230168404Spjd{ 1231168404Spjd uint8_t *vdva = (uint8_t *)dva; 1232168404Spjd uint64_t crc = -1ULL; 1233168404Spjd int i; 1234168404Spjd 1235168404Spjd ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 1236168404Spjd 1237168404Spjd for (i = 0; i < sizeof (dva_t); i++) 1238168404Spjd crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 1239168404Spjd 1240209962Smm crc ^= (spa>>8) ^ birth; 1241168404Spjd 1242168404Spjd return (crc); 1243168404Spjd} 1244168404Spjd 1245168404Spjd#define BUF_EMPTY(buf) \ 1246168404Spjd ((buf)->b_dva.dva_word[0] == 0 && \ 1247286570Smav (buf)->b_dva.dva_word[1] == 0) 1248168404Spjd 1249168404Spjd#define BUF_EQUAL(spa, dva, birth, buf) \ 1250168404Spjd ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 1251168404Spjd ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 1252168404Spjd ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 1253168404Spjd 1254219089Spjdstatic void 1255219089Spjdbuf_discard_identity(arc_buf_hdr_t *hdr) 1256219089Spjd{ 1257219089Spjd hdr->b_dva.dva_word[0] = 0; 1258219089Spjd hdr->b_dva.dva_word[1] = 0; 1259219089Spjd hdr->b_birth = 0; 1260219089Spjd} 1261219089Spjd 1262168404Spjdstatic arc_buf_hdr_t * 1263268075Sdelphijbuf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) 1264168404Spjd{ 1265268075Sdelphij const dva_t *dva = BP_IDENTITY(bp); 1266268075Sdelphij uint64_t birth = BP_PHYSICAL_BIRTH(bp); 1267168404Spjd uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 1268168404Spjd kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1269275811Sdelphij arc_buf_hdr_t *hdr; 1270168404Spjd 1271168404Spjd mutex_enter(hash_lock); 1272275811Sdelphij for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; 1273275811Sdelphij hdr = hdr->b_hash_next) { 1274275811Sdelphij if (BUF_EQUAL(spa, dva, birth, hdr)) { 1275168404Spjd *lockp = hash_lock; 1276275811Sdelphij return (hdr); 1277168404Spjd } 1278168404Spjd } 1279168404Spjd mutex_exit(hash_lock); 1280168404Spjd *lockp = NULL; 1281168404Spjd return (NULL); 1282168404Spjd} 1283168404Spjd 1284168404Spjd/* 1285168404Spjd * Insert an entry into the hash table. If there is already an element 1286168404Spjd * equal to elem in the hash table, then the already existing element 1287168404Spjd * will be returned and the new element will not be inserted. 1288168404Spjd * Otherwise returns NULL. 1289286570Smav * If lockp == NULL, the caller is assumed to already hold the hash lock. 1290168404Spjd */ 1291168404Spjdstatic arc_buf_hdr_t * 1292275811Sdelphijbuf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) 1293168404Spjd{ 1294275811Sdelphij uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1295168404Spjd kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1296275811Sdelphij arc_buf_hdr_t *fhdr; 1297168404Spjd uint32_t i; 1298168404Spjd 1299275811Sdelphij ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); 1300275811Sdelphij ASSERT(hdr->b_birth != 0); 1301275811Sdelphij ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1302286570Smav 1303286570Smav if (lockp != NULL) { 1304286570Smav *lockp = hash_lock; 1305286570Smav mutex_enter(hash_lock); 1306286570Smav } else { 1307286570Smav ASSERT(MUTEX_HELD(hash_lock)); 1308286570Smav } 1309286570Smav 1310275811Sdelphij for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; 1311275811Sdelphij fhdr = fhdr->b_hash_next, i++) { 1312275811Sdelphij if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) 1313275811Sdelphij return (fhdr); 1314168404Spjd } 1315168404Spjd 1316275811Sdelphij hdr->b_hash_next = buf_hash_table.ht_table[idx]; 1317275811Sdelphij buf_hash_table.ht_table[idx] = hdr; 1318275811Sdelphij hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 1319168404Spjd 1320168404Spjd /* collect some hash table performance data */ 1321168404Spjd if (i > 0) { 1322168404Spjd ARCSTAT_BUMP(arcstat_hash_collisions); 1323168404Spjd if (i == 1) 1324168404Spjd ARCSTAT_BUMP(arcstat_hash_chains); 1325168404Spjd 1326168404Spjd ARCSTAT_MAX(arcstat_hash_chain_max, i); 1327168404Spjd } 1328168404Spjd 1329168404Spjd ARCSTAT_BUMP(arcstat_hash_elements); 1330168404Spjd ARCSTAT_MAXSTAT(arcstat_hash_elements); 1331168404Spjd 1332168404Spjd return (NULL); 1333168404Spjd} 1334168404Spjd 1335168404Spjdstatic void 1336275811Sdelphijbuf_hash_remove(arc_buf_hdr_t *hdr) 1337168404Spjd{ 1338275811Sdelphij arc_buf_hdr_t *fhdr, **hdrp; 1339275811Sdelphij uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1340168404Spjd 1341168404Spjd ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 1342275811Sdelphij ASSERT(HDR_IN_HASH_TABLE(hdr)); 1343168404Spjd 1344275811Sdelphij hdrp = &buf_hash_table.ht_table[idx]; 1345275811Sdelphij while ((fhdr = *hdrp) != hdr) { 1346275811Sdelphij ASSERT(fhdr != NULL); 1347275811Sdelphij hdrp = &fhdr->b_hash_next; 1348168404Spjd } 1349275811Sdelphij *hdrp = hdr->b_hash_next; 1350275811Sdelphij hdr->b_hash_next = NULL; 1351275811Sdelphij hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; 1352168404Spjd 1353168404Spjd /* collect some hash table performance data */ 1354168404Spjd ARCSTAT_BUMPDOWN(arcstat_hash_elements); 1355168404Spjd 1356168404Spjd if (buf_hash_table.ht_table[idx] && 1357168404Spjd buf_hash_table.ht_table[idx]->b_hash_next == NULL) 1358168404Spjd ARCSTAT_BUMPDOWN(arcstat_hash_chains); 1359168404Spjd} 1360168404Spjd 1361168404Spjd/* 1362168404Spjd * Global data structures and functions for the buf kmem cache. 1363168404Spjd */ 1364286570Smavstatic kmem_cache_t *hdr_full_cache; 1365286570Smavstatic kmem_cache_t *hdr_l2only_cache; 1366168404Spjdstatic kmem_cache_t *buf_cache; 1367168404Spjd 1368168404Spjdstatic void 1369168404Spjdbuf_fini(void) 1370168404Spjd{ 1371168404Spjd int i; 1372168404Spjd 1373168404Spjd kmem_free(buf_hash_table.ht_table, 1374168404Spjd (buf_hash_table.ht_mask + 1) * sizeof (void *)); 1375168404Spjd for (i = 0; i < BUF_LOCKS; i++) 1376168404Spjd mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 1377286570Smav kmem_cache_destroy(hdr_full_cache); 1378286570Smav kmem_cache_destroy(hdr_l2only_cache); 1379168404Spjd kmem_cache_destroy(buf_cache); 1380168404Spjd} 1381168404Spjd 1382168404Spjd/* 1383168404Spjd * Constructor callback - called when the cache is empty 1384168404Spjd * and a new buf is requested. 1385168404Spjd */ 1386168404Spjd/* ARGSUSED */ 1387168404Spjdstatic int 1388286570Smavhdr_full_cons(void *vbuf, void *unused, int kmflag) 1389168404Spjd{ 1390275811Sdelphij arc_buf_hdr_t *hdr = vbuf; 1391168404Spjd 1392286570Smav bzero(hdr, HDR_FULL_SIZE); 1393286570Smav cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); 1394286570Smav refcount_create(&hdr->b_l1hdr.b_refcnt); 1395286570Smav mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 1396286763Smav multilist_link_init(&hdr->b_l1hdr.b_arc_node); 1397286570Smav arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1398185029Spjd 1399168404Spjd return (0); 1400168404Spjd} 1401168404Spjd 1402185029Spjd/* ARGSUSED */ 1403185029Spjdstatic int 1404286570Smavhdr_l2only_cons(void *vbuf, void *unused, int kmflag) 1405286570Smav{ 1406286570Smav arc_buf_hdr_t *hdr = vbuf; 1407286570Smav 1408286570Smav bzero(hdr, HDR_L2ONLY_SIZE); 1409286570Smav arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1410286570Smav 1411286570Smav return (0); 1412286570Smav} 1413286570Smav 1414286570Smav/* ARGSUSED */ 1415286570Smavstatic int 1416185029Spjdbuf_cons(void *vbuf, void *unused, int kmflag) 1417185029Spjd{ 1418185029Spjd arc_buf_t *buf = vbuf; 1419185029Spjd 1420185029Spjd bzero(buf, sizeof (arc_buf_t)); 1421219089Spjd mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 1422208373Smm arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1423208373Smm 1424185029Spjd return (0); 1425185029Spjd} 1426185029Spjd 1427168404Spjd/* 1428168404Spjd * Destructor callback - called when a cached buf is 1429168404Spjd * no longer required. 1430168404Spjd */ 1431168404Spjd/* ARGSUSED */ 1432168404Spjdstatic void 1433286570Smavhdr_full_dest(void *vbuf, void *unused) 1434168404Spjd{ 1435275811Sdelphij arc_buf_hdr_t *hdr = vbuf; 1436168404Spjd 1437275811Sdelphij ASSERT(BUF_EMPTY(hdr)); 1438286570Smav cv_destroy(&hdr->b_l1hdr.b_cv); 1439286570Smav refcount_destroy(&hdr->b_l1hdr.b_refcnt); 1440286570Smav mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); 1441286763Smav ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 1442286570Smav arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1443168404Spjd} 1444168404Spjd 1445185029Spjd/* ARGSUSED */ 1446185029Spjdstatic void 1447286570Smavhdr_l2only_dest(void *vbuf, void *unused) 1448286570Smav{ 1449286570Smav arc_buf_hdr_t *hdr = vbuf; 1450286570Smav 1451286570Smav ASSERT(BUF_EMPTY(hdr)); 1452286570Smav arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1453286570Smav} 1454286570Smav 1455286570Smav/* ARGSUSED */ 1456286570Smavstatic void 1457185029Spjdbuf_dest(void *vbuf, void *unused) 1458185029Spjd{ 1459185029Spjd arc_buf_t *buf = vbuf; 1460185029Spjd 1461219089Spjd mutex_destroy(&buf->b_evict_lock); 1462208373Smm arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1463185029Spjd} 1464185029Spjd 1465168404Spjd/* 1466168404Spjd * Reclaim callback -- invoked when memory is low. 1467168404Spjd */ 1468168404Spjd/* ARGSUSED */ 1469168404Spjdstatic void 1470168404Spjdhdr_recl(void *unused) 1471168404Spjd{ 1472168404Spjd dprintf("hdr_recl called\n"); 1473168404Spjd /* 1474168404Spjd * umem calls the reclaim func when we destroy the buf cache, 1475168404Spjd * which is after we do arc_fini(). 1476168404Spjd */ 1477168404Spjd if (!arc_dead) 1478286763Smav cv_signal(&arc_reclaim_thread_cv); 1479168404Spjd} 1480168404Spjd 1481168404Spjdstatic void 1482168404Spjdbuf_init(void) 1483168404Spjd{ 1484168404Spjd uint64_t *ct; 1485168404Spjd uint64_t hsize = 1ULL << 12; 1486168404Spjd int i, j; 1487168404Spjd 1488168404Spjd /* 1489168404Spjd * The hash table is big enough to fill all of physical memory 1490269230Sdelphij * with an average block size of zfs_arc_average_blocksize (default 8K). 1491269230Sdelphij * By default, the table will take up 1492269230Sdelphij * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). 1493168404Spjd */ 1494269230Sdelphij while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) 1495168404Spjd hsize <<= 1; 1496168404Spjdretry: 1497168404Spjd buf_hash_table.ht_mask = hsize - 1; 1498168404Spjd buf_hash_table.ht_table = 1499168404Spjd kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1500168404Spjd if (buf_hash_table.ht_table == NULL) { 1501168404Spjd ASSERT(hsize > (1ULL << 8)); 1502168404Spjd hsize >>= 1; 1503168404Spjd goto retry; 1504168404Spjd } 1505168404Spjd 1506286570Smav hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 1507286570Smav 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); 1508286570Smav hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", 1509286570Smav HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, 1510286570Smav NULL, NULL, 0); 1511168404Spjd buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1512185029Spjd 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1513168404Spjd 1514168404Spjd for (i = 0; i < 256; i++) 1515168404Spjd for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1516168404Spjd *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1517168404Spjd 1518168404Spjd for (i = 0; i < BUF_LOCKS; i++) { 1519168404Spjd mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1520168404Spjd NULL, MUTEX_DEFAULT, NULL); 1521168404Spjd } 1522168404Spjd} 1523168404Spjd 1524286570Smav/* 1525286570Smav * Transition between the two allocation states for the arc_buf_hdr struct. 1526286570Smav * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without 1527286570Smav * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller 1528286570Smav * version is used when a cache buffer is only in the L2ARC in order to reduce 1529286570Smav * memory usage. 1530286570Smav */ 1531286570Smavstatic arc_buf_hdr_t * 1532286570Smavarc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) 1533286570Smav{ 1534286570Smav ASSERT(HDR_HAS_L2HDR(hdr)); 1535286570Smav 1536286570Smav arc_buf_hdr_t *nhdr; 1537286570Smav l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 1538286570Smav 1539286570Smav ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || 1540286570Smav (old == hdr_l2only_cache && new == hdr_full_cache)); 1541286570Smav 1542286570Smav nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); 1543286570Smav 1544286570Smav ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); 1545286570Smav buf_hash_remove(hdr); 1546286570Smav 1547286570Smav bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); 1548286598Smav 1549286570Smav if (new == hdr_full_cache) { 1550286570Smav nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1551286570Smav /* 1552286570Smav * arc_access and arc_change_state need to be aware that a 1553286570Smav * header has just come out of L2ARC, so we set its state to 1554286570Smav * l2c_only even though it's about to change. 1555286570Smav */ 1556286570Smav nhdr->b_l1hdr.b_state = arc_l2c_only; 1557286763Smav 1558286763Smav /* Verify previous threads set to NULL before freeing */ 1559286763Smav ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL); 1560286570Smav } else { 1561286570Smav ASSERT(hdr->b_l1hdr.b_buf == NULL); 1562286570Smav ASSERT0(hdr->b_l1hdr.b_datacnt); 1563286763Smav 1564286570Smav /* 1565286763Smav * If we've reached here, We must have been called from 1566286763Smav * arc_evict_hdr(), as such we should have already been 1567286763Smav * removed from any ghost list we were previously on 1568286763Smav * (which protects us from racing with arc_evict_state), 1569286763Smav * thus no locking is needed during this check. 1570286570Smav */ 1571286763Smav ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 1572286763Smav 1573286763Smav /* 1574286763Smav * A buffer must not be moved into the arc_l2c_only 1575286763Smav * state if it's not finished being written out to the 1576286763Smav * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field 1577286763Smav * might try to be accessed, even though it was removed. 1578286763Smav */ 1579286763Smav VERIFY(!HDR_L2_WRITING(hdr)); 1580286763Smav VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); 1581286763Smav 1582288064Savg#ifdef ZFS_DEBUG 1583288064Savg if (hdr->b_l1hdr.b_thawed != NULL) { 1584288064Savg kmem_free(hdr->b_l1hdr.b_thawed, 1); 1585288064Savg hdr->b_l1hdr.b_thawed = NULL; 1586288064Savg } 1587288064Savg#endif 1588288064Savg 1589286570Smav nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; 1590286570Smav } 1591286570Smav /* 1592286570Smav * The header has been reallocated so we need to re-insert it into any 1593286570Smav * lists it was on. 1594286570Smav */ 1595286570Smav (void) buf_hash_insert(nhdr, NULL); 1596286570Smav 1597286570Smav ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); 1598286570Smav 1599286570Smav mutex_enter(&dev->l2ad_mtx); 1600286570Smav 1601286570Smav /* 1602286570Smav * We must place the realloc'ed header back into the list at 1603286570Smav * the same spot. Otherwise, if it's placed earlier in the list, 1604286570Smav * l2arc_write_buffers() could find it during the function's 1605286570Smav * write phase, and try to write it out to the l2arc. 1606286570Smav */ 1607286570Smav list_insert_after(&dev->l2ad_buflist, hdr, nhdr); 1608286570Smav list_remove(&dev->l2ad_buflist, hdr); 1609286570Smav 1610286570Smav mutex_exit(&dev->l2ad_mtx); 1611286570Smav 1612286598Smav /* 1613286598Smav * Since we're using the pointer address as the tag when 1614286598Smav * incrementing and decrementing the l2ad_alloc refcount, we 1615286598Smav * must remove the old pointer (that we're about to destroy) and 1616286598Smav * add the new pointer to the refcount. Otherwise we'd remove 1617286598Smav * the wrong pointer address when calling arc_hdr_destroy() later. 1618286598Smav */ 1619286598Smav 1620286598Smav (void) refcount_remove_many(&dev->l2ad_alloc, 1621286598Smav hdr->b_l2hdr.b_asize, hdr); 1622286598Smav 1623286598Smav (void) refcount_add_many(&dev->l2ad_alloc, 1624286598Smav nhdr->b_l2hdr.b_asize, nhdr); 1625286598Smav 1626286570Smav buf_discard_identity(hdr); 1627286570Smav hdr->b_freeze_cksum = NULL; 1628286570Smav kmem_cache_free(old, hdr); 1629286570Smav 1630286570Smav return (nhdr); 1631286570Smav} 1632286570Smav 1633286570Smav 1634168404Spjd#define ARC_MINTIME (hz>>4) /* 62 ms */ 1635168404Spjd 1636168404Spjdstatic void 1637168404Spjdarc_cksum_verify(arc_buf_t *buf) 1638168404Spjd{ 1639168404Spjd zio_cksum_t zc; 1640168404Spjd 1641168404Spjd if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1642168404Spjd return; 1643168404Spjd 1644286570Smav mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1645286570Smav if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { 1646286570Smav mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1647168404Spjd return; 1648168404Spjd } 1649289422Smav fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc); 1650168404Spjd if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 1651168404Spjd panic("buffer modified while frozen!"); 1652286570Smav mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1653168404Spjd} 1654168404Spjd 1655185029Spjdstatic int 1656185029Spjdarc_cksum_equal(arc_buf_t *buf) 1657185029Spjd{ 1658185029Spjd zio_cksum_t zc; 1659185029Spjd int equal; 1660185029Spjd 1661286570Smav mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1662289422Smav fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc); 1663185029Spjd equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 1664286570Smav mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1665185029Spjd 1666185029Spjd return (equal); 1667185029Spjd} 1668185029Spjd 1669168404Spjdstatic void 1670185029Spjdarc_cksum_compute(arc_buf_t *buf, boolean_t force) 1671168404Spjd{ 1672185029Spjd if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 1673168404Spjd return; 1674168404Spjd 1675286570Smav mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1676168404Spjd if (buf->b_hdr->b_freeze_cksum != NULL) { 1677286570Smav mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1678168404Spjd return; 1679168404Spjd } 1680168404Spjd buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 1681168404Spjd fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 1682289422Smav NULL, buf->b_hdr->b_freeze_cksum); 1683286570Smav mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1684240133Smm#ifdef illumos 1685240133Smm arc_buf_watch(buf); 1686277300Ssmh#endif 1687168404Spjd} 1688168404Spjd 1689240133Smm#ifdef illumos 1690240133Smm#ifndef _KERNEL 1691240133Smmtypedef struct procctl { 1692240133Smm long cmd; 1693240133Smm prwatch_t prwatch; 1694240133Smm} procctl_t; 1695240133Smm#endif 1696240133Smm 1697240133Smm/* ARGSUSED */ 1698240133Smmstatic void 1699240133Smmarc_buf_unwatch(arc_buf_t *buf) 1700240133Smm{ 1701240133Smm#ifndef _KERNEL 1702240133Smm if (arc_watch) { 1703240133Smm int result; 1704240133Smm procctl_t ctl; 1705240133Smm ctl.cmd = PCWATCH; 1706240133Smm ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1707240133Smm ctl.prwatch.pr_size = 0; 1708240133Smm ctl.prwatch.pr_wflags = 0; 1709240133Smm result = write(arc_procfd, &ctl, sizeof (ctl)); 1710240133Smm ASSERT3U(result, ==, sizeof (ctl)); 1711240133Smm } 1712240133Smm#endif 1713240133Smm} 1714240133Smm 1715240133Smm/* ARGSUSED */ 1716240133Smmstatic void 1717240133Smmarc_buf_watch(arc_buf_t *buf) 1718240133Smm{ 1719240133Smm#ifndef _KERNEL 1720240133Smm if (arc_watch) { 1721240133Smm int result; 1722240133Smm procctl_t ctl; 1723240133Smm ctl.cmd = PCWATCH; 1724240133Smm ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1725240133Smm ctl.prwatch.pr_size = buf->b_hdr->b_size; 1726240133Smm ctl.prwatch.pr_wflags = WA_WRITE; 1727240133Smm result = write(arc_procfd, &ctl, sizeof (ctl)); 1728240133Smm ASSERT3U(result, ==, sizeof (ctl)); 1729240133Smm } 1730240133Smm#endif 1731240133Smm} 1732240133Smm#endif /* illumos */ 1733240133Smm 1734286570Smavstatic arc_buf_contents_t 1735286570Smavarc_buf_type(arc_buf_hdr_t *hdr) 1736286570Smav{ 1737286570Smav if (HDR_ISTYPE_METADATA(hdr)) { 1738286570Smav return (ARC_BUFC_METADATA); 1739286570Smav } else { 1740286570Smav return (ARC_BUFC_DATA); 1741286570Smav } 1742286570Smav} 1743286570Smav 1744286570Smavstatic uint32_t 1745286570Smavarc_bufc_to_flags(arc_buf_contents_t type) 1746286570Smav{ 1747286570Smav switch (type) { 1748286570Smav case ARC_BUFC_DATA: 1749286570Smav /* metadata field is 0 if buffer contains normal data */ 1750286570Smav return (0); 1751286570Smav case ARC_BUFC_METADATA: 1752286570Smav return (ARC_FLAG_BUFC_METADATA); 1753286570Smav default: 1754286570Smav break; 1755286570Smav } 1756286570Smav panic("undefined ARC buffer type!"); 1757286570Smav return ((uint32_t)-1); 1758286570Smav} 1759286570Smav 1760168404Spjdvoid 1761168404Spjdarc_buf_thaw(arc_buf_t *buf) 1762168404Spjd{ 1763185029Spjd if (zfs_flags & ZFS_DEBUG_MODIFY) { 1764286570Smav if (buf->b_hdr->b_l1hdr.b_state != arc_anon) 1765185029Spjd panic("modifying non-anon buffer!"); 1766286570Smav if (HDR_IO_IN_PROGRESS(buf->b_hdr)) 1767185029Spjd panic("modifying buffer while i/o in progress!"); 1768185029Spjd arc_cksum_verify(buf); 1769185029Spjd } 1770168404Spjd 1771286570Smav mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1772168404Spjd if (buf->b_hdr->b_freeze_cksum != NULL) { 1773168404Spjd kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1774168404Spjd buf->b_hdr->b_freeze_cksum = NULL; 1775168404Spjd } 1776219089Spjd 1777286570Smav#ifdef ZFS_DEBUG 1778219089Spjd if (zfs_flags & ZFS_DEBUG_MODIFY) { 1779286570Smav if (buf->b_hdr->b_l1hdr.b_thawed != NULL) 1780286570Smav kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1); 1781286570Smav buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); 1782219089Spjd } 1783286570Smav#endif 1784219089Spjd 1785286570Smav mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1786240133Smm 1787240133Smm#ifdef illumos 1788240133Smm arc_buf_unwatch(buf); 1789277300Ssmh#endif 1790168404Spjd} 1791168404Spjd 1792168404Spjdvoid 1793168404Spjdarc_buf_freeze(arc_buf_t *buf) 1794168404Spjd{ 1795219089Spjd kmutex_t *hash_lock; 1796219089Spjd 1797168404Spjd if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1798168404Spjd return; 1799168404Spjd 1800219089Spjd hash_lock = HDR_LOCK(buf->b_hdr); 1801219089Spjd mutex_enter(hash_lock); 1802219089Spjd 1803168404Spjd ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 1804286570Smav buf->b_hdr->b_l1hdr.b_state == arc_anon); 1805185029Spjd arc_cksum_compute(buf, B_FALSE); 1806219089Spjd mutex_exit(hash_lock); 1807240133Smm 1808168404Spjd} 1809168404Spjd 1810168404Spjdstatic void 1811275811Sdelphijadd_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1812168404Spjd{ 1813286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1814168404Spjd ASSERT(MUTEX_HELD(hash_lock)); 1815286570Smav arc_state_t *state = hdr->b_l1hdr.b_state; 1816168404Spjd 1817286570Smav if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && 1818286570Smav (state != arc_anon)) { 1819286570Smav /* We don't use the L2-only state list. */ 1820286570Smav if (state != arc_l2c_only) { 1821286763Smav arc_buf_contents_t type = arc_buf_type(hdr); 1822286570Smav uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; 1823286763Smav multilist_t *list = &state->arcs_list[type]; 1824286763Smav uint64_t *size = &state->arcs_lsize[type]; 1825168404Spjd 1826286763Smav multilist_remove(list, hdr); 1827286763Smav 1828286570Smav if (GHOST_STATE(state)) { 1829286570Smav ASSERT0(hdr->b_l1hdr.b_datacnt); 1830286570Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 1831286570Smav delta = hdr->b_size; 1832286570Smav } 1833286570Smav ASSERT(delta > 0); 1834286570Smav ASSERT3U(*size, >=, delta); 1835286570Smav atomic_add_64(size, -delta); 1836168404Spjd } 1837185029Spjd /* remove the prefetch flag if we get a reference */ 1838286570Smav hdr->b_flags &= ~ARC_FLAG_PREFETCH; 1839168404Spjd } 1840168404Spjd} 1841168404Spjd 1842168404Spjdstatic int 1843275811Sdelphijremove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1844168404Spjd{ 1845168404Spjd int cnt; 1846286570Smav arc_state_t *state = hdr->b_l1hdr.b_state; 1847168404Spjd 1848286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1849168404Spjd ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 1850168404Spjd ASSERT(!GHOST_STATE(state)); 1851168404Spjd 1852286570Smav /* 1853286570Smav * arc_l2c_only counts as a ghost state so we don't need to explicitly 1854286570Smav * check to prevent usage of the arc_l2c_only list. 1855286570Smav */ 1856286570Smav if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && 1857168404Spjd (state != arc_anon)) { 1858286763Smav arc_buf_contents_t type = arc_buf_type(hdr); 1859286763Smav multilist_t *list = &state->arcs_list[type]; 1860286763Smav uint64_t *size = &state->arcs_lsize[type]; 1861185029Spjd 1862286763Smav multilist_insert(list, hdr); 1863286763Smav 1864286570Smav ASSERT(hdr->b_l1hdr.b_datacnt > 0); 1865286570Smav atomic_add_64(size, hdr->b_size * 1866286570Smav hdr->b_l1hdr.b_datacnt); 1867168404Spjd } 1868168404Spjd return (cnt); 1869168404Spjd} 1870168404Spjd 1871168404Spjd/* 1872286763Smav * Move the supplied buffer to the indicated state. The hash lock 1873168404Spjd * for the buffer must be held by the caller. 1874168404Spjd */ 1875168404Spjdstatic void 1876275811Sdelphijarc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, 1877275811Sdelphij kmutex_t *hash_lock) 1878168404Spjd{ 1879286570Smav arc_state_t *old_state; 1880286570Smav int64_t refcnt; 1881286570Smav uint32_t datacnt; 1882168404Spjd uint64_t from_delta, to_delta; 1883286570Smav arc_buf_contents_t buftype = arc_buf_type(hdr); 1884168404Spjd 1885286570Smav /* 1886286570Smav * We almost always have an L1 hdr here, since we call arc_hdr_realloc() 1887286570Smav * in arc_read() when bringing a buffer out of the L2ARC. However, the 1888286570Smav * L1 hdr doesn't always exist when we change state to arc_anon before 1889286570Smav * destroying a header, in which case reallocating to add the L1 hdr is 1890286570Smav * pointless. 1891286570Smav */ 1892286570Smav if (HDR_HAS_L1HDR(hdr)) { 1893286570Smav old_state = hdr->b_l1hdr.b_state; 1894286570Smav refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); 1895286570Smav datacnt = hdr->b_l1hdr.b_datacnt; 1896286570Smav } else { 1897286570Smav old_state = arc_l2c_only; 1898286570Smav refcnt = 0; 1899286570Smav datacnt = 0; 1900286570Smav } 1901286570Smav 1902168404Spjd ASSERT(MUTEX_HELD(hash_lock)); 1903258632Savg ASSERT3P(new_state, !=, old_state); 1904286570Smav ASSERT(refcnt == 0 || datacnt > 0); 1905286570Smav ASSERT(!GHOST_STATE(new_state) || datacnt == 0); 1906286570Smav ASSERT(old_state != arc_anon || datacnt <= 1); 1907168404Spjd 1908286570Smav from_delta = to_delta = datacnt * hdr->b_size; 1909168404Spjd 1910168404Spjd /* 1911168404Spjd * If this buffer is evictable, transfer it from the 1912168404Spjd * old state list to the new state list. 1913168404Spjd */ 1914168404Spjd if (refcnt == 0) { 1915286570Smav if (old_state != arc_anon && old_state != arc_l2c_only) { 1916286570Smav uint64_t *size = &old_state->arcs_lsize[buftype]; 1917168404Spjd 1918286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1919286763Smav multilist_remove(&old_state->arcs_list[buftype], hdr); 1920168404Spjd 1921168404Spjd /* 1922168404Spjd * If prefetching out of the ghost cache, 1923219089Spjd * we will have a non-zero datacnt. 1924168404Spjd */ 1925286570Smav if (GHOST_STATE(old_state) && datacnt == 0) { 1926168404Spjd /* ghost elements have a ghost size */ 1927286570Smav ASSERT(hdr->b_l1hdr.b_buf == NULL); 1928275811Sdelphij from_delta = hdr->b_size; 1929168404Spjd } 1930185029Spjd ASSERT3U(*size, >=, from_delta); 1931185029Spjd atomic_add_64(size, -from_delta); 1932168404Spjd } 1933286570Smav if (new_state != arc_anon && new_state != arc_l2c_only) { 1934286570Smav uint64_t *size = &new_state->arcs_lsize[buftype]; 1935168404Spjd 1936286570Smav /* 1937286570Smav * An L1 header always exists here, since if we're 1938286570Smav * moving to some L1-cached state (i.e. not l2c_only or 1939286570Smav * anonymous), we realloc the header to add an L1hdr 1940286570Smav * beforehand. 1941286570Smav */ 1942286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1943286763Smav multilist_insert(&new_state->arcs_list[buftype], hdr); 1944168404Spjd 1945168404Spjd /* ghost elements have a ghost size */ 1946168404Spjd if (GHOST_STATE(new_state)) { 1947286762Smav ASSERT0(datacnt); 1948286570Smav ASSERT(hdr->b_l1hdr.b_buf == NULL); 1949275811Sdelphij to_delta = hdr->b_size; 1950168404Spjd } 1951185029Spjd atomic_add_64(size, to_delta); 1952168404Spjd } 1953168404Spjd } 1954168404Spjd 1955275811Sdelphij ASSERT(!BUF_EMPTY(hdr)); 1956275811Sdelphij if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) 1957275811Sdelphij buf_hash_remove(hdr); 1958168404Spjd 1959286570Smav /* adjust state sizes (ignore arc_l2c_only) */ 1960286766Smav 1961286766Smav if (to_delta && new_state != arc_l2c_only) { 1962286766Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1963286766Smav if (GHOST_STATE(new_state)) { 1964286766Smav ASSERT0(datacnt); 1965286766Smav 1966286766Smav /* 1967286766Smav * We moving a header to a ghost state, we first 1968286766Smav * remove all arc buffers. Thus, we'll have a 1969286766Smav * datacnt of zero, and no arc buffer to use for 1970286766Smav * the reference. As a result, we use the arc 1971286766Smav * header pointer for the reference. 1972286766Smav */ 1973286766Smav (void) refcount_add_many(&new_state->arcs_size, 1974286766Smav hdr->b_size, hdr); 1975286766Smav } else { 1976286766Smav ASSERT3U(datacnt, !=, 0); 1977286766Smav 1978286766Smav /* 1979286766Smav * Each individual buffer holds a unique reference, 1980286766Smav * thus we must remove each of these references one 1981286766Smav * at a time. 1982286766Smav */ 1983286766Smav for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 1984286766Smav buf = buf->b_next) { 1985286766Smav (void) refcount_add_many(&new_state->arcs_size, 1986286766Smav hdr->b_size, buf); 1987286766Smav } 1988286766Smav } 1989286766Smav } 1990286766Smav 1991286570Smav if (from_delta && old_state != arc_l2c_only) { 1992286766Smav ASSERT(HDR_HAS_L1HDR(hdr)); 1993286766Smav if (GHOST_STATE(old_state)) { 1994286766Smav /* 1995286766Smav * When moving a header off of a ghost state, 1996286766Smav * there's the possibility for datacnt to be 1997286766Smav * non-zero. This is because we first add the 1998286766Smav * arc buffer to the header prior to changing 1999286766Smav * the header's state. Since we used the header 2000286766Smav * for the reference when putting the header on 2001286766Smav * the ghost state, we must balance that and use 2002286766Smav * the header when removing off the ghost state 2003286766Smav * (even though datacnt is non zero). 2004286766Smav */ 2005286766Smav 2006286766Smav IMPLY(datacnt == 0, new_state == arc_anon || 2007286766Smav new_state == arc_l2c_only); 2008286766Smav 2009286766Smav (void) refcount_remove_many(&old_state->arcs_size, 2010286766Smav hdr->b_size, hdr); 2011286766Smav } else { 2012286766Smav ASSERT3P(datacnt, !=, 0); 2013286766Smav 2014286766Smav /* 2015286766Smav * Each individual buffer holds a unique reference, 2016286766Smav * thus we must remove each of these references one 2017286766Smav * at a time. 2018286766Smav */ 2019286766Smav for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2020286766Smav buf = buf->b_next) { 2021286766Smav (void) refcount_remove_many( 2022286766Smav &old_state->arcs_size, hdr->b_size, buf); 2023286766Smav } 2024286766Smav } 2025168404Spjd } 2026286766Smav 2027286570Smav if (HDR_HAS_L1HDR(hdr)) 2028286570Smav hdr->b_l1hdr.b_state = new_state; 2029185029Spjd 2030286570Smav /* 2031286570Smav * L2 headers should never be on the L2 state list since they don't 2032286570Smav * have L1 headers allocated. 2033286570Smav */ 2034286763Smav ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && 2035286763Smav multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); 2036168404Spjd} 2037168404Spjd 2038185029Spjdvoid 2039208373Smmarc_space_consume(uint64_t space, arc_space_type_t type) 2040185029Spjd{ 2041208373Smm ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 2042208373Smm 2043208373Smm switch (type) { 2044208373Smm case ARC_SPACE_DATA: 2045208373Smm ARCSTAT_INCR(arcstat_data_size, space); 2046208373Smm break; 2047286574Smav case ARC_SPACE_META: 2048286574Smav ARCSTAT_INCR(arcstat_metadata_size, space); 2049286574Smav break; 2050208373Smm case ARC_SPACE_OTHER: 2051208373Smm ARCSTAT_INCR(arcstat_other_size, space); 2052208373Smm break; 2053208373Smm case ARC_SPACE_HDRS: 2054208373Smm ARCSTAT_INCR(arcstat_hdr_size, space); 2055208373Smm break; 2056208373Smm case ARC_SPACE_L2HDRS: 2057208373Smm ARCSTAT_INCR(arcstat_l2_hdr_size, space); 2058208373Smm break; 2059208373Smm } 2060208373Smm 2061286574Smav if (type != ARC_SPACE_DATA) 2062286574Smav ARCSTAT_INCR(arcstat_meta_used, space); 2063286574Smav 2064185029Spjd atomic_add_64(&arc_size, space); 2065185029Spjd} 2066185029Spjd 2067185029Spjdvoid 2068208373Smmarc_space_return(uint64_t space, arc_space_type_t type) 2069185029Spjd{ 2070208373Smm ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 2071208373Smm 2072208373Smm switch (type) { 2073208373Smm case ARC_SPACE_DATA: 2074208373Smm ARCSTAT_INCR(arcstat_data_size, -space); 2075208373Smm break; 2076286574Smav case ARC_SPACE_META: 2077286574Smav ARCSTAT_INCR(arcstat_metadata_size, -space); 2078286574Smav break; 2079208373Smm case ARC_SPACE_OTHER: 2080208373Smm ARCSTAT_INCR(arcstat_other_size, -space); 2081208373Smm break; 2082208373Smm case ARC_SPACE_HDRS: 2083208373Smm ARCSTAT_INCR(arcstat_hdr_size, -space); 2084208373Smm break; 2085208373Smm case ARC_SPACE_L2HDRS: 2086208373Smm ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 2087208373Smm break; 2088208373Smm } 2089208373Smm 2090286574Smav if (type != ARC_SPACE_DATA) { 2091286574Smav ASSERT(arc_meta_used >= space); 2092286574Smav if (arc_meta_max < arc_meta_used) 2093286574Smav arc_meta_max = arc_meta_used; 2094286574Smav ARCSTAT_INCR(arcstat_meta_used, -space); 2095286574Smav } 2096286574Smav 2097185029Spjd ASSERT(arc_size >= space); 2098185029Spjd atomic_add_64(&arc_size, -space); 2099185029Spjd} 2100185029Spjd 2101168404Spjdarc_buf_t * 2102286570Smavarc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) 2103168404Spjd{ 2104168404Spjd arc_buf_hdr_t *hdr; 2105168404Spjd arc_buf_t *buf; 2106168404Spjd 2107168404Spjd ASSERT3U(size, >, 0); 2108286570Smav hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 2109168404Spjd ASSERT(BUF_EMPTY(hdr)); 2110286570Smav ASSERT3P(hdr->b_freeze_cksum, ==, NULL); 2111168404Spjd hdr->b_size = size; 2112228103Smm hdr->b_spa = spa_load_guid(spa); 2113286570Smav 2114185029Spjd buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2115168404Spjd buf->b_hdr = hdr; 2116168404Spjd buf->b_data = NULL; 2117168404Spjd buf->b_efunc = NULL; 2118168404Spjd buf->b_private = NULL; 2119168404Spjd buf->b_next = NULL; 2120286570Smav 2121286570Smav hdr->b_flags = arc_bufc_to_flags(type); 2122286570Smav hdr->b_flags |= ARC_FLAG_HAS_L1HDR; 2123286570Smav 2124286570Smav hdr->b_l1hdr.b_buf = buf; 2125286570Smav hdr->b_l1hdr.b_state = arc_anon; 2126286570Smav hdr->b_l1hdr.b_arc_access = 0; 2127286570Smav hdr->b_l1hdr.b_datacnt = 1; 2128286763Smav hdr->b_l1hdr.b_tmp_cdata = NULL; 2129286570Smav 2130168404Spjd arc_get_data_buf(buf); 2131286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2132286570Smav (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 2133168404Spjd 2134168404Spjd return (buf); 2135168404Spjd} 2136168404Spjd 2137209962Smmstatic char *arc_onloan_tag = "onloan"; 2138209962Smm 2139209962Smm/* 2140209962Smm * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 2141209962Smm * flight data by arc_tempreserve_space() until they are "returned". Loaned 2142209962Smm * buffers must be returned to the arc before they can be used by the DMU or 2143209962Smm * freed. 2144209962Smm */ 2145209962Smmarc_buf_t * 2146209962Smmarc_loan_buf(spa_t *spa, int size) 2147209962Smm{ 2148209962Smm arc_buf_t *buf; 2149209962Smm 2150209962Smm buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 2151209962Smm 2152209962Smm atomic_add_64(&arc_loaned_bytes, size); 2153209962Smm return (buf); 2154209962Smm} 2155209962Smm 2156209962Smm/* 2157209962Smm * Return a loaned arc buffer to the arc. 2158209962Smm */ 2159209962Smmvoid 2160209962Smmarc_return_buf(arc_buf_t *buf, void *tag) 2161209962Smm{ 2162209962Smm arc_buf_hdr_t *hdr = buf->b_hdr; 2163209962Smm 2164209962Smm ASSERT(buf->b_data != NULL); 2165286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2166286570Smav (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 2167286570Smav (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2168209962Smm 2169209962Smm atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 2170209962Smm} 2171209962Smm 2172219089Spjd/* Detach an arc_buf from a dbuf (tag) */ 2173219089Spjdvoid 2174219089Spjdarc_loan_inuse_buf(arc_buf_t *buf, void *tag) 2175219089Spjd{ 2176286570Smav arc_buf_hdr_t *hdr = buf->b_hdr; 2177219089Spjd 2178219089Spjd ASSERT(buf->b_data != NULL); 2179286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2180286570Smav (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2181286570Smav (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); 2182219089Spjd buf->b_efunc = NULL; 2183219089Spjd buf->b_private = NULL; 2184219089Spjd 2185219089Spjd atomic_add_64(&arc_loaned_bytes, hdr->b_size); 2186219089Spjd} 2187219089Spjd 2188168404Spjdstatic arc_buf_t * 2189168404Spjdarc_buf_clone(arc_buf_t *from) 2190168404Spjd{ 2191168404Spjd arc_buf_t *buf; 2192168404Spjd arc_buf_hdr_t *hdr = from->b_hdr; 2193168404Spjd uint64_t size = hdr->b_size; 2194168404Spjd 2195286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2196286570Smav ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2197219089Spjd 2198185029Spjd buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2199168404Spjd buf->b_hdr = hdr; 2200168404Spjd buf->b_data = NULL; 2201168404Spjd buf->b_efunc = NULL; 2202168404Spjd buf->b_private = NULL; 2203286570Smav buf->b_next = hdr->b_l1hdr.b_buf; 2204286570Smav hdr->b_l1hdr.b_buf = buf; 2205168404Spjd arc_get_data_buf(buf); 2206168404Spjd bcopy(from->b_data, buf->b_data, size); 2207242845Sdelphij 2208242845Sdelphij /* 2209242845Sdelphij * This buffer already exists in the arc so create a duplicate 2210242845Sdelphij * copy for the caller. If the buffer is associated with user data 2211242845Sdelphij * then track the size and number of duplicates. These stats will be 2212242845Sdelphij * updated as duplicate buffers are created and destroyed. 2213242845Sdelphij */ 2214286570Smav if (HDR_ISTYPE_DATA(hdr)) { 2215242845Sdelphij ARCSTAT_BUMP(arcstat_duplicate_buffers); 2216242845Sdelphij ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 2217242845Sdelphij } 2218286570Smav hdr->b_l1hdr.b_datacnt += 1; 2219168404Spjd return (buf); 2220168404Spjd} 2221168404Spjd 2222168404Spjdvoid 2223168404Spjdarc_buf_add_ref(arc_buf_t *buf, void* tag) 2224168404Spjd{ 2225168404Spjd arc_buf_hdr_t *hdr; 2226168404Spjd kmutex_t *hash_lock; 2227168404Spjd 2228168404Spjd /* 2229185029Spjd * Check to see if this buffer is evicted. Callers 2230185029Spjd * must verify b_data != NULL to know if the add_ref 2231185029Spjd * was successful. 2232168404Spjd */ 2233219089Spjd mutex_enter(&buf->b_evict_lock); 2234185029Spjd if (buf->b_data == NULL) { 2235219089Spjd mutex_exit(&buf->b_evict_lock); 2236168404Spjd return; 2237168404Spjd } 2238219089Spjd hash_lock = HDR_LOCK(buf->b_hdr); 2239219089Spjd mutex_enter(hash_lock); 2240185029Spjd hdr = buf->b_hdr; 2241286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2242219089Spjd ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2243219089Spjd mutex_exit(&buf->b_evict_lock); 2244168404Spjd 2245286570Smav ASSERT(hdr->b_l1hdr.b_state == arc_mru || 2246286570Smav hdr->b_l1hdr.b_state == arc_mfu); 2247286570Smav 2248168404Spjd add_reference(hdr, hash_lock, tag); 2249208373Smm DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2250168404Spjd arc_access(hdr, hash_lock); 2251168404Spjd mutex_exit(hash_lock); 2252168404Spjd ARCSTAT_BUMP(arcstat_hits); 2253286570Smav ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 2254286570Smav demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 2255168404Spjd data, metadata, hits); 2256168404Spjd} 2257168404Spjd 2258274172Savgstatic void 2259274172Savgarc_buf_free_on_write(void *data, size_t size, 2260274172Savg void (*free_func)(void *, size_t)) 2261274172Savg{ 2262274172Savg l2arc_data_free_t *df; 2263274172Savg 2264286763Smav df = kmem_alloc(sizeof (*df), KM_SLEEP); 2265274172Savg df->l2df_data = data; 2266274172Savg df->l2df_size = size; 2267274172Savg df->l2df_func = free_func; 2268274172Savg mutex_enter(&l2arc_free_on_write_mtx); 2269274172Savg list_insert_head(l2arc_free_on_write, df); 2270274172Savg mutex_exit(&l2arc_free_on_write_mtx); 2271274172Savg} 2272274172Savg 2273185029Spjd/* 2274185029Spjd * Free the arc data buffer. If it is an l2arc write in progress, 2275185029Spjd * the buffer is placed on l2arc_free_on_write to be freed later. 2276185029Spjd */ 2277168404Spjdstatic void 2278240133Smmarc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) 2279185029Spjd{ 2280240133Smm arc_buf_hdr_t *hdr = buf->b_hdr; 2281240133Smm 2282185029Spjd if (HDR_L2_WRITING(hdr)) { 2283274172Savg arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); 2284185029Spjd ARCSTAT_BUMP(arcstat_l2_free_on_write); 2285185029Spjd } else { 2286240133Smm free_func(buf->b_data, hdr->b_size); 2287185029Spjd } 2288185029Spjd} 2289185029Spjd 2290185029Spjdstatic void 2291274172Savgarc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) 2292274172Savg{ 2293297848Savg size_t align, asize, len; 2294297848Savg 2295286570Smav ASSERT(HDR_HAS_L2HDR(hdr)); 2296286570Smav ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); 2297274172Savg 2298286570Smav /* 2299286570Smav * The b_tmp_cdata field is linked off of the b_l1hdr, so if 2300286570Smav * that doesn't exist, the header is in the arc_l2c_only state, 2301286570Smav * and there isn't anything to free (it's already been freed). 2302286570Smav */ 2303286570Smav if (!HDR_HAS_L1HDR(hdr)) 2304286570Smav return; 2305274172Savg 2306286763Smav /* 2307286763Smav * The header isn't being written to the l2arc device, thus it 2308286763Smav * shouldn't have a b_tmp_cdata to free. 2309286763Smav */ 2310286763Smav if (!HDR_L2_WRITING(hdr)) { 2311286763Smav ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); 2312274172Savg return; 2313286763Smav } 2314274172Savg 2315286763Smav /* 2316297848Savg * The bufer has been chosen for writing to L2ARC, but it's 2317297848Savg * not being written just yet. In other words, 2318297848Savg * b_tmp_cdata points to exactly the same buffer as b_data, 2319297848Savg * l2arc_transform_buf hasn't been called. 2320286763Smav */ 2321297848Savg if (hdr->b_l2hdr.b_daddr == L2ARC_ADDR_UNSET) { 2322297848Savg ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, 2323297848Savg hdr->b_l1hdr.b_buf->b_data); 2324297848Savg ASSERT3U(hdr->b_l2hdr.b_compress, ==, ZIO_COMPRESS_OFF); 2325301873Savg hdr->b_l1hdr.b_tmp_cdata = NULL; 2326286763Smav return; 2327286763Smav } 2328286570Smav 2329286763Smav /* 2330286763Smav * There's nothing to free since the buffer was all zero's and 2331286763Smav * compressed to a zero length buffer. 2332286763Smav */ 2333287706Sdelphij if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_EMPTY) { 2334286763Smav ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); 2335286763Smav return; 2336286763Smav } 2337286763Smav 2338297848Savg /* 2339297848Savg * Nothing to do if the temporary buffer was not required. 2340297848Savg */ 2341297848Savg if (hdr->b_l1hdr.b_tmp_cdata == NULL) 2342297848Savg return; 2343286763Smav 2344274172Savg ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); 2345297848Savg len = hdr->b_size; 2346297848Savg align = (size_t)1 << hdr->b_l2hdr.b_dev->l2ad_vdev->vdev_ashift; 2347297848Savg asize = P2ROUNDUP(len, align); 2348297848Savg arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, asize, 2349297848Savg zio_data_buf_free); 2350286570Smav hdr->b_l1hdr.b_tmp_cdata = NULL; 2351274172Savg} 2352274172Savg 2353286767Smav/* 2354286767Smav * Free up buf->b_data and if 'remove' is set, then pull the 2355286767Smav * arc_buf_t off of the the arc_buf_hdr_t's list and free it. 2356286767Smav */ 2357274172Savgstatic void 2358286763Smavarc_buf_destroy(arc_buf_t *buf, boolean_t remove) 2359168404Spjd{ 2360168404Spjd arc_buf_t **bufp; 2361168404Spjd 2362168404Spjd /* free up data associated with the buf */ 2363286570Smav if (buf->b_data != NULL) { 2364286570Smav arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 2365168404Spjd uint64_t size = buf->b_hdr->b_size; 2366286570Smav arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 2367168404Spjd 2368168404Spjd arc_cksum_verify(buf); 2369240133Smm#ifdef illumos 2370240133Smm arc_buf_unwatch(buf); 2371277300Ssmh#endif 2372219089Spjd 2373286763Smav if (type == ARC_BUFC_METADATA) { 2374286763Smav arc_buf_data_free(buf, zio_buf_free); 2375286763Smav arc_space_return(size, ARC_SPACE_META); 2376286763Smav } else { 2377286763Smav ASSERT(type == ARC_BUFC_DATA); 2378286763Smav arc_buf_data_free(buf, zio_data_buf_free); 2379286763Smav arc_space_return(size, ARC_SPACE_DATA); 2380168404Spjd } 2381286763Smav 2382286763Smav /* protected by hash lock, if in the hash table */ 2383286763Smav if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { 2384185029Spjd uint64_t *cnt = &state->arcs_lsize[type]; 2385185029Spjd 2386286570Smav ASSERT(refcount_is_zero( 2387286570Smav &buf->b_hdr->b_l1hdr.b_refcnt)); 2388286570Smav ASSERT(state != arc_anon && state != arc_l2c_only); 2389185029Spjd 2390185029Spjd ASSERT3U(*cnt, >=, size); 2391185029Spjd atomic_add_64(cnt, -size); 2392168404Spjd } 2393286766Smav 2394286766Smav (void) refcount_remove_many(&state->arcs_size, size, buf); 2395168404Spjd buf->b_data = NULL; 2396242845Sdelphij 2397242845Sdelphij /* 2398242845Sdelphij * If we're destroying a duplicate buffer make sure 2399242845Sdelphij * that the appropriate statistics are updated. 2400242845Sdelphij */ 2401286570Smav if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && 2402286570Smav HDR_ISTYPE_DATA(buf->b_hdr)) { 2403242845Sdelphij ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 2404242845Sdelphij ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 2405242845Sdelphij } 2406286570Smav ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); 2407286570Smav buf->b_hdr->b_l1hdr.b_datacnt -= 1; 2408168404Spjd } 2409168404Spjd 2410168404Spjd /* only remove the buf if requested */ 2411268858Sdelphij if (!remove) 2412168404Spjd return; 2413168404Spjd 2414168404Spjd /* remove the buf from the hdr list */ 2415286570Smav for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; 2416286570Smav bufp = &(*bufp)->b_next) 2417168404Spjd continue; 2418168404Spjd *bufp = buf->b_next; 2419219089Spjd buf->b_next = NULL; 2420168404Spjd 2421168404Spjd ASSERT(buf->b_efunc == NULL); 2422168404Spjd 2423168404Spjd /* clean up the buf */ 2424168404Spjd buf->b_hdr = NULL; 2425168404Spjd kmem_cache_free(buf_cache, buf); 2426168404Spjd} 2427168404Spjd 2428168404Spjdstatic void 2429286598Smavarc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) 2430286598Smav{ 2431286598Smav l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 2432286598Smav l2arc_dev_t *dev = l2hdr->b_dev; 2433286598Smav 2434286598Smav ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 2435286598Smav ASSERT(HDR_HAS_L2HDR(hdr)); 2436286598Smav 2437286598Smav list_remove(&dev->l2ad_buflist, hdr); 2438286598Smav 2439286598Smav /* 2440286598Smav * We don't want to leak the b_tmp_cdata buffer that was 2441286598Smav * allocated in l2arc_write_buffers() 2442286598Smav */ 2443286598Smav arc_buf_l2_cdata_free(hdr); 2444286598Smav 2445286598Smav /* 2446286598Smav * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then 2447286598Smav * this header is being processed by l2arc_write_buffers() (i.e. 2448286598Smav * it's in the first stage of l2arc_write_buffers()). 2449286598Smav * Re-affirming that truth here, just to serve as a reminder. If 2450286598Smav * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or 2451286598Smav * may not have its HDR_L2_WRITING flag set. (the write may have 2452286598Smav * completed, in which case HDR_L2_WRITING will be false and the 2453286598Smav * b_daddr field will point to the address of the buffer on disk). 2454286598Smav */ 2455286598Smav IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr)); 2456286598Smav 2457286598Smav /* 2458286598Smav * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with 2459286598Smav * l2arc_write_buffers(). Since we've just removed this header 2460286598Smav * from the l2arc buffer list, this header will never reach the 2461286598Smav * second stage of l2arc_write_buffers(), which increments the 2462286598Smav * accounting stats for this header. Thus, we must be careful 2463286598Smav * not to decrement them for this header either. 2464286598Smav */ 2465286598Smav if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) { 2466286598Smav ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 2467286598Smav ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 2468286598Smav 2469286598Smav vdev_space_update(dev->l2ad_vdev, 2470286598Smav -l2hdr->b_asize, 0, 0); 2471286598Smav 2472286598Smav (void) refcount_remove_many(&dev->l2ad_alloc, 2473286598Smav l2hdr->b_asize, hdr); 2474286598Smav } 2475286598Smav 2476286598Smav hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 2477286598Smav} 2478286598Smav 2479286598Smavstatic void 2480168404Spjdarc_hdr_destroy(arc_buf_hdr_t *hdr) 2481168404Spjd{ 2482286570Smav if (HDR_HAS_L1HDR(hdr)) { 2483286570Smav ASSERT(hdr->b_l1hdr.b_buf == NULL || 2484286570Smav hdr->b_l1hdr.b_datacnt > 0); 2485286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2486286570Smav ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 2487286570Smav } 2488168404Spjd ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2489286570Smav ASSERT(!HDR_IN_HASH_TABLE(hdr)); 2490168404Spjd 2491286570Smav if (HDR_HAS_L2HDR(hdr)) { 2492286598Smav l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 2493286598Smav boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); 2494286570Smav 2495286598Smav if (!buflist_held) 2496286598Smav mutex_enter(&dev->l2ad_mtx); 2497219089Spjd 2498286570Smav /* 2499286598Smav * Even though we checked this conditional above, we 2500286598Smav * need to check this again now that we have the 2501286598Smav * l2ad_mtx. This is because we could be racing with 2502286598Smav * another thread calling l2arc_evict() which might have 2503286598Smav * destroyed this header's L2 portion as we were waiting 2504286598Smav * to acquire the l2ad_mtx. If that happens, we don't 2505286598Smav * want to re-destroy the header's L2 portion. 2506286570Smav */ 2507286598Smav if (HDR_HAS_L2HDR(hdr)) { 2508290191Savg l2arc_trim(hdr); 2509286598Smav arc_hdr_l2hdr_destroy(hdr); 2510286598Smav } 2511286570Smav 2512219089Spjd if (!buflist_held) 2513286598Smav mutex_exit(&dev->l2ad_mtx); 2514185029Spjd } 2515185029Spjd 2516286570Smav if (!BUF_EMPTY(hdr)) 2517219089Spjd buf_discard_identity(hdr); 2518286776Smav 2519168404Spjd if (hdr->b_freeze_cksum != NULL) { 2520168404Spjd kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 2521168404Spjd hdr->b_freeze_cksum = NULL; 2522168404Spjd } 2523286570Smav 2524286570Smav if (HDR_HAS_L1HDR(hdr)) { 2525286570Smav while (hdr->b_l1hdr.b_buf) { 2526286570Smav arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2527286570Smav 2528286570Smav if (buf->b_efunc != NULL) { 2529286763Smav mutex_enter(&arc_user_evicts_lock); 2530286570Smav mutex_enter(&buf->b_evict_lock); 2531286570Smav ASSERT(buf->b_hdr != NULL); 2532286763Smav arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE); 2533286570Smav hdr->b_l1hdr.b_buf = buf->b_next; 2534286570Smav buf->b_hdr = &arc_eviction_hdr; 2535286570Smav buf->b_next = arc_eviction_list; 2536286570Smav arc_eviction_list = buf; 2537286570Smav mutex_exit(&buf->b_evict_lock); 2538286763Smav cv_signal(&arc_user_evicts_cv); 2539286763Smav mutex_exit(&arc_user_evicts_lock); 2540286570Smav } else { 2541286763Smav arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE); 2542286570Smav } 2543286570Smav } 2544286570Smav#ifdef ZFS_DEBUG 2545286570Smav if (hdr->b_l1hdr.b_thawed != NULL) { 2546286570Smav kmem_free(hdr->b_l1hdr.b_thawed, 1); 2547286570Smav hdr->b_l1hdr.b_thawed = NULL; 2548286570Smav } 2549286570Smav#endif 2550219089Spjd } 2551168404Spjd 2552168404Spjd ASSERT3P(hdr->b_hash_next, ==, NULL); 2553286570Smav if (HDR_HAS_L1HDR(hdr)) { 2554286763Smav ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 2555286570Smav ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 2556286570Smav kmem_cache_free(hdr_full_cache, hdr); 2557286570Smav } else { 2558286570Smav kmem_cache_free(hdr_l2only_cache, hdr); 2559286570Smav } 2560168404Spjd} 2561168404Spjd 2562168404Spjdvoid 2563168404Spjdarc_buf_free(arc_buf_t *buf, void *tag) 2564168404Spjd{ 2565168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 2566286570Smav int hashed = hdr->b_l1hdr.b_state != arc_anon; 2567168404Spjd 2568168404Spjd ASSERT(buf->b_efunc == NULL); 2569168404Spjd ASSERT(buf->b_data != NULL); 2570168404Spjd 2571168404Spjd if (hashed) { 2572168404Spjd kmutex_t *hash_lock = HDR_LOCK(hdr); 2573168404Spjd 2574168404Spjd mutex_enter(hash_lock); 2575219089Spjd hdr = buf->b_hdr; 2576219089Spjd ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2577219089Spjd 2578168404Spjd (void) remove_reference(hdr, hash_lock, tag); 2579286570Smav if (hdr->b_l1hdr.b_datacnt > 1) { 2580286763Smav arc_buf_destroy(buf, TRUE); 2581219089Spjd } else { 2582286570Smav ASSERT(buf == hdr->b_l1hdr.b_buf); 2583219089Spjd ASSERT(buf->b_efunc == NULL); 2584275811Sdelphij hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2585219089Spjd } 2586168404Spjd mutex_exit(hash_lock); 2587168404Spjd } else if (HDR_IO_IN_PROGRESS(hdr)) { 2588168404Spjd int destroy_hdr; 2589168404Spjd /* 2590168404Spjd * We are in the middle of an async write. Don't destroy 2591168404Spjd * this buffer unless the write completes before we finish 2592168404Spjd * decrementing the reference count. 2593168404Spjd */ 2594286763Smav mutex_enter(&arc_user_evicts_lock); 2595168404Spjd (void) remove_reference(hdr, NULL, tag); 2596286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2597168404Spjd destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 2598286763Smav mutex_exit(&arc_user_evicts_lock); 2599168404Spjd if (destroy_hdr) 2600168404Spjd arc_hdr_destroy(hdr); 2601168404Spjd } else { 2602219089Spjd if (remove_reference(hdr, NULL, tag) > 0) 2603286763Smav arc_buf_destroy(buf, TRUE); 2604219089Spjd else 2605168404Spjd arc_hdr_destroy(hdr); 2606168404Spjd } 2607168404Spjd} 2608168404Spjd 2609248571Smmboolean_t 2610168404Spjdarc_buf_remove_ref(arc_buf_t *buf, void* tag) 2611168404Spjd{ 2612168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 2613168404Spjd kmutex_t *hash_lock = HDR_LOCK(hdr); 2614248571Smm boolean_t no_callback = (buf->b_efunc == NULL); 2615168404Spjd 2616286570Smav if (hdr->b_l1hdr.b_state == arc_anon) { 2617286570Smav ASSERT(hdr->b_l1hdr.b_datacnt == 1); 2618168404Spjd arc_buf_free(buf, tag); 2619168404Spjd return (no_callback); 2620168404Spjd } 2621168404Spjd 2622168404Spjd mutex_enter(hash_lock); 2623219089Spjd hdr = buf->b_hdr; 2624286570Smav ASSERT(hdr->b_l1hdr.b_datacnt > 0); 2625219089Spjd ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2626286570Smav ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2627168404Spjd ASSERT(buf->b_data != NULL); 2628168404Spjd 2629168404Spjd (void) remove_reference(hdr, hash_lock, tag); 2630286570Smav if (hdr->b_l1hdr.b_datacnt > 1) { 2631168404Spjd if (no_callback) 2632286763Smav arc_buf_destroy(buf, TRUE); 2633168404Spjd } else if (no_callback) { 2634286570Smav ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); 2635219089Spjd ASSERT(buf->b_efunc == NULL); 2636275811Sdelphij hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2637168404Spjd } 2638286570Smav ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || 2639286570Smav refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2640168404Spjd mutex_exit(hash_lock); 2641168404Spjd return (no_callback); 2642168404Spjd} 2643168404Spjd 2644286570Smavint32_t 2645168404Spjdarc_buf_size(arc_buf_t *buf) 2646168404Spjd{ 2647168404Spjd return (buf->b_hdr->b_size); 2648168404Spjd} 2649168404Spjd 2650168404Spjd/* 2651242845Sdelphij * Called from the DMU to determine if the current buffer should be 2652242845Sdelphij * evicted. In order to ensure proper locking, the eviction must be initiated 2653242845Sdelphij * from the DMU. Return true if the buffer is associated with user data and 2654242845Sdelphij * duplicate buffers still exist. 2655242845Sdelphij */ 2656242845Sdelphijboolean_t 2657242845Sdelphijarc_buf_eviction_needed(arc_buf_t *buf) 2658242845Sdelphij{ 2659242845Sdelphij arc_buf_hdr_t *hdr; 2660242845Sdelphij boolean_t evict_needed = B_FALSE; 2661242845Sdelphij 2662242845Sdelphij if (zfs_disable_dup_eviction) 2663242845Sdelphij return (B_FALSE); 2664242845Sdelphij 2665242845Sdelphij mutex_enter(&buf->b_evict_lock); 2666242845Sdelphij hdr = buf->b_hdr; 2667242845Sdelphij if (hdr == NULL) { 2668242845Sdelphij /* 2669242845Sdelphij * We are in arc_do_user_evicts(); let that function 2670242845Sdelphij * perform the eviction. 2671242845Sdelphij */ 2672242845Sdelphij ASSERT(buf->b_data == NULL); 2673242845Sdelphij mutex_exit(&buf->b_evict_lock); 2674242845Sdelphij return (B_FALSE); 2675242845Sdelphij } else if (buf->b_data == NULL) { 2676242845Sdelphij /* 2677242845Sdelphij * We have already been added to the arc eviction list; 2678242845Sdelphij * recommend eviction. 2679242845Sdelphij */ 2680242845Sdelphij ASSERT3P(hdr, ==, &arc_eviction_hdr); 2681242845Sdelphij mutex_exit(&buf->b_evict_lock); 2682242845Sdelphij return (B_TRUE); 2683242845Sdelphij } 2684242845Sdelphij 2685286570Smav if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) 2686242845Sdelphij evict_needed = B_TRUE; 2687242845Sdelphij 2688242845Sdelphij mutex_exit(&buf->b_evict_lock); 2689242845Sdelphij return (evict_needed); 2690242845Sdelphij} 2691242845Sdelphij 2692242845Sdelphij/* 2693286763Smav * Evict the arc_buf_hdr that is provided as a parameter. The resultant 2694286763Smav * state of the header is dependent on it's state prior to entering this 2695286763Smav * function. The following transitions are possible: 2696185029Spjd * 2697286763Smav * - arc_mru -> arc_mru_ghost 2698286763Smav * - arc_mfu -> arc_mfu_ghost 2699286763Smav * - arc_mru_ghost -> arc_l2c_only 2700286763Smav * - arc_mru_ghost -> deleted 2701286763Smav * - arc_mfu_ghost -> arc_l2c_only 2702286763Smav * - arc_mfu_ghost -> deleted 2703168404Spjd */ 2704286763Smavstatic int64_t 2705286763Smavarc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 2706168404Spjd{ 2707286763Smav arc_state_t *evicted_state, *state; 2708286763Smav int64_t bytes_evicted = 0; 2709168404Spjd 2710286763Smav ASSERT(MUTEX_HELD(hash_lock)); 2711286763Smav ASSERT(HDR_HAS_L1HDR(hdr)); 2712168404Spjd 2713286763Smav state = hdr->b_l1hdr.b_state; 2714286763Smav if (GHOST_STATE(state)) { 2715286763Smav ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2716286763Smav ASSERT(hdr->b_l1hdr.b_buf == NULL); 2717206796Spjd 2718286763Smav /* 2719286763Smav * l2arc_write_buffers() relies on a header's L1 portion 2720286763Smav * (i.e. it's b_tmp_cdata field) during it's write phase. 2721286763Smav * Thus, we cannot push a header onto the arc_l2c_only 2722286763Smav * state (removing it's L1 piece) until the header is 2723286763Smav * done being written to the l2arc. 2724286763Smav */ 2725286763Smav if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { 2726286763Smav ARCSTAT_BUMP(arcstat_evict_l2_skip); 2727286763Smav return (bytes_evicted); 2728286763Smav } 2729286762Smav 2730286763Smav ARCSTAT_BUMP(arcstat_deleted); 2731286763Smav bytes_evicted += hdr->b_size; 2732286762Smav 2733286763Smav DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); 2734286763Smav 2735286763Smav if (HDR_HAS_L2HDR(hdr)) { 2736275780Sdelphij /* 2737286763Smav * This buffer is cached on the 2nd Level ARC; 2738286763Smav * don't destroy the header. 2739275780Sdelphij */ 2740286763Smav arc_change_state(arc_l2c_only, hdr, hash_lock); 2741286763Smav /* 2742286763Smav * dropping from L1+L2 cached to L2-only, 2743286763Smav * realloc to remove the L1 header. 2744286763Smav */ 2745286763Smav hdr = arc_hdr_realloc(hdr, hdr_full_cache, 2746286763Smav hdr_l2only_cache); 2747286763Smav } else { 2748286763Smav arc_change_state(arc_anon, hdr, hash_lock); 2749286763Smav arc_hdr_destroy(hdr); 2750275780Sdelphij } 2751286763Smav return (bytes_evicted); 2752275780Sdelphij } 2753275780Sdelphij 2754286763Smav ASSERT(state == arc_mru || state == arc_mfu); 2755286763Smav evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2756206796Spjd 2757286763Smav /* prefetch buffers have a minimum lifespan */ 2758286763Smav if (HDR_IO_IN_PROGRESS(hdr) || 2759286763Smav ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && 2760286763Smav ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < 2761286763Smav arc_min_prefetch_lifespan)) { 2762286763Smav ARCSTAT_BUMP(arcstat_evict_skip); 2763286763Smav return (bytes_evicted); 2764286763Smav } 2765286763Smav 2766286763Smav ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 2767286763Smav ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); 2768286763Smav while (hdr->b_l1hdr.b_buf) { 2769286763Smav arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2770286763Smav if (!mutex_tryenter(&buf->b_evict_lock)) { 2771286763Smav ARCSTAT_BUMP(arcstat_mutex_miss); 2772286763Smav break; 2773168404Spjd } 2774286763Smav if (buf->b_data != NULL) 2775286763Smav bytes_evicted += hdr->b_size; 2776286763Smav if (buf->b_efunc != NULL) { 2777286763Smav mutex_enter(&arc_user_evicts_lock); 2778286763Smav arc_buf_destroy(buf, FALSE); 2779286763Smav hdr->b_l1hdr.b_buf = buf->b_next; 2780286763Smav buf->b_hdr = &arc_eviction_hdr; 2781286763Smav buf->b_next = arc_eviction_list; 2782286763Smav arc_eviction_list = buf; 2783286763Smav cv_signal(&arc_user_evicts_cv); 2784286763Smav mutex_exit(&arc_user_evicts_lock); 2785286763Smav mutex_exit(&buf->b_evict_lock); 2786286763Smav } else { 2787286763Smav mutex_exit(&buf->b_evict_lock); 2788286763Smav arc_buf_destroy(buf, TRUE); 2789286763Smav } 2790286763Smav } 2791258632Savg 2792286763Smav if (HDR_HAS_L2HDR(hdr)) { 2793286763Smav ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size); 2794286763Smav } else { 2795286763Smav if (l2arc_write_eligible(hdr->b_spa, hdr)) 2796286763Smav ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size); 2797286763Smav else 2798286763Smav ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size); 2799286763Smav } 2800258632Savg 2801286763Smav if (hdr->b_l1hdr.b_datacnt == 0) { 2802286763Smav arc_change_state(evicted_state, hdr, hash_lock); 2803286763Smav ASSERT(HDR_IN_HASH_TABLE(hdr)); 2804286763Smav hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 2805286763Smav hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 2806286763Smav DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); 2807286763Smav } 2808286763Smav 2809286763Smav return (bytes_evicted); 2810286763Smav} 2811286763Smav 2812286763Smavstatic uint64_t 2813286763Smavarc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, 2814286763Smav uint64_t spa, int64_t bytes) 2815286763Smav{ 2816286763Smav multilist_sublist_t *mls; 2817286763Smav uint64_t bytes_evicted = 0; 2818286763Smav arc_buf_hdr_t *hdr; 2819286763Smav kmutex_t *hash_lock; 2820286763Smav int evict_count = 0; 2821286763Smav 2822286763Smav ASSERT3P(marker, !=, NULL); 2823286763Smav IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 2824286763Smav 2825286763Smav mls = multilist_sublist_lock(ml, idx); 2826286763Smav 2827286763Smav for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; 2828286763Smav hdr = multilist_sublist_prev(mls, marker)) { 2829286763Smav if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || 2830286763Smav (evict_count >= zfs_arc_evict_batch_limit)) 2831286763Smav break; 2832286763Smav 2833258632Savg /* 2834286763Smav * To keep our iteration location, move the marker 2835286763Smav * forward. Since we're not holding hdr's hash lock, we 2836286763Smav * must be very careful and not remove 'hdr' from the 2837286763Smav * sublist. Otherwise, other consumers might mistake the 2838286763Smav * 'hdr' as not being on a sublist when they call the 2839286763Smav * multilist_link_active() function (they all rely on 2840286763Smav * the hash lock protecting concurrent insertions and 2841286763Smav * removals). multilist_sublist_move_forward() was 2842286763Smav * specifically implemented to ensure this is the case 2843286763Smav * (only 'marker' will be removed and re-inserted). 2844258632Savg */ 2845286763Smav multilist_sublist_move_forward(mls, marker); 2846286763Smav 2847286763Smav /* 2848286763Smav * The only case where the b_spa field should ever be 2849286763Smav * zero, is the marker headers inserted by 2850286763Smav * arc_evict_state(). It's possible for multiple threads 2851286763Smav * to be calling arc_evict_state() concurrently (e.g. 2852286763Smav * dsl_pool_close() and zio_inject_fault()), so we must 2853286763Smav * skip any markers we see from these other threads. 2854286763Smav */ 2855286763Smav if (hdr->b_spa == 0) 2856258632Savg continue; 2857286763Smav 2858286763Smav /* we're only interested in evicting buffers of a certain spa */ 2859286763Smav if (spa != 0 && hdr->b_spa != spa) { 2860286763Smav ARCSTAT_BUMP(arcstat_evict_skip); 2861286763Smav continue; 2862258632Savg } 2863258632Savg 2864275811Sdelphij hash_lock = HDR_LOCK(hdr); 2865208373Smm 2866286763Smav /* 2867286763Smav * We aren't calling this function from any code path 2868286763Smav * that would already be holding a hash lock, so we're 2869286763Smav * asserting on this assumption to be defensive in case 2870286763Smav * this ever changes. Without this check, it would be 2871286763Smav * possible to incorrectly increment arcstat_mutex_miss 2872286763Smav * below (e.g. if the code changed such that we called 2873286763Smav * this function with a hash lock held). 2874286763Smav */ 2875286763Smav ASSERT(!MUTEX_HELD(hash_lock)); 2876208373Smm 2877286763Smav if (mutex_tryenter(hash_lock)) { 2878286763Smav uint64_t evicted = arc_evict_hdr(hdr, hash_lock); 2879286763Smav mutex_exit(hash_lock); 2880286763Smav 2881286763Smav bytes_evicted += evicted; 2882286763Smav 2883286763Smav /* 2884286763Smav * If evicted is zero, arc_evict_hdr() must have 2885286763Smav * decided to skip this header, don't increment 2886286763Smav * evict_count in this case. 2887286763Smav */ 2888286763Smav if (evicted != 0) 2889286763Smav evict_count++; 2890286763Smav 2891286763Smav /* 2892286763Smav * If arc_size isn't overflowing, signal any 2893286763Smav * threads that might happen to be waiting. 2894286763Smav * 2895286763Smav * For each header evicted, we wake up a single 2896286763Smav * thread. If we used cv_broadcast, we could 2897286763Smav * wake up "too many" threads causing arc_size 2898286763Smav * to significantly overflow arc_c; since 2899286763Smav * arc_get_data_buf() doesn't check for overflow 2900286763Smav * when it's woken up (it doesn't because it's 2901286763Smav * possible for the ARC to be overflowing while 2902286763Smav * full of un-evictable buffers, and the 2903286763Smav * function should proceed in this case). 2904286763Smav * 2905286763Smav * If threads are left sleeping, due to not 2906286763Smav * using cv_broadcast, they will be woken up 2907286763Smav * just before arc_reclaim_thread() sleeps. 2908286763Smav */ 2909286763Smav mutex_enter(&arc_reclaim_lock); 2910286763Smav if (!arc_is_overflowing()) 2911286763Smav cv_signal(&arc_reclaim_waiters_cv); 2912286763Smav mutex_exit(&arc_reclaim_lock); 2913168404Spjd } else { 2914286763Smav ARCSTAT_BUMP(arcstat_mutex_miss); 2915168404Spjd } 2916168404Spjd } 2917168404Spjd 2918286763Smav multilist_sublist_unlock(mls); 2919206796Spjd 2920286763Smav return (bytes_evicted); 2921286763Smav} 2922168404Spjd 2923286763Smav/* 2924286763Smav * Evict buffers from the given arc state, until we've removed the 2925286763Smav * specified number of bytes. Move the removed buffers to the 2926286763Smav * appropriate evict state. 2927286763Smav * 2928286763Smav * This function makes a "best effort". It skips over any buffers 2929286763Smav * it can't get a hash_lock on, and so, may not catch all candidates. 2930286763Smav * It may also return without evicting as much space as requested. 2931286763Smav * 2932286763Smav * If bytes is specified using the special value ARC_EVICT_ALL, this 2933286763Smav * will evict all available (i.e. unlocked and evictable) buffers from 2934286763Smav * the given arc state; which is used by arc_flush(). 2935286763Smav */ 2936286763Smavstatic uint64_t 2937286763Smavarc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, 2938286763Smav arc_buf_contents_t type) 2939286763Smav{ 2940286763Smav uint64_t total_evicted = 0; 2941286763Smav multilist_t *ml = &state->arcs_list[type]; 2942286763Smav int num_sublists; 2943286763Smav arc_buf_hdr_t **markers; 2944168404Spjd 2945286763Smav IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 2946168404Spjd 2947286763Smav num_sublists = multilist_get_num_sublists(ml); 2948286763Smav 2949185029Spjd /* 2950286763Smav * If we've tried to evict from each sublist, made some 2951286763Smav * progress, but still have not hit the target number of bytes 2952286763Smav * to evict, we want to keep trying. The markers allow us to 2953286763Smav * pick up where we left off for each individual sublist, rather 2954286763Smav * than starting from the tail each time. 2955185029Spjd */ 2956286763Smav markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); 2957286763Smav for (int i = 0; i < num_sublists; i++) { 2958286763Smav markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); 2959185029Spjd 2960286763Smav /* 2961286763Smav * A b_spa of 0 is used to indicate that this header is 2962286763Smav * a marker. This fact is used in arc_adjust_type() and 2963286763Smav * arc_evict_state_impl(). 2964286763Smav */ 2965286763Smav markers[i]->b_spa = 0; 2966168404Spjd 2967286763Smav multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 2968286763Smav multilist_sublist_insert_tail(mls, markers[i]); 2969286763Smav multilist_sublist_unlock(mls); 2970286763Smav } 2971168404Spjd 2972286763Smav /* 2973286763Smav * While we haven't hit our target number of bytes to evict, or 2974286763Smav * we're evicting all available buffers. 2975286763Smav */ 2976286763Smav while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { 2977286763Smav /* 2978286763Smav * Start eviction using a randomly selected sublist, 2979286763Smav * this is to try and evenly balance eviction across all 2980286763Smav * sublists. Always starting at the same sublist 2981286763Smav * (e.g. index 0) would cause evictions to favor certain 2982286763Smav * sublists over others. 2983286763Smav */ 2984286763Smav int sublist_idx = multilist_get_random_index(ml); 2985286763Smav uint64_t scan_evicted = 0; 2986219089Spjd 2987286763Smav for (int i = 0; i < num_sublists; i++) { 2988286763Smav uint64_t bytes_remaining; 2989286763Smav uint64_t bytes_evicted; 2990219089Spjd 2991286763Smav if (bytes == ARC_EVICT_ALL) 2992286763Smav bytes_remaining = ARC_EVICT_ALL; 2993286763Smav else if (total_evicted < bytes) 2994286763Smav bytes_remaining = bytes - total_evicted; 2995286763Smav else 2996286763Smav break; 2997258632Savg 2998286763Smav bytes_evicted = arc_evict_state_impl(ml, sublist_idx, 2999286763Smav markers[sublist_idx], spa, bytes_remaining); 3000286763Smav 3001286763Smav scan_evicted += bytes_evicted; 3002286763Smav total_evicted += bytes_evicted; 3003286763Smav 3004286763Smav /* we've reached the end, wrap to the beginning */ 3005286763Smav if (++sublist_idx >= num_sublists) 3006286763Smav sublist_idx = 0; 3007286763Smav } 3008286763Smav 3009258632Savg /* 3010286763Smav * If we didn't evict anything during this scan, we have 3011286763Smav * no reason to believe we'll evict more during another 3012286763Smav * scan, so break the loop. 3013258632Savg */ 3014286763Smav if (scan_evicted == 0) { 3015286763Smav /* This isn't possible, let's make that obvious */ 3016286763Smav ASSERT3S(bytes, !=, 0); 3017185029Spjd 3018286763Smav /* 3019286763Smav * When bytes is ARC_EVICT_ALL, the only way to 3020286763Smav * break the loop is when scan_evicted is zero. 3021286763Smav * In that case, we actually have evicted enough, 3022286763Smav * so we don't want to increment the kstat. 3023286763Smav */ 3024286763Smav if (bytes != ARC_EVICT_ALL) { 3025286763Smav ASSERT3S(total_evicted, <, bytes); 3026286763Smav ARCSTAT_BUMP(arcstat_evict_not_enough); 3027185029Spjd } 3028185029Spjd 3029286763Smav break; 3030258632Savg } 3031286763Smav } 3032258632Savg 3033286763Smav for (int i = 0; i < num_sublists; i++) { 3034286763Smav multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 3035286763Smav multilist_sublist_remove(mls, markers[i]); 3036286763Smav multilist_sublist_unlock(mls); 3037286763Smav 3038286763Smav kmem_cache_free(hdr_full_cache, markers[i]); 3039168404Spjd } 3040286763Smav kmem_free(markers, sizeof (*markers) * num_sublists); 3041206796Spjd 3042286763Smav return (total_evicted); 3043286763Smav} 3044286763Smav 3045286763Smav/* 3046286763Smav * Flush all "evictable" data of the given type from the arc state 3047286763Smav * specified. This will not evict any "active" buffers (i.e. referenced). 3048286763Smav * 3049286763Smav * When 'retry' is set to FALSE, the function will make a single pass 3050286763Smav * over the state and evict any buffers that it can. Since it doesn't 3051286763Smav * continually retry the eviction, it might end up leaving some buffers 3052286763Smav * in the ARC due to lock misses. 3053286763Smav * 3054286763Smav * When 'retry' is set to TRUE, the function will continually retry the 3055286763Smav * eviction until *all* evictable buffers have been removed from the 3056286763Smav * state. As a result, if concurrent insertions into the state are 3057286763Smav * allowed (e.g. if the ARC isn't shutting down), this function might 3058286763Smav * wind up in an infinite loop, continually trying to evict buffers. 3059286763Smav */ 3060286763Smavstatic uint64_t 3061286763Smavarc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, 3062286763Smav boolean_t retry) 3063286763Smav{ 3064286763Smav uint64_t evicted = 0; 3065286763Smav 3066286763Smav while (state->arcs_lsize[type] != 0) { 3067286763Smav evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); 3068286763Smav 3069286763Smav if (!retry) 3070286763Smav break; 3071185029Spjd } 3072185029Spjd 3073286763Smav return (evicted); 3074286763Smav} 3075286763Smav 3076286763Smav/* 3077286763Smav * Evict the specified number of bytes from the state specified, 3078286763Smav * restricting eviction to the spa and type given. This function 3079286763Smav * prevents us from trying to evict more from a state's list than 3080286763Smav * is "evictable", and to skip evicting altogether when passed a 3081286763Smav * negative value for "bytes". In contrast, arc_evict_state() will 3082286763Smav * evict everything it can, when passed a negative value for "bytes". 3083286763Smav */ 3084286763Smavstatic uint64_t 3085286763Smavarc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, 3086286763Smav arc_buf_contents_t type) 3087286763Smav{ 3088286763Smav int64_t delta; 3089286763Smav 3090286763Smav if (bytes > 0 && state->arcs_lsize[type] > 0) { 3091286763Smav delta = MIN(state->arcs_lsize[type], bytes); 3092286763Smav return (arc_evict_state(state, spa, delta, type)); 3093168404Spjd } 3094168404Spjd 3095286763Smav return (0); 3096168404Spjd} 3097168404Spjd 3098286763Smav/* 3099286763Smav * Evict metadata buffers from the cache, such that arc_meta_used is 3100286763Smav * capped by the arc_meta_limit tunable. 3101286763Smav */ 3102286763Smavstatic uint64_t 3103286763Smavarc_adjust_meta(void) 3104286763Smav{ 3105286763Smav uint64_t total_evicted = 0; 3106286763Smav int64_t target; 3107286763Smav 3108286763Smav /* 3109286763Smav * If we're over the meta limit, we want to evict enough 3110286763Smav * metadata to get back under the meta limit. We don't want to 3111286763Smav * evict so much that we drop the MRU below arc_p, though. If 3112286763Smav * we're over the meta limit more than we're over arc_p, we 3113286763Smav * evict some from the MRU here, and some from the MFU below. 3114286763Smav */ 3115286763Smav target = MIN((int64_t)(arc_meta_used - arc_meta_limit), 3116286766Smav (int64_t)(refcount_count(&arc_anon->arcs_size) + 3117286766Smav refcount_count(&arc_mru->arcs_size) - arc_p)); 3118286763Smav 3119286763Smav total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 3120286763Smav 3121286763Smav /* 3122286763Smav * Similar to the above, we want to evict enough bytes to get us 3123286763Smav * below the meta limit, but not so much as to drop us below the 3124286763Smav * space alloted to the MFU (which is defined as arc_c - arc_p). 3125286763Smav */ 3126286763Smav target = MIN((int64_t)(arc_meta_used - arc_meta_limit), 3127286766Smav (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p))); 3128286763Smav 3129286763Smav total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 3130286763Smav 3131286763Smav return (total_evicted); 3132286763Smav} 3133286763Smav 3134286763Smav/* 3135286763Smav * Return the type of the oldest buffer in the given arc state 3136286763Smav * 3137286763Smav * This function will select a random sublist of type ARC_BUFC_DATA and 3138286763Smav * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist 3139286763Smav * is compared, and the type which contains the "older" buffer will be 3140286763Smav * returned. 3141286763Smav */ 3142286763Smavstatic arc_buf_contents_t 3143286763Smavarc_adjust_type(arc_state_t *state) 3144286763Smav{ 3145286763Smav multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA]; 3146286763Smav multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA]; 3147286763Smav int data_idx = multilist_get_random_index(data_ml); 3148286763Smav int meta_idx = multilist_get_random_index(meta_ml); 3149286763Smav multilist_sublist_t *data_mls; 3150286763Smav multilist_sublist_t *meta_mls; 3151286763Smav arc_buf_contents_t type; 3152286763Smav arc_buf_hdr_t *data_hdr; 3153286763Smav arc_buf_hdr_t *meta_hdr; 3154286763Smav 3155286763Smav /* 3156286763Smav * We keep the sublist lock until we're finished, to prevent 3157286763Smav * the headers from being destroyed via arc_evict_state(). 3158286763Smav */ 3159286763Smav data_mls = multilist_sublist_lock(data_ml, data_idx); 3160286763Smav meta_mls = multilist_sublist_lock(meta_ml, meta_idx); 3161286763Smav 3162286763Smav /* 3163286763Smav * These two loops are to ensure we skip any markers that 3164286763Smav * might be at the tail of the lists due to arc_evict_state(). 3165286763Smav */ 3166286763Smav 3167286763Smav for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; 3168286763Smav data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { 3169286763Smav if (data_hdr->b_spa != 0) 3170286763Smav break; 3171286763Smav } 3172286763Smav 3173286763Smav for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; 3174286763Smav meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { 3175286763Smav if (meta_hdr->b_spa != 0) 3176286763Smav break; 3177286763Smav } 3178286763Smav 3179286763Smav if (data_hdr == NULL && meta_hdr == NULL) { 3180286763Smav type = ARC_BUFC_DATA; 3181286763Smav } else if (data_hdr == NULL) { 3182286763Smav ASSERT3P(meta_hdr, !=, NULL); 3183286763Smav type = ARC_BUFC_METADATA; 3184286763Smav } else if (meta_hdr == NULL) { 3185286763Smav ASSERT3P(data_hdr, !=, NULL); 3186286763Smav type = ARC_BUFC_DATA; 3187286763Smav } else { 3188286763Smav ASSERT3P(data_hdr, !=, NULL); 3189286763Smav ASSERT3P(meta_hdr, !=, NULL); 3190286763Smav 3191286763Smav /* The headers can't be on the sublist without an L1 header */ 3192286763Smav ASSERT(HDR_HAS_L1HDR(data_hdr)); 3193286763Smav ASSERT(HDR_HAS_L1HDR(meta_hdr)); 3194286763Smav 3195286763Smav if (data_hdr->b_l1hdr.b_arc_access < 3196286763Smav meta_hdr->b_l1hdr.b_arc_access) { 3197286763Smav type = ARC_BUFC_DATA; 3198286763Smav } else { 3199286763Smav type = ARC_BUFC_METADATA; 3200286763Smav } 3201286763Smav } 3202286763Smav 3203286763Smav multilist_sublist_unlock(meta_mls); 3204286763Smav multilist_sublist_unlock(data_mls); 3205286763Smav 3206286763Smav return (type); 3207286763Smav} 3208286763Smav 3209286763Smav/* 3210286763Smav * Evict buffers from the cache, such that arc_size is capped by arc_c. 3211286763Smav */ 3212286763Smavstatic uint64_t 3213168404Spjdarc_adjust(void) 3214168404Spjd{ 3215286763Smav uint64_t total_evicted = 0; 3216286763Smav uint64_t bytes; 3217286763Smav int64_t target; 3218168404Spjd 3219208373Smm /* 3220286763Smav * If we're over arc_meta_limit, we want to correct that before 3221286763Smav * potentially evicting data buffers below. 3222286763Smav */ 3223286763Smav total_evicted += arc_adjust_meta(); 3224286763Smav 3225286763Smav /* 3226208373Smm * Adjust MRU size 3227286763Smav * 3228286763Smav * If we're over the target cache size, we want to evict enough 3229286763Smav * from the list to get back to our target size. We don't want 3230286763Smav * to evict too much from the MRU, such that it drops below 3231286763Smav * arc_p. So, if we're over our target cache size more than 3232286763Smav * the MRU is over arc_p, we'll evict enough to get back to 3233286763Smav * arc_p here, and then evict more from the MFU below. 3234208373Smm */ 3235286763Smav target = MIN((int64_t)(arc_size - arc_c), 3236286766Smav (int64_t)(refcount_count(&arc_anon->arcs_size) + 3237286766Smav refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p)); 3238208373Smm 3239286763Smav /* 3240286763Smav * If we're below arc_meta_min, always prefer to evict data. 3241286763Smav * Otherwise, try to satisfy the requested number of bytes to 3242286763Smav * evict from the type which contains older buffers; in an 3243286763Smav * effort to keep newer buffers in the cache regardless of their 3244286763Smav * type. If we cannot satisfy the number of bytes from this 3245286763Smav * type, spill over into the next type. 3246286763Smav */ 3247286763Smav if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && 3248286763Smav arc_meta_used > arc_meta_min) { 3249286763Smav bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 3250286763Smav total_evicted += bytes; 3251168404Spjd 3252286763Smav /* 3253286763Smav * If we couldn't evict our target number of bytes from 3254286763Smav * metadata, we try to get the rest from data. 3255286763Smav */ 3256286763Smav target -= bytes; 3257286763Smav 3258286763Smav total_evicted += 3259286763Smav arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 3260286763Smav } else { 3261286763Smav bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 3262286763Smav total_evicted += bytes; 3263286763Smav 3264286763Smav /* 3265286763Smav * If we couldn't evict our target number of bytes from 3266286763Smav * data, we try to get the rest from metadata. 3267286763Smav */ 3268286763Smav target -= bytes; 3269286763Smav 3270286763Smav total_evicted += 3271286763Smav arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 3272185029Spjd } 3273185029Spjd 3274208373Smm /* 3275208373Smm * Adjust MFU size 3276286763Smav * 3277286763Smav * Now that we've tried to evict enough from the MRU to get its 3278286763Smav * size back to arc_p, if we're still above the target cache 3279286763Smav * size, we evict the rest from the MFU. 3280208373Smm */ 3281286763Smav target = arc_size - arc_c; 3282168404Spjd 3283286764Smav if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && 3284286763Smav arc_meta_used > arc_meta_min) { 3285286763Smav bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 3286286763Smav total_evicted += bytes; 3287208373Smm 3288286763Smav /* 3289286763Smav * If we couldn't evict our target number of bytes from 3290286763Smav * metadata, we try to get the rest from data. 3291286763Smav */ 3292286763Smav target -= bytes; 3293168404Spjd 3294286763Smav total_evicted += 3295286763Smav arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 3296286763Smav } else { 3297286763Smav bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 3298286763Smav total_evicted += bytes; 3299286763Smav 3300286763Smav /* 3301286763Smav * If we couldn't evict our target number of bytes from 3302286763Smav * data, we try to get the rest from data. 3303286763Smav */ 3304286763Smav target -= bytes; 3305286763Smav 3306286763Smav total_evicted += 3307286763Smav arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 3308208373Smm } 3309168404Spjd 3310208373Smm /* 3311208373Smm * Adjust ghost lists 3312286763Smav * 3313286763Smav * In addition to the above, the ARC also defines target values 3314286763Smav * for the ghost lists. The sum of the mru list and mru ghost 3315286763Smav * list should never exceed the target size of the cache, and 3316286763Smav * the sum of the mru list, mfu list, mru ghost list, and mfu 3317286763Smav * ghost list should never exceed twice the target size of the 3318286763Smav * cache. The following logic enforces these limits on the ghost 3319286763Smav * caches, and evicts from them as needed. 3320208373Smm */ 3321286766Smav target = refcount_count(&arc_mru->arcs_size) + 3322286766Smav refcount_count(&arc_mru_ghost->arcs_size) - arc_c; 3323168404Spjd 3324286763Smav bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); 3325286763Smav total_evicted += bytes; 3326168404Spjd 3327286763Smav target -= bytes; 3328185029Spjd 3329286763Smav total_evicted += 3330286763Smav arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); 3331208373Smm 3332286763Smav /* 3333286763Smav * We assume the sum of the mru list and mfu list is less than 3334286763Smav * or equal to arc_c (we enforced this above), which means we 3335286763Smav * can use the simpler of the two equations below: 3336286763Smav * 3337286763Smav * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c 3338286763Smav * mru ghost + mfu ghost <= arc_c 3339286763Smav */ 3340286766Smav target = refcount_count(&arc_mru_ghost->arcs_size) + 3341286766Smav refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; 3342286763Smav 3343286763Smav bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); 3344286763Smav total_evicted += bytes; 3345286763Smav 3346286763Smav target -= bytes; 3347286763Smav 3348286763Smav total_evicted += 3349286763Smav arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); 3350286763Smav 3351286763Smav return (total_evicted); 3352168404Spjd} 3353168404Spjd 3354168404Spjdstatic void 3355168404Spjdarc_do_user_evicts(void) 3356168404Spjd{ 3357286763Smav mutex_enter(&arc_user_evicts_lock); 3358286762Smav while (arc_eviction_list != NULL) { 3359286762Smav arc_buf_t *buf = arc_eviction_list; 3360286762Smav arc_eviction_list = buf->b_next; 3361219089Spjd mutex_enter(&buf->b_evict_lock); 3362168404Spjd buf->b_hdr = NULL; 3363219089Spjd mutex_exit(&buf->b_evict_lock); 3364286763Smav mutex_exit(&arc_user_evicts_lock); 3365168404Spjd 3366168404Spjd if (buf->b_efunc != NULL) 3367268858Sdelphij VERIFY0(buf->b_efunc(buf->b_private)); 3368168404Spjd 3369168404Spjd buf->b_efunc = NULL; 3370168404Spjd buf->b_private = NULL; 3371168404Spjd kmem_cache_free(buf_cache, buf); 3372286763Smav mutex_enter(&arc_user_evicts_lock); 3373168404Spjd } 3374286763Smav mutex_exit(&arc_user_evicts_lock); 3375168404Spjd} 3376168404Spjd 3377168404Spjdvoid 3378286763Smavarc_flush(spa_t *spa, boolean_t retry) 3379168404Spjd{ 3380209962Smm uint64_t guid = 0; 3381209962Smm 3382286763Smav /* 3383286763Smav * If retry is TRUE, a spa must not be specified since we have 3384286763Smav * no good way to determine if all of a spa's buffers have been 3385286763Smav * evicted from an arc state. 3386286763Smav */ 3387286763Smav ASSERT(!retry || spa == 0); 3388286763Smav 3389286570Smav if (spa != NULL) 3390228103Smm guid = spa_load_guid(spa); 3391209962Smm 3392286763Smav (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); 3393286763Smav (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); 3394168404Spjd 3395286763Smav (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); 3396286763Smav (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); 3397168404Spjd 3398286763Smav (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); 3399286763Smav (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); 3400286763Smav 3401286763Smav (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); 3402286763Smav (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); 3403286763Smav 3404168404Spjd arc_do_user_evicts(); 3405185029Spjd ASSERT(spa || arc_eviction_list == NULL); 3406168404Spjd} 3407168404Spjd 3408168404Spjdvoid 3409286625Smavarc_shrink(int64_t to_free) 3410168404Spjd{ 3411168404Spjd if (arc_c > arc_c_min) { 3412272483Ssmh DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, 3413272483Ssmh arc_c_min, uint64_t, arc_p, uint64_t, to_free); 3414168404Spjd if (arc_c > arc_c_min + to_free) 3415168404Spjd atomic_add_64(&arc_c, -to_free); 3416168404Spjd else 3417168404Spjd arc_c = arc_c_min; 3418168404Spjd 3419168404Spjd atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 3420168404Spjd if (arc_c > arc_size) 3421168404Spjd arc_c = MAX(arc_size, arc_c_min); 3422168404Spjd if (arc_p > arc_c) 3423168404Spjd arc_p = (arc_c >> 1); 3424272483Ssmh 3425272483Ssmh DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, 3426272483Ssmh arc_p); 3427272483Ssmh 3428168404Spjd ASSERT(arc_c >= arc_c_min); 3429168404Spjd ASSERT((int64_t)arc_p >= 0); 3430168404Spjd } 3431168404Spjd 3432270759Ssmh if (arc_size > arc_c) { 3433270759Ssmh DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, 3434270759Ssmh uint64_t, arc_c); 3435286763Smav (void) arc_adjust(); 3436270759Ssmh } 3437168404Spjd} 3438168404Spjd 3439286625Smavstatic long needfree = 0; 3440168404Spjd 3441286625Smavtypedef enum free_memory_reason_t { 3442286625Smav FMR_UNKNOWN, 3443286625Smav FMR_NEEDFREE, 3444286625Smav FMR_LOTSFREE, 3445286625Smav FMR_SWAPFS_MINFREE, 3446286625Smav FMR_PAGES_PP_MAXIMUM, 3447286625Smav FMR_HEAP_ARENA, 3448286625Smav FMR_ZIO_ARENA, 3449286625Smav FMR_ZIO_FRAG, 3450286625Smav} free_memory_reason_t; 3451286625Smav 3452286625Smavint64_t last_free_memory; 3453286625Smavfree_memory_reason_t last_free_reason; 3454286625Smav 3455286625Smav/* 3456286625Smav * Additional reserve of pages for pp_reserve. 3457286625Smav */ 3458286625Smavint64_t arc_pages_pp_reserve = 64; 3459286625Smav 3460286625Smav/* 3461286625Smav * Additional reserve of pages for swapfs. 3462286625Smav */ 3463286625Smavint64_t arc_swapfs_reserve = 64; 3464286625Smav 3465286625Smav/* 3466286625Smav * Return the amount of memory that can be consumed before reclaim will be 3467286625Smav * needed. Positive if there is sufficient free memory, negative indicates 3468286625Smav * the amount of memory that needs to be freed up. 3469286625Smav */ 3470286625Smavstatic int64_t 3471286625Smavarc_available_memory(void) 3472168404Spjd{ 3473286625Smav int64_t lowest = INT64_MAX; 3474286625Smav int64_t n; 3475286625Smav free_memory_reason_t r = FMR_UNKNOWN; 3476168404Spjd 3477168404Spjd#ifdef _KERNEL 3478286625Smav if (needfree > 0) { 3479286625Smav n = PAGESIZE * (-needfree); 3480286625Smav if (n < lowest) { 3481286625Smav lowest = n; 3482286625Smav r = FMR_NEEDFREE; 3483286625Smav } 3484270759Ssmh } 3485168404Spjd 3486191902Skmacy /* 3487212780Savg * Cooperate with pagedaemon when it's time for it to scan 3488212780Savg * and reclaim some pages. 3489191902Skmacy */ 3490286655Smav n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); 3491286625Smav if (n < lowest) { 3492286625Smav lowest = n; 3493286625Smav r = FMR_LOTSFREE; 3494270759Ssmh } 3495191902Skmacy 3496277300Ssmh#ifdef illumos 3497168404Spjd /* 3498185029Spjd * check that we're out of range of the pageout scanner. It starts to 3499185029Spjd * schedule paging if freemem is less than lotsfree and needfree. 3500185029Spjd * lotsfree is the high-water mark for pageout, and needfree is the 3501185029Spjd * number of needed free pages. We add extra pages here to make sure 3502185029Spjd * the scanner doesn't start up while we're freeing memory. 3503185029Spjd */ 3504286625Smav n = PAGESIZE * (freemem - lotsfree - needfree - desfree); 3505286625Smav if (n < lowest) { 3506286625Smav lowest = n; 3507286625Smav r = FMR_LOTSFREE; 3508286625Smav } 3509185029Spjd 3510185029Spjd /* 3511168404Spjd * check to make sure that swapfs has enough space so that anon 3512185029Spjd * reservations can still succeed. anon_resvmem() checks that the 3513168404Spjd * availrmem is greater than swapfs_minfree, and the number of reserved 3514168404Spjd * swap pages. We also add a bit of extra here just to prevent 3515168404Spjd * circumstances from getting really dire. 3516168404Spjd */ 3517286625Smav n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - 3518286625Smav desfree - arc_swapfs_reserve); 3519286625Smav if (n < lowest) { 3520286625Smav lowest = n; 3521286625Smav r = FMR_SWAPFS_MINFREE; 3522286625Smav } 3523168404Spjd 3524286625Smav 3525168404Spjd /* 3526272483Ssmh * Check that we have enough availrmem that memory locking (e.g., via 3527272483Ssmh * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum 3528272483Ssmh * stores the number of pages that cannot be locked; when availrmem 3529272483Ssmh * drops below pages_pp_maximum, page locking mechanisms such as 3530272483Ssmh * page_pp_lock() will fail.) 3531272483Ssmh */ 3532286625Smav n = PAGESIZE * (availrmem - pages_pp_maximum - 3533286625Smav arc_pages_pp_reserve); 3534286625Smav if (n < lowest) { 3535286625Smav lowest = n; 3536286625Smav r = FMR_PAGES_PP_MAXIMUM; 3537286625Smav } 3538272483Ssmh 3539277300Ssmh#endif /* illumos */ 3540272483Ssmh#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 3541272483Ssmh /* 3542168404Spjd * If we're on an i386 platform, it's possible that we'll exhaust the 3543168404Spjd * kernel heap space before we ever run out of available physical 3544168404Spjd * memory. Most checks of the size of the heap_area compare against 3545168404Spjd * tune.t_minarmem, which is the minimum available real memory that we 3546168404Spjd * can have in the system. However, this is generally fixed at 25 pages 3547168404Spjd * which is so low that it's useless. In this comparison, we seek to 3548168404Spjd * calculate the total heap-size, and reclaim if more than 3/4ths of the 3549185029Spjd * heap is allocated. (Or, in the calculation, if less than 1/4th is 3550168404Spjd * free) 3551168404Spjd */ 3552286655Smav n = (int64_t)vmem_size(heap_arena, VMEM_FREE) - 3553286628Smav (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); 3554286625Smav if (n < lowest) { 3555286625Smav lowest = n; 3556286625Smav r = FMR_HEAP_ARENA; 3557270861Ssmh } 3558281026Smav#define zio_arena NULL 3559281026Smav#else 3560281026Smav#define zio_arena heap_arena 3561270861Ssmh#endif 3562281026Smav 3563272483Ssmh /* 3564272483Ssmh * If zio data pages are being allocated out of a separate heap segment, 3565272483Ssmh * then enforce that the size of available vmem for this arena remains 3566272483Ssmh * above about 1/16th free. 3567272483Ssmh * 3568272483Ssmh * Note: The 1/16th arena free requirement was put in place 3569272483Ssmh * to aggressively evict memory from the arc in order to avoid 3570272483Ssmh * memory fragmentation issues. 3571272483Ssmh */ 3572286625Smav if (zio_arena != NULL) { 3573286655Smav n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - 3574286625Smav (vmem_size(zio_arena, VMEM_ALLOC) >> 4); 3575286625Smav if (n < lowest) { 3576286625Smav lowest = n; 3577286625Smav r = FMR_ZIO_ARENA; 3578286625Smav } 3579286625Smav } 3580281026Smav 3581281026Smav /* 3582281026Smav * Above limits know nothing about real level of KVA fragmentation. 3583281026Smav * Start aggressive reclamation if too little sequential KVA left. 3584281026Smav */ 3585286625Smav if (lowest > 0) { 3586286625Smav n = (vmem_size(heap_arena, VMEM_MAXFREE) < zfs_max_recordsize) ? 3587286655Smav -((int64_t)vmem_size(heap_arena, VMEM_ALLOC) >> 4) : 3588286655Smav INT64_MAX; 3589286625Smav if (n < lowest) { 3590286625Smav lowest = n; 3591286625Smav r = FMR_ZIO_FRAG; 3592286625Smav } 3593281109Smav } 3594281026Smav 3595272483Ssmh#else /* _KERNEL */ 3596286625Smav /* Every 100 calls, free a small amount */ 3597168404Spjd if (spa_get_random(100) == 0) 3598286625Smav lowest = -1024; 3599272483Ssmh#endif /* _KERNEL */ 3600270759Ssmh 3601286625Smav last_free_memory = lowest; 3602286625Smav last_free_reason = r; 3603286625Smav DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); 3604286625Smav return (lowest); 3605168404Spjd} 3606168404Spjd 3607286625Smav 3608286625Smav/* 3609286625Smav * Determine if the system is under memory pressure and is asking 3610286625Smav * to reclaim memory. A return value of TRUE indicates that the system 3611286625Smav * is under memory pressure and that the arc should adjust accordingly. 3612286625Smav */ 3613286625Smavstatic boolean_t 3614286625Smavarc_reclaim_needed(void) 3615286625Smav{ 3616286625Smav return (arc_available_memory() < 0); 3617286625Smav} 3618286625Smav 3619208454Spjdextern kmem_cache_t *zio_buf_cache[]; 3620208454Spjdextern kmem_cache_t *zio_data_buf_cache[]; 3621272527Sdelphijextern kmem_cache_t *range_seg_cache; 3622208454Spjd 3623278040Ssmhstatic __noinline void 3624286625Smavarc_kmem_reap_now(void) 3625168404Spjd{ 3626168404Spjd size_t i; 3627168404Spjd kmem_cache_t *prev_cache = NULL; 3628168404Spjd kmem_cache_t *prev_data_cache = NULL; 3629168404Spjd 3630272483Ssmh DTRACE_PROBE(arc__kmem_reap_start); 3631168404Spjd#ifdef _KERNEL 3632185029Spjd if (arc_meta_used >= arc_meta_limit) { 3633185029Spjd /* 3634185029Spjd * We are exceeding our meta-data cache limit. 3635185029Spjd * Purge some DNLC entries to release holds on meta-data. 3636185029Spjd */ 3637185029Spjd dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 3638185029Spjd } 3639168404Spjd#if defined(__i386) 3640168404Spjd /* 3641168404Spjd * Reclaim unused memory from all kmem caches. 3642168404Spjd */ 3643168404Spjd kmem_reap(); 3644168404Spjd#endif 3645168404Spjd#endif 3646168404Spjd 3647168404Spjd for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 3648168404Spjd if (zio_buf_cache[i] != prev_cache) { 3649168404Spjd prev_cache = zio_buf_cache[i]; 3650168404Spjd kmem_cache_reap_now(zio_buf_cache[i]); 3651168404Spjd } 3652168404Spjd if (zio_data_buf_cache[i] != prev_data_cache) { 3653168404Spjd prev_data_cache = zio_data_buf_cache[i]; 3654168404Spjd kmem_cache_reap_now(zio_data_buf_cache[i]); 3655168404Spjd } 3656168404Spjd } 3657168404Spjd kmem_cache_reap_now(buf_cache); 3658286570Smav kmem_cache_reap_now(hdr_full_cache); 3659286570Smav kmem_cache_reap_now(hdr_l2only_cache); 3660272506Sdelphij kmem_cache_reap_now(range_seg_cache); 3661272483Ssmh 3662277300Ssmh#ifdef illumos 3663286625Smav if (zio_arena != NULL) { 3664286625Smav /* 3665286625Smav * Ask the vmem arena to reclaim unused memory from its 3666286625Smav * quantum caches. 3667286625Smav */ 3668272483Ssmh vmem_qcache_reap(zio_arena); 3669286625Smav } 3670272483Ssmh#endif 3671272483Ssmh DTRACE_PROBE(arc__kmem_reap_end); 3672168404Spjd} 3673168404Spjd 3674286763Smav/* 3675286763Smav * Threads can block in arc_get_data_buf() waiting for this thread to evict 3676286763Smav * enough data and signal them to proceed. When this happens, the threads in 3677286763Smav * arc_get_data_buf() are sleeping while holding the hash lock for their 3678286763Smav * particular arc header. Thus, we must be careful to never sleep on a 3679286763Smav * hash lock in this thread. This is to prevent the following deadlock: 3680286763Smav * 3681286763Smav * - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L", 3682286763Smav * waiting for the reclaim thread to signal it. 3683286763Smav * 3684286763Smav * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, 3685286763Smav * fails, and goes to sleep forever. 3686286763Smav * 3687286763Smav * This possible deadlock is avoided by always acquiring a hash lock 3688286763Smav * using mutex_tryenter() from arc_reclaim_thread(). 3689286763Smav */ 3690168404Spjdstatic void 3691168404Spjdarc_reclaim_thread(void *dummy __unused) 3692168404Spjd{ 3693296530Smav hrtime_t growtime = 0; 3694168404Spjd callb_cpr_t cpr; 3695168404Spjd 3696286763Smav CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); 3697168404Spjd 3698286763Smav mutex_enter(&arc_reclaim_lock); 3699286763Smav while (!arc_reclaim_thread_exit) { 3700286625Smav int64_t free_memory = arc_available_memory(); 3701286763Smav uint64_t evicted = 0; 3702286763Smav 3703286763Smav mutex_exit(&arc_reclaim_lock); 3704286763Smav 3705286625Smav if (free_memory < 0) { 3706168404Spjd 3707286625Smav arc_no_grow = B_TRUE; 3708286625Smav arc_warm = B_TRUE; 3709168404Spjd 3710286625Smav /* 3711286625Smav * Wait at least zfs_grow_retry (default 60) seconds 3712286625Smav * before considering growing. 3713286625Smav */ 3714296530Smav growtime = gethrtime() + SEC2NSEC(arc_grow_retry); 3715168404Spjd 3716286625Smav arc_kmem_reap_now(); 3717286625Smav 3718286625Smav /* 3719286625Smav * If we are still low on memory, shrink the ARC 3720286625Smav * so that we have arc_shrink_min free space. 3721286625Smav */ 3722286625Smav free_memory = arc_available_memory(); 3723286625Smav 3724286625Smav int64_t to_free = 3725286625Smav (arc_c >> arc_shrink_shift) - free_memory; 3726286625Smav if (to_free > 0) { 3727286625Smav#ifdef _KERNEL 3728286625Smav to_free = MAX(to_free, ptob(needfree)); 3729286625Smav#endif 3730286625Smav arc_shrink(to_free); 3731168404Spjd } 3732286625Smav } else if (free_memory < arc_c >> arc_no_grow_shift) { 3733286625Smav arc_no_grow = B_TRUE; 3734296530Smav } else if (gethrtime() >= growtime) { 3735286625Smav arc_no_grow = B_FALSE; 3736168404Spjd } 3737168404Spjd 3738286763Smav evicted = arc_adjust(); 3739168404Spjd 3740286763Smav mutex_enter(&arc_reclaim_lock); 3741168404Spjd 3742286763Smav /* 3743286763Smav * If evicted is zero, we couldn't evict anything via 3744286763Smav * arc_adjust(). This could be due to hash lock 3745286763Smav * collisions, but more likely due to the majority of 3746286763Smav * arc buffers being unevictable. Therefore, even if 3747286763Smav * arc_size is above arc_c, another pass is unlikely to 3748286763Smav * be helpful and could potentially cause us to enter an 3749286763Smav * infinite loop. 3750286763Smav */ 3751286763Smav if (arc_size <= arc_c || evicted == 0) { 3752211762Savg#ifdef _KERNEL 3753185029Spjd needfree = 0; 3754168404Spjd#endif 3755286763Smav /* 3756286763Smav * We're either no longer overflowing, or we 3757286763Smav * can't evict anything more, so we should wake 3758286763Smav * up any threads before we go to sleep. 3759286763Smav */ 3760286763Smav cv_broadcast(&arc_reclaim_waiters_cv); 3761168404Spjd 3762286763Smav /* 3763286763Smav * Block until signaled, or after one second (we 3764286763Smav * might need to perform arc_kmem_reap_now() 3765286763Smav * even if we aren't being signalled) 3766286763Smav */ 3767286763Smav CALLB_CPR_SAFE_BEGIN(&cpr); 3768296530Smav (void) cv_timedwait_hires(&arc_reclaim_thread_cv, 3769296530Smav &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); 3770286763Smav CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); 3771286763Smav } 3772286763Smav } 3773286763Smav 3774286763Smav arc_reclaim_thread_exit = FALSE; 3775286763Smav cv_broadcast(&arc_reclaim_thread_cv); 3776286763Smav CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ 3777286763Smav thread_exit(); 3778286763Smav} 3779286763Smav 3780286763Smavstatic void 3781286763Smavarc_user_evicts_thread(void *dummy __unused) 3782286763Smav{ 3783286763Smav callb_cpr_t cpr; 3784286763Smav 3785286763Smav CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG); 3786286763Smav 3787286763Smav mutex_enter(&arc_user_evicts_lock); 3788286763Smav while (!arc_user_evicts_thread_exit) { 3789286763Smav mutex_exit(&arc_user_evicts_lock); 3790286763Smav 3791286763Smav arc_do_user_evicts(); 3792286763Smav 3793286574Smav /* 3794286574Smav * This is necessary in order for the mdb ::arc dcmd to 3795286574Smav * show up to date information. Since the ::arc command 3796286574Smav * does not call the kstat's update function, without 3797286574Smav * this call, the command may show stale stats for the 3798286574Smav * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even 3799286574Smav * with this change, the data might be up to 1 second 3800286574Smav * out of date; but that should suffice. The arc_state_t 3801286574Smav * structures can be queried directly if more accurate 3802286574Smav * information is needed. 3803286574Smav */ 3804286574Smav if (arc_ksp != NULL) 3805286574Smav arc_ksp->ks_update(arc_ksp, KSTAT_READ); 3806286574Smav 3807286763Smav mutex_enter(&arc_user_evicts_lock); 3808286763Smav 3809286763Smav /* 3810286763Smav * Block until signaled, or after one second (we need to 3811286763Smav * call the arc's kstat update function regularly). 3812286763Smav */ 3813168404Spjd CALLB_CPR_SAFE_BEGIN(&cpr); 3814286763Smav (void) cv_timedwait(&arc_user_evicts_cv, 3815286763Smav &arc_user_evicts_lock, hz); 3816286763Smav CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock); 3817168404Spjd } 3818168404Spjd 3819286763Smav arc_user_evicts_thread_exit = FALSE; 3820286763Smav cv_broadcast(&arc_user_evicts_cv); 3821286763Smav CALLB_CPR_EXIT(&cpr); /* drops arc_user_evicts_lock */ 3822168404Spjd thread_exit(); 3823168404Spjd} 3824168404Spjd 3825301997Skibstatic u_int arc_dnlc_evicts_arg; 3826301997Skibextern struct vfsops zfs_vfsops; 3827301997Skib 3828301997Skibstatic void 3829301997Skibarc_dnlc_evicts_thread(void *dummy __unused) 3830301997Skib{ 3831301997Skib callb_cpr_t cpr; 3832301997Skib u_int percent; 3833301997Skib 3834301997Skib CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG); 3835301997Skib 3836301997Skib mutex_enter(&arc_dnlc_evicts_lock); 3837301997Skib while (!arc_dnlc_evicts_thread_exit) { 3838301997Skib CALLB_CPR_SAFE_BEGIN(&cpr); 3839301997Skib (void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); 3840301997Skib CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock); 3841301997Skib if (arc_dnlc_evicts_arg != 0) { 3842301997Skib percent = arc_dnlc_evicts_arg; 3843301997Skib mutex_exit(&arc_dnlc_evicts_lock); 3844301997Skib#ifdef _KERNEL 3845301997Skib vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops); 3846301997Skib#endif 3847301997Skib mutex_enter(&arc_dnlc_evicts_lock); 3848301997Skib /* 3849301997Skib * Clear our token only after vnlru_free() 3850301997Skib * pass is done, to avoid false queueing of 3851301997Skib * the requests. 3852301997Skib */ 3853301997Skib arc_dnlc_evicts_arg = 0; 3854301997Skib } 3855301997Skib } 3856301997Skib arc_dnlc_evicts_thread_exit = FALSE; 3857301997Skib cv_broadcast(&arc_dnlc_evicts_cv); 3858301997Skib CALLB_CPR_EXIT(&cpr); 3859301997Skib thread_exit(); 3860301997Skib} 3861301997Skib 3862301997Skibvoid 3863301997Skibdnlc_reduce_cache(void *arg) 3864301997Skib{ 3865301997Skib u_int percent; 3866301997Skib 3867302012Skib percent = (u_int)(uintptr_t)arg; 3868301997Skib mutex_enter(&arc_dnlc_evicts_lock); 3869301997Skib if (arc_dnlc_evicts_arg == 0) { 3870301997Skib arc_dnlc_evicts_arg = percent; 3871301997Skib cv_broadcast(&arc_dnlc_evicts_cv); 3872301997Skib } 3873301997Skib mutex_exit(&arc_dnlc_evicts_lock); 3874301997Skib} 3875301997Skib 3876168404Spjd/* 3877168404Spjd * Adapt arc info given the number of bytes we are trying to add and 3878168404Spjd * the state that we are comming from. This function is only called 3879168404Spjd * when we are adding new content to the cache. 3880168404Spjd */ 3881168404Spjdstatic void 3882168404Spjdarc_adapt(int bytes, arc_state_t *state) 3883168404Spjd{ 3884168404Spjd int mult; 3885208373Smm uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 3886286766Smav int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size); 3887286766Smav int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size); 3888168404Spjd 3889185029Spjd if (state == arc_l2c_only) 3890185029Spjd return; 3891185029Spjd 3892168404Spjd ASSERT(bytes > 0); 3893168404Spjd /* 3894168404Spjd * Adapt the target size of the MRU list: 3895168404Spjd * - if we just hit in the MRU ghost list, then increase 3896168404Spjd * the target size of the MRU list. 3897168404Spjd * - if we just hit in the MFU ghost list, then increase 3898168404Spjd * the target size of the MFU list by decreasing the 3899168404Spjd * target size of the MRU list. 3900168404Spjd */ 3901168404Spjd if (state == arc_mru_ghost) { 3902286766Smav mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); 3903209275Smm mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 3904168404Spjd 3905208373Smm arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 3906168404Spjd } else if (state == arc_mfu_ghost) { 3907208373Smm uint64_t delta; 3908208373Smm 3909286766Smav mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); 3910209275Smm mult = MIN(mult, 10); 3911168404Spjd 3912208373Smm delta = MIN(bytes * mult, arc_p); 3913208373Smm arc_p = MAX(arc_p_min, arc_p - delta); 3914168404Spjd } 3915168404Spjd ASSERT((int64_t)arc_p >= 0); 3916168404Spjd 3917168404Spjd if (arc_reclaim_needed()) { 3918286763Smav cv_signal(&arc_reclaim_thread_cv); 3919168404Spjd return; 3920168404Spjd } 3921168404Spjd 3922168404Spjd if (arc_no_grow) 3923168404Spjd return; 3924168404Spjd 3925168404Spjd if (arc_c >= arc_c_max) 3926168404Spjd return; 3927168404Spjd 3928168404Spjd /* 3929168404Spjd * If we're within (2 * maxblocksize) bytes of the target 3930168404Spjd * cache size, increment the target cache size 3931168404Spjd */ 3932168404Spjd if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 3933272483Ssmh DTRACE_PROBE1(arc__inc_adapt, int, bytes); 3934168404Spjd atomic_add_64(&arc_c, (int64_t)bytes); 3935168404Spjd if (arc_c > arc_c_max) 3936168404Spjd arc_c = arc_c_max; 3937168404Spjd else if (state == arc_anon) 3938168404Spjd atomic_add_64(&arc_p, (int64_t)bytes); 3939168404Spjd if (arc_p > arc_c) 3940168404Spjd arc_p = arc_c; 3941168404Spjd } 3942168404Spjd ASSERT((int64_t)arc_p >= 0); 3943168404Spjd} 3944168404Spjd 3945168404Spjd/* 3946286763Smav * Check if arc_size has grown past our upper threshold, determined by 3947286763Smav * zfs_arc_overflow_shift. 3948168404Spjd */ 3949286763Smavstatic boolean_t 3950286763Smavarc_is_overflowing(void) 3951168404Spjd{ 3952286763Smav /* Always allow at least one block of overflow */ 3953286763Smav uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, 3954286763Smav arc_c >> zfs_arc_overflow_shift); 3955185029Spjd 3956286763Smav return (arc_size >= arc_c + overflow); 3957168404Spjd} 3958168404Spjd 3959168404Spjd/* 3960286763Smav * The buffer, supplied as the first argument, needs a data block. If we 3961286763Smav * are hitting the hard limit for the cache size, we must sleep, waiting 3962286763Smav * for the eviction thread to catch up. If we're past the target size 3963286763Smav * but below the hard limit, we'll only signal the reclaim thread and 3964286763Smav * continue on. 3965168404Spjd */ 3966168404Spjdstatic void 3967168404Spjdarc_get_data_buf(arc_buf_t *buf) 3968168404Spjd{ 3969286570Smav arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 3970168404Spjd uint64_t size = buf->b_hdr->b_size; 3971286570Smav arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 3972168404Spjd 3973168404Spjd arc_adapt(size, state); 3974168404Spjd 3975168404Spjd /* 3976286763Smav * If arc_size is currently overflowing, and has grown past our 3977286763Smav * upper limit, we must be adding data faster than the evict 3978286763Smav * thread can evict. Thus, to ensure we don't compound the 3979286763Smav * problem by adding more data and forcing arc_size to grow even 3980286763Smav * further past it's target size, we halt and wait for the 3981286763Smav * eviction thread to catch up. 3982286763Smav * 3983286763Smav * It's also possible that the reclaim thread is unable to evict 3984286763Smav * enough buffers to get arc_size below the overflow limit (e.g. 3985286763Smav * due to buffers being un-evictable, or hash lock collisions). 3986286763Smav * In this case, we want to proceed regardless if we're 3987286763Smav * overflowing; thus we don't use a while loop here. 3988168404Spjd */ 3989286763Smav if (arc_is_overflowing()) { 3990286763Smav mutex_enter(&arc_reclaim_lock); 3991286763Smav 3992286763Smav /* 3993286763Smav * Now that we've acquired the lock, we may no longer be 3994286763Smav * over the overflow limit, lets check. 3995286763Smav * 3996286763Smav * We're ignoring the case of spurious wake ups. If that 3997286763Smav * were to happen, it'd let this thread consume an ARC 3998286763Smav * buffer before it should have (i.e. before we're under 3999286763Smav * the overflow limit and were signalled by the reclaim 4000286763Smav * thread). As long as that is a rare occurrence, it 4001286763Smav * shouldn't cause any harm. 4002286763Smav */ 4003286763Smav if (arc_is_overflowing()) { 4004286763Smav cv_signal(&arc_reclaim_thread_cv); 4005286763Smav cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); 4006168404Spjd } 4007286763Smav 4008286763Smav mutex_exit(&arc_reclaim_lock); 4009168404Spjd } 4010168404Spjd 4011286763Smav if (type == ARC_BUFC_METADATA) { 4012286763Smav buf->b_data = zio_buf_alloc(size); 4013286763Smav arc_space_consume(size, ARC_SPACE_META); 4014168404Spjd } else { 4015286763Smav ASSERT(type == ARC_BUFC_DATA); 4016286763Smav buf->b_data = zio_data_buf_alloc(size); 4017286763Smav arc_space_consume(size, ARC_SPACE_DATA); 4018168404Spjd } 4019286763Smav 4020168404Spjd /* 4021168404Spjd * Update the state size. Note that ghost states have a 4022168404Spjd * "ghost size" and so don't need to be updated. 4023168404Spjd */ 4024286570Smav if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { 4025168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 4026286766Smav arc_state_t *state = hdr->b_l1hdr.b_state; 4027168404Spjd 4028286766Smav (void) refcount_add_many(&state->arcs_size, size, buf); 4029286763Smav 4030286763Smav /* 4031286763Smav * If this is reached via arc_read, the link is 4032286763Smav * protected by the hash lock. If reached via 4033286763Smav * arc_buf_alloc, the header should not be accessed by 4034286763Smav * any other thread. And, if reached via arc_read_done, 4035286763Smav * the hash lock will protect it if it's found in the 4036286763Smav * hash table; otherwise no other thread should be 4037286763Smav * trying to [add|remove]_reference it. 4038286763Smav */ 4039286763Smav if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 4040286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4041286570Smav atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], 4042286570Smav size); 4043168404Spjd } 4044168404Spjd /* 4045168404Spjd * If we are growing the cache, and we are adding anonymous 4046168404Spjd * data, and we have outgrown arc_p, update arc_p 4047168404Spjd */ 4048286570Smav if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && 4049286766Smav (refcount_count(&arc_anon->arcs_size) + 4050286766Smav refcount_count(&arc_mru->arcs_size) > arc_p)) 4051168404Spjd arc_p = MIN(arc_c, arc_p + size); 4052168404Spjd } 4053205231Skmacy ARCSTAT_BUMP(arcstat_allocated); 4054168404Spjd} 4055168404Spjd 4056168404Spjd/* 4057168404Spjd * This routine is called whenever a buffer is accessed. 4058168404Spjd * NOTE: the hash lock is dropped in this function. 4059168404Spjd */ 4060168404Spjdstatic void 4061275811Sdelphijarc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 4062168404Spjd{ 4063219089Spjd clock_t now; 4064219089Spjd 4065168404Spjd ASSERT(MUTEX_HELD(hash_lock)); 4066286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 4067168404Spjd 4068286570Smav if (hdr->b_l1hdr.b_state == arc_anon) { 4069168404Spjd /* 4070168404Spjd * This buffer is not in the cache, and does not 4071168404Spjd * appear in our "ghost" list. Add the new buffer 4072168404Spjd * to the MRU state. 4073168404Spjd */ 4074168404Spjd 4075286570Smav ASSERT0(hdr->b_l1hdr.b_arc_access); 4076286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4077275811Sdelphij DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 4078275811Sdelphij arc_change_state(arc_mru, hdr, hash_lock); 4079168404Spjd 4080286570Smav } else if (hdr->b_l1hdr.b_state == arc_mru) { 4081219089Spjd now = ddi_get_lbolt(); 4082219089Spjd 4083168404Spjd /* 4084168404Spjd * If this buffer is here because of a prefetch, then either: 4085168404Spjd * - clear the flag if this is a "referencing" read 4086168404Spjd * (any subsequent access will bump this into the MFU state). 4087168404Spjd * or 4088168404Spjd * - move the buffer to the head of the list if this is 4089168404Spjd * another prefetch (to make it less likely to be evicted). 4090168404Spjd */ 4091286570Smav if (HDR_PREFETCH(hdr)) { 4092286570Smav if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 4093286763Smav /* link protected by hash lock */ 4094286763Smav ASSERT(multilist_link_active( 4095286570Smav &hdr->b_l1hdr.b_arc_node)); 4096168404Spjd } else { 4097275811Sdelphij hdr->b_flags &= ~ARC_FLAG_PREFETCH; 4098168404Spjd ARCSTAT_BUMP(arcstat_mru_hits); 4099168404Spjd } 4100286570Smav hdr->b_l1hdr.b_arc_access = now; 4101168404Spjd return; 4102168404Spjd } 4103168404Spjd 4104168404Spjd /* 4105168404Spjd * This buffer has been "accessed" only once so far, 4106168404Spjd * but it is still in the cache. Move it to the MFU 4107168404Spjd * state. 4108168404Spjd */ 4109286570Smav if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { 4110168404Spjd /* 4111168404Spjd * More than 125ms have passed since we 4112168404Spjd * instantiated this buffer. Move it to the 4113168404Spjd * most frequently used state. 4114168404Spjd */ 4115286570Smav hdr->b_l1hdr.b_arc_access = now; 4116275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4117275811Sdelphij arc_change_state(arc_mfu, hdr, hash_lock); 4118168404Spjd } 4119168404Spjd ARCSTAT_BUMP(arcstat_mru_hits); 4120286570Smav } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { 4121168404Spjd arc_state_t *new_state; 4122168404Spjd /* 4123168404Spjd * This buffer has been "accessed" recently, but 4124168404Spjd * was evicted from the cache. Move it to the 4125168404Spjd * MFU state. 4126168404Spjd */ 4127168404Spjd 4128286570Smav if (HDR_PREFETCH(hdr)) { 4129168404Spjd new_state = arc_mru; 4130286570Smav if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) 4131275811Sdelphij hdr->b_flags &= ~ARC_FLAG_PREFETCH; 4132275811Sdelphij DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 4133168404Spjd } else { 4134168404Spjd new_state = arc_mfu; 4135275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4136168404Spjd } 4137168404Spjd 4138286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4139275811Sdelphij arc_change_state(new_state, hdr, hash_lock); 4140168404Spjd 4141168404Spjd ARCSTAT_BUMP(arcstat_mru_ghost_hits); 4142286570Smav } else if (hdr->b_l1hdr.b_state == arc_mfu) { 4143168404Spjd /* 4144168404Spjd * This buffer has been accessed more than once and is 4145168404Spjd * still in the cache. Keep it in the MFU state. 4146168404Spjd * 4147168404Spjd * NOTE: an add_reference() that occurred when we did 4148168404Spjd * the arc_read() will have kicked this off the list. 4149168404Spjd * If it was a prefetch, we will explicitly move it to 4150168404Spjd * the head of the list now. 4151168404Spjd */ 4152286570Smav if ((HDR_PREFETCH(hdr)) != 0) { 4153286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4154286763Smav /* link protected by hash_lock */ 4155286763Smav ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 4156168404Spjd } 4157168404Spjd ARCSTAT_BUMP(arcstat_mfu_hits); 4158286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4159286570Smav } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { 4160168404Spjd arc_state_t *new_state = arc_mfu; 4161168404Spjd /* 4162168404Spjd * This buffer has been accessed more than once but has 4163168404Spjd * been evicted from the cache. Move it back to the 4164168404Spjd * MFU state. 4165168404Spjd */ 4166168404Spjd 4167286570Smav if (HDR_PREFETCH(hdr)) { 4168168404Spjd /* 4169168404Spjd * This is a prefetch access... 4170168404Spjd * move this block back to the MRU state. 4171168404Spjd */ 4172286570Smav ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 4173168404Spjd new_state = arc_mru; 4174168404Spjd } 4175168404Spjd 4176286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4177275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4178275811Sdelphij arc_change_state(new_state, hdr, hash_lock); 4179168404Spjd 4180168404Spjd ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 4181286570Smav } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { 4182185029Spjd /* 4183185029Spjd * This buffer is on the 2nd Level ARC. 4184185029Spjd */ 4185185029Spjd 4186286570Smav hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4187275811Sdelphij DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4188275811Sdelphij arc_change_state(arc_mfu, hdr, hash_lock); 4189168404Spjd } else { 4190168404Spjd ASSERT(!"invalid arc state"); 4191168404Spjd } 4192168404Spjd} 4193168404Spjd 4194168404Spjd/* a generic arc_done_func_t which you can use */ 4195168404Spjd/* ARGSUSED */ 4196168404Spjdvoid 4197168404Spjdarc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 4198168404Spjd{ 4199219089Spjd if (zio == NULL || zio->io_error == 0) 4200219089Spjd bcopy(buf->b_data, arg, buf->b_hdr->b_size); 4201248571Smm VERIFY(arc_buf_remove_ref(buf, arg)); 4202168404Spjd} 4203168404Spjd 4204185029Spjd/* a generic arc_done_func_t */ 4205168404Spjdvoid 4206168404Spjdarc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 4207168404Spjd{ 4208168404Spjd arc_buf_t **bufp = arg; 4209168404Spjd if (zio && zio->io_error) { 4210248571Smm VERIFY(arc_buf_remove_ref(buf, arg)); 4211168404Spjd *bufp = NULL; 4212168404Spjd } else { 4213168404Spjd *bufp = buf; 4214219089Spjd ASSERT(buf->b_data); 4215168404Spjd } 4216168404Spjd} 4217168404Spjd 4218168404Spjdstatic void 4219168404Spjdarc_read_done(zio_t *zio) 4220168404Spjd{ 4221268075Sdelphij arc_buf_hdr_t *hdr; 4222168404Spjd arc_buf_t *buf; 4223168404Spjd arc_buf_t *abuf; /* buffer we're assigning to callback */ 4224268075Sdelphij kmutex_t *hash_lock = NULL; 4225168404Spjd arc_callback_t *callback_list, *acb; 4226168404Spjd int freeable = FALSE; 4227168404Spjd 4228168404Spjd buf = zio->io_private; 4229168404Spjd hdr = buf->b_hdr; 4230168404Spjd 4231168404Spjd /* 4232168404Spjd * The hdr was inserted into hash-table and removed from lists 4233168404Spjd * prior to starting I/O. We should find this header, since 4234168404Spjd * it's in the hash table, and it should be legit since it's 4235168404Spjd * not possible to evict it during the I/O. The only possible 4236168404Spjd * reason for it not to be found is if we were freed during the 4237168404Spjd * read. 4238168404Spjd */ 4239268075Sdelphij if (HDR_IN_HASH_TABLE(hdr)) { 4240268075Sdelphij ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); 4241268075Sdelphij ASSERT3U(hdr->b_dva.dva_word[0], ==, 4242268075Sdelphij BP_IDENTITY(zio->io_bp)->dva_word[0]); 4243268075Sdelphij ASSERT3U(hdr->b_dva.dva_word[1], ==, 4244268075Sdelphij BP_IDENTITY(zio->io_bp)->dva_word[1]); 4245168404Spjd 4246268075Sdelphij arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, 4247268075Sdelphij &hash_lock); 4248168404Spjd 4249268075Sdelphij ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && 4250268075Sdelphij hash_lock == NULL) || 4251268075Sdelphij (found == hdr && 4252268075Sdelphij DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 4253268075Sdelphij (found == hdr && HDR_L2_READING(hdr))); 4254268075Sdelphij } 4255268075Sdelphij 4256275811Sdelphij hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; 4257286570Smav if (l2arc_noprefetch && HDR_PREFETCH(hdr)) 4258275811Sdelphij hdr->b_flags &= ~ARC_FLAG_L2CACHE; 4259206796Spjd 4260168404Spjd /* byteswap if necessary */ 4261286570Smav callback_list = hdr->b_l1hdr.b_acb; 4262168404Spjd ASSERT(callback_list != NULL); 4263209101Smm if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 4264236884Smm dmu_object_byteswap_t bswap = 4265236884Smm DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 4266185029Spjd arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 4267185029Spjd byteswap_uint64_array : 4268236884Smm dmu_ot_byteswap[bswap].ob_func; 4269185029Spjd func(buf->b_data, hdr->b_size); 4270185029Spjd } 4271168404Spjd 4272185029Spjd arc_cksum_compute(buf, B_FALSE); 4273240133Smm#ifdef illumos 4274240133Smm arc_buf_watch(buf); 4275277300Ssmh#endif 4276168404Spjd 4277286570Smav if (hash_lock && zio->io_error == 0 && 4278286570Smav hdr->b_l1hdr.b_state == arc_anon) { 4279219089Spjd /* 4280219089Spjd * Only call arc_access on anonymous buffers. This is because 4281219089Spjd * if we've issued an I/O for an evicted buffer, we've already 4282219089Spjd * called arc_access (to prevent any simultaneous readers from 4283219089Spjd * getting confused). 4284219089Spjd */ 4285219089Spjd arc_access(hdr, hash_lock); 4286219089Spjd } 4287219089Spjd 4288168404Spjd /* create copies of the data buffer for the callers */ 4289168404Spjd abuf = buf; 4290168404Spjd for (acb = callback_list; acb; acb = acb->acb_next) { 4291168404Spjd if (acb->acb_done) { 4292242845Sdelphij if (abuf == NULL) { 4293242845Sdelphij ARCSTAT_BUMP(arcstat_duplicate_reads); 4294168404Spjd abuf = arc_buf_clone(buf); 4295242845Sdelphij } 4296168404Spjd acb->acb_buf = abuf; 4297168404Spjd abuf = NULL; 4298168404Spjd } 4299168404Spjd } 4300286570Smav hdr->b_l1hdr.b_acb = NULL; 4301275811Sdelphij hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4302168404Spjd ASSERT(!HDR_BUF_AVAILABLE(hdr)); 4303219089Spjd if (abuf == buf) { 4304219089Spjd ASSERT(buf->b_efunc == NULL); 4305286570Smav ASSERT(hdr->b_l1hdr.b_datacnt == 1); 4306275811Sdelphij hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 4307219089Spjd } 4308168404Spjd 4309286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || 4310286570Smav callback_list != NULL); 4311168404Spjd 4312168404Spjd if (zio->io_error != 0) { 4313275811Sdelphij hdr->b_flags |= ARC_FLAG_IO_ERROR; 4314286570Smav if (hdr->b_l1hdr.b_state != arc_anon) 4315168404Spjd arc_change_state(arc_anon, hdr, hash_lock); 4316168404Spjd if (HDR_IN_HASH_TABLE(hdr)) 4317168404Spjd buf_hash_remove(hdr); 4318286570Smav freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 4319168404Spjd } 4320168404Spjd 4321168404Spjd /* 4322168404Spjd * Broadcast before we drop the hash_lock to avoid the possibility 4323168404Spjd * that the hdr (and hence the cv) might be freed before we get to 4324168404Spjd * the cv_broadcast(). 4325168404Spjd */ 4326286570Smav cv_broadcast(&hdr->b_l1hdr.b_cv); 4327168404Spjd 4328286570Smav if (hash_lock != NULL) { 4329168404Spjd mutex_exit(hash_lock); 4330168404Spjd } else { 4331168404Spjd /* 4332168404Spjd * This block was freed while we waited for the read to 4333168404Spjd * complete. It has been removed from the hash table and 4334168404Spjd * moved to the anonymous state (so that it won't show up 4335168404Spjd * in the cache). 4336168404Spjd */ 4337286570Smav ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 4338286570Smav freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 4339168404Spjd } 4340168404Spjd 4341168404Spjd /* execute each callback and free its structure */ 4342168404Spjd while ((acb = callback_list) != NULL) { 4343168404Spjd if (acb->acb_done) 4344168404Spjd acb->acb_done(zio, acb->acb_buf, acb->acb_private); 4345168404Spjd 4346168404Spjd if (acb->acb_zio_dummy != NULL) { 4347168404Spjd acb->acb_zio_dummy->io_error = zio->io_error; 4348168404Spjd zio_nowait(acb->acb_zio_dummy); 4349168404Spjd } 4350168404Spjd 4351168404Spjd callback_list = acb->acb_next; 4352168404Spjd kmem_free(acb, sizeof (arc_callback_t)); 4353168404Spjd } 4354168404Spjd 4355168404Spjd if (freeable) 4356168404Spjd arc_hdr_destroy(hdr); 4357168404Spjd} 4358168404Spjd 4359168404Spjd/* 4360286762Smav * "Read" the block at the specified DVA (in bp) via the 4361168404Spjd * cache. If the block is found in the cache, invoke the provided 4362168404Spjd * callback immediately and return. Note that the `zio' parameter 4363168404Spjd * in the callback will be NULL in this case, since no IO was 4364168404Spjd * required. If the block is not in the cache pass the read request 4365168404Spjd * on to the spa with a substitute callback function, so that the 4366168404Spjd * requested block will be added to the cache. 4367168404Spjd * 4368168404Spjd * If a read request arrives for a block that has a read in-progress, 4369168404Spjd * either wait for the in-progress read to complete (and return the 4370168404Spjd * results); or, if this is a read with a "done" func, add a record 4371168404Spjd * to the read to invoke the "done" func when the read completes, 4372168404Spjd * and return; or just return. 4373168404Spjd * 4374168404Spjd * arc_read_done() will invoke all the requested "done" functions 4375168404Spjd * for readers of this block. 4376168404Spjd */ 4377168404Spjdint 4378246666Smmarc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 4379275811Sdelphij void *private, zio_priority_t priority, int zio_flags, 4380275811Sdelphij arc_flags_t *arc_flags, const zbookmark_phys_t *zb) 4381168404Spjd{ 4382268075Sdelphij arc_buf_hdr_t *hdr = NULL; 4383247187Smm arc_buf_t *buf = NULL; 4384268075Sdelphij kmutex_t *hash_lock = NULL; 4385185029Spjd zio_t *rzio; 4386228103Smm uint64_t guid = spa_load_guid(spa); 4387168404Spjd 4388268075Sdelphij ASSERT(!BP_IS_EMBEDDED(bp) || 4389268075Sdelphij BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 4390268075Sdelphij 4391168404Spjdtop: 4392268075Sdelphij if (!BP_IS_EMBEDDED(bp)) { 4393268075Sdelphij /* 4394268075Sdelphij * Embedded BP's have no DVA and require no I/O to "read". 4395268075Sdelphij * Create an anonymous arc buf to back it. 4396268075Sdelphij */ 4397268075Sdelphij hdr = buf_hash_find(guid, bp, &hash_lock); 4398268075Sdelphij } 4399168404Spjd 4400286570Smav if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { 4401268075Sdelphij 4402275811Sdelphij *arc_flags |= ARC_FLAG_CACHED; 4403168404Spjd 4404168404Spjd if (HDR_IO_IN_PROGRESS(hdr)) { 4405168404Spjd 4406287702Sdelphij if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && 4407287702Sdelphij priority == ZIO_PRIORITY_SYNC_READ) { 4408287702Sdelphij /* 4409287702Sdelphij * This sync read must wait for an 4410287702Sdelphij * in-progress async read (e.g. a predictive 4411287702Sdelphij * prefetch). Async reads are queued 4412287702Sdelphij * separately at the vdev_queue layer, so 4413287702Sdelphij * this is a form of priority inversion. 4414287702Sdelphij * Ideally, we would "inherit" the demand 4415287702Sdelphij * i/o's priority by moving the i/o from 4416287702Sdelphij * the async queue to the synchronous queue, 4417287702Sdelphij * but there is currently no mechanism to do 4418287702Sdelphij * so. Track this so that we can evaluate 4419287702Sdelphij * the magnitude of this potential performance 4420287702Sdelphij * problem. 4421287702Sdelphij * 4422287702Sdelphij * Note that if the prefetch i/o is already 4423287702Sdelphij * active (has been issued to the device), 4424287702Sdelphij * the prefetch improved performance, because 4425287702Sdelphij * we issued it sooner than we would have 4426287702Sdelphij * without the prefetch. 4427287702Sdelphij */ 4428287702Sdelphij DTRACE_PROBE1(arc__sync__wait__for__async, 4429287702Sdelphij arc_buf_hdr_t *, hdr); 4430287702Sdelphij ARCSTAT_BUMP(arcstat_sync_wait_for_async); 4431287702Sdelphij } 4432287702Sdelphij if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 4433287702Sdelphij hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; 4434287702Sdelphij } 4435287702Sdelphij 4436275811Sdelphij if (*arc_flags & ARC_FLAG_WAIT) { 4437286570Smav cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); 4438168404Spjd mutex_exit(hash_lock); 4439168404Spjd goto top; 4440168404Spjd } 4441275811Sdelphij ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 4442168404Spjd 4443168404Spjd if (done) { 4444287702Sdelphij arc_callback_t *acb = NULL; 4445168404Spjd 4446168404Spjd acb = kmem_zalloc(sizeof (arc_callback_t), 4447168404Spjd KM_SLEEP); 4448168404Spjd acb->acb_done = done; 4449168404Spjd acb->acb_private = private; 4450168404Spjd if (pio != NULL) 4451168404Spjd acb->acb_zio_dummy = zio_null(pio, 4452209962Smm spa, NULL, NULL, NULL, zio_flags); 4453168404Spjd 4454168404Spjd ASSERT(acb->acb_done != NULL); 4455286570Smav acb->acb_next = hdr->b_l1hdr.b_acb; 4456286570Smav hdr->b_l1hdr.b_acb = acb; 4457168404Spjd add_reference(hdr, hash_lock, private); 4458168404Spjd mutex_exit(hash_lock); 4459168404Spjd return (0); 4460168404Spjd } 4461168404Spjd mutex_exit(hash_lock); 4462168404Spjd return (0); 4463168404Spjd } 4464168404Spjd 4465286570Smav ASSERT(hdr->b_l1hdr.b_state == arc_mru || 4466286570Smav hdr->b_l1hdr.b_state == arc_mfu); 4467168404Spjd 4468168404Spjd if (done) { 4469287702Sdelphij if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 4470287702Sdelphij /* 4471287702Sdelphij * This is a demand read which does not have to 4472287702Sdelphij * wait for i/o because we did a predictive 4473287702Sdelphij * prefetch i/o for it, which has completed. 4474287702Sdelphij */ 4475287702Sdelphij DTRACE_PROBE1( 4476287702Sdelphij arc__demand__hit__predictive__prefetch, 4477287702Sdelphij arc_buf_hdr_t *, hdr); 4478287702Sdelphij ARCSTAT_BUMP( 4479287702Sdelphij arcstat_demand_hit_predictive_prefetch); 4480287702Sdelphij hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; 4481287702Sdelphij } 4482168404Spjd add_reference(hdr, hash_lock, private); 4483168404Spjd /* 4484168404Spjd * If this block is already in use, create a new 4485168404Spjd * copy of the data so that we will be guaranteed 4486168404Spjd * that arc_release() will always succeed. 4487168404Spjd */ 4488286570Smav buf = hdr->b_l1hdr.b_buf; 4489168404Spjd ASSERT(buf); 4490168404Spjd ASSERT(buf->b_data); 4491168404Spjd if (HDR_BUF_AVAILABLE(hdr)) { 4492168404Spjd ASSERT(buf->b_efunc == NULL); 4493275811Sdelphij hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 4494168404Spjd } else { 4495168404Spjd buf = arc_buf_clone(buf); 4496168404Spjd } 4497219089Spjd 4498275811Sdelphij } else if (*arc_flags & ARC_FLAG_PREFETCH && 4499286570Smav refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 4500275811Sdelphij hdr->b_flags |= ARC_FLAG_PREFETCH; 4501168404Spjd } 4502168404Spjd DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 4503168404Spjd arc_access(hdr, hash_lock); 4504275811Sdelphij if (*arc_flags & ARC_FLAG_L2CACHE) 4505275811Sdelphij hdr->b_flags |= ARC_FLAG_L2CACHE; 4506275811Sdelphij if (*arc_flags & ARC_FLAG_L2COMPRESS) 4507275811Sdelphij hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4508168404Spjd mutex_exit(hash_lock); 4509168404Spjd ARCSTAT_BUMP(arcstat_hits); 4510286570Smav ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 4511286570Smav demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 4512168404Spjd data, metadata, hits); 4513168404Spjd 4514168404Spjd if (done) 4515168404Spjd done(NULL, buf, private); 4516168404Spjd } else { 4517168404Spjd uint64_t size = BP_GET_LSIZE(bp); 4518268075Sdelphij arc_callback_t *acb; 4519185029Spjd vdev_t *vd = NULL; 4520247187Smm uint64_t addr = 0; 4521208373Smm boolean_t devw = B_FALSE; 4522258389Savg enum zio_compress b_compress = ZIO_COMPRESS_OFF; 4523286570Smav int32_t b_asize = 0; 4524168404Spjd 4525168404Spjd if (hdr == NULL) { 4526168404Spjd /* this block is not in the cache */ 4527268075Sdelphij arc_buf_hdr_t *exists = NULL; 4528168404Spjd arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 4529168404Spjd buf = arc_buf_alloc(spa, size, private, type); 4530168404Spjd hdr = buf->b_hdr; 4531268075Sdelphij if (!BP_IS_EMBEDDED(bp)) { 4532268075Sdelphij hdr->b_dva = *BP_IDENTITY(bp); 4533268075Sdelphij hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 4534268075Sdelphij exists = buf_hash_insert(hdr, &hash_lock); 4535268075Sdelphij } 4536268075Sdelphij if (exists != NULL) { 4537168404Spjd /* somebody beat us to the hash insert */ 4538168404Spjd mutex_exit(hash_lock); 4539219089Spjd buf_discard_identity(hdr); 4540168404Spjd (void) arc_buf_remove_ref(buf, private); 4541168404Spjd goto top; /* restart the IO request */ 4542168404Spjd } 4543275811Sdelphij 4544287702Sdelphij /* 4545287702Sdelphij * If there is a callback, we pass our reference to 4546287702Sdelphij * it; otherwise we remove our reference. 4547287702Sdelphij */ 4548287702Sdelphij if (done == NULL) { 4549168404Spjd (void) remove_reference(hdr, hash_lock, 4550168404Spjd private); 4551287702Sdelphij } 4552287702Sdelphij if (*arc_flags & ARC_FLAG_PREFETCH) 4553275811Sdelphij hdr->b_flags |= ARC_FLAG_PREFETCH; 4554275811Sdelphij if (*arc_flags & ARC_FLAG_L2CACHE) 4555275811Sdelphij hdr->b_flags |= ARC_FLAG_L2CACHE; 4556275811Sdelphij if (*arc_flags & ARC_FLAG_L2COMPRESS) 4557275811Sdelphij hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4558168404Spjd if (BP_GET_LEVEL(bp) > 0) 4559275811Sdelphij hdr->b_flags |= ARC_FLAG_INDIRECT; 4560168404Spjd } else { 4561286570Smav /* 4562286570Smav * This block is in the ghost cache. If it was L2-only 4563286570Smav * (and thus didn't have an L1 hdr), we realloc the 4564286570Smav * header to add an L1 hdr. 4565286570Smav */ 4566286570Smav if (!HDR_HAS_L1HDR(hdr)) { 4567286570Smav hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, 4568286570Smav hdr_full_cache); 4569286570Smav } 4570286570Smav 4571286570Smav ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); 4572168404Spjd ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4573286570Smav ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4574286763Smav ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 4575168404Spjd 4576287702Sdelphij /* 4577287702Sdelphij * If there is a callback, we pass a reference to it. 4578287702Sdelphij */ 4579287702Sdelphij if (done != NULL) 4580287702Sdelphij add_reference(hdr, hash_lock, private); 4581275811Sdelphij if (*arc_flags & ARC_FLAG_PREFETCH) 4582275811Sdelphij hdr->b_flags |= ARC_FLAG_PREFETCH; 4583275811Sdelphij if (*arc_flags & ARC_FLAG_L2CACHE) 4584275811Sdelphij hdr->b_flags |= ARC_FLAG_L2CACHE; 4585275811Sdelphij if (*arc_flags & ARC_FLAG_L2COMPRESS) 4586275811Sdelphij hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4587185029Spjd buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 4588168404Spjd buf->b_hdr = hdr; 4589168404Spjd buf->b_data = NULL; 4590168404Spjd buf->b_efunc = NULL; 4591168404Spjd buf->b_private = NULL; 4592168404Spjd buf->b_next = NULL; 4593286570Smav hdr->b_l1hdr.b_buf = buf; 4594286570Smav ASSERT0(hdr->b_l1hdr.b_datacnt); 4595286570Smav hdr->b_l1hdr.b_datacnt = 1; 4596219089Spjd arc_get_data_buf(buf); 4597219089Spjd arc_access(hdr, hash_lock); 4598168404Spjd } 4599168404Spjd 4600287702Sdelphij if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) 4601287702Sdelphij hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH; 4602286570Smav ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); 4603219089Spjd 4604168404Spjd acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 4605168404Spjd acb->acb_done = done; 4606168404Spjd acb->acb_private = private; 4607168404Spjd 4608286570Smav ASSERT(hdr->b_l1hdr.b_acb == NULL); 4609286570Smav hdr->b_l1hdr.b_acb = acb; 4610275811Sdelphij hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 4611168404Spjd 4612286570Smav if (HDR_HAS_L2HDR(hdr) && 4613286570Smav (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { 4614286570Smav devw = hdr->b_l2hdr.b_dev->l2ad_writing; 4615286570Smav addr = hdr->b_l2hdr.b_daddr; 4616287706Sdelphij b_compress = hdr->b_l2hdr.b_compress; 4617286570Smav b_asize = hdr->b_l2hdr.b_asize; 4618185029Spjd /* 4619185029Spjd * Lock out device removal. 4620185029Spjd */ 4621185029Spjd if (vdev_is_dead(vd) || 4622185029Spjd !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 4623185029Spjd vd = NULL; 4624185029Spjd } 4625185029Spjd 4626268075Sdelphij if (hash_lock != NULL) 4627268075Sdelphij mutex_exit(hash_lock); 4628168404Spjd 4629251629Sdelphij /* 4630251629Sdelphij * At this point, we have a level 1 cache miss. Try again in 4631251629Sdelphij * L2ARC if possible. 4632251629Sdelphij */ 4633168404Spjd ASSERT3U(hdr->b_size, ==, size); 4634219089Spjd DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 4635268123Sdelphij uint64_t, size, zbookmark_phys_t *, zb); 4636168404Spjd ARCSTAT_BUMP(arcstat_misses); 4637286570Smav ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 4638286570Smav demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 4639168404Spjd data, metadata, misses); 4640228392Spjd#ifdef _KERNEL 4641297633Strasz#ifdef RACCT 4642297633Strasz if (racct_enable) { 4643297633Strasz PROC_LOCK(curproc); 4644297633Strasz racct_add_force(curproc, RACCT_READBPS, size); 4645297633Strasz racct_add_force(curproc, RACCT_READIOPS, 1); 4646297633Strasz PROC_UNLOCK(curproc); 4647297633Strasz } 4648297633Strasz#endif /* RACCT */ 4649228392Spjd curthread->td_ru.ru_inblock++; 4650228392Spjd#endif 4651168404Spjd 4652287702Sdelphij if (priority == ZIO_PRIORITY_ASYNC_READ) 4653287702Sdelphij hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ; 4654287702Sdelphij else 4655287702Sdelphij hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ; 4656287702Sdelphij 4657208373Smm if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 4658185029Spjd /* 4659185029Spjd * Read from the L2ARC if the following are true: 4660185029Spjd * 1. The L2ARC vdev was previously cached. 4661185029Spjd * 2. This buffer still has L2ARC metadata. 4662185029Spjd * 3. This buffer isn't currently writing to the L2ARC. 4663185029Spjd * 4. The L2ARC entry wasn't evicted, which may 4664185029Spjd * also have invalidated the vdev. 4665208373Smm * 5. This isn't prefetch and l2arc_noprefetch is set. 4666185029Spjd */ 4667286570Smav if (HDR_HAS_L2HDR(hdr) && 4668208373Smm !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 4669208373Smm !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 4670185029Spjd l2arc_read_callback_t *cb; 4671297848Savg void* b_data; 4672185029Spjd 4673185029Spjd DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 4674185029Spjd ARCSTAT_BUMP(arcstat_l2_hits); 4675185029Spjd 4676185029Spjd cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 4677185029Spjd KM_SLEEP); 4678185029Spjd cb->l2rcb_buf = buf; 4679185029Spjd cb->l2rcb_spa = spa; 4680185029Spjd cb->l2rcb_bp = *bp; 4681185029Spjd cb->l2rcb_zb = *zb; 4682185029Spjd cb->l2rcb_flags = zio_flags; 4683258389Savg cb->l2rcb_compress = b_compress; 4684297848Savg if (b_asize > hdr->b_size) { 4685297848Savg ASSERT3U(b_compress, ==, 4686297848Savg ZIO_COMPRESS_OFF); 4687297848Savg b_data = zio_data_buf_alloc(b_asize); 4688297848Savg cb->l2rcb_data = b_data; 4689297848Savg } else { 4690297848Savg b_data = buf->b_data; 4691297848Savg } 4692185029Spjd 4693247187Smm ASSERT(addr >= VDEV_LABEL_START_SIZE && 4694247187Smm addr + size < vd->vdev_psize - 4695247187Smm VDEV_LABEL_END_SIZE); 4696247187Smm 4697185029Spjd /* 4698185029Spjd * l2arc read. The SCL_L2ARC lock will be 4699185029Spjd * released by l2arc_read_done(). 4700251478Sdelphij * Issue a null zio if the underlying buffer 4701251478Sdelphij * was squashed to zero size by compression. 4702185029Spjd */ 4703258389Savg if (b_compress == ZIO_COMPRESS_EMPTY) { 4704297848Savg ASSERT3U(b_asize, ==, 0); 4705251478Sdelphij rzio = zio_null(pio, spa, vd, 4706251478Sdelphij l2arc_read_done, cb, 4707251478Sdelphij zio_flags | ZIO_FLAG_DONT_CACHE | 4708251478Sdelphij ZIO_FLAG_CANFAIL | 4709251478Sdelphij ZIO_FLAG_DONT_PROPAGATE | 4710251478Sdelphij ZIO_FLAG_DONT_RETRY); 4711251478Sdelphij } else { 4712251478Sdelphij rzio = zio_read_phys(pio, vd, addr, 4713297848Savg b_asize, b_data, 4714258389Savg ZIO_CHECKSUM_OFF, 4715251478Sdelphij l2arc_read_done, cb, priority, 4716251478Sdelphij zio_flags | ZIO_FLAG_DONT_CACHE | 4717251478Sdelphij ZIO_FLAG_CANFAIL | 4718251478Sdelphij ZIO_FLAG_DONT_PROPAGATE | 4719251478Sdelphij ZIO_FLAG_DONT_RETRY, B_FALSE); 4720251478Sdelphij } 4721185029Spjd DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 4722185029Spjd zio_t *, rzio); 4723258389Savg ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); 4724185029Spjd 4725275811Sdelphij if (*arc_flags & ARC_FLAG_NOWAIT) { 4726185029Spjd zio_nowait(rzio); 4727185029Spjd return (0); 4728185029Spjd } 4729185029Spjd 4730275811Sdelphij ASSERT(*arc_flags & ARC_FLAG_WAIT); 4731185029Spjd if (zio_wait(rzio) == 0) 4732185029Spjd return (0); 4733185029Spjd 4734185029Spjd /* l2arc read error; goto zio_read() */ 4735185029Spjd } else { 4736185029Spjd DTRACE_PROBE1(l2arc__miss, 4737185029Spjd arc_buf_hdr_t *, hdr); 4738185029Spjd ARCSTAT_BUMP(arcstat_l2_misses); 4739185029Spjd if (HDR_L2_WRITING(hdr)) 4740185029Spjd ARCSTAT_BUMP(arcstat_l2_rw_clash); 4741185029Spjd spa_config_exit(spa, SCL_L2ARC, vd); 4742185029Spjd } 4743208373Smm } else { 4744208373Smm if (vd != NULL) 4745208373Smm spa_config_exit(spa, SCL_L2ARC, vd); 4746208373Smm if (l2arc_ndev != 0) { 4747208373Smm DTRACE_PROBE1(l2arc__miss, 4748208373Smm arc_buf_hdr_t *, hdr); 4749208373Smm ARCSTAT_BUMP(arcstat_l2_misses); 4750208373Smm } 4751185029Spjd } 4752185029Spjd 4753168404Spjd rzio = zio_read(pio, spa, bp, buf->b_data, size, 4754185029Spjd arc_read_done, buf, priority, zio_flags, zb); 4755168404Spjd 4756275811Sdelphij if (*arc_flags & ARC_FLAG_WAIT) 4757168404Spjd return (zio_wait(rzio)); 4758168404Spjd 4759275811Sdelphij ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 4760168404Spjd zio_nowait(rzio); 4761168404Spjd } 4762168404Spjd return (0); 4763168404Spjd} 4764168404Spjd 4765168404Spjdvoid 4766168404Spjdarc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 4767168404Spjd{ 4768168404Spjd ASSERT(buf->b_hdr != NULL); 4769286570Smav ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); 4770286570Smav ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || 4771286570Smav func == NULL); 4772219089Spjd ASSERT(buf->b_efunc == NULL); 4773219089Spjd ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 4774219089Spjd 4775168404Spjd buf->b_efunc = func; 4776168404Spjd buf->b_private = private; 4777168404Spjd} 4778168404Spjd 4779168404Spjd/* 4780251520Sdelphij * Notify the arc that a block was freed, and thus will never be used again. 4781251520Sdelphij */ 4782251520Sdelphijvoid 4783251520Sdelphijarc_freed(spa_t *spa, const blkptr_t *bp) 4784251520Sdelphij{ 4785251520Sdelphij arc_buf_hdr_t *hdr; 4786251520Sdelphij kmutex_t *hash_lock; 4787251520Sdelphij uint64_t guid = spa_load_guid(spa); 4788251520Sdelphij 4789268075Sdelphij ASSERT(!BP_IS_EMBEDDED(bp)); 4790268075Sdelphij 4791268075Sdelphij hdr = buf_hash_find(guid, bp, &hash_lock); 4792251520Sdelphij if (hdr == NULL) 4793251520Sdelphij return; 4794251520Sdelphij if (HDR_BUF_AVAILABLE(hdr)) { 4795286570Smav arc_buf_t *buf = hdr->b_l1hdr.b_buf; 4796251520Sdelphij add_reference(hdr, hash_lock, FTAG); 4797275811Sdelphij hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 4798251520Sdelphij mutex_exit(hash_lock); 4799251520Sdelphij 4800251520Sdelphij arc_release(buf, FTAG); 4801251520Sdelphij (void) arc_buf_remove_ref(buf, FTAG); 4802251520Sdelphij } else { 4803251520Sdelphij mutex_exit(hash_lock); 4804251520Sdelphij } 4805251520Sdelphij 4806251520Sdelphij} 4807251520Sdelphij 4808251520Sdelphij/* 4809268858Sdelphij * Clear the user eviction callback set by arc_set_callback(), first calling 4810268858Sdelphij * it if it exists. Because the presence of a callback keeps an arc_buf cached 4811268858Sdelphij * clearing the callback may result in the arc_buf being destroyed. However, 4812268858Sdelphij * it will not result in the *last* arc_buf being destroyed, hence the data 4813268858Sdelphij * will remain cached in the ARC. We make a copy of the arc buffer here so 4814268858Sdelphij * that we can process the callback without holding any locks. 4815268858Sdelphij * 4816268858Sdelphij * It's possible that the callback is already in the process of being cleared 4817268858Sdelphij * by another thread. In this case we can not clear the callback. 4818268858Sdelphij * 4819268858Sdelphij * Returns B_TRUE if the callback was successfully called and cleared. 4820168404Spjd */ 4821268858Sdelphijboolean_t 4822268858Sdelphijarc_clear_callback(arc_buf_t *buf) 4823168404Spjd{ 4824168404Spjd arc_buf_hdr_t *hdr; 4825168404Spjd kmutex_t *hash_lock; 4826268858Sdelphij arc_evict_func_t *efunc = buf->b_efunc; 4827268858Sdelphij void *private = buf->b_private; 4828206796Spjd 4829219089Spjd mutex_enter(&buf->b_evict_lock); 4830168404Spjd hdr = buf->b_hdr; 4831168404Spjd if (hdr == NULL) { 4832168404Spjd /* 4833168404Spjd * We are in arc_do_user_evicts(). 4834168404Spjd */ 4835168404Spjd ASSERT(buf->b_data == NULL); 4836219089Spjd mutex_exit(&buf->b_evict_lock); 4837268858Sdelphij return (B_FALSE); 4838185029Spjd } else if (buf->b_data == NULL) { 4839185029Spjd /* 4840185029Spjd * We are on the eviction list; process this buffer now 4841185029Spjd * but let arc_do_user_evicts() do the reaping. 4842185029Spjd */ 4843185029Spjd buf->b_efunc = NULL; 4844219089Spjd mutex_exit(&buf->b_evict_lock); 4845268858Sdelphij VERIFY0(efunc(private)); 4846268858Sdelphij return (B_TRUE); 4847168404Spjd } 4848168404Spjd hash_lock = HDR_LOCK(hdr); 4849168404Spjd mutex_enter(hash_lock); 4850219089Spjd hdr = buf->b_hdr; 4851219089Spjd ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4852168404Spjd 4853286570Smav ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, 4854286570Smav hdr->b_l1hdr.b_datacnt); 4855286570Smav ASSERT(hdr->b_l1hdr.b_state == arc_mru || 4856286570Smav hdr->b_l1hdr.b_state == arc_mfu); 4857168404Spjd 4858268858Sdelphij buf->b_efunc = NULL; 4859268858Sdelphij buf->b_private = NULL; 4860168404Spjd 4861286570Smav if (hdr->b_l1hdr.b_datacnt > 1) { 4862268858Sdelphij mutex_exit(&buf->b_evict_lock); 4863286763Smav arc_buf_destroy(buf, TRUE); 4864268858Sdelphij } else { 4865286570Smav ASSERT(buf == hdr->b_l1hdr.b_buf); 4866275811Sdelphij hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 4867268858Sdelphij mutex_exit(&buf->b_evict_lock); 4868268858Sdelphij } 4869168404Spjd 4870168404Spjd mutex_exit(hash_lock); 4871268858Sdelphij VERIFY0(efunc(private)); 4872268858Sdelphij return (B_TRUE); 4873168404Spjd} 4874168404Spjd 4875168404Spjd/* 4876251629Sdelphij * Release this buffer from the cache, making it an anonymous buffer. This 4877251629Sdelphij * must be done after a read and prior to modifying the buffer contents. 4878168404Spjd * If the buffer has more than one reference, we must make 4879185029Spjd * a new hdr for the buffer. 4880168404Spjd */ 4881168404Spjdvoid 4882168404Spjdarc_release(arc_buf_t *buf, void *tag) 4883168404Spjd{ 4884286570Smav arc_buf_hdr_t *hdr = buf->b_hdr; 4885168404Spjd 4886219089Spjd /* 4887219089Spjd * It would be nice to assert that if it's DMU metadata (level > 4888219089Spjd * 0 || it's the dnode file), then it must be syncing context. 4889219089Spjd * But we don't know that information at this level. 4890219089Spjd */ 4891219089Spjd 4892219089Spjd mutex_enter(&buf->b_evict_lock); 4893286776Smav 4894286776Smav ASSERT(HDR_HAS_L1HDR(hdr)); 4895286776Smav 4896286570Smav /* 4897286570Smav * We don't grab the hash lock prior to this check, because if 4898286570Smav * the buffer's header is in the arc_anon state, it won't be 4899286570Smav * linked into the hash table. 4900286570Smav */ 4901286570Smav if (hdr->b_l1hdr.b_state == arc_anon) { 4902286570Smav mutex_exit(&buf->b_evict_lock); 4903286570Smav ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4904286570Smav ASSERT(!HDR_IN_HASH_TABLE(hdr)); 4905286570Smav ASSERT(!HDR_HAS_L2HDR(hdr)); 4906286570Smav ASSERT(BUF_EMPTY(hdr)); 4907286570Smav ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); 4908286570Smav ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); 4909286570Smav ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 4910185029Spjd 4911286570Smav ASSERT3P(buf->b_efunc, ==, NULL); 4912286570Smav ASSERT3P(buf->b_private, ==, NULL); 4913168404Spjd 4914286570Smav hdr->b_l1hdr.b_arc_access = 0; 4915286570Smav arc_buf_thaw(buf); 4916286570Smav 4917286570Smav return; 4918168404Spjd } 4919168404Spjd 4920286570Smav kmutex_t *hash_lock = HDR_LOCK(hdr); 4921286570Smav mutex_enter(hash_lock); 4922286570Smav 4923286570Smav /* 4924286570Smav * This assignment is only valid as long as the hash_lock is 4925286570Smav * held, we must be careful not to reference state or the 4926286570Smav * b_state field after dropping the lock. 4927286570Smav */ 4928286570Smav arc_state_t *state = hdr->b_l1hdr.b_state; 4929286570Smav ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4930286570Smav ASSERT3P(state, !=, arc_anon); 4931286570Smav 4932286570Smav /* this buffer is not on any list */ 4933286570Smav ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); 4934286570Smav 4935286570Smav if (HDR_HAS_L2HDR(hdr)) { 4936286570Smav mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4937286570Smav 4938286570Smav /* 4939286598Smav * We have to recheck this conditional again now that 4940286598Smav * we're holding the l2ad_mtx to prevent a race with 4941286598Smav * another thread which might be concurrently calling 4942286598Smav * l2arc_evict(). In that case, l2arc_evict() might have 4943286598Smav * destroyed the header's L2 portion as we were waiting 4944286598Smav * to acquire the l2ad_mtx. 4945286570Smav */ 4946286598Smav if (HDR_HAS_L2HDR(hdr)) { 4947290191Savg l2arc_trim(hdr); 4948286598Smav arc_hdr_l2hdr_destroy(hdr); 4949286598Smav } 4950286570Smav 4951286570Smav mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4952185029Spjd } 4953185029Spjd 4954168404Spjd /* 4955168404Spjd * Do we have more than one buf? 4956168404Spjd */ 4957286570Smav if (hdr->b_l1hdr.b_datacnt > 1) { 4958168404Spjd arc_buf_hdr_t *nhdr; 4959168404Spjd arc_buf_t **bufp; 4960168404Spjd uint64_t blksz = hdr->b_size; 4961209962Smm uint64_t spa = hdr->b_spa; 4962286570Smav arc_buf_contents_t type = arc_buf_type(hdr); 4963185029Spjd uint32_t flags = hdr->b_flags; 4964168404Spjd 4965286570Smav ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); 4966168404Spjd /* 4967219089Spjd * Pull the data off of this hdr and attach it to 4968219089Spjd * a new anonymous hdr. 4969168404Spjd */ 4970168404Spjd (void) remove_reference(hdr, hash_lock, tag); 4971286570Smav bufp = &hdr->b_l1hdr.b_buf; 4972168404Spjd while (*bufp != buf) 4973168404Spjd bufp = &(*bufp)->b_next; 4974219089Spjd *bufp = buf->b_next; 4975168404Spjd buf->b_next = NULL; 4976168404Spjd 4977286570Smav ASSERT3P(state, !=, arc_l2c_only); 4978286766Smav 4979286766Smav (void) refcount_remove_many( 4980286766Smav &state->arcs_size, hdr->b_size, buf); 4981286766Smav 4982286570Smav if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { 4983286570Smav ASSERT3P(state, !=, arc_l2c_only); 4984286570Smav uint64_t *size = &state->arcs_lsize[type]; 4985185029Spjd ASSERT3U(*size, >=, hdr->b_size); 4986185029Spjd atomic_add_64(size, -hdr->b_size); 4987168404Spjd } 4988242845Sdelphij 4989242845Sdelphij /* 4990242845Sdelphij * We're releasing a duplicate user data buffer, update 4991242845Sdelphij * our statistics accordingly. 4992242845Sdelphij */ 4993286570Smav if (HDR_ISTYPE_DATA(hdr)) { 4994242845Sdelphij ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 4995242845Sdelphij ARCSTAT_INCR(arcstat_duplicate_buffers_size, 4996242845Sdelphij -hdr->b_size); 4997242845Sdelphij } 4998286570Smav hdr->b_l1hdr.b_datacnt -= 1; 4999168404Spjd arc_cksum_verify(buf); 5000240133Smm#ifdef illumos 5001240133Smm arc_buf_unwatch(buf); 5002277300Ssmh#endif 5003168404Spjd 5004168404Spjd mutex_exit(hash_lock); 5005168404Spjd 5006286570Smav nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 5007168404Spjd nhdr->b_size = blksz; 5008168404Spjd nhdr->b_spa = spa; 5009286570Smav 5010275811Sdelphij nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; 5011286570Smav nhdr->b_flags |= arc_bufc_to_flags(type); 5012286570Smav nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 5013286570Smav 5014286570Smav nhdr->b_l1hdr.b_buf = buf; 5015286570Smav nhdr->b_l1hdr.b_datacnt = 1; 5016286570Smav nhdr->b_l1hdr.b_state = arc_anon; 5017286570Smav nhdr->b_l1hdr.b_arc_access = 0; 5018286763Smav nhdr->b_l1hdr.b_tmp_cdata = NULL; 5019168404Spjd nhdr->b_freeze_cksum = NULL; 5020286570Smav 5021286570Smav (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); 5022168404Spjd buf->b_hdr = nhdr; 5023219089Spjd mutex_exit(&buf->b_evict_lock); 5024286766Smav (void) refcount_add_many(&arc_anon->arcs_size, blksz, buf); 5025168404Spjd } else { 5026219089Spjd mutex_exit(&buf->b_evict_lock); 5027286570Smav ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); 5028286763Smav /* protected by hash lock, or hdr is on arc_anon */ 5029286763Smav ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 5030168404Spjd ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 5031286570Smav arc_change_state(arc_anon, hdr, hash_lock); 5032286570Smav hdr->b_l1hdr.b_arc_access = 0; 5033286570Smav mutex_exit(hash_lock); 5034185029Spjd 5035219089Spjd buf_discard_identity(hdr); 5036168404Spjd arc_buf_thaw(buf); 5037168404Spjd } 5038168404Spjd buf->b_efunc = NULL; 5039168404Spjd buf->b_private = NULL; 5040168404Spjd} 5041168404Spjd 5042168404Spjdint 5043168404Spjdarc_released(arc_buf_t *buf) 5044168404Spjd{ 5045185029Spjd int released; 5046185029Spjd 5047219089Spjd mutex_enter(&buf->b_evict_lock); 5048286570Smav released = (buf->b_data != NULL && 5049286570Smav buf->b_hdr->b_l1hdr.b_state == arc_anon); 5050219089Spjd mutex_exit(&buf->b_evict_lock); 5051185029Spjd return (released); 5052168404Spjd} 5053168404Spjd 5054168404Spjd#ifdef ZFS_DEBUG 5055168404Spjdint 5056168404Spjdarc_referenced(arc_buf_t *buf) 5057168404Spjd{ 5058185029Spjd int referenced; 5059185029Spjd 5060219089Spjd mutex_enter(&buf->b_evict_lock); 5061286570Smav referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); 5062219089Spjd mutex_exit(&buf->b_evict_lock); 5063185029Spjd return (referenced); 5064168404Spjd} 5065168404Spjd#endif 5066168404Spjd 5067168404Spjdstatic void 5068168404Spjdarc_write_ready(zio_t *zio) 5069168404Spjd{ 5070168404Spjd arc_write_callback_t *callback = zio->io_private; 5071168404Spjd arc_buf_t *buf = callback->awcb_buf; 5072185029Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 5073168404Spjd 5074286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 5075286570Smav ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); 5076286570Smav ASSERT(hdr->b_l1hdr.b_datacnt > 0); 5077185029Spjd callback->awcb_ready(zio, buf, callback->awcb_private); 5078185029Spjd 5079185029Spjd /* 5080185029Spjd * If the IO is already in progress, then this is a re-write 5081185029Spjd * attempt, so we need to thaw and re-compute the cksum. 5082185029Spjd * It is the responsibility of the callback to handle the 5083185029Spjd * accounting for any re-write attempt. 5084185029Spjd */ 5085185029Spjd if (HDR_IO_IN_PROGRESS(hdr)) { 5086286570Smav mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 5087185029Spjd if (hdr->b_freeze_cksum != NULL) { 5088185029Spjd kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 5089185029Spjd hdr->b_freeze_cksum = NULL; 5090185029Spjd } 5091286570Smav mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 5092168404Spjd } 5093185029Spjd arc_cksum_compute(buf, B_FALSE); 5094275811Sdelphij hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 5095168404Spjd} 5096168404Spjd 5097258632Savg/* 5098258632Savg * The SPA calls this callback for each physical write that happens on behalf 5099258632Savg * of a logical write. See the comment in dbuf_write_physdone() for details. 5100258632Savg */ 5101168404Spjdstatic void 5102258632Savgarc_write_physdone(zio_t *zio) 5103258632Savg{ 5104258632Savg arc_write_callback_t *cb = zio->io_private; 5105258632Savg if (cb->awcb_physdone != NULL) 5106258632Savg cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 5107258632Savg} 5108258632Savg 5109258632Savgstatic void 5110168404Spjdarc_write_done(zio_t *zio) 5111168404Spjd{ 5112168404Spjd arc_write_callback_t *callback = zio->io_private; 5113168404Spjd arc_buf_t *buf = callback->awcb_buf; 5114168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 5115168404Spjd 5116286570Smav ASSERT(hdr->b_l1hdr.b_acb == NULL); 5117168404Spjd 5118219089Spjd if (zio->io_error == 0) { 5119268075Sdelphij if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 5120260150Sdelphij buf_discard_identity(hdr); 5121260150Sdelphij } else { 5122260150Sdelphij hdr->b_dva = *BP_IDENTITY(zio->io_bp); 5123260150Sdelphij hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 5124260150Sdelphij } 5125219089Spjd } else { 5126219089Spjd ASSERT(BUF_EMPTY(hdr)); 5127219089Spjd } 5128219089Spjd 5129168404Spjd /* 5130268075Sdelphij * If the block to be written was all-zero or compressed enough to be 5131268075Sdelphij * embedded in the BP, no write was performed so there will be no 5132268075Sdelphij * dva/birth/checksum. The buffer must therefore remain anonymous 5133268075Sdelphij * (and uncached). 5134168404Spjd */ 5135168404Spjd if (!BUF_EMPTY(hdr)) { 5136168404Spjd arc_buf_hdr_t *exists; 5137168404Spjd kmutex_t *hash_lock; 5138168404Spjd 5139219089Spjd ASSERT(zio->io_error == 0); 5140219089Spjd 5141168404Spjd arc_cksum_verify(buf); 5142168404Spjd 5143168404Spjd exists = buf_hash_insert(hdr, &hash_lock); 5144286570Smav if (exists != NULL) { 5145168404Spjd /* 5146168404Spjd * This can only happen if we overwrite for 5147168404Spjd * sync-to-convergence, because we remove 5148168404Spjd * buffers from the hash table when we arc_free(). 5149168404Spjd */ 5150219089Spjd if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 5151219089Spjd if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 5152219089Spjd panic("bad overwrite, hdr=%p exists=%p", 5153219089Spjd (void *)hdr, (void *)exists); 5154286570Smav ASSERT(refcount_is_zero( 5155286570Smav &exists->b_l1hdr.b_refcnt)); 5156219089Spjd arc_change_state(arc_anon, exists, hash_lock); 5157219089Spjd mutex_exit(hash_lock); 5158219089Spjd arc_hdr_destroy(exists); 5159219089Spjd exists = buf_hash_insert(hdr, &hash_lock); 5160219089Spjd ASSERT3P(exists, ==, NULL); 5161243524Smm } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 5162243524Smm /* nopwrite */ 5163243524Smm ASSERT(zio->io_prop.zp_nopwrite); 5164243524Smm if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 5165243524Smm panic("bad nopwrite, hdr=%p exists=%p", 5166243524Smm (void *)hdr, (void *)exists); 5167219089Spjd } else { 5168219089Spjd /* Dedup */ 5169286570Smav ASSERT(hdr->b_l1hdr.b_datacnt == 1); 5170286570Smav ASSERT(hdr->b_l1hdr.b_state == arc_anon); 5171219089Spjd ASSERT(BP_GET_DEDUP(zio->io_bp)); 5172219089Spjd ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 5173219089Spjd } 5174168404Spjd } 5175275811Sdelphij hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 5176185029Spjd /* if it's not anon, we are doing a scrub */ 5177286570Smav if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) 5178185029Spjd arc_access(hdr, hash_lock); 5179168404Spjd mutex_exit(hash_lock); 5180168404Spjd } else { 5181275811Sdelphij hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 5182168404Spjd } 5183168404Spjd 5184286570Smav ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 5185219089Spjd callback->awcb_done(zio, buf, callback->awcb_private); 5186168404Spjd 5187168404Spjd kmem_free(callback, sizeof (arc_write_callback_t)); 5188168404Spjd} 5189168404Spjd 5190168404Spjdzio_t * 5191219089Spjdarc_write(zio_t *pio, spa_t *spa, uint64_t txg, 5192251478Sdelphij blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, 5193258632Savg const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, 5194258632Savg arc_done_func_t *done, void *private, zio_priority_t priority, 5195268123Sdelphij int zio_flags, const zbookmark_phys_t *zb) 5196168404Spjd{ 5197168404Spjd arc_buf_hdr_t *hdr = buf->b_hdr; 5198168404Spjd arc_write_callback_t *callback; 5199185029Spjd zio_t *zio; 5200168404Spjd 5201185029Spjd ASSERT(ready != NULL); 5202219089Spjd ASSERT(done != NULL); 5203168404Spjd ASSERT(!HDR_IO_ERROR(hdr)); 5204286570Smav ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 5205286570Smav ASSERT(hdr->b_l1hdr.b_acb == NULL); 5206286570Smav ASSERT(hdr->b_l1hdr.b_datacnt > 0); 5207185029Spjd if (l2arc) 5208275811Sdelphij hdr->b_flags |= ARC_FLAG_L2CACHE; 5209251478Sdelphij if (l2arc_compress) 5210275811Sdelphij hdr->b_flags |= ARC_FLAG_L2COMPRESS; 5211168404Spjd callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 5212168404Spjd callback->awcb_ready = ready; 5213258632Savg callback->awcb_physdone = physdone; 5214168404Spjd callback->awcb_done = done; 5215168404Spjd callback->awcb_private = private; 5216168404Spjd callback->awcb_buf = buf; 5217168404Spjd 5218219089Spjd zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 5219258632Savg arc_write_ready, arc_write_physdone, arc_write_done, callback, 5220258632Savg priority, zio_flags, zb); 5221185029Spjd 5222168404Spjd return (zio); 5223168404Spjd} 5224168404Spjd 5225185029Spjdstatic int 5226258632Savgarc_memory_throttle(uint64_t reserve, uint64_t txg) 5227185029Spjd{ 5228185029Spjd#ifdef _KERNEL 5229272483Ssmh uint64_t available_memory = ptob(freemem); 5230185029Spjd static uint64_t page_load = 0; 5231185029Spjd static uint64_t last_txg = 0; 5232185029Spjd 5233272483Ssmh#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) 5234185029Spjd available_memory = 5235272483Ssmh MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); 5236185029Spjd#endif 5237258632Savg 5238272483Ssmh if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) 5239185029Spjd return (0); 5240185029Spjd 5241185029Spjd if (txg > last_txg) { 5242185029Spjd last_txg = txg; 5243185029Spjd page_load = 0; 5244185029Spjd } 5245185029Spjd /* 5246185029Spjd * If we are in pageout, we know that memory is already tight, 5247185029Spjd * the arc is already going to be evicting, so we just want to 5248185029Spjd * continue to let page writes occur as quickly as possible. 5249185029Spjd */ 5250185029Spjd if (curproc == pageproc) { 5251272483Ssmh if (page_load > MAX(ptob(minfree), available_memory) / 4) 5252249195Smm return (SET_ERROR(ERESTART)); 5253185029Spjd /* Note: reserve is inflated, so we deflate */ 5254185029Spjd page_load += reserve / 8; 5255185029Spjd return (0); 5256185029Spjd } else if (page_load > 0 && arc_reclaim_needed()) { 5257185029Spjd /* memory is low, delay before restarting */ 5258185029Spjd ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 5259249195Smm return (SET_ERROR(EAGAIN)); 5260185029Spjd } 5261185029Spjd page_load = 0; 5262185029Spjd#endif 5263185029Spjd return (0); 5264185029Spjd} 5265185029Spjd 5266168404Spjdvoid 5267185029Spjdarc_tempreserve_clear(uint64_t reserve) 5268168404Spjd{ 5269185029Spjd atomic_add_64(&arc_tempreserve, -reserve); 5270168404Spjd ASSERT((int64_t)arc_tempreserve >= 0); 5271168404Spjd} 5272168404Spjd 5273168404Spjdint 5274185029Spjdarc_tempreserve_space(uint64_t reserve, uint64_t txg) 5275168404Spjd{ 5276185029Spjd int error; 5277209962Smm uint64_t anon_size; 5278185029Spjd 5279272483Ssmh if (reserve > arc_c/4 && !arc_no_grow) { 5280185029Spjd arc_c = MIN(arc_c_max, reserve * 4); 5281272483Ssmh DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); 5282272483Ssmh } 5283185029Spjd if (reserve > arc_c) 5284249195Smm return (SET_ERROR(ENOMEM)); 5285168404Spjd 5286168404Spjd /* 5287209962Smm * Don't count loaned bufs as in flight dirty data to prevent long 5288209962Smm * network delays from blocking transactions that are ready to be 5289209962Smm * assigned to a txg. 5290209962Smm */ 5291286766Smav anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) - 5292286766Smav arc_loaned_bytes), 0); 5293209962Smm 5294209962Smm /* 5295185029Spjd * Writes will, almost always, require additional memory allocations 5296251631Sdelphij * in order to compress/encrypt/etc the data. We therefore need to 5297185029Spjd * make sure that there is sufficient available memory for this. 5298185029Spjd */ 5299258632Savg error = arc_memory_throttle(reserve, txg); 5300258632Savg if (error != 0) 5301185029Spjd return (error); 5302185029Spjd 5303185029Spjd /* 5304168404Spjd * Throttle writes when the amount of dirty data in the cache 5305168404Spjd * gets too large. We try to keep the cache less than half full 5306168404Spjd * of dirty blocks so that our sync times don't grow too large. 5307168404Spjd * Note: if two requests come in concurrently, we might let them 5308168404Spjd * both succeed, when one of them should fail. Not a huge deal. 5309168404Spjd */ 5310209962Smm 5311209962Smm if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 5312209962Smm anon_size > arc_c / 4) { 5313185029Spjd dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 5314185029Spjd "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 5315185029Spjd arc_tempreserve>>10, 5316185029Spjd arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 5317185029Spjd arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 5318185029Spjd reserve>>10, arc_c>>10); 5319249195Smm return (SET_ERROR(ERESTART)); 5320168404Spjd } 5321185029Spjd atomic_add_64(&arc_tempreserve, reserve); 5322168404Spjd return (0); 5323168404Spjd} 5324168404Spjd 5325286626Smavstatic void 5326286626Smavarc_kstat_update_state(arc_state_t *state, kstat_named_t *size, 5327286626Smav kstat_named_t *evict_data, kstat_named_t *evict_metadata) 5328286626Smav{ 5329286766Smav size->value.ui64 = refcount_count(&state->arcs_size); 5330286626Smav evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; 5331286626Smav evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; 5332286626Smav} 5333286626Smav 5334286626Smavstatic int 5335286626Smavarc_kstat_update(kstat_t *ksp, int rw) 5336286626Smav{ 5337286626Smav arc_stats_t *as = ksp->ks_data; 5338286626Smav 5339286626Smav if (rw == KSTAT_WRITE) { 5340286626Smav return (EACCES); 5341286626Smav } else { 5342286626Smav arc_kstat_update_state(arc_anon, 5343286626Smav &as->arcstat_anon_size, 5344286626Smav &as->arcstat_anon_evictable_data, 5345286626Smav &as->arcstat_anon_evictable_metadata); 5346286626Smav arc_kstat_update_state(arc_mru, 5347286626Smav &as->arcstat_mru_size, 5348286626Smav &as->arcstat_mru_evictable_data, 5349286626Smav &as->arcstat_mru_evictable_metadata); 5350286626Smav arc_kstat_update_state(arc_mru_ghost, 5351286626Smav &as->arcstat_mru_ghost_size, 5352286626Smav &as->arcstat_mru_ghost_evictable_data, 5353286626Smav &as->arcstat_mru_ghost_evictable_metadata); 5354286626Smav arc_kstat_update_state(arc_mfu, 5355286626Smav &as->arcstat_mfu_size, 5356286626Smav &as->arcstat_mfu_evictable_data, 5357286626Smav &as->arcstat_mfu_evictable_metadata); 5358286626Smav arc_kstat_update_state(arc_mfu_ghost, 5359286626Smav &as->arcstat_mfu_ghost_size, 5360286626Smav &as->arcstat_mfu_ghost_evictable_data, 5361286626Smav &as->arcstat_mfu_ghost_evictable_metadata); 5362286626Smav } 5363286626Smav 5364286626Smav return (0); 5365286626Smav} 5366286626Smav 5367286763Smav/* 5368286763Smav * This function *must* return indices evenly distributed between all 5369286763Smav * sublists of the multilist. This is needed due to how the ARC eviction 5370286763Smav * code is laid out; arc_evict_state() assumes ARC buffers are evenly 5371286763Smav * distributed between all sublists and uses this assumption when 5372286763Smav * deciding which sublist to evict from and how much to evict from it. 5373286763Smav */ 5374286763Smavunsigned int 5375286763Smavarc_state_multilist_index_func(multilist_t *ml, void *obj) 5376286763Smav{ 5377286763Smav arc_buf_hdr_t *hdr = obj; 5378286763Smav 5379286763Smav /* 5380286763Smav * We rely on b_dva to generate evenly distributed index 5381286763Smav * numbers using buf_hash below. So, as an added precaution, 5382286763Smav * let's make sure we never add empty buffers to the arc lists. 5383286763Smav */ 5384286763Smav ASSERT(!BUF_EMPTY(hdr)); 5385286763Smav 5386286763Smav /* 5387286763Smav * The assumption here, is the hash value for a given 5388286763Smav * arc_buf_hdr_t will remain constant throughout it's lifetime 5389286763Smav * (i.e. it's b_spa, b_dva, and b_birth fields don't change). 5390286763Smav * Thus, we don't need to store the header's sublist index 5391286763Smav * on insertion, as this index can be recalculated on removal. 5392286763Smav * 5393286763Smav * Also, the low order bits of the hash value are thought to be 5394286763Smav * distributed evenly. Otherwise, in the case that the multilist 5395286763Smav * has a power of two number of sublists, each sublists' usage 5396286763Smav * would not be evenly distributed. 5397286763Smav */ 5398286763Smav return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % 5399286763Smav multilist_get_num_sublists(ml)); 5400286763Smav} 5401286763Smav 5402168404Spjd#ifdef _KERNEL 5403168566Spjdstatic eventhandler_tag arc_event_lowmem = NULL; 5404168404Spjd 5405168404Spjdstatic void 5406168566Spjdarc_lowmem(void *arg __unused, int howto __unused) 5407168404Spjd{ 5408168404Spjd 5409286763Smav mutex_enter(&arc_reclaim_lock); 5410286625Smav /* XXX: Memory deficit should be passed as argument. */ 5411286625Smav needfree = btoc(arc_c >> arc_shrink_shift); 5412272483Ssmh DTRACE_PROBE(arc__needfree); 5413286763Smav cv_signal(&arc_reclaim_thread_cv); 5414241773Savg 5415241773Savg /* 5416241773Savg * It is unsafe to block here in arbitrary threads, because we can come 5417241773Savg * here from ARC itself and may hold ARC locks and thus risk a deadlock 5418241773Savg * with ARC reclaim thread. 5419241773Savg */ 5420286623Smav if (curproc == pageproc) 5421286763Smav (void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); 5422286763Smav mutex_exit(&arc_reclaim_lock); 5423168404Spjd} 5424168404Spjd#endif 5425168404Spjd 5426168404Spjdvoid 5427168404Spjdarc_init(void) 5428168404Spjd{ 5429219089Spjd int i, prefetch_tunable_set = 0; 5430205231Skmacy 5431286763Smav mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); 5432286763Smav cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); 5433286763Smav cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); 5434168404Spjd 5435286763Smav mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL); 5436286763Smav cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL); 5437286763Smav 5438301997Skib mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL); 5439301997Skib cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL); 5440301997Skib 5441168404Spjd /* Convert seconds to clock ticks */ 5442168404Spjd arc_min_prefetch_lifespan = 1 * hz; 5443168404Spjd 5444168404Spjd /* Start out with 1/8 of all memory */ 5445168566Spjd arc_c = kmem_size() / 8; 5446219089Spjd 5447277300Ssmh#ifdef illumos 5448192360Skmacy#ifdef _KERNEL 5449192360Skmacy /* 5450192360Skmacy * On architectures where the physical memory can be larger 5451192360Skmacy * than the addressable space (intel in 32-bit mode), we may 5452192360Skmacy * need to limit the cache to 1/8 of VM size. 5453192360Skmacy */ 5454192360Skmacy arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 5455192360Skmacy#endif 5456277300Ssmh#endif /* illumos */ 5457302265Ssmh /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */ 5458302265Ssmh arc_c_min = MAX(arc_c / 4, arc_abs_min); 5459168566Spjd /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ 5460280822Smav if (arc_c * 8 >= 1 << 30) 5461280822Smav arc_c_max = (arc_c * 8) - (1 << 30); 5462168404Spjd else 5463168404Spjd arc_c_max = arc_c_min; 5464175633Spjd arc_c_max = MAX(arc_c * 5, arc_c_max); 5465219089Spjd 5466289305Smav /* 5467289305Smav * In userland, there's only the memory pressure that we artificially 5468289305Smav * create (see arc_available_memory()). Don't let arc_c get too 5469289305Smav * small, because it can cause transactions to be larger than 5470289305Smav * arc_c, causing arc_tempreserve_space() to fail. 5471289305Smav */ 5472289305Smav#ifndef _KERNEL 5473289305Smav arc_c_min = arc_c_max / 2; 5474289305Smav#endif 5475289305Smav 5476168481Spjd#ifdef _KERNEL 5477168404Spjd /* 5478168404Spjd * Allow the tunables to override our calculations if they are 5479302265Ssmh * reasonable. 5480168404Spjd */ 5481302265Ssmh if (zfs_arc_max > arc_abs_min && zfs_arc_max < kmem_size()) 5482168404Spjd arc_c_max = zfs_arc_max; 5483302265Ssmh if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max) 5484168404Spjd arc_c_min = zfs_arc_min; 5485168481Spjd#endif 5486219089Spjd 5487168404Spjd arc_c = arc_c_max; 5488168404Spjd arc_p = (arc_c >> 1); 5489168404Spjd 5490185029Spjd /* limit meta-data to 1/4 of the arc capacity */ 5491185029Spjd arc_meta_limit = arc_c_max / 4; 5492185029Spjd 5493185029Spjd /* Allow the tunable to override if it is reasonable */ 5494185029Spjd if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 5495185029Spjd arc_meta_limit = zfs_arc_meta_limit; 5496185029Spjd 5497185029Spjd if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 5498185029Spjd arc_c_min = arc_meta_limit / 2; 5499185029Spjd 5500275780Sdelphij if (zfs_arc_meta_min > 0) { 5501275780Sdelphij arc_meta_min = zfs_arc_meta_min; 5502275780Sdelphij } else { 5503275780Sdelphij arc_meta_min = arc_c_min / 2; 5504275780Sdelphij } 5505275780Sdelphij 5506208373Smm if (zfs_arc_grow_retry > 0) 5507208373Smm arc_grow_retry = zfs_arc_grow_retry; 5508208373Smm 5509208373Smm if (zfs_arc_shrink_shift > 0) 5510208373Smm arc_shrink_shift = zfs_arc_shrink_shift; 5511208373Smm 5512286625Smav /* 5513286625Smav * Ensure that arc_no_grow_shift is less than arc_shrink_shift. 5514286625Smav */ 5515286625Smav if (arc_no_grow_shift >= arc_shrink_shift) 5516286625Smav arc_no_grow_shift = arc_shrink_shift - 1; 5517286625Smav 5518208373Smm if (zfs_arc_p_min_shift > 0) 5519208373Smm arc_p_min_shift = zfs_arc_p_min_shift; 5520208373Smm 5521286763Smav if (zfs_arc_num_sublists_per_state < 1) 5522286763Smav zfs_arc_num_sublists_per_state = MAX(max_ncpus, 1); 5523286763Smav 5524168404Spjd /* if kmem_flags are set, lets try to use less memory */ 5525168404Spjd if (kmem_debugging()) 5526168404Spjd arc_c = arc_c / 2; 5527168404Spjd if (arc_c < arc_c_min) 5528168404Spjd arc_c = arc_c_min; 5529168404Spjd 5530168473Spjd zfs_arc_min = arc_c_min; 5531168473Spjd zfs_arc_max = arc_c_max; 5532168473Spjd 5533168404Spjd arc_anon = &ARC_anon; 5534168404Spjd arc_mru = &ARC_mru; 5535168404Spjd arc_mru_ghost = &ARC_mru_ghost; 5536168404Spjd arc_mfu = &ARC_mfu; 5537168404Spjd arc_mfu_ghost = &ARC_mfu_ghost; 5538185029Spjd arc_l2c_only = &ARC_l2c_only; 5539168404Spjd arc_size = 0; 5540168404Spjd 5541286763Smav multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], 5542286762Smav sizeof (arc_buf_hdr_t), 5543286763Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5544286763Smav zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5545286763Smav multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], 5546286762Smav sizeof (arc_buf_hdr_t), 5547286763Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5548286763Smav zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5549286763Smav multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], 5550286762Smav sizeof (arc_buf_hdr_t), 5551286763Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5552286763Smav zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5553286763Smav multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], 5554286762Smav sizeof (arc_buf_hdr_t), 5555286763Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5556286763Smav zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5557286763Smav multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], 5558286762Smav sizeof (arc_buf_hdr_t), 5559286763Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5560286763Smav zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5561286763Smav multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], 5562286762Smav sizeof (arc_buf_hdr_t), 5563286763Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5564286763Smav zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5565286763Smav multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], 5566286762Smav sizeof (arc_buf_hdr_t), 5567286763Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5568286763Smav zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5569286763Smav multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], 5570286762Smav sizeof (arc_buf_hdr_t), 5571286763Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5572286763Smav zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5573286763Smav multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], 5574286762Smav sizeof (arc_buf_hdr_t), 5575286763Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5576286763Smav zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5577286763Smav multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], 5578286762Smav sizeof (arc_buf_hdr_t), 5579286763Smav offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5580286763Smav zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5581168404Spjd 5582286766Smav refcount_create(&arc_anon->arcs_size); 5583286766Smav refcount_create(&arc_mru->arcs_size); 5584286766Smav refcount_create(&arc_mru_ghost->arcs_size); 5585286766Smav refcount_create(&arc_mfu->arcs_size); 5586286766Smav refcount_create(&arc_mfu_ghost->arcs_size); 5587286766Smav refcount_create(&arc_l2c_only->arcs_size); 5588286766Smav 5589168404Spjd buf_init(); 5590168404Spjd 5591286763Smav arc_reclaim_thread_exit = FALSE; 5592286763Smav arc_user_evicts_thread_exit = FALSE; 5593301997Skib arc_dnlc_evicts_thread_exit = FALSE; 5594168404Spjd arc_eviction_list = NULL; 5595168404Spjd bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 5596168404Spjd 5597168404Spjd arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 5598168404Spjd sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 5599168404Spjd 5600168404Spjd if (arc_ksp != NULL) { 5601168404Spjd arc_ksp->ks_data = &arc_stats; 5602286574Smav arc_ksp->ks_update = arc_kstat_update; 5603168404Spjd kstat_install(arc_ksp); 5604168404Spjd } 5605168404Spjd 5606168404Spjd (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 5607168404Spjd TS_RUN, minclsyspri); 5608168404Spjd 5609168404Spjd#ifdef _KERNEL 5610168566Spjd arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, 5611168404Spjd EVENTHANDLER_PRI_FIRST); 5612168404Spjd#endif 5613168404Spjd 5614286763Smav (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0, 5615286763Smav TS_RUN, minclsyspri); 5616286763Smav 5617301997Skib (void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0, 5618301997Skib TS_RUN, minclsyspri); 5619301997Skib 5620168404Spjd arc_dead = FALSE; 5621185029Spjd arc_warm = B_FALSE; 5622168566Spjd 5623258632Savg /* 5624258632Savg * Calculate maximum amount of dirty data per pool. 5625258632Savg * 5626258632Savg * If it has been set by /etc/system, take that. 5627258632Savg * Otherwise, use a percentage of physical memory defined by 5628258632Savg * zfs_dirty_data_max_percent (default 10%) with a cap at 5629258632Savg * zfs_dirty_data_max_max (default 4GB). 5630258632Savg */ 5631258632Savg if (zfs_dirty_data_max == 0) { 5632258632Savg zfs_dirty_data_max = ptob(physmem) * 5633258632Savg zfs_dirty_data_max_percent / 100; 5634258632Savg zfs_dirty_data_max = MIN(zfs_dirty_data_max, 5635258632Savg zfs_dirty_data_max_max); 5636258632Savg } 5637185029Spjd 5638168566Spjd#ifdef _KERNEL 5639194043Skmacy if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) 5640193953Skmacy prefetch_tunable_set = 1; 5641206796Spjd 5642193878Skmacy#ifdef __i386__ 5643193953Skmacy if (prefetch_tunable_set == 0) { 5644196863Strasz printf("ZFS NOTICE: Prefetch is disabled by default on i386 " 5645196863Strasz "-- to enable,\n"); 5646196863Strasz printf(" add \"vfs.zfs.prefetch_disable=0\" " 5647196863Strasz "to /boot/loader.conf.\n"); 5648219089Spjd zfs_prefetch_disable = 1; 5649193878Skmacy } 5650206796Spjd#else 5651193878Skmacy if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && 5652193953Skmacy prefetch_tunable_set == 0) { 5653196863Strasz printf("ZFS NOTICE: Prefetch is disabled by default if less " 5654196941Strasz "than 4GB of RAM is present;\n" 5655196863Strasz " to enable, add \"vfs.zfs.prefetch_disable=0\" " 5656196863Strasz "to /boot/loader.conf.\n"); 5657219089Spjd zfs_prefetch_disable = 1; 5658193878Skmacy } 5659206796Spjd#endif 5660175633Spjd /* Warn about ZFS memory and address space requirements. */ 5661168696Spjd if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { 5662168987Sbmah printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " 5663168987Sbmah "expect unstable behavior.\n"); 5664175633Spjd } 5665175633Spjd if (kmem_size() < 512 * (1 << 20)) { 5666173419Spjd printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " 5667168987Sbmah "expect unstable behavior.\n"); 5668185029Spjd printf(" Consider tuning vm.kmem_size and " 5669173419Spjd "vm.kmem_size_max\n"); 5670185029Spjd printf(" in /boot/loader.conf.\n"); 5671168566Spjd } 5672168566Spjd#endif 5673168404Spjd} 5674168404Spjd 5675168404Spjdvoid 5676168404Spjdarc_fini(void) 5677168404Spjd{ 5678286763Smav mutex_enter(&arc_reclaim_lock); 5679286763Smav arc_reclaim_thread_exit = TRUE; 5680286763Smav /* 5681286763Smav * The reclaim thread will set arc_reclaim_thread_exit back to 5682286763Smav * FALSE when it is finished exiting; we're waiting for that. 5683286763Smav */ 5684286763Smav while (arc_reclaim_thread_exit) { 5685286763Smav cv_signal(&arc_reclaim_thread_cv); 5686286763Smav cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); 5687286763Smav } 5688286763Smav mutex_exit(&arc_reclaim_lock); 5689168404Spjd 5690286763Smav mutex_enter(&arc_user_evicts_lock); 5691286763Smav arc_user_evicts_thread_exit = TRUE; 5692286763Smav /* 5693286763Smav * The user evicts thread will set arc_user_evicts_thread_exit 5694286763Smav * to FALSE when it is finished exiting; we're waiting for that. 5695286763Smav */ 5696286763Smav while (arc_user_evicts_thread_exit) { 5697286763Smav cv_signal(&arc_user_evicts_cv); 5698286763Smav cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock); 5699286763Smav } 5700286763Smav mutex_exit(&arc_user_evicts_lock); 5701168404Spjd 5702301997Skib mutex_enter(&arc_dnlc_evicts_lock); 5703301997Skib arc_dnlc_evicts_thread_exit = TRUE; 5704301997Skib /* 5705301997Skib * The user evicts thread will set arc_user_evicts_thread_exit 5706301997Skib * to FALSE when it is finished exiting; we're waiting for that. 5707301997Skib */ 5708301997Skib while (arc_dnlc_evicts_thread_exit) { 5709301997Skib cv_signal(&arc_dnlc_evicts_cv); 5710301997Skib cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); 5711301997Skib } 5712301997Skib mutex_exit(&arc_dnlc_evicts_lock); 5713301997Skib 5714286763Smav /* Use TRUE to ensure *all* buffers are evicted */ 5715286763Smav arc_flush(NULL, TRUE); 5716286763Smav 5717168404Spjd arc_dead = TRUE; 5718168404Spjd 5719168404Spjd if (arc_ksp != NULL) { 5720168404Spjd kstat_delete(arc_ksp); 5721168404Spjd arc_ksp = NULL; 5722168404Spjd } 5723168404Spjd 5724286763Smav mutex_destroy(&arc_reclaim_lock); 5725286763Smav cv_destroy(&arc_reclaim_thread_cv); 5726286763Smav cv_destroy(&arc_reclaim_waiters_cv); 5727168404Spjd 5728286763Smav mutex_destroy(&arc_user_evicts_lock); 5729286763Smav cv_destroy(&arc_user_evicts_cv); 5730168404Spjd 5731301997Skib mutex_destroy(&arc_dnlc_evicts_lock); 5732301997Skib cv_destroy(&arc_dnlc_evicts_cv); 5733301997Skib 5734286766Smav refcount_destroy(&arc_anon->arcs_size); 5735286766Smav refcount_destroy(&arc_mru->arcs_size); 5736286766Smav refcount_destroy(&arc_mru_ghost->arcs_size); 5737286766Smav refcount_destroy(&arc_mfu->arcs_size); 5738286766Smav refcount_destroy(&arc_mfu_ghost->arcs_size); 5739286766Smav refcount_destroy(&arc_l2c_only->arcs_size); 5740286766Smav 5741286763Smav multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); 5742286763Smav multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 5743286763Smav multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); 5744286763Smav multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 5745294809Smav multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); 5746286763Smav multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); 5747286763Smav multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 5748286763Smav multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); 5749286763Smav multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 5750294809Smav multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); 5751206796Spjd 5752168404Spjd buf_fini(); 5753168404Spjd 5754286570Smav ASSERT0(arc_loaned_bytes); 5755209962Smm 5756168404Spjd#ifdef _KERNEL 5757168566Spjd if (arc_event_lowmem != NULL) 5758168566Spjd EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); 5759168404Spjd#endif 5760168404Spjd} 5761185029Spjd 5762185029Spjd/* 5763185029Spjd * Level 2 ARC 5764185029Spjd * 5765185029Spjd * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 5766185029Spjd * It uses dedicated storage devices to hold cached data, which are populated 5767185029Spjd * using large infrequent writes. The main role of this cache is to boost 5768185029Spjd * the performance of random read workloads. The intended L2ARC devices 5769185029Spjd * include short-stroked disks, solid state disks, and other media with 5770185029Spjd * substantially faster read latency than disk. 5771185029Spjd * 5772185029Spjd * +-----------------------+ 5773185029Spjd * | ARC | 5774185029Spjd * +-----------------------+ 5775185029Spjd * | ^ ^ 5776185029Spjd * | | | 5777185029Spjd * l2arc_feed_thread() arc_read() 5778185029Spjd * | | | 5779185029Spjd * | l2arc read | 5780185029Spjd * V | | 5781185029Spjd * +---------------+ | 5782185029Spjd * | L2ARC | | 5783185029Spjd * +---------------+ | 5784185029Spjd * | ^ | 5785185029Spjd * l2arc_write() | | 5786185029Spjd * | | | 5787185029Spjd * V | | 5788185029Spjd * +-------+ +-------+ 5789185029Spjd * | vdev | | vdev | 5790185029Spjd * | cache | | cache | 5791185029Spjd * +-------+ +-------+ 5792185029Spjd * +=========+ .-----. 5793185029Spjd * : L2ARC : |-_____-| 5794185029Spjd * : devices : | Disks | 5795185029Spjd * +=========+ `-_____-' 5796185029Spjd * 5797185029Spjd * Read requests are satisfied from the following sources, in order: 5798185029Spjd * 5799185029Spjd * 1) ARC 5800185029Spjd * 2) vdev cache of L2ARC devices 5801185029Spjd * 3) L2ARC devices 5802185029Spjd * 4) vdev cache of disks 5803185029Spjd * 5) disks 5804185029Spjd * 5805185029Spjd * Some L2ARC device types exhibit extremely slow write performance. 5806185029Spjd * To accommodate for this there are some significant differences between 5807185029Spjd * the L2ARC and traditional cache design: 5808185029Spjd * 5809185029Spjd * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 5810185029Spjd * the ARC behave as usual, freeing buffers and placing headers on ghost 5811185029Spjd * lists. The ARC does not send buffers to the L2ARC during eviction as 5812185029Spjd * this would add inflated write latencies for all ARC memory pressure. 5813185029Spjd * 5814185029Spjd * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 5815185029Spjd * It does this by periodically scanning buffers from the eviction-end of 5816185029Spjd * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 5817251478Sdelphij * not already there. It scans until a headroom of buffers is satisfied, 5818251478Sdelphij * which itself is a buffer for ARC eviction. If a compressible buffer is 5819251478Sdelphij * found during scanning and selected for writing to an L2ARC device, we 5820251478Sdelphij * temporarily boost scanning headroom during the next scan cycle to make 5821251478Sdelphij * sure we adapt to compression effects (which might significantly reduce 5822251478Sdelphij * the data volume we write to L2ARC). The thread that does this is 5823185029Spjd * l2arc_feed_thread(), illustrated below; example sizes are included to 5824185029Spjd * provide a better sense of ratio than this diagram: 5825185029Spjd * 5826185029Spjd * head --> tail 5827185029Spjd * +---------------------+----------+ 5828185029Spjd * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 5829185029Spjd * +---------------------+----------+ | o L2ARC eligible 5830185029Spjd * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 5831185029Spjd * +---------------------+----------+ | 5832185029Spjd * 15.9 Gbytes ^ 32 Mbytes | 5833185029Spjd * headroom | 5834185029Spjd * l2arc_feed_thread() 5835185029Spjd * | 5836185029Spjd * l2arc write hand <--[oooo]--' 5837185029Spjd * | 8 Mbyte 5838185029Spjd * | write max 5839185029Spjd * V 5840185029Spjd * +==============================+ 5841185029Spjd * L2ARC dev |####|#|###|###| |####| ... | 5842185029Spjd * +==============================+ 5843185029Spjd * 32 Gbytes 5844185029Spjd * 5845185029Spjd * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 5846185029Spjd * evicted, then the L2ARC has cached a buffer much sooner than it probably 5847185029Spjd * needed to, potentially wasting L2ARC device bandwidth and storage. It is 5848185029Spjd * safe to say that this is an uncommon case, since buffers at the end of 5849185029Spjd * the ARC lists have moved there due to inactivity. 5850185029Spjd * 5851185029Spjd * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 5852185029Spjd * then the L2ARC simply misses copying some buffers. This serves as a 5853185029Spjd * pressure valve to prevent heavy read workloads from both stalling the ARC 5854185029Spjd * with waits and clogging the L2ARC with writes. This also helps prevent 5855185029Spjd * the potential for the L2ARC to churn if it attempts to cache content too 5856185029Spjd * quickly, such as during backups of the entire pool. 5857185029Spjd * 5858185029Spjd * 5. After system boot and before the ARC has filled main memory, there are 5859185029Spjd * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 5860185029Spjd * lists can remain mostly static. Instead of searching from tail of these 5861185029Spjd * lists as pictured, the l2arc_feed_thread() will search from the list heads 5862185029Spjd * for eligible buffers, greatly increasing its chance of finding them. 5863185029Spjd * 5864185029Spjd * The L2ARC device write speed is also boosted during this time so that 5865185029Spjd * the L2ARC warms up faster. Since there have been no ARC evictions yet, 5866185029Spjd * there are no L2ARC reads, and no fear of degrading read performance 5867185029Spjd * through increased writes. 5868185029Spjd * 5869185029Spjd * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 5870185029Spjd * the vdev queue can aggregate them into larger and fewer writes. Each 5871185029Spjd * device is written to in a rotor fashion, sweeping writes through 5872185029Spjd * available space then repeating. 5873185029Spjd * 5874185029Spjd * 7. The L2ARC does not store dirty content. It never needs to flush 5875185029Spjd * write buffers back to disk based storage. 5876185029Spjd * 5877185029Spjd * 8. If an ARC buffer is written (and dirtied) which also exists in the 5878185029Spjd * L2ARC, the now stale L2ARC buffer is immediately dropped. 5879185029Spjd * 5880185029Spjd * The performance of the L2ARC can be tweaked by a number of tunables, which 5881185029Spjd * may be necessary for different workloads: 5882185029Spjd * 5883185029Spjd * l2arc_write_max max write bytes per interval 5884185029Spjd * l2arc_write_boost extra write bytes during device warmup 5885185029Spjd * l2arc_noprefetch skip caching prefetched buffers 5886185029Spjd * l2arc_headroom number of max device writes to precache 5887251478Sdelphij * l2arc_headroom_boost when we find compressed buffers during ARC 5888251478Sdelphij * scanning, we multiply headroom by this 5889251478Sdelphij * percentage factor for the next scan cycle, 5890251478Sdelphij * since more compressed buffers are likely to 5891251478Sdelphij * be present 5892185029Spjd * l2arc_feed_secs seconds between L2ARC writing 5893185029Spjd * 5894185029Spjd * Tunables may be removed or added as future performance improvements are 5895185029Spjd * integrated, and also may become zpool properties. 5896208373Smm * 5897208373Smm * There are three key functions that control how the L2ARC warms up: 5898208373Smm * 5899208373Smm * l2arc_write_eligible() check if a buffer is eligible to cache 5900208373Smm * l2arc_write_size() calculate how much to write 5901208373Smm * l2arc_write_interval() calculate sleep delay between writes 5902208373Smm * 5903208373Smm * These three functions determine what to write, how much, and how quickly 5904208373Smm * to send writes. 5905185029Spjd */ 5906185029Spjd 5907208373Smmstatic boolean_t 5908275811Sdelphijl2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) 5909208373Smm{ 5910208373Smm /* 5911208373Smm * A buffer is *not* eligible for the L2ARC if it: 5912208373Smm * 1. belongs to a different spa. 5913208373Smm * 2. is already cached on the L2ARC. 5914208373Smm * 3. has an I/O in progress (it may be an incomplete read). 5915208373Smm * 4. is flagged not eligible (zfs property). 5916208373Smm */ 5917275811Sdelphij if (hdr->b_spa != spa_guid) { 5918208373Smm ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); 5919208373Smm return (B_FALSE); 5920208373Smm } 5921286570Smav if (HDR_HAS_L2HDR(hdr)) { 5922208373Smm ARCSTAT_BUMP(arcstat_l2_write_in_l2); 5923208373Smm return (B_FALSE); 5924208373Smm } 5925275811Sdelphij if (HDR_IO_IN_PROGRESS(hdr)) { 5926208373Smm ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); 5927208373Smm return (B_FALSE); 5928208373Smm } 5929275811Sdelphij if (!HDR_L2CACHE(hdr)) { 5930208373Smm ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); 5931208373Smm return (B_FALSE); 5932208373Smm } 5933208373Smm 5934208373Smm return (B_TRUE); 5935208373Smm} 5936208373Smm 5937208373Smmstatic uint64_t 5938251478Sdelphijl2arc_write_size(void) 5939208373Smm{ 5940208373Smm uint64_t size; 5941208373Smm 5942251478Sdelphij /* 5943251478Sdelphij * Make sure our globals have meaningful values in case the user 5944251478Sdelphij * altered them. 5945251478Sdelphij */ 5946251478Sdelphij size = l2arc_write_max; 5947251478Sdelphij if (size == 0) { 5948251478Sdelphij cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 5949251478Sdelphij "be greater than zero, resetting it to the default (%d)", 5950251478Sdelphij L2ARC_WRITE_SIZE); 5951251478Sdelphij size = l2arc_write_max = L2ARC_WRITE_SIZE; 5952251478Sdelphij } 5953208373Smm 5954208373Smm if (arc_warm == B_FALSE) 5955251478Sdelphij size += l2arc_write_boost; 5956208373Smm 5957208373Smm return (size); 5958208373Smm 5959208373Smm} 5960208373Smm 5961208373Smmstatic clock_t 5962208373Smml2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 5963208373Smm{ 5964219089Spjd clock_t interval, next, now; 5965208373Smm 5966208373Smm /* 5967208373Smm * If the ARC lists are busy, increase our write rate; if the 5968208373Smm * lists are stale, idle back. This is achieved by checking 5969208373Smm * how much we previously wrote - if it was more than half of 5970208373Smm * what we wanted, schedule the next write much sooner. 5971208373Smm */ 5972208373Smm if (l2arc_feed_again && wrote > (wanted / 2)) 5973208373Smm interval = (hz * l2arc_feed_min_ms) / 1000; 5974208373Smm else 5975208373Smm interval = hz * l2arc_feed_secs; 5976208373Smm 5977219089Spjd now = ddi_get_lbolt(); 5978219089Spjd next = MAX(now, MIN(now + interval, began + interval)); 5979208373Smm 5980208373Smm return (next); 5981208373Smm} 5982208373Smm 5983185029Spjd/* 5984185029Spjd * Cycle through L2ARC devices. This is how L2ARC load balances. 5985185029Spjd * If a device is returned, this also returns holding the spa config lock. 5986185029Spjd */ 5987185029Spjdstatic l2arc_dev_t * 5988185029Spjdl2arc_dev_get_next(void) 5989185029Spjd{ 5990185029Spjd l2arc_dev_t *first, *next = NULL; 5991185029Spjd 5992185029Spjd /* 5993185029Spjd * Lock out the removal of spas (spa_namespace_lock), then removal 5994185029Spjd * of cache devices (l2arc_dev_mtx). Once a device has been selected, 5995185029Spjd * both locks will be dropped and a spa config lock held instead. 5996185029Spjd */ 5997185029Spjd mutex_enter(&spa_namespace_lock); 5998185029Spjd mutex_enter(&l2arc_dev_mtx); 5999185029Spjd 6000185029Spjd /* if there are no vdevs, there is nothing to do */ 6001185029Spjd if (l2arc_ndev == 0) 6002185029Spjd goto out; 6003185029Spjd 6004185029Spjd first = NULL; 6005185029Spjd next = l2arc_dev_last; 6006185029Spjd do { 6007185029Spjd /* loop around the list looking for a non-faulted vdev */ 6008185029Spjd if (next == NULL) { 6009185029Spjd next = list_head(l2arc_dev_list); 6010185029Spjd } else { 6011185029Spjd next = list_next(l2arc_dev_list, next); 6012185029Spjd if (next == NULL) 6013185029Spjd next = list_head(l2arc_dev_list); 6014185029Spjd } 6015185029Spjd 6016185029Spjd /* if we have come back to the start, bail out */ 6017185029Spjd if (first == NULL) 6018185029Spjd first = next; 6019185029Spjd else if (next == first) 6020185029Spjd break; 6021185029Spjd 6022185029Spjd } while (vdev_is_dead(next->l2ad_vdev)); 6023185029Spjd 6024185029Spjd /* if we were unable to find any usable vdevs, return NULL */ 6025185029Spjd if (vdev_is_dead(next->l2ad_vdev)) 6026185029Spjd next = NULL; 6027185029Spjd 6028185029Spjd l2arc_dev_last = next; 6029185029Spjd 6030185029Spjdout: 6031185029Spjd mutex_exit(&l2arc_dev_mtx); 6032185029Spjd 6033185029Spjd /* 6034185029Spjd * Grab the config lock to prevent the 'next' device from being 6035185029Spjd * removed while we are writing to it. 6036185029Spjd */ 6037185029Spjd if (next != NULL) 6038185029Spjd spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 6039185029Spjd mutex_exit(&spa_namespace_lock); 6040185029Spjd 6041185029Spjd return (next); 6042185029Spjd} 6043185029Spjd 6044185029Spjd/* 6045185029Spjd * Free buffers that were tagged for destruction. 6046185029Spjd */ 6047185029Spjdstatic void 6048185029Spjdl2arc_do_free_on_write() 6049185029Spjd{ 6050185029Spjd list_t *buflist; 6051185029Spjd l2arc_data_free_t *df, *df_prev; 6052185029Spjd 6053185029Spjd mutex_enter(&l2arc_free_on_write_mtx); 6054185029Spjd buflist = l2arc_free_on_write; 6055185029Spjd 6056185029Spjd for (df = list_tail(buflist); df; df = df_prev) { 6057185029Spjd df_prev = list_prev(buflist, df); 6058185029Spjd ASSERT(df->l2df_data != NULL); 6059185029Spjd ASSERT(df->l2df_func != NULL); 6060185029Spjd df->l2df_func(df->l2df_data, df->l2df_size); 6061185029Spjd list_remove(buflist, df); 6062185029Spjd kmem_free(df, sizeof (l2arc_data_free_t)); 6063185029Spjd } 6064185029Spjd 6065185029Spjd mutex_exit(&l2arc_free_on_write_mtx); 6066185029Spjd} 6067185029Spjd 6068185029Spjd/* 6069185029Spjd * A write to a cache device has completed. Update all headers to allow 6070185029Spjd * reads from these buffers to begin. 6071185029Spjd */ 6072185029Spjdstatic void 6073185029Spjdl2arc_write_done(zio_t *zio) 6074185029Spjd{ 6075185029Spjd l2arc_write_callback_t *cb; 6076185029Spjd l2arc_dev_t *dev; 6077185029Spjd list_t *buflist; 6078275811Sdelphij arc_buf_hdr_t *head, *hdr, *hdr_prev; 6079185029Spjd kmutex_t *hash_lock; 6080268085Sdelphij int64_t bytes_dropped = 0; 6081185029Spjd 6082185029Spjd cb = zio->io_private; 6083185029Spjd ASSERT(cb != NULL); 6084185029Spjd dev = cb->l2wcb_dev; 6085185029Spjd ASSERT(dev != NULL); 6086185029Spjd head = cb->l2wcb_head; 6087185029Spjd ASSERT(head != NULL); 6088286570Smav buflist = &dev->l2ad_buflist; 6089185029Spjd ASSERT(buflist != NULL); 6090185029Spjd DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 6091185029Spjd l2arc_write_callback_t *, cb); 6092185029Spjd 6093185029Spjd if (zio->io_error != 0) 6094185029Spjd ARCSTAT_BUMP(arcstat_l2_writes_error); 6095185029Spjd 6096185029Spjd /* 6097185029Spjd * All writes completed, or an error was hit. 6098185029Spjd */ 6099286763Smavtop: 6100286763Smav mutex_enter(&dev->l2ad_mtx); 6101275811Sdelphij for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { 6102275811Sdelphij hdr_prev = list_prev(buflist, hdr); 6103185029Spjd 6104275811Sdelphij hash_lock = HDR_LOCK(hdr); 6105286763Smav 6106286763Smav /* 6107286763Smav * We cannot use mutex_enter or else we can deadlock 6108286763Smav * with l2arc_write_buffers (due to swapping the order 6109286763Smav * the hash lock and l2ad_mtx are taken). 6110286763Smav */ 6111185029Spjd if (!mutex_tryenter(hash_lock)) { 6112185029Spjd /* 6113286763Smav * Missed the hash lock. We must retry so we 6114286763Smav * don't leave the ARC_FLAG_L2_WRITING bit set. 6115185029Spjd */ 6116286763Smav ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); 6117286763Smav 6118286763Smav /* 6119286763Smav * We don't want to rescan the headers we've 6120286763Smav * already marked as having been written out, so 6121286763Smav * we reinsert the head node so we can pick up 6122286763Smav * where we left off. 6123286763Smav */ 6124286763Smav list_remove(buflist, head); 6125286763Smav list_insert_after(buflist, hdr, head); 6126286763Smav 6127286763Smav mutex_exit(&dev->l2ad_mtx); 6128286763Smav 6129286763Smav /* 6130286763Smav * We wait for the hash lock to become available 6131286763Smav * to try and prevent busy waiting, and increase 6132286763Smav * the chance we'll be able to acquire the lock 6133286763Smav * the next time around. 6134286763Smav */ 6135286763Smav mutex_enter(hash_lock); 6136286763Smav mutex_exit(hash_lock); 6137286763Smav goto top; 6138185029Spjd } 6139185029Spjd 6140286570Smav /* 6141286763Smav * We could not have been moved into the arc_l2c_only 6142286763Smav * state while in-flight due to our ARC_FLAG_L2_WRITING 6143286763Smav * bit being set. Let's just ensure that's being enforced. 6144286570Smav */ 6145286763Smav ASSERT(HDR_HAS_L1HDR(hdr)); 6146286570Smav 6147286763Smav /* 6148286763Smav * We may have allocated a buffer for L2ARC compression, 6149286763Smav * we must release it to avoid leaking this data. 6150286763Smav */ 6151286763Smav l2arc_release_cdata_buf(hdr); 6152286763Smav 6153185029Spjd if (zio->io_error != 0) { 6154185029Spjd /* 6155185029Spjd * Error - drop L2ARC entry. 6156185029Spjd */ 6157286776Smav list_remove(buflist, hdr); 6158290191Savg l2arc_trim(hdr); 6159286570Smav hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 6160286570Smav 6161286570Smav ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); 6162275811Sdelphij ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 6163286598Smav 6164286598Smav bytes_dropped += hdr->b_l2hdr.b_asize; 6165286598Smav (void) refcount_remove_many(&dev->l2ad_alloc, 6166286598Smav hdr->b_l2hdr.b_asize, hdr); 6167185029Spjd } 6168185029Spjd 6169185029Spjd /* 6170286763Smav * Allow ARC to begin reads and ghost list evictions to 6171286763Smav * this L2ARC entry. 6172185029Spjd */ 6173275811Sdelphij hdr->b_flags &= ~ARC_FLAG_L2_WRITING; 6174185029Spjd 6175185029Spjd mutex_exit(hash_lock); 6176185029Spjd } 6177185029Spjd 6178185029Spjd atomic_inc_64(&l2arc_writes_done); 6179185029Spjd list_remove(buflist, head); 6180286570Smav ASSERT(!HDR_HAS_L1HDR(head)); 6181286570Smav kmem_cache_free(hdr_l2only_cache, head); 6182286570Smav mutex_exit(&dev->l2ad_mtx); 6183185029Spjd 6184268085Sdelphij vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); 6185268085Sdelphij 6186185029Spjd l2arc_do_free_on_write(); 6187185029Spjd 6188185029Spjd kmem_free(cb, sizeof (l2arc_write_callback_t)); 6189185029Spjd} 6190185029Spjd 6191185029Spjd/* 6192185029Spjd * A read to a cache device completed. Validate buffer contents before 6193185029Spjd * handing over to the regular ARC routines. 6194185029Spjd */ 6195185029Spjdstatic void 6196185029Spjdl2arc_read_done(zio_t *zio) 6197185029Spjd{ 6198185029Spjd l2arc_read_callback_t *cb; 6199185029Spjd arc_buf_hdr_t *hdr; 6200185029Spjd arc_buf_t *buf; 6201185029Spjd kmutex_t *hash_lock; 6202185029Spjd int equal; 6203185029Spjd 6204185029Spjd ASSERT(zio->io_vd != NULL); 6205185029Spjd ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 6206185029Spjd 6207185029Spjd spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 6208185029Spjd 6209185029Spjd cb = zio->io_private; 6210185029Spjd ASSERT(cb != NULL); 6211185029Spjd buf = cb->l2rcb_buf; 6212185029Spjd ASSERT(buf != NULL); 6213185029Spjd 6214219089Spjd hash_lock = HDR_LOCK(buf->b_hdr); 6215185029Spjd mutex_enter(hash_lock); 6216219089Spjd hdr = buf->b_hdr; 6217219089Spjd ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 6218185029Spjd 6219185029Spjd /* 6220297848Savg * If the data was read into a temporary buffer, 6221297848Savg * move it and free the buffer. 6222297848Savg */ 6223297848Savg if (cb->l2rcb_data != NULL) { 6224297848Savg ASSERT3U(hdr->b_size, <, zio->io_size); 6225297848Savg ASSERT3U(cb->l2rcb_compress, ==, ZIO_COMPRESS_OFF); 6226297848Savg if (zio->io_error == 0) 6227297848Savg bcopy(cb->l2rcb_data, buf->b_data, hdr->b_size); 6228297848Savg 6229297848Savg /* 6230297848Savg * The following must be done regardless of whether 6231297848Savg * there was an error: 6232297848Savg * - free the temporary buffer 6233297848Savg * - point zio to the real ARC buffer 6234297848Savg * - set zio size accordingly 6235297848Savg * These are required because zio is either re-used for 6236297848Savg * an I/O of the block in the case of the error 6237297848Savg * or the zio is passed to arc_read_done() and it 6238297848Savg * needs real data. 6239297848Savg */ 6240297848Savg zio_data_buf_free(cb->l2rcb_data, zio->io_size); 6241297848Savg zio->io_size = zio->io_orig_size = hdr->b_size; 6242297848Savg zio->io_data = zio->io_orig_data = buf->b_data; 6243297848Savg } 6244297848Savg 6245297848Savg /* 6246251478Sdelphij * If the buffer was compressed, decompress it first. 6247251478Sdelphij */ 6248251478Sdelphij if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) 6249251478Sdelphij l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); 6250251478Sdelphij ASSERT(zio->io_data != NULL); 6251287706Sdelphij ASSERT3U(zio->io_size, ==, hdr->b_size); 6252287706Sdelphij ASSERT3U(BP_GET_LSIZE(&cb->l2rcb_bp), ==, hdr->b_size); 6253251478Sdelphij 6254251478Sdelphij /* 6255185029Spjd * Check this survived the L2ARC journey. 6256185029Spjd */ 6257185029Spjd equal = arc_cksum_equal(buf); 6258185029Spjd if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 6259185029Spjd mutex_exit(hash_lock); 6260185029Spjd zio->io_private = buf; 6261185029Spjd zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 6262185029Spjd zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 6263185029Spjd arc_read_done(zio); 6264185029Spjd } else { 6265185029Spjd mutex_exit(hash_lock); 6266185029Spjd /* 6267185029Spjd * Buffer didn't survive caching. Increment stats and 6268185029Spjd * reissue to the original storage device. 6269185029Spjd */ 6270185029Spjd if (zio->io_error != 0) { 6271185029Spjd ARCSTAT_BUMP(arcstat_l2_io_error); 6272185029Spjd } else { 6273249195Smm zio->io_error = SET_ERROR(EIO); 6274185029Spjd } 6275185029Spjd if (!equal) 6276185029Spjd ARCSTAT_BUMP(arcstat_l2_cksum_bad); 6277185029Spjd 6278185029Spjd /* 6279185029Spjd * If there's no waiter, issue an async i/o to the primary 6280185029Spjd * storage now. If there *is* a waiter, the caller must 6281185029Spjd * issue the i/o in a context where it's OK to block. 6282185029Spjd */ 6283209962Smm if (zio->io_waiter == NULL) { 6284209962Smm zio_t *pio = zio_unique_parent(zio); 6285209962Smm 6286209962Smm ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 6287209962Smm 6288209962Smm zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 6289287706Sdelphij buf->b_data, hdr->b_size, arc_read_done, buf, 6290185029Spjd zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 6291209962Smm } 6292185029Spjd } 6293185029Spjd 6294185029Spjd kmem_free(cb, sizeof (l2arc_read_callback_t)); 6295185029Spjd} 6296185029Spjd 6297185029Spjd/* 6298185029Spjd * This is the list priority from which the L2ARC will search for pages to 6299185029Spjd * cache. This is used within loops (0..3) to cycle through lists in the 6300185029Spjd * desired order. This order can have a significant effect on cache 6301185029Spjd * performance. 6302185029Spjd * 6303185029Spjd * Currently the metadata lists are hit first, MFU then MRU, followed by 6304185029Spjd * the data lists. This function returns a locked list, and also returns 6305185029Spjd * the lock pointer. 6306185029Spjd */ 6307286763Smavstatic multilist_sublist_t * 6308286763Smavl2arc_sublist_lock(int list_num) 6309185029Spjd{ 6310286763Smav multilist_t *ml = NULL; 6311286763Smav unsigned int idx; 6312185029Spjd 6313286762Smav ASSERT(list_num >= 0 && list_num <= 3); 6314206796Spjd 6315286762Smav switch (list_num) { 6316286762Smav case 0: 6317286763Smav ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; 6318286762Smav break; 6319286762Smav case 1: 6320286763Smav ml = &arc_mru->arcs_list[ARC_BUFC_METADATA]; 6321286762Smav break; 6322286762Smav case 2: 6323286763Smav ml = &arc_mfu->arcs_list[ARC_BUFC_DATA]; 6324286762Smav break; 6325286762Smav case 3: 6326286763Smav ml = &arc_mru->arcs_list[ARC_BUFC_DATA]; 6327286762Smav break; 6328185029Spjd } 6329185029Spjd 6330286763Smav /* 6331286763Smav * Return a randomly-selected sublist. This is acceptable 6332286763Smav * because the caller feeds only a little bit of data for each 6333286763Smav * call (8MB). Subsequent calls will result in different 6334286763Smav * sublists being selected. 6335286763Smav */ 6336286763Smav idx = multilist_get_random_index(ml); 6337286763Smav return (multilist_sublist_lock(ml, idx)); 6338185029Spjd} 6339185029Spjd 6340185029Spjd/* 6341185029Spjd * Evict buffers from the device write hand to the distance specified in 6342185029Spjd * bytes. This distance may span populated buffers, it may span nothing. 6343185029Spjd * This is clearing a region on the L2ARC device ready for writing. 6344185029Spjd * If the 'all' boolean is set, every buffer is evicted. 6345185029Spjd */ 6346185029Spjdstatic void 6347185029Spjdl2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 6348185029Spjd{ 6349185029Spjd list_t *buflist; 6350275811Sdelphij arc_buf_hdr_t *hdr, *hdr_prev; 6351185029Spjd kmutex_t *hash_lock; 6352185029Spjd uint64_t taddr; 6353185029Spjd 6354286570Smav buflist = &dev->l2ad_buflist; 6355185029Spjd 6356185029Spjd if (!all && dev->l2ad_first) { 6357185029Spjd /* 6358185029Spjd * This is the first sweep through the device. There is 6359185029Spjd * nothing to evict. 6360185029Spjd */ 6361185029Spjd return; 6362185029Spjd } 6363185029Spjd 6364185029Spjd if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 6365185029Spjd /* 6366185029Spjd * When nearing the end of the device, evict to the end 6367185029Spjd * before the device write hand jumps to the start. 6368185029Spjd */ 6369185029Spjd taddr = dev->l2ad_end; 6370185029Spjd } else { 6371185029Spjd taddr = dev->l2ad_hand + distance; 6372185029Spjd } 6373185029Spjd DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 6374185029Spjd uint64_t, taddr, boolean_t, all); 6375185029Spjd 6376185029Spjdtop: 6377286570Smav mutex_enter(&dev->l2ad_mtx); 6378275811Sdelphij for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { 6379275811Sdelphij hdr_prev = list_prev(buflist, hdr); 6380185029Spjd 6381275811Sdelphij hash_lock = HDR_LOCK(hdr); 6382286763Smav 6383286763Smav /* 6384286763Smav * We cannot use mutex_enter or else we can deadlock 6385286763Smav * with l2arc_write_buffers (due to swapping the order 6386286763Smav * the hash lock and l2ad_mtx are taken). 6387286763Smav */ 6388185029Spjd if (!mutex_tryenter(hash_lock)) { 6389185029Spjd /* 6390185029Spjd * Missed the hash lock. Retry. 6391185029Spjd */ 6392185029Spjd ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 6393286570Smav mutex_exit(&dev->l2ad_mtx); 6394185029Spjd mutex_enter(hash_lock); 6395185029Spjd mutex_exit(hash_lock); 6396185029Spjd goto top; 6397185029Spjd } 6398185029Spjd 6399275811Sdelphij if (HDR_L2_WRITE_HEAD(hdr)) { 6400185029Spjd /* 6401185029Spjd * We hit a write head node. Leave it for 6402185029Spjd * l2arc_write_done(). 6403185029Spjd */ 6404275811Sdelphij list_remove(buflist, hdr); 6405185029Spjd mutex_exit(hash_lock); 6406185029Spjd continue; 6407185029Spjd } 6408185029Spjd 6409286570Smav if (!all && HDR_HAS_L2HDR(hdr) && 6410286570Smav (hdr->b_l2hdr.b_daddr > taddr || 6411286570Smav hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { 6412185029Spjd /* 6413185029Spjd * We've evicted to the target address, 6414185029Spjd * or the end of the device. 6415185029Spjd */ 6416185029Spjd mutex_exit(hash_lock); 6417185029Spjd break; 6418185029Spjd } 6419185029Spjd 6420286570Smav ASSERT(HDR_HAS_L2HDR(hdr)); 6421286570Smav if (!HDR_HAS_L1HDR(hdr)) { 6422275811Sdelphij ASSERT(!HDR_L2_READING(hdr)); 6423185029Spjd /* 6424185029Spjd * This doesn't exist in the ARC. Destroy. 6425185029Spjd * arc_hdr_destroy() will call list_remove() 6426185029Spjd * and decrement arcstat_l2_size. 6427185029Spjd */ 6428275811Sdelphij arc_change_state(arc_anon, hdr, hash_lock); 6429275811Sdelphij arc_hdr_destroy(hdr); 6430185029Spjd } else { 6431286570Smav ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); 6432286570Smav ARCSTAT_BUMP(arcstat_l2_evict_l1cached); 6433185029Spjd /* 6434185029Spjd * Invalidate issued or about to be issued 6435185029Spjd * reads, since we may be about to write 6436185029Spjd * over this location. 6437185029Spjd */ 6438275811Sdelphij if (HDR_L2_READING(hdr)) { 6439185029Spjd ARCSTAT_BUMP(arcstat_l2_evict_reading); 6440275811Sdelphij hdr->b_flags |= ARC_FLAG_L2_EVICTED; 6441185029Spjd } 6442185029Spjd 6443286763Smav /* Ensure this header has finished being written */ 6444286763Smav ASSERT(!HDR_L2_WRITING(hdr)); 6445286763Smav ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); 6446286763Smav 6447286598Smav arc_hdr_l2hdr_destroy(hdr); 6448185029Spjd } 6449185029Spjd mutex_exit(hash_lock); 6450185029Spjd } 6451286570Smav mutex_exit(&dev->l2ad_mtx); 6452185029Spjd} 6453185029Spjd 6454185029Spjd/* 6455185029Spjd * Find and write ARC buffers to the L2ARC device. 6456185029Spjd * 6457275811Sdelphij * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid 6458185029Spjd * for reading until they have completed writing. 6459251478Sdelphij * The headroom_boost is an in-out parameter used to maintain headroom boost 6460251478Sdelphij * state between calls to this function. 6461251478Sdelphij * 6462251478Sdelphij * Returns the number of bytes actually written (which may be smaller than 6463251478Sdelphij * the delta by which the device hand has changed due to alignment). 6464185029Spjd */ 6465208373Smmstatic uint64_t 6466251478Sdelphijl2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, 6467251478Sdelphij boolean_t *headroom_boost) 6468185029Spjd{ 6469275811Sdelphij arc_buf_hdr_t *hdr, *hdr_prev, *head; 6470289295Smav uint64_t write_asize, write_sz, headroom, 6471289295Smav buf_compress_minsz; 6472185029Spjd void *buf_data; 6473251478Sdelphij boolean_t full; 6474185029Spjd l2arc_write_callback_t *cb; 6475185029Spjd zio_t *pio, *wzio; 6476228103Smm uint64_t guid = spa_load_guid(spa); 6477251478Sdelphij const boolean_t do_headroom_boost = *headroom_boost; 6478185029Spjd int try; 6479185029Spjd 6480185029Spjd ASSERT(dev->l2ad_vdev != NULL); 6481185029Spjd 6482251478Sdelphij /* Lower the flag now, we might want to raise it again later. */ 6483251478Sdelphij *headroom_boost = B_FALSE; 6484251478Sdelphij 6485185029Spjd pio = NULL; 6486287099Savg write_sz = write_asize = 0; 6487185029Spjd full = B_FALSE; 6488286570Smav head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); 6489275811Sdelphij head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; 6490286570Smav head->b_flags |= ARC_FLAG_HAS_L2HDR; 6491185029Spjd 6492205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); 6493185029Spjd /* 6494251478Sdelphij * We will want to try to compress buffers that are at least 2x the 6495251478Sdelphij * device sector size. 6496251478Sdelphij */ 6497251478Sdelphij buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; 6498251478Sdelphij 6499251478Sdelphij /* 6500185029Spjd * Copy buffers for L2ARC writing. 6501185029Spjd */ 6502286762Smav for (try = 0; try <= 3; try++) { 6503286763Smav multilist_sublist_t *mls = l2arc_sublist_lock(try); 6504251478Sdelphij uint64_t passed_sz = 0; 6505251478Sdelphij 6506205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); 6507185029Spjd 6508185029Spjd /* 6509185029Spjd * L2ARC fast warmup. 6510185029Spjd * 6511185029Spjd * Until the ARC is warm and starts to evict, read from the 6512185029Spjd * head of the ARC lists rather than the tail. 6513185029Spjd */ 6514185029Spjd if (arc_warm == B_FALSE) 6515286763Smav hdr = multilist_sublist_head(mls); 6516185029Spjd else 6517286763Smav hdr = multilist_sublist_tail(mls); 6518275811Sdelphij if (hdr == NULL) 6519205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); 6520185029Spjd 6521286762Smav headroom = target_sz * l2arc_headroom; 6522251478Sdelphij if (do_headroom_boost) 6523251478Sdelphij headroom = (headroom * l2arc_headroom_boost) / 100; 6524251478Sdelphij 6525275811Sdelphij for (; hdr; hdr = hdr_prev) { 6526251478Sdelphij kmutex_t *hash_lock; 6527251478Sdelphij uint64_t buf_sz; 6528287099Savg uint64_t buf_a_sz; 6529297848Savg size_t align; 6530251478Sdelphij 6531185029Spjd if (arc_warm == B_FALSE) 6532286763Smav hdr_prev = multilist_sublist_next(mls, hdr); 6533185029Spjd else 6534286763Smav hdr_prev = multilist_sublist_prev(mls, hdr); 6535275811Sdelphij ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size); 6536206796Spjd 6537275811Sdelphij hash_lock = HDR_LOCK(hdr); 6538251478Sdelphij if (!mutex_tryenter(hash_lock)) { 6539205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); 6540185029Spjd /* 6541185029Spjd * Skip this buffer rather than waiting. 6542185029Spjd */ 6543185029Spjd continue; 6544185029Spjd } 6545185029Spjd 6546275811Sdelphij passed_sz += hdr->b_size; 6547185029Spjd if (passed_sz > headroom) { 6548185029Spjd /* 6549185029Spjd * Searched too far. 6550185029Spjd */ 6551185029Spjd mutex_exit(hash_lock); 6552205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); 6553185029Spjd break; 6554185029Spjd } 6555185029Spjd 6556275811Sdelphij if (!l2arc_write_eligible(guid, hdr)) { 6557185029Spjd mutex_exit(hash_lock); 6558185029Spjd continue; 6559185029Spjd } 6560185029Spjd 6561287099Savg /* 6562287099Savg * Assume that the buffer is not going to be compressed 6563287099Savg * and could take more space on disk because of a larger 6564287099Savg * disk block size. 6565287099Savg */ 6566287099Savg buf_sz = hdr->b_size; 6567297848Savg align = (size_t)1 << dev->l2ad_vdev->vdev_ashift; 6568297848Savg buf_a_sz = P2ROUNDUP(buf_sz, align); 6569287099Savg 6570287099Savg if ((write_asize + buf_a_sz) > target_sz) { 6571185029Spjd full = B_TRUE; 6572185029Spjd mutex_exit(hash_lock); 6573205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_full); 6574185029Spjd break; 6575185029Spjd } 6576185029Spjd 6577185029Spjd if (pio == NULL) { 6578185029Spjd /* 6579185029Spjd * Insert a dummy header on the buflist so 6580185029Spjd * l2arc_write_done() can find where the 6581185029Spjd * write buffers begin without searching. 6582185029Spjd */ 6583286763Smav mutex_enter(&dev->l2ad_mtx); 6584286570Smav list_insert_head(&dev->l2ad_buflist, head); 6585286763Smav mutex_exit(&dev->l2ad_mtx); 6586185029Spjd 6587185029Spjd cb = kmem_alloc( 6588185029Spjd sizeof (l2arc_write_callback_t), KM_SLEEP); 6589185029Spjd cb->l2wcb_dev = dev; 6590185029Spjd cb->l2wcb_head = head; 6591185029Spjd pio = zio_root(spa, l2arc_write_done, cb, 6592185029Spjd ZIO_FLAG_CANFAIL); 6593205231Skmacy ARCSTAT_BUMP(arcstat_l2_write_pios); 6594185029Spjd } 6595185029Spjd 6596185029Spjd /* 6597185029Spjd * Create and add a new L2ARC header. 6598185029Spjd */ 6599286570Smav hdr->b_l2hdr.b_dev = dev; 6600275811Sdelphij hdr->b_flags |= ARC_FLAG_L2_WRITING; 6601251478Sdelphij /* 6602251478Sdelphij * Temporarily stash the data buffer in b_tmp_cdata. 6603251478Sdelphij * The subsequent write step will pick it up from 6604286570Smav * there. This is because can't access b_l1hdr.b_buf 6605251478Sdelphij * without holding the hash_lock, which we in turn 6606251478Sdelphij * can't access without holding the ARC list locks 6607251478Sdelphij * (which we want to avoid during compression/writing). 6608251478Sdelphij */ 6609287706Sdelphij hdr->b_l2hdr.b_compress = ZIO_COMPRESS_OFF; 6610286570Smav hdr->b_l2hdr.b_asize = hdr->b_size; 6611286570Smav hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; 6612251478Sdelphij 6613286598Smav /* 6614286598Smav * Explicitly set the b_daddr field to a known 6615286598Smav * value which means "invalid address". This 6616286598Smav * enables us to differentiate which stage of 6617286598Smav * l2arc_write_buffers() the particular header 6618286598Smav * is in (e.g. this loop, or the one below). 6619286598Smav * ARC_FLAG_L2_WRITING is not enough to make 6620286598Smav * this distinction, and we need to know in 6621286598Smav * order to do proper l2arc vdev accounting in 6622286598Smav * arc_release() and arc_hdr_destroy(). 6623286598Smav * 6624286598Smav * Note, we can't use a new flag to distinguish 6625286598Smav * the two stages because we don't hold the 6626286598Smav * header's hash_lock below, in the second stage 6627286598Smav * of this function. Thus, we can't simply 6628286598Smav * change the b_flags field to denote that the 6629286598Smav * IO has been sent. We can change the b_daddr 6630286598Smav * field of the L2 portion, though, since we'll 6631286598Smav * be holding the l2ad_mtx; which is why we're 6632286598Smav * using it to denote the header's state change. 6633286598Smav */ 6634286598Smav hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; 6635289295Smav 6636286570Smav hdr->b_flags |= ARC_FLAG_HAS_L2HDR; 6637185029Spjd 6638286763Smav mutex_enter(&dev->l2ad_mtx); 6639286570Smav list_insert_head(&dev->l2ad_buflist, hdr); 6640286763Smav mutex_exit(&dev->l2ad_mtx); 6641251478Sdelphij 6642185029Spjd /* 6643185029Spjd * Compute and store the buffer cksum before 6644185029Spjd * writing. On debug the cksum is verified first. 6645185029Spjd */ 6646286570Smav arc_cksum_verify(hdr->b_l1hdr.b_buf); 6647286570Smav arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); 6648185029Spjd 6649185029Spjd mutex_exit(hash_lock); 6650185029Spjd 6651251478Sdelphij write_sz += buf_sz; 6652287099Savg write_asize += buf_a_sz; 6653251478Sdelphij } 6654251478Sdelphij 6655286763Smav multilist_sublist_unlock(mls); 6656251478Sdelphij 6657251478Sdelphij if (full == B_TRUE) 6658251478Sdelphij break; 6659251478Sdelphij } 6660251478Sdelphij 6661251478Sdelphij /* No buffers selected for writing? */ 6662251478Sdelphij if (pio == NULL) { 6663251478Sdelphij ASSERT0(write_sz); 6664286570Smav ASSERT(!HDR_HAS_L1HDR(head)); 6665286570Smav kmem_cache_free(hdr_l2only_cache, head); 6666251478Sdelphij return (0); 6667251478Sdelphij } 6668251478Sdelphij 6669286763Smav mutex_enter(&dev->l2ad_mtx); 6670286763Smav 6671251478Sdelphij /* 6672251478Sdelphij * Now start writing the buffers. We're starting at the write head 6673251478Sdelphij * and work backwards, retracing the course of the buffer selector 6674251478Sdelphij * loop above. 6675251478Sdelphij */ 6676297848Savg write_asize = 0; 6677286570Smav for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; 6678286570Smav hdr = list_prev(&dev->l2ad_buflist, hdr)) { 6679251478Sdelphij uint64_t buf_sz; 6680297848Savg boolean_t compress; 6681251478Sdelphij 6682251478Sdelphij /* 6683286763Smav * We rely on the L1 portion of the header below, so 6684286763Smav * it's invalid for this header to have been evicted out 6685286763Smav * of the ghost cache, prior to being written out. The 6686286763Smav * ARC_FLAG_L2_WRITING bit ensures this won't happen. 6687286763Smav */ 6688286763Smav ASSERT(HDR_HAS_L1HDR(hdr)); 6689286763Smav 6690286763Smav /* 6691251478Sdelphij * We shouldn't need to lock the buffer here, since we flagged 6692275811Sdelphij * it as ARC_FLAG_L2_WRITING in the previous step, but we must 6693275811Sdelphij * take care to only access its L2 cache parameters. In 6694286570Smav * particular, hdr->l1hdr.b_buf may be invalid by now due to 6695275811Sdelphij * ARC eviction. 6696251478Sdelphij */ 6697286570Smav hdr->b_l2hdr.b_daddr = dev->l2ad_hand; 6698251478Sdelphij 6699297848Savg /* 6700297848Savg * Save a pointer to the original buffer data we had previously 6701297848Savg * stashed away. 6702297848Savg */ 6703297848Savg buf_data = hdr->b_l1hdr.b_tmp_cdata; 6704297848Savg 6705297848Savg compress = HDR_L2COMPRESS(hdr) && 6706297848Savg hdr->b_l2hdr.b_asize >= buf_compress_minsz; 6707297848Savg if (l2arc_transform_buf(hdr, compress)) { 6708297848Savg /* 6709297848Savg * If compression succeeded, enable headroom 6710297848Savg * boost on the next scan cycle. 6711297848Savg */ 6712297848Savg *headroom_boost = B_TRUE; 6713251478Sdelphij } 6714251478Sdelphij 6715251478Sdelphij /* 6716297848Savg * Get the new buffer size that accounts for compression 6717297848Savg * and padding. 6718251478Sdelphij */ 6719286570Smav buf_sz = hdr->b_l2hdr.b_asize; 6720251478Sdelphij 6721274172Savg /* 6722286598Smav * We need to do this regardless if buf_sz is zero or 6723286598Smav * not, otherwise, when this l2hdr is evicted we'll 6724286598Smav * remove a reference that was never added. 6725286598Smav */ 6726286598Smav (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr); 6727286598Smav 6728251478Sdelphij /* Compression may have squashed the buffer to zero length. */ 6729251478Sdelphij if (buf_sz != 0) { 6730297848Savg /* 6731297848Savg * If the data was padded or compressed, then it 6732297848Savg * it is in a new buffer. 6733297848Savg */ 6734297848Savg if (hdr->b_l1hdr.b_tmp_cdata != NULL) 6735297848Savg buf_data = hdr->b_l1hdr.b_tmp_cdata; 6736185029Spjd wzio = zio_write_phys(pio, dev->l2ad_vdev, 6737185029Spjd dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 6738185029Spjd NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 6739185029Spjd ZIO_FLAG_CANFAIL, B_FALSE); 6740185029Spjd 6741185029Spjd DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 6742185029Spjd zio_t *, wzio); 6743185029Spjd (void) zio_nowait(wzio); 6744185029Spjd 6745297848Savg write_asize += buf_sz; 6746297848Savg dev->l2ad_hand += buf_sz; 6747185029Spjd } 6748251478Sdelphij } 6749185029Spjd 6750286570Smav mutex_exit(&dev->l2ad_mtx); 6751185029Spjd 6752251478Sdelphij ASSERT3U(write_asize, <=, target_sz); 6753185029Spjd ARCSTAT_BUMP(arcstat_l2_writes_sent); 6754251478Sdelphij ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); 6755185029Spjd ARCSTAT_INCR(arcstat_l2_size, write_sz); 6756297848Savg ARCSTAT_INCR(arcstat_l2_asize, write_asize); 6757297848Savg vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0); 6758185029Spjd 6759185029Spjd /* 6760185029Spjd * Bump device hand to the device start if it is approaching the end. 6761185029Spjd * l2arc_evict() will already have evicted ahead for this case. 6762185029Spjd */ 6763185029Spjd if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 6764185029Spjd dev->l2ad_hand = dev->l2ad_start; 6765185029Spjd dev->l2ad_first = B_FALSE; 6766185029Spjd } 6767185029Spjd 6768208373Smm dev->l2ad_writing = B_TRUE; 6769185029Spjd (void) zio_wait(pio); 6770208373Smm dev->l2ad_writing = B_FALSE; 6771208373Smm 6772251478Sdelphij return (write_asize); 6773185029Spjd} 6774185029Spjd 6775185029Spjd/* 6776297848Savg * Transforms, possibly compresses and pads, an L2ARC buffer. 6777286570Smav * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its 6778251478Sdelphij * size in l2hdr->b_asize. This routine tries to compress the data and 6779251478Sdelphij * depending on the compression result there are three possible outcomes: 6780297848Savg * *) The buffer was incompressible. The buffer size was already ashift aligned. 6781297848Savg * The original hdr contents were left untouched except for b_tmp_cdata, 6782297848Savg * which is reset to NULL. The caller must keep a pointer to the original 6783297848Savg * data. 6784297848Savg * *) The buffer was incompressible. The buffer size was not ashift aligned. 6785297848Savg * b_tmp_cdata was replaced with a temporary data buffer which holds a padded 6786297848Savg * (aligned) copy of the data. Once writing is done, invoke 6787297848Savg * l2arc_release_cdata_buf on this hdr to free the temporary buffer. 6788251478Sdelphij * *) The buffer was all-zeros, so there is no need to write it to an L2 6789251478Sdelphij * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is 6790251478Sdelphij * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. 6791251478Sdelphij * *) Compression succeeded and b_tmp_cdata was replaced with a temporary 6792251478Sdelphij * data buffer which holds the compressed data to be written, and b_asize 6793251478Sdelphij * tells us how much data there is. b_compress is set to the appropriate 6794251478Sdelphij * compression algorithm. Once writing is done, invoke 6795251478Sdelphij * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. 6796251478Sdelphij * 6797251478Sdelphij * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the 6798251478Sdelphij * buffer was incompressible). 6799251478Sdelphij */ 6800251478Sdelphijstatic boolean_t 6801297848Savgl2arc_transform_buf(arc_buf_hdr_t *hdr, boolean_t compress) 6802251478Sdelphij{ 6803251478Sdelphij void *cdata; 6804297848Savg size_t align, asize, csize, len, rounded; 6805297848Savg 6806286570Smav ASSERT(HDR_HAS_L2HDR(hdr)); 6807286570Smav l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 6808251478Sdelphij 6809286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 6810287706Sdelphij ASSERT3S(l2hdr->b_compress, ==, ZIO_COMPRESS_OFF); 6811286570Smav ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 6812251478Sdelphij 6813251478Sdelphij len = l2hdr->b_asize; 6814297848Savg align = (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift; 6815297848Savg asize = P2ROUNDUP(len, align); 6816297848Savg cdata = zio_data_buf_alloc(asize); 6817286570Smav ASSERT3P(cdata, !=, NULL); 6818297848Savg if (compress) 6819297848Savg csize = zio_compress_data(ZIO_COMPRESS_LZ4, 6820297848Savg hdr->b_l1hdr.b_tmp_cdata, cdata, len); 6821297848Savg else 6822297848Savg csize = len; 6823251478Sdelphij 6824251478Sdelphij if (csize == 0) { 6825251478Sdelphij /* zero block, indicate that there's nothing to write */ 6826297848Savg zio_data_buf_free(cdata, asize); 6827287706Sdelphij l2hdr->b_compress = ZIO_COMPRESS_EMPTY; 6828251478Sdelphij l2hdr->b_asize = 0; 6829286570Smav hdr->b_l1hdr.b_tmp_cdata = NULL; 6830251478Sdelphij ARCSTAT_BUMP(arcstat_l2_compress_zeros); 6831251478Sdelphij return (B_TRUE); 6832287283Sdelphij } 6833287283Sdelphij 6834297848Savg rounded = P2ROUNDUP(csize, align); 6835297848Savg ASSERT3U(rounded, <=, asize); 6836287283Sdelphij if (rounded < len) { 6837251478Sdelphij /* 6838251478Sdelphij * Compression succeeded, we'll keep the cdata around for 6839251478Sdelphij * writing and release it afterwards. 6840251478Sdelphij */ 6841287283Sdelphij if (rounded > csize) { 6842287283Sdelphij bzero((char *)cdata + csize, rounded - csize); 6843287283Sdelphij csize = rounded; 6844287283Sdelphij } 6845287706Sdelphij l2hdr->b_compress = ZIO_COMPRESS_LZ4; 6846251478Sdelphij l2hdr->b_asize = csize; 6847286570Smav hdr->b_l1hdr.b_tmp_cdata = cdata; 6848251478Sdelphij ARCSTAT_BUMP(arcstat_l2_compress_successes); 6849251478Sdelphij return (B_TRUE); 6850251478Sdelphij } else { 6851251478Sdelphij /* 6852297848Savg * Compression did not save space. 6853251478Sdelphij */ 6854297848Savg if (P2PHASE(len, align) != 0) { 6855297848Savg /* 6856297848Savg * Use compression buffer for a copy of data padded to 6857297848Savg * the proper size. Compression algorithm remains set 6858297848Savg * to ZIO_COMPRESS_OFF. 6859297848Savg */ 6860297848Savg ASSERT3U(len, <, asize); 6861297848Savg bcopy(hdr->b_l1hdr.b_tmp_cdata, cdata, len); 6862297848Savg bzero((char *)cdata + len, asize - len); 6863297848Savg l2hdr->b_asize = asize; 6864297848Savg hdr->b_l1hdr.b_tmp_cdata = cdata; 6865297848Savg ARCSTAT_BUMP(arcstat_l2_padding_needed); 6866297848Savg } else { 6867297848Savg ASSERT3U(len, ==, asize); 6868297848Savg /* 6869297848Savg * The original buffer is good as is, 6870297848Savg * release the compressed buffer. 6871297848Savg * l2hdr will be left unmodified except for b_tmp_cdata. 6872297848Savg */ 6873297848Savg zio_data_buf_free(cdata, asize); 6874297848Savg hdr->b_l1hdr.b_tmp_cdata = NULL; 6875297848Savg } 6876297848Savg if (compress) 6877297848Savg ARCSTAT_BUMP(arcstat_l2_compress_failures); 6878251478Sdelphij return (B_FALSE); 6879251478Sdelphij } 6880251478Sdelphij} 6881251478Sdelphij 6882251478Sdelphij/* 6883251478Sdelphij * Decompresses a zio read back from an l2arc device. On success, the 6884251478Sdelphij * underlying zio's io_data buffer is overwritten by the uncompressed 6885251478Sdelphij * version. On decompression error (corrupt compressed stream), the 6886251478Sdelphij * zio->io_error value is set to signal an I/O error. 6887251478Sdelphij * 6888251478Sdelphij * Please note that the compressed data stream is not checksummed, so 6889251478Sdelphij * if the underlying device is experiencing data corruption, we may feed 6890251478Sdelphij * corrupt data to the decompressor, so the decompressor needs to be 6891251478Sdelphij * able to handle this situation (LZ4 does). 6892251478Sdelphij */ 6893251478Sdelphijstatic void 6894251478Sdelphijl2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) 6895251478Sdelphij{ 6896251478Sdelphij ASSERT(L2ARC_IS_VALID_COMPRESS(c)); 6897251478Sdelphij 6898251478Sdelphij if (zio->io_error != 0) { 6899251478Sdelphij /* 6900251478Sdelphij * An io error has occured, just restore the original io 6901251478Sdelphij * size in preparation for a main pool read. 6902251478Sdelphij */ 6903251478Sdelphij zio->io_orig_size = zio->io_size = hdr->b_size; 6904251478Sdelphij return; 6905251478Sdelphij } 6906251478Sdelphij 6907251478Sdelphij if (c == ZIO_COMPRESS_EMPTY) { 6908251478Sdelphij /* 6909251478Sdelphij * An empty buffer results in a null zio, which means we 6910251478Sdelphij * need to fill its io_data after we're done restoring the 6911251478Sdelphij * buffer's contents. 6912251478Sdelphij */ 6913286570Smav ASSERT(hdr->b_l1hdr.b_buf != NULL); 6914286570Smav bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); 6915286570Smav zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; 6916251478Sdelphij } else { 6917251478Sdelphij ASSERT(zio->io_data != NULL); 6918251478Sdelphij /* 6919251478Sdelphij * We copy the compressed data from the start of the arc buffer 6920251478Sdelphij * (the zio_read will have pulled in only what we need, the 6921251478Sdelphij * rest is garbage which we will overwrite at decompression) 6922251478Sdelphij * and then decompress back to the ARC data buffer. This way we 6923251478Sdelphij * can minimize copying by simply decompressing back over the 6924251478Sdelphij * original compressed data (rather than decompressing to an 6925251478Sdelphij * aux buffer and then copying back the uncompressed buffer, 6926251478Sdelphij * which is likely to be much larger). 6927251478Sdelphij */ 6928251478Sdelphij uint64_t csize; 6929251478Sdelphij void *cdata; 6930251478Sdelphij 6931251478Sdelphij csize = zio->io_size; 6932251478Sdelphij cdata = zio_data_buf_alloc(csize); 6933251478Sdelphij bcopy(zio->io_data, cdata, csize); 6934251478Sdelphij if (zio_decompress_data(c, cdata, zio->io_data, csize, 6935251478Sdelphij hdr->b_size) != 0) 6936251478Sdelphij zio->io_error = EIO; 6937251478Sdelphij zio_data_buf_free(cdata, csize); 6938251478Sdelphij } 6939251478Sdelphij 6940251478Sdelphij /* Restore the expected uncompressed IO size. */ 6941251478Sdelphij zio->io_orig_size = zio->io_size = hdr->b_size; 6942251478Sdelphij} 6943251478Sdelphij 6944251478Sdelphij/* 6945251478Sdelphij * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. 6946297848Savg * This buffer serves as a temporary holder of compressed or padded data while 6947251478Sdelphij * the buffer entry is being written to an l2arc device. Once that is 6948251478Sdelphij * done, we can dispose of it. 6949251478Sdelphij */ 6950251478Sdelphijstatic void 6951275811Sdelphijl2arc_release_cdata_buf(arc_buf_hdr_t *hdr) 6952251478Sdelphij{ 6953297848Savg size_t align, asize, len; 6954287706Sdelphij enum zio_compress comp = hdr->b_l2hdr.b_compress; 6955286763Smav 6956297848Savg ASSERT(HDR_HAS_L2HDR(hdr)); 6957286570Smav ASSERT(HDR_HAS_L1HDR(hdr)); 6958286763Smav ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp)); 6959286763Smav 6960297848Savg if (hdr->b_l1hdr.b_tmp_cdata != NULL) { 6961297848Savg ASSERT(comp != ZIO_COMPRESS_EMPTY); 6962297848Savg len = hdr->b_size; 6963297848Savg align = (size_t)1 << hdr->b_l2hdr.b_dev->l2ad_vdev->vdev_ashift; 6964297848Savg asize = P2ROUNDUP(len, align); 6965297848Savg zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, asize); 6966286763Smav hdr->b_l1hdr.b_tmp_cdata = NULL; 6967286763Smav } else { 6968297848Savg ASSERT(comp == ZIO_COMPRESS_OFF || comp == ZIO_COMPRESS_EMPTY); 6969251478Sdelphij } 6970251478Sdelphij} 6971251478Sdelphij 6972251478Sdelphij/* 6973185029Spjd * This thread feeds the L2ARC at regular intervals. This is the beating 6974185029Spjd * heart of the L2ARC. 6975185029Spjd */ 6976185029Spjdstatic void 6977185029Spjdl2arc_feed_thread(void *dummy __unused) 6978185029Spjd{ 6979185029Spjd callb_cpr_t cpr; 6980185029Spjd l2arc_dev_t *dev; 6981185029Spjd spa_t *spa; 6982208373Smm uint64_t size, wrote; 6983219089Spjd clock_t begin, next = ddi_get_lbolt(); 6984251478Sdelphij boolean_t headroom_boost = B_FALSE; 6985185029Spjd 6986185029Spjd CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 6987185029Spjd 6988185029Spjd mutex_enter(&l2arc_feed_thr_lock); 6989185029Spjd 6990185029Spjd while (l2arc_thread_exit == 0) { 6991185029Spjd CALLB_CPR_SAFE_BEGIN(&cpr); 6992185029Spjd (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 6993219089Spjd next - ddi_get_lbolt()); 6994185029Spjd CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 6995219089Spjd next = ddi_get_lbolt() + hz; 6996185029Spjd 6997185029Spjd /* 6998185029Spjd * Quick check for L2ARC devices. 6999185029Spjd */ 7000185029Spjd mutex_enter(&l2arc_dev_mtx); 7001185029Spjd if (l2arc_ndev == 0) { 7002185029Spjd mutex_exit(&l2arc_dev_mtx); 7003185029Spjd continue; 7004185029Spjd } 7005185029Spjd mutex_exit(&l2arc_dev_mtx); 7006219089Spjd begin = ddi_get_lbolt(); 7007185029Spjd 7008185029Spjd /* 7009185029Spjd * This selects the next l2arc device to write to, and in 7010185029Spjd * doing so the next spa to feed from: dev->l2ad_spa. This 7011185029Spjd * will return NULL if there are now no l2arc devices or if 7012185029Spjd * they are all faulted. 7013185029Spjd * 7014185029Spjd * If a device is returned, its spa's config lock is also 7015185029Spjd * held to prevent device removal. l2arc_dev_get_next() 7016185029Spjd * will grab and release l2arc_dev_mtx. 7017185029Spjd */ 7018185029Spjd if ((dev = l2arc_dev_get_next()) == NULL) 7019185029Spjd continue; 7020185029Spjd 7021185029Spjd spa = dev->l2ad_spa; 7022185029Spjd ASSERT(spa != NULL); 7023185029Spjd 7024185029Spjd /* 7025219089Spjd * If the pool is read-only then force the feed thread to 7026219089Spjd * sleep a little longer. 7027219089Spjd */ 7028219089Spjd if (!spa_writeable(spa)) { 7029219089Spjd next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 7030219089Spjd spa_config_exit(spa, SCL_L2ARC, dev); 7031219089Spjd continue; 7032219089Spjd } 7033219089Spjd 7034219089Spjd /* 7035185029Spjd * Avoid contributing to memory pressure. 7036185029Spjd */ 7037185029Spjd if (arc_reclaim_needed()) { 7038185029Spjd ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 7039185029Spjd spa_config_exit(spa, SCL_L2ARC, dev); 7040185029Spjd continue; 7041185029Spjd } 7042185029Spjd 7043185029Spjd ARCSTAT_BUMP(arcstat_l2_feeds); 7044185029Spjd 7045251478Sdelphij size = l2arc_write_size(); 7046185029Spjd 7047185029Spjd /* 7048185029Spjd * Evict L2ARC buffers that will be overwritten. 7049185029Spjd */ 7050185029Spjd l2arc_evict(dev, size, B_FALSE); 7051185029Spjd 7052185029Spjd /* 7053185029Spjd * Write ARC buffers. 7054185029Spjd */ 7055251478Sdelphij wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); 7056208373Smm 7057208373Smm /* 7058208373Smm * Calculate interval between writes. 7059208373Smm */ 7060208373Smm next = l2arc_write_interval(begin, size, wrote); 7061185029Spjd spa_config_exit(spa, SCL_L2ARC, dev); 7062185029Spjd } 7063185029Spjd 7064185029Spjd l2arc_thread_exit = 0; 7065185029Spjd cv_broadcast(&l2arc_feed_thr_cv); 7066185029Spjd CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 7067185029Spjd thread_exit(); 7068185029Spjd} 7069185029Spjd 7070185029Spjdboolean_t 7071185029Spjdl2arc_vdev_present(vdev_t *vd) 7072185029Spjd{ 7073185029Spjd l2arc_dev_t *dev; 7074185029Spjd 7075185029Spjd mutex_enter(&l2arc_dev_mtx); 7076185029Spjd for (dev = list_head(l2arc_dev_list); dev != NULL; 7077185029Spjd dev = list_next(l2arc_dev_list, dev)) { 7078185029Spjd if (dev->l2ad_vdev == vd) 7079185029Spjd break; 7080185029Spjd } 7081185029Spjd mutex_exit(&l2arc_dev_mtx); 7082185029Spjd 7083185029Spjd return (dev != NULL); 7084185029Spjd} 7085185029Spjd 7086185029Spjd/* 7087185029Spjd * Add a vdev for use by the L2ARC. By this point the spa has already 7088185029Spjd * validated the vdev and opened it. 7089185029Spjd */ 7090185029Spjdvoid 7091219089Spjdl2arc_add_vdev(spa_t *spa, vdev_t *vd) 7092185029Spjd{ 7093185029Spjd l2arc_dev_t *adddev; 7094185029Spjd 7095185029Spjd ASSERT(!l2arc_vdev_present(vd)); 7096185029Spjd 7097255753Sgibbs vdev_ashift_optimize(vd); 7098255753Sgibbs 7099185029Spjd /* 7100185029Spjd * Create a new l2arc device entry. 7101185029Spjd */ 7102185029Spjd adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 7103185029Spjd adddev->l2ad_spa = spa; 7104185029Spjd adddev->l2ad_vdev = vd; 7105219089Spjd adddev->l2ad_start = VDEV_LABEL_START_SIZE; 7106219089Spjd adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 7107185029Spjd adddev->l2ad_hand = adddev->l2ad_start; 7108185029Spjd adddev->l2ad_first = B_TRUE; 7109208373Smm adddev->l2ad_writing = B_FALSE; 7110185029Spjd 7111286570Smav mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); 7112185029Spjd /* 7113185029Spjd * This is a list of all ARC buffers that are still valid on the 7114185029Spjd * device. 7115185029Spjd */ 7116286570Smav list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 7117286570Smav offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); 7118185029Spjd 7119219089Spjd vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 7120286598Smav refcount_create(&adddev->l2ad_alloc); 7121185029Spjd 7122185029Spjd /* 7123185029Spjd * Add device to global list 7124185029Spjd */ 7125185029Spjd mutex_enter(&l2arc_dev_mtx); 7126185029Spjd list_insert_head(l2arc_dev_list, adddev); 7127185029Spjd atomic_inc_64(&l2arc_ndev); 7128185029Spjd mutex_exit(&l2arc_dev_mtx); 7129185029Spjd} 7130185029Spjd 7131185029Spjd/* 7132185029Spjd * Remove a vdev from the L2ARC. 7133185029Spjd */ 7134185029Spjdvoid 7135185029Spjdl2arc_remove_vdev(vdev_t *vd) 7136185029Spjd{ 7137185029Spjd l2arc_dev_t *dev, *nextdev, *remdev = NULL; 7138185029Spjd 7139185029Spjd /* 7140185029Spjd * Find the device by vdev 7141185029Spjd */ 7142185029Spjd mutex_enter(&l2arc_dev_mtx); 7143185029Spjd for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 7144185029Spjd nextdev = list_next(l2arc_dev_list, dev); 7145185029Spjd if (vd == dev->l2ad_vdev) { 7146185029Spjd remdev = dev; 7147185029Spjd break; 7148185029Spjd } 7149185029Spjd } 7150185029Spjd ASSERT(remdev != NULL); 7151185029Spjd 7152185029Spjd /* 7153185029Spjd * Remove device from global list 7154185029Spjd */ 7155185029Spjd list_remove(l2arc_dev_list, remdev); 7156185029Spjd l2arc_dev_last = NULL; /* may have been invalidated */ 7157185029Spjd atomic_dec_64(&l2arc_ndev); 7158185029Spjd mutex_exit(&l2arc_dev_mtx); 7159185029Spjd 7160185029Spjd /* 7161185029Spjd * Clear all buflists and ARC references. L2ARC device flush. 7162185029Spjd */ 7163185029Spjd l2arc_evict(remdev, 0, B_TRUE); 7164286570Smav list_destroy(&remdev->l2ad_buflist); 7165286570Smav mutex_destroy(&remdev->l2ad_mtx); 7166286598Smav refcount_destroy(&remdev->l2ad_alloc); 7167185029Spjd kmem_free(remdev, sizeof (l2arc_dev_t)); 7168185029Spjd} 7169185029Spjd 7170185029Spjdvoid 7171185029Spjdl2arc_init(void) 7172185029Spjd{ 7173185029Spjd l2arc_thread_exit = 0; 7174185029Spjd l2arc_ndev = 0; 7175185029Spjd l2arc_writes_sent = 0; 7176185029Spjd l2arc_writes_done = 0; 7177185029Spjd 7178185029Spjd mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 7179185029Spjd cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 7180185029Spjd mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 7181185029Spjd mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 7182185029Spjd 7183185029Spjd l2arc_dev_list = &L2ARC_dev_list; 7184185029Spjd l2arc_free_on_write = &L2ARC_free_on_write; 7185185029Spjd list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 7186185029Spjd offsetof(l2arc_dev_t, l2ad_node)); 7187185029Spjd list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 7188185029Spjd offsetof(l2arc_data_free_t, l2df_list_node)); 7189185029Spjd} 7190185029Spjd 7191185029Spjdvoid 7192185029Spjdl2arc_fini(void) 7193185029Spjd{ 7194185029Spjd /* 7195185029Spjd * This is called from dmu_fini(), which is called from spa_fini(); 7196185029Spjd * Because of this, we can assume that all l2arc devices have 7197185029Spjd * already been removed when the pools themselves were removed. 7198185029Spjd */ 7199185029Spjd 7200185029Spjd l2arc_do_free_on_write(); 7201185029Spjd 7202185029Spjd mutex_destroy(&l2arc_feed_thr_lock); 7203185029Spjd cv_destroy(&l2arc_feed_thr_cv); 7204185029Spjd mutex_destroy(&l2arc_dev_mtx); 7205185029Spjd mutex_destroy(&l2arc_free_on_write_mtx); 7206185029Spjd 7207185029Spjd list_destroy(l2arc_dev_list); 7208185029Spjd list_destroy(l2arc_free_on_write); 7209185029Spjd} 7210185029Spjd 7211185029Spjdvoid 7212185029Spjdl2arc_start(void) 7213185029Spjd{ 7214209962Smm if (!(spa_mode_global & FWRITE)) 7215185029Spjd return; 7216185029Spjd 7217185029Spjd (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 7218185029Spjd TS_RUN, minclsyspri); 7219185029Spjd} 7220185029Spjd 7221185029Spjdvoid 7222185029Spjdl2arc_stop(void) 7223185029Spjd{ 7224209962Smm if (!(spa_mode_global & FWRITE)) 7225185029Spjd return; 7226185029Spjd 7227185029Spjd mutex_enter(&l2arc_feed_thr_lock); 7228185029Spjd cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 7229185029Spjd l2arc_thread_exit = 1; 7230185029Spjd while (l2arc_thread_exit != 0) 7231185029Spjd cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 7232185029Spjd mutex_exit(&l2arc_feed_thr_lock); 7233185029Spjd} 7234