arc.c revision 251629
1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23228103Smm * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24249195Smm * Copyright (c) 2013 by Delphix. All rights reserved.
25251478Sdelphij * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26168404Spjd */
27168404Spjd
28168404Spjd/*
29168404Spjd * DVA-based Adjustable Replacement Cache
30168404Spjd *
31168404Spjd * While much of the theory of operation used here is
32168404Spjd * based on the self-tuning, low overhead replacement cache
33168404Spjd * presented by Megiddo and Modha at FAST 2003, there are some
34168404Spjd * significant differences:
35168404Spjd *
36168404Spjd * 1. The Megiddo and Modha model assumes any page is evictable.
37168404Spjd * Pages in its cache cannot be "locked" into memory.  This makes
38168404Spjd * the eviction algorithm simple: evict the last page in the list.
39168404Spjd * This also make the performance characteristics easy to reason
40168404Spjd * about.  Our cache is not so simple.  At any given moment, some
41168404Spjd * subset of the blocks in the cache are un-evictable because we
42168404Spjd * have handed out a reference to them.  Blocks are only evictable
43168404Spjd * when there are no external references active.  This makes
44168404Spjd * eviction far more problematic:  we choose to evict the evictable
45168404Spjd * blocks that are the "lowest" in the list.
46168404Spjd *
47168404Spjd * There are times when it is not possible to evict the requested
48168404Spjd * space.  In these circumstances we are unable to adjust the cache
49168404Spjd * size.  To prevent the cache growing unbounded at these times we
50185029Spjd * implement a "cache throttle" that slows the flow of new data
51185029Spjd * into the cache until we can make space available.
52168404Spjd *
53168404Spjd * 2. The Megiddo and Modha model assumes a fixed cache size.
54168404Spjd * Pages are evicted when the cache is full and there is a cache
55168404Spjd * miss.  Our model has a variable sized cache.  It grows with
56185029Spjd * high use, but also tries to react to memory pressure from the
57168404Spjd * operating system: decreasing its size when system memory is
58168404Spjd * tight.
59168404Spjd *
60168404Spjd * 3. The Megiddo and Modha model assumes a fixed page size. All
61168404Spjd * elements of the cache are therefor exactly the same size.  So
62168404Spjd * when adjusting the cache size following a cache miss, its simply
63168404Spjd * a matter of choosing a single page to evict.  In our model, we
64168404Spjd * have variable sized cache blocks (rangeing from 512 bytes to
65168404Spjd * 128K bytes).  We therefor choose a set of blocks to evict to make
66168404Spjd * space for a cache miss that approximates as closely as possible
67168404Spjd * the space used by the new block.
68168404Spjd *
69168404Spjd * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
70168404Spjd * by N. Megiddo & D. Modha, FAST 2003
71168404Spjd */
72168404Spjd
73168404Spjd/*
74168404Spjd * The locking model:
75168404Spjd *
76168404Spjd * A new reference to a cache buffer can be obtained in two
77168404Spjd * ways: 1) via a hash table lookup using the DVA as a key,
78185029Spjd * or 2) via one of the ARC lists.  The arc_read() interface
79168404Spjd * uses method 1, while the internal arc algorithms for
80168404Spjd * adjusting the cache use method 2.  We therefor provide two
81168404Spjd * types of locks: 1) the hash table lock array, and 2) the
82168404Spjd * arc list locks.
83168404Spjd *
84168404Spjd * Buffers do not have their own mutexs, rather they rely on the
85168404Spjd * hash table mutexs for the bulk of their protection (i.e. most
86168404Spjd * fields in the arc_buf_hdr_t are protected by these mutexs).
87168404Spjd *
88168404Spjd * buf_hash_find() returns the appropriate mutex (held) when it
89168404Spjd * locates the requested buffer in the hash table.  It returns
90168404Spjd * NULL for the mutex if the buffer was not in the table.
91168404Spjd *
92168404Spjd * buf_hash_remove() expects the appropriate hash mutex to be
93168404Spjd * already held before it is invoked.
94168404Spjd *
95168404Spjd * Each arc state also has a mutex which is used to protect the
96168404Spjd * buffer list associated with the state.  When attempting to
97168404Spjd * obtain a hash table lock while holding an arc list lock you
98168404Spjd * must use: mutex_tryenter() to avoid deadlock.  Also note that
99168404Spjd * the active state mutex must be held before the ghost state mutex.
100168404Spjd *
101168404Spjd * Arc buffers may have an associated eviction callback function.
102168404Spjd * This function will be invoked prior to removing the buffer (e.g.
103168404Spjd * in arc_do_user_evicts()).  Note however that the data associated
104168404Spjd * with the buffer may be evicted prior to the callback.  The callback
105168404Spjd * must be made with *no locks held* (to prevent deadlock).  Additionally,
106168404Spjd * the users of callbacks must ensure that their private data is
107168404Spjd * protected from simultaneous callbacks from arc_buf_evict()
108168404Spjd * and arc_do_user_evicts().
109168404Spjd *
110168404Spjd * Note that the majority of the performance stats are manipulated
111168404Spjd * with atomic operations.
112185029Spjd *
113185029Spjd * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
114185029Spjd *
115185029Spjd *	- L2ARC buflist creation
116185029Spjd *	- L2ARC buflist eviction
117185029Spjd *	- L2ARC write completion, which walks L2ARC buflists
118185029Spjd *	- ARC header destruction, as it removes from L2ARC buflists
119185029Spjd *	- ARC header release, as it removes from L2ARC buflists
120168404Spjd */
121168404Spjd
122168404Spjd#include <sys/spa.h>
123168404Spjd#include <sys/zio.h>
124251478Sdelphij#include <sys/zio_compress.h>
125168404Spjd#include <sys/zfs_context.h>
126168404Spjd#include <sys/arc.h>
127168404Spjd#include <sys/refcount.h>
128185029Spjd#include <sys/vdev.h>
129219089Spjd#include <sys/vdev_impl.h>
130168404Spjd#ifdef _KERNEL
131168404Spjd#include <sys/dnlc.h>
132168404Spjd#endif
133168404Spjd#include <sys/callb.h>
134168404Spjd#include <sys/kstat.h>
135248572Ssmh#include <sys/trim_map.h>
136219089Spjd#include <zfs_fletcher.h>
137168404Spjd#include <sys/sdt.h>
138168404Spjd
139191902Skmacy#include <vm/vm_pageout.h>
140191902Skmacy
141240133Smm#ifdef illumos
142240133Smm#ifndef _KERNEL
143240133Smm/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
144240133Smmboolean_t arc_watch = B_FALSE;
145240133Smmint arc_procfd;
146240133Smm#endif
147240133Smm#endif /* illumos */
148240133Smm
149168404Spjdstatic kmutex_t		arc_reclaim_thr_lock;
150168404Spjdstatic kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
151168404Spjdstatic uint8_t		arc_thread_exit;
152168404Spjd
153185029Spjdextern int zfs_write_limit_shift;
154185029Spjdextern uint64_t zfs_write_limit_max;
155185029Spjdextern kmutex_t zfs_write_limit_lock;
156185029Spjd
157168404Spjd#define	ARC_REDUCE_DNLC_PERCENT	3
158168404Spjduint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
159168404Spjd
160168404Spjdtypedef enum arc_reclaim_strategy {
161168404Spjd	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
162168404Spjd	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
163168404Spjd} arc_reclaim_strategy_t;
164168404Spjd
165168404Spjd/* number of seconds before growing cache again */
166168404Spjdstatic int		arc_grow_retry = 60;
167168404Spjd
168208373Smm/* shift of arc_c for calculating both min and max arc_p */
169208373Smmstatic int		arc_p_min_shift = 4;
170208373Smm
171208373Smm/* log2(fraction of arc to reclaim) */
172208373Smmstatic int		arc_shrink_shift = 5;
173208373Smm
174168404Spjd/*
175168404Spjd * minimum lifespan of a prefetch block in clock ticks
176168404Spjd * (initialized in arc_init())
177168404Spjd */
178168404Spjdstatic int		arc_min_prefetch_lifespan;
179168404Spjd
180208373Smmstatic int arc_dead;
181194043Skmacyextern int zfs_prefetch_disable;
182168404Spjd
183168404Spjd/*
184185029Spjd * The arc has filled available memory and has now warmed up.
185185029Spjd */
186185029Spjdstatic boolean_t arc_warm;
187185029Spjd
188185029Spjd/*
189168404Spjd * These tunables are for performance analysis.
190168404Spjd */
191185029Spjduint64_t zfs_arc_max;
192185029Spjduint64_t zfs_arc_min;
193185029Spjduint64_t zfs_arc_meta_limit = 0;
194208373Smmint zfs_arc_grow_retry = 0;
195208373Smmint zfs_arc_shrink_shift = 0;
196208373Smmint zfs_arc_p_min_shift = 0;
197242845Sdelphijint zfs_disable_dup_eviction = 0;
198185029Spjd
199185029SpjdTUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
200185029SpjdTUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
201185029SpjdTUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
202168473SpjdSYSCTL_DECL(_vfs_zfs);
203217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
204168473Spjd    "Maximum ARC size");
205217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
206168473Spjd    "Minimum ARC size");
207168404Spjd
208168404Spjd/*
209185029Spjd * Note that buffers can be in one of 6 states:
210168404Spjd *	ARC_anon	- anonymous (discussed below)
211168404Spjd *	ARC_mru		- recently used, currently cached
212168404Spjd *	ARC_mru_ghost	- recentely used, no longer in cache
213168404Spjd *	ARC_mfu		- frequently used, currently cached
214168404Spjd *	ARC_mfu_ghost	- frequently used, no longer in cache
215185029Spjd *	ARC_l2c_only	- exists in L2ARC but not other states
216185029Spjd * When there are no active references to the buffer, they are
217185029Spjd * are linked onto a list in one of these arc states.  These are
218185029Spjd * the only buffers that can be evicted or deleted.  Within each
219185029Spjd * state there are multiple lists, one for meta-data and one for
220185029Spjd * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
221185029Spjd * etc.) is tracked separately so that it can be managed more
222185029Spjd * explicitly: favored over data, limited explicitly.
223168404Spjd *
224168404Spjd * Anonymous buffers are buffers that are not associated with
225168404Spjd * a DVA.  These are buffers that hold dirty block copies
226168404Spjd * before they are written to stable storage.  By definition,
227168404Spjd * they are "ref'd" and are considered part of arc_mru
228168404Spjd * that cannot be freed.  Generally, they will aquire a DVA
229168404Spjd * as they are written and migrate onto the arc_mru list.
230185029Spjd *
231185029Spjd * The ARC_l2c_only state is for buffers that are in the second
232185029Spjd * level ARC but no longer in any of the ARC_m* lists.  The second
233185029Spjd * level ARC itself may also contain buffers that are in any of
234185029Spjd * the ARC_m* states - meaning that a buffer can exist in two
235185029Spjd * places.  The reason for the ARC_l2c_only state is to keep the
236185029Spjd * buffer header in the hash table, so that reads that hit the
237185029Spjd * second level ARC benefit from these fast lookups.
238168404Spjd */
239168404Spjd
240205264Skmacy#define	ARCS_LOCK_PAD		CACHE_LINE_SIZE
241205231Skmacystruct arcs_lock {
242205231Skmacy	kmutex_t	arcs_lock;
243205231Skmacy#ifdef _KERNEL
244205231Skmacy	unsigned char	pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
245205231Skmacy#endif
246205231Skmacy};
247205231Skmacy
248205231Skmacy/*
249205231Skmacy * must be power of two for mask use to work
250205231Skmacy *
251205231Skmacy */
252205231Skmacy#define ARC_BUFC_NUMDATALISTS		16
253205231Skmacy#define ARC_BUFC_NUMMETADATALISTS	16
254206796Spjd#define ARC_BUFC_NUMLISTS	(ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
255205231Skmacy
256168404Spjdtypedef struct arc_state {
257185029Spjd	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
258185029Spjd	uint64_t arcs_size;	/* total amount of data in this state */
259205231Skmacy	list_t	arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
260205264Skmacy	struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
261168404Spjd} arc_state_t;
262168404Spjd
263206796Spjd#define ARCS_LOCK(s, i)	(&((s)->arcs_locks[(i)].arcs_lock))
264205231Skmacy
265185029Spjd/* The 6 states: */
266168404Spjdstatic arc_state_t ARC_anon;
267168404Spjdstatic arc_state_t ARC_mru;
268168404Spjdstatic arc_state_t ARC_mru_ghost;
269168404Spjdstatic arc_state_t ARC_mfu;
270168404Spjdstatic arc_state_t ARC_mfu_ghost;
271185029Spjdstatic arc_state_t ARC_l2c_only;
272168404Spjd
273168404Spjdtypedef struct arc_stats {
274168404Spjd	kstat_named_t arcstat_hits;
275168404Spjd	kstat_named_t arcstat_misses;
276168404Spjd	kstat_named_t arcstat_demand_data_hits;
277168404Spjd	kstat_named_t arcstat_demand_data_misses;
278168404Spjd	kstat_named_t arcstat_demand_metadata_hits;
279168404Spjd	kstat_named_t arcstat_demand_metadata_misses;
280168404Spjd	kstat_named_t arcstat_prefetch_data_hits;
281168404Spjd	kstat_named_t arcstat_prefetch_data_misses;
282168404Spjd	kstat_named_t arcstat_prefetch_metadata_hits;
283168404Spjd	kstat_named_t arcstat_prefetch_metadata_misses;
284168404Spjd	kstat_named_t arcstat_mru_hits;
285168404Spjd	kstat_named_t arcstat_mru_ghost_hits;
286168404Spjd	kstat_named_t arcstat_mfu_hits;
287168404Spjd	kstat_named_t arcstat_mfu_ghost_hits;
288205231Skmacy	kstat_named_t arcstat_allocated;
289168404Spjd	kstat_named_t arcstat_deleted;
290205231Skmacy	kstat_named_t arcstat_stolen;
291168404Spjd	kstat_named_t arcstat_recycle_miss;
292251629Sdelphij	/*
293251629Sdelphij	 * Number of buffers that could not be evicted because the hash lock
294251629Sdelphij	 * was held by another thread.  The lock may not necessarily be held
295251629Sdelphij	 * by something using the same buffer, since hash locks are shared
296251629Sdelphij	 * by multiple buffers.
297251629Sdelphij	 */
298168404Spjd	kstat_named_t arcstat_mutex_miss;
299251629Sdelphij	/*
300251629Sdelphij	 * Number of buffers skipped because they have I/O in progress, are
301251629Sdelphij	 * indrect prefetch buffers that have not lived long enough, or are
302251629Sdelphij	 * not from the spa we're trying to evict from.
303251629Sdelphij	 */
304168404Spjd	kstat_named_t arcstat_evict_skip;
305208373Smm	kstat_named_t arcstat_evict_l2_cached;
306208373Smm	kstat_named_t arcstat_evict_l2_eligible;
307208373Smm	kstat_named_t arcstat_evict_l2_ineligible;
308168404Spjd	kstat_named_t arcstat_hash_elements;
309168404Spjd	kstat_named_t arcstat_hash_elements_max;
310168404Spjd	kstat_named_t arcstat_hash_collisions;
311168404Spjd	kstat_named_t arcstat_hash_chains;
312168404Spjd	kstat_named_t arcstat_hash_chain_max;
313168404Spjd	kstat_named_t arcstat_p;
314168404Spjd	kstat_named_t arcstat_c;
315168404Spjd	kstat_named_t arcstat_c_min;
316168404Spjd	kstat_named_t arcstat_c_max;
317168404Spjd	kstat_named_t arcstat_size;
318185029Spjd	kstat_named_t arcstat_hdr_size;
319208373Smm	kstat_named_t arcstat_data_size;
320208373Smm	kstat_named_t arcstat_other_size;
321185029Spjd	kstat_named_t arcstat_l2_hits;
322185029Spjd	kstat_named_t arcstat_l2_misses;
323185029Spjd	kstat_named_t arcstat_l2_feeds;
324185029Spjd	kstat_named_t arcstat_l2_rw_clash;
325208373Smm	kstat_named_t arcstat_l2_read_bytes;
326208373Smm	kstat_named_t arcstat_l2_write_bytes;
327185029Spjd	kstat_named_t arcstat_l2_writes_sent;
328185029Spjd	kstat_named_t arcstat_l2_writes_done;
329185029Spjd	kstat_named_t arcstat_l2_writes_error;
330185029Spjd	kstat_named_t arcstat_l2_writes_hdr_miss;
331185029Spjd	kstat_named_t arcstat_l2_evict_lock_retry;
332185029Spjd	kstat_named_t arcstat_l2_evict_reading;
333185029Spjd	kstat_named_t arcstat_l2_free_on_write;
334185029Spjd	kstat_named_t arcstat_l2_abort_lowmem;
335185029Spjd	kstat_named_t arcstat_l2_cksum_bad;
336185029Spjd	kstat_named_t arcstat_l2_io_error;
337185029Spjd	kstat_named_t arcstat_l2_size;
338251478Sdelphij	kstat_named_t arcstat_l2_asize;
339185029Spjd	kstat_named_t arcstat_l2_hdr_size;
340251478Sdelphij	kstat_named_t arcstat_l2_compress_successes;
341251478Sdelphij	kstat_named_t arcstat_l2_compress_zeros;
342251478Sdelphij	kstat_named_t arcstat_l2_compress_failures;
343205231Skmacy	kstat_named_t arcstat_l2_write_trylock_fail;
344205231Skmacy	kstat_named_t arcstat_l2_write_passed_headroom;
345205231Skmacy	kstat_named_t arcstat_l2_write_spa_mismatch;
346206796Spjd	kstat_named_t arcstat_l2_write_in_l2;
347205231Skmacy	kstat_named_t arcstat_l2_write_hdr_io_in_progress;
348205231Skmacy	kstat_named_t arcstat_l2_write_not_cacheable;
349205231Skmacy	kstat_named_t arcstat_l2_write_full;
350205231Skmacy	kstat_named_t arcstat_l2_write_buffer_iter;
351205231Skmacy	kstat_named_t arcstat_l2_write_pios;
352205231Skmacy	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
353205231Skmacy	kstat_named_t arcstat_l2_write_buffer_list_iter;
354205231Skmacy	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
355242845Sdelphij	kstat_named_t arcstat_memory_throttle_count;
356242845Sdelphij	kstat_named_t arcstat_duplicate_buffers;
357242845Sdelphij	kstat_named_t arcstat_duplicate_buffers_size;
358242845Sdelphij	kstat_named_t arcstat_duplicate_reads;
359168404Spjd} arc_stats_t;
360168404Spjd
361168404Spjdstatic arc_stats_t arc_stats = {
362168404Spjd	{ "hits",			KSTAT_DATA_UINT64 },
363168404Spjd	{ "misses",			KSTAT_DATA_UINT64 },
364168404Spjd	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
365168404Spjd	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
366168404Spjd	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
367168404Spjd	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
368168404Spjd	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
369168404Spjd	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
370168404Spjd	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
371168404Spjd	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
372168404Spjd	{ "mru_hits",			KSTAT_DATA_UINT64 },
373168404Spjd	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
374168404Spjd	{ "mfu_hits",			KSTAT_DATA_UINT64 },
375168404Spjd	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
376205231Skmacy	{ "allocated",			KSTAT_DATA_UINT64 },
377168404Spjd	{ "deleted",			KSTAT_DATA_UINT64 },
378205231Skmacy	{ "stolen",			KSTAT_DATA_UINT64 },
379168404Spjd	{ "recycle_miss",		KSTAT_DATA_UINT64 },
380168404Spjd	{ "mutex_miss",			KSTAT_DATA_UINT64 },
381168404Spjd	{ "evict_skip",			KSTAT_DATA_UINT64 },
382208373Smm	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
383208373Smm	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
384208373Smm	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
385168404Spjd	{ "hash_elements",		KSTAT_DATA_UINT64 },
386168404Spjd	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
387168404Spjd	{ "hash_collisions",		KSTAT_DATA_UINT64 },
388168404Spjd	{ "hash_chains",		KSTAT_DATA_UINT64 },
389168404Spjd	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
390168404Spjd	{ "p",				KSTAT_DATA_UINT64 },
391168404Spjd	{ "c",				KSTAT_DATA_UINT64 },
392168404Spjd	{ "c_min",			KSTAT_DATA_UINT64 },
393168404Spjd	{ "c_max",			KSTAT_DATA_UINT64 },
394185029Spjd	{ "size",			KSTAT_DATA_UINT64 },
395185029Spjd	{ "hdr_size",			KSTAT_DATA_UINT64 },
396208373Smm	{ "data_size",			KSTAT_DATA_UINT64 },
397208373Smm	{ "other_size",			KSTAT_DATA_UINT64 },
398185029Spjd	{ "l2_hits",			KSTAT_DATA_UINT64 },
399185029Spjd	{ "l2_misses",			KSTAT_DATA_UINT64 },
400185029Spjd	{ "l2_feeds",			KSTAT_DATA_UINT64 },
401185029Spjd	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
402208373Smm	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
403208373Smm	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
404185029Spjd	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
405185029Spjd	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
406185029Spjd	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
407185029Spjd	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
408185029Spjd	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
409185029Spjd	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
410185029Spjd	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
411185029Spjd	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
412185029Spjd	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
413185029Spjd	{ "l2_io_error",		KSTAT_DATA_UINT64 },
414185029Spjd	{ "l2_size",			KSTAT_DATA_UINT64 },
415251478Sdelphij	{ "l2_asize",			KSTAT_DATA_UINT64 },
416185029Spjd	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
417251478Sdelphij	{ "l2_compress_successes",	KSTAT_DATA_UINT64 },
418251478Sdelphij	{ "l2_compress_zeros",		KSTAT_DATA_UINT64 },
419251478Sdelphij	{ "l2_compress_failures",	KSTAT_DATA_UINT64 },
420206796Spjd	{ "l2_write_trylock_fail",	KSTAT_DATA_UINT64 },
421206796Spjd	{ "l2_write_passed_headroom",	KSTAT_DATA_UINT64 },
422206796Spjd	{ "l2_write_spa_mismatch",	KSTAT_DATA_UINT64 },
423206796Spjd	{ "l2_write_in_l2",		KSTAT_DATA_UINT64 },
424206796Spjd	{ "l2_write_io_in_progress",	KSTAT_DATA_UINT64 },
425206796Spjd	{ "l2_write_not_cacheable",	KSTAT_DATA_UINT64 },
426206796Spjd	{ "l2_write_full",		KSTAT_DATA_UINT64 },
427206796Spjd	{ "l2_write_buffer_iter",	KSTAT_DATA_UINT64 },
428206796Spjd	{ "l2_write_pios",		KSTAT_DATA_UINT64 },
429206796Spjd	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
430206796Spjd	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
431242845Sdelphij	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
432242845Sdelphij	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
433242845Sdelphij	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
434242845Sdelphij	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
435242845Sdelphij	{ "duplicate_reads",		KSTAT_DATA_UINT64 }
436168404Spjd};
437168404Spjd
438168404Spjd#define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
439168404Spjd
440168404Spjd#define	ARCSTAT_INCR(stat, val) \
441168404Spjd	atomic_add_64(&arc_stats.stat.value.ui64, (val));
442168404Spjd
443206796Spjd#define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
444168404Spjd#define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
445168404Spjd
446168404Spjd#define	ARCSTAT_MAX(stat, val) {					\
447168404Spjd	uint64_t m;							\
448168404Spjd	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
449168404Spjd	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
450168404Spjd		continue;						\
451168404Spjd}
452168404Spjd
453168404Spjd#define	ARCSTAT_MAXSTAT(stat) \
454168404Spjd	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
455168404Spjd
456168404Spjd/*
457168404Spjd * We define a macro to allow ARC hits/misses to be easily broken down by
458168404Spjd * two separate conditions, giving a total of four different subtypes for
459168404Spjd * each of hits and misses (so eight statistics total).
460168404Spjd */
461168404Spjd#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
462168404Spjd	if (cond1) {							\
463168404Spjd		if (cond2) {						\
464168404Spjd			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
465168404Spjd		} else {						\
466168404Spjd			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
467168404Spjd		}							\
468168404Spjd	} else {							\
469168404Spjd		if (cond2) {						\
470168404Spjd			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
471168404Spjd		} else {						\
472168404Spjd			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
473168404Spjd		}							\
474168404Spjd	}
475168404Spjd
476168404Spjdkstat_t			*arc_ksp;
477206796Spjdstatic arc_state_t	*arc_anon;
478168404Spjdstatic arc_state_t	*arc_mru;
479168404Spjdstatic arc_state_t	*arc_mru_ghost;
480168404Spjdstatic arc_state_t	*arc_mfu;
481168404Spjdstatic arc_state_t	*arc_mfu_ghost;
482185029Spjdstatic arc_state_t	*arc_l2c_only;
483168404Spjd
484168404Spjd/*
485168404Spjd * There are several ARC variables that are critical to export as kstats --
486168404Spjd * but we don't want to have to grovel around in the kstat whenever we wish to
487168404Spjd * manipulate them.  For these variables, we therefore define them to be in
488168404Spjd * terms of the statistic variable.  This assures that we are not introducing
489168404Spjd * the possibility of inconsistency by having shadow copies of the variables,
490168404Spjd * while still allowing the code to be readable.
491168404Spjd */
492168404Spjd#define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
493168404Spjd#define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
494168404Spjd#define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
495168404Spjd#define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
496168404Spjd#define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
497168404Spjd
498251478Sdelphij#define	L2ARC_IS_VALID_COMPRESS(_c_) \
499251478Sdelphij	((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
500251478Sdelphij
501168404Spjdstatic int		arc_no_grow;	/* Don't try to grow cache size */
502168404Spjdstatic uint64_t		arc_tempreserve;
503209962Smmstatic uint64_t		arc_loaned_bytes;
504185029Spjdstatic uint64_t		arc_meta_used;
505185029Spjdstatic uint64_t		arc_meta_limit;
506185029Spjdstatic uint64_t		arc_meta_max = 0;
507229663SpjdSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RD, &arc_meta_used, 0,
508229663Spjd    "ARC metadata used");
509229663SpjdSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RW, &arc_meta_limit, 0,
510229663Spjd    "ARC metadata limit");
511168404Spjd
512185029Spjdtypedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
513185029Spjd
514168404Spjdtypedef struct arc_callback arc_callback_t;
515168404Spjd
516168404Spjdstruct arc_callback {
517168404Spjd	void			*acb_private;
518168404Spjd	arc_done_func_t		*acb_done;
519168404Spjd	arc_buf_t		*acb_buf;
520168404Spjd	zio_t			*acb_zio_dummy;
521168404Spjd	arc_callback_t		*acb_next;
522168404Spjd};
523168404Spjd
524168404Spjdtypedef struct arc_write_callback arc_write_callback_t;
525168404Spjd
526168404Spjdstruct arc_write_callback {
527168404Spjd	void		*awcb_private;
528168404Spjd	arc_done_func_t	*awcb_ready;
529168404Spjd	arc_done_func_t	*awcb_done;
530168404Spjd	arc_buf_t	*awcb_buf;
531168404Spjd};
532168404Spjd
533168404Spjdstruct arc_buf_hdr {
534168404Spjd	/* protected by hash lock */
535168404Spjd	dva_t			b_dva;
536168404Spjd	uint64_t		b_birth;
537168404Spjd	uint64_t		b_cksum0;
538168404Spjd
539168404Spjd	kmutex_t		b_freeze_lock;
540168404Spjd	zio_cksum_t		*b_freeze_cksum;
541219089Spjd	void			*b_thawed;
542168404Spjd
543168404Spjd	arc_buf_hdr_t		*b_hash_next;
544168404Spjd	arc_buf_t		*b_buf;
545168404Spjd	uint32_t		b_flags;
546168404Spjd	uint32_t		b_datacnt;
547168404Spjd
548168404Spjd	arc_callback_t		*b_acb;
549168404Spjd	kcondvar_t		b_cv;
550168404Spjd
551168404Spjd	/* immutable */
552168404Spjd	arc_buf_contents_t	b_type;
553168404Spjd	uint64_t		b_size;
554209962Smm	uint64_t		b_spa;
555168404Spjd
556168404Spjd	/* protected by arc state mutex */
557168404Spjd	arc_state_t		*b_state;
558168404Spjd	list_node_t		b_arc_node;
559168404Spjd
560168404Spjd	/* updated atomically */
561168404Spjd	clock_t			b_arc_access;
562168404Spjd
563168404Spjd	/* self protecting */
564168404Spjd	refcount_t		b_refcnt;
565185029Spjd
566185029Spjd	l2arc_buf_hdr_t		*b_l2hdr;
567185029Spjd	list_node_t		b_l2node;
568168404Spjd};
569168404Spjd
570168404Spjdstatic arc_buf_t *arc_eviction_list;
571168404Spjdstatic kmutex_t arc_eviction_mtx;
572168404Spjdstatic arc_buf_hdr_t arc_eviction_hdr;
573168404Spjdstatic void arc_get_data_buf(arc_buf_t *buf);
574168404Spjdstatic void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
575185029Spjdstatic int arc_evict_needed(arc_buf_contents_t type);
576209962Smmstatic void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
577240133Smm#ifdef illumos
578240133Smmstatic void arc_buf_watch(arc_buf_t *buf);
579240133Smm#endif /* illumos */
580168404Spjd
581209962Smmstatic boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
582208373Smm
583168404Spjd#define	GHOST_STATE(state)	\
584185029Spjd	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
585185029Spjd	(state) == arc_l2c_only)
586168404Spjd
587168404Spjd/*
588168404Spjd * Private ARC flags.  These flags are private ARC only flags that will show up
589168404Spjd * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
590168404Spjd * be passed in as arc_flags in things like arc_read.  However, these flags
591168404Spjd * should never be passed and should only be set by ARC code.  When adding new
592168404Spjd * public flags, make sure not to smash the private ones.
593168404Spjd */
594168404Spjd
595168404Spjd#define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
596168404Spjd#define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
597168404Spjd#define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
598168404Spjd#define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
599168404Spjd#define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
600168404Spjd#define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
601185029Spjd#define	ARC_FREE_IN_PROGRESS	(1 << 15)	/* hdr about to be freed */
602185029Spjd#define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
603185029Spjd#define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
604185029Spjd#define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
605168404Spjd
606168404Spjd#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
607168404Spjd#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
608168404Spjd#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
609208373Smm#define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_PREFETCH)
610168404Spjd#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
611168404Spjd#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
612185029Spjd#define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
613185029Spjd#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_L2CACHE)
614185029Spjd#define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS &&	\
615185029Spjd				    (hdr)->b_l2hdr != NULL)
616185029Spjd#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
617185029Spjd#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
618185029Spjd#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
619168404Spjd
620168404Spjd/*
621185029Spjd * Other sizes
622185029Spjd */
623185029Spjd
624185029Spjd#define	HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
625185029Spjd#define	L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
626185029Spjd
627185029Spjd/*
628168404Spjd * Hash table routines
629168404Spjd */
630168404Spjd
631205253Skmacy#define	HT_LOCK_PAD	CACHE_LINE_SIZE
632168404Spjd
633168404Spjdstruct ht_lock {
634168404Spjd	kmutex_t	ht_lock;
635168404Spjd#ifdef _KERNEL
636168404Spjd	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
637168404Spjd#endif
638168404Spjd};
639168404Spjd
640168404Spjd#define	BUF_LOCKS 256
641168404Spjdtypedef struct buf_hash_table {
642168404Spjd	uint64_t ht_mask;
643168404Spjd	arc_buf_hdr_t **ht_table;
644205264Skmacy	struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
645168404Spjd} buf_hash_table_t;
646168404Spjd
647168404Spjdstatic buf_hash_table_t buf_hash_table;
648168404Spjd
649168404Spjd#define	BUF_HASH_INDEX(spa, dva, birth) \
650168404Spjd	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
651168404Spjd#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
652168404Spjd#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
653219089Spjd#define	HDR_LOCK(hdr) \
654219089Spjd	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
655168404Spjd
656168404Spjduint64_t zfs_crc64_table[256];
657168404Spjd
658185029Spjd/*
659185029Spjd * Level 2 ARC
660185029Spjd */
661185029Spjd
662208373Smm#define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
663251478Sdelphij#define	L2ARC_HEADROOM		2			/* num of writes */
664251478Sdelphij/*
665251478Sdelphij * If we discover during ARC scan any buffers to be compressed, we boost
666251478Sdelphij * our headroom for the next scanning cycle by this percentage multiple.
667251478Sdelphij */
668251478Sdelphij#define	L2ARC_HEADROOM_BOOST	200
669208373Smm#define	L2ARC_FEED_SECS		1		/* caching interval secs */
670208373Smm#define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
671185029Spjd
672185029Spjd#define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
673185029Spjd#define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
674185029Spjd
675185029Spjd/*
676185029Spjd * L2ARC Performance Tunables
677185029Spjd */
678185029Spjduint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
679185029Spjduint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
680185029Spjduint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
681251478Sdelphijuint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
682185029Spjduint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
683208373Smmuint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
684219089Spjdboolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
685208373Smmboolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
686208373Smmboolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
687185029Spjd
688217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
689205231Skmacy    &l2arc_write_max, 0, "max write size");
690217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
691205231Skmacy    &l2arc_write_boost, 0, "extra write during warmup");
692217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
693205231Skmacy    &l2arc_headroom, 0, "number of dev writes");
694217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
695205231Skmacy    &l2arc_feed_secs, 0, "interval seconds");
696217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
697208373Smm    &l2arc_feed_min_ms, 0, "min interval milliseconds");
698205231Skmacy
699205231SkmacySYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
700205231Skmacy    &l2arc_noprefetch, 0, "don't cache prefetch bufs");
701208373SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
702208373Smm    &l2arc_feed_again, 0, "turbo warmup");
703208373SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
704208373Smm    &l2arc_norw, 0, "no reads during writes");
705205231Skmacy
706217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
707205231Skmacy    &ARC_anon.arcs_size, 0, "size of anonymous state");
708217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
709205231Skmacy    &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
710217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
711205231Skmacy    &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
712205231Skmacy
713217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
714205231Skmacy    &ARC_mru.arcs_size, 0, "size of mru state");
715217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
716205231Skmacy    &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
717217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
718205231Skmacy    &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
719205231Skmacy
720217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
721205231Skmacy    &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
722217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
723205231Skmacy    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
724205231Skmacy    "size of metadata in mru ghost state");
725217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
726205231Skmacy    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
727205231Skmacy    "size of data in mru ghost state");
728205231Skmacy
729217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
730205231Skmacy    &ARC_mfu.arcs_size, 0, "size of mfu state");
731217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
732205231Skmacy    &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
733217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
734205231Skmacy    &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
735205231Skmacy
736217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
737205231Skmacy    &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
738217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
739205231Skmacy    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
740205231Skmacy    "size of metadata in mfu ghost state");
741217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
742205231Skmacy    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
743205231Skmacy    "size of data in mfu ghost state");
744205231Skmacy
745217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
746205231Skmacy    &ARC_l2c_only.arcs_size, 0, "size of mru state");
747205231Skmacy
748185029Spjd/*
749185029Spjd * L2ARC Internals
750185029Spjd */
751185029Spjdtypedef struct l2arc_dev {
752185029Spjd	vdev_t			*l2ad_vdev;	/* vdev */
753185029Spjd	spa_t			*l2ad_spa;	/* spa */
754185029Spjd	uint64_t		l2ad_hand;	/* next write location */
755185029Spjd	uint64_t		l2ad_start;	/* first addr on device */
756185029Spjd	uint64_t		l2ad_end;	/* last addr on device */
757185029Spjd	uint64_t		l2ad_evict;	/* last addr eviction reached */
758185029Spjd	boolean_t		l2ad_first;	/* first sweep through */
759208373Smm	boolean_t		l2ad_writing;	/* currently writing */
760185029Spjd	list_t			*l2ad_buflist;	/* buffer list */
761185029Spjd	list_node_t		l2ad_node;	/* device list node */
762185029Spjd} l2arc_dev_t;
763185029Spjd
764185029Spjdstatic list_t L2ARC_dev_list;			/* device list */
765185029Spjdstatic list_t *l2arc_dev_list;			/* device list pointer */
766185029Spjdstatic kmutex_t l2arc_dev_mtx;			/* device list mutex */
767185029Spjdstatic l2arc_dev_t *l2arc_dev_last;		/* last device used */
768185029Spjdstatic kmutex_t l2arc_buflist_mtx;		/* mutex for all buflists */
769185029Spjdstatic list_t L2ARC_free_on_write;		/* free after write buf list */
770185029Spjdstatic list_t *l2arc_free_on_write;		/* free after write list ptr */
771185029Spjdstatic kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
772185029Spjdstatic uint64_t l2arc_ndev;			/* number of devices */
773185029Spjd
774185029Spjdtypedef struct l2arc_read_callback {
775251478Sdelphij	arc_buf_t		*l2rcb_buf;		/* read buffer */
776251478Sdelphij	spa_t			*l2rcb_spa;		/* spa */
777251478Sdelphij	blkptr_t		l2rcb_bp;		/* original blkptr */
778251478Sdelphij	zbookmark_t		l2rcb_zb;		/* original bookmark */
779251478Sdelphij	int			l2rcb_flags;		/* original flags */
780251478Sdelphij	enum zio_compress	l2rcb_compress;		/* applied compress */
781185029Spjd} l2arc_read_callback_t;
782185029Spjd
783185029Spjdtypedef struct l2arc_write_callback {
784185029Spjd	l2arc_dev_t	*l2wcb_dev;		/* device info */
785185029Spjd	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
786185029Spjd} l2arc_write_callback_t;
787185029Spjd
788185029Spjdstruct l2arc_buf_hdr {
789185029Spjd	/* protected by arc_buf_hdr  mutex */
790251478Sdelphij	l2arc_dev_t		*b_dev;		/* L2ARC device */
791251478Sdelphij	uint64_t		b_daddr;	/* disk address, offset byte */
792251478Sdelphij	/* compression applied to buffer data */
793251478Sdelphij	enum zio_compress	b_compress;
794251478Sdelphij	/* real alloc'd buffer size depending on b_compress applied */
795251478Sdelphij	int			b_asize;
796251478Sdelphij	/* temporary buffer holder for in-flight compressed data */
797251478Sdelphij	void			*b_tmp_cdata;
798185029Spjd};
799185029Spjd
800185029Spjdtypedef struct l2arc_data_free {
801185029Spjd	/* protected by l2arc_free_on_write_mtx */
802185029Spjd	void		*l2df_data;
803185029Spjd	size_t		l2df_size;
804185029Spjd	void		(*l2df_func)(void *, size_t);
805185029Spjd	list_node_t	l2df_list_node;
806185029Spjd} l2arc_data_free_t;
807185029Spjd
808185029Spjdstatic kmutex_t l2arc_feed_thr_lock;
809185029Spjdstatic kcondvar_t l2arc_feed_thr_cv;
810185029Spjdstatic uint8_t l2arc_thread_exit;
811185029Spjd
812185029Spjdstatic void l2arc_read_done(zio_t *zio);
813185029Spjdstatic void l2arc_hdr_stat_add(void);
814185029Spjdstatic void l2arc_hdr_stat_remove(void);
815185029Spjd
816251478Sdelphijstatic boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
817251478Sdelphijstatic void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
818251478Sdelphij    enum zio_compress c);
819251478Sdelphijstatic void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
820251478Sdelphij
821168404Spjdstatic uint64_t
822209962Smmbuf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
823168404Spjd{
824168404Spjd	uint8_t *vdva = (uint8_t *)dva;
825168404Spjd	uint64_t crc = -1ULL;
826168404Spjd	int i;
827168404Spjd
828168404Spjd	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
829168404Spjd
830168404Spjd	for (i = 0; i < sizeof (dva_t); i++)
831168404Spjd		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
832168404Spjd
833209962Smm	crc ^= (spa>>8) ^ birth;
834168404Spjd
835168404Spjd	return (crc);
836168404Spjd}
837168404Spjd
838168404Spjd#define	BUF_EMPTY(buf)						\
839168404Spjd	((buf)->b_dva.dva_word[0] == 0 &&			\
840168404Spjd	(buf)->b_dva.dva_word[1] == 0 &&			\
841168404Spjd	(buf)->b_birth == 0)
842168404Spjd
843168404Spjd#define	BUF_EQUAL(spa, dva, birth, buf)				\
844168404Spjd	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
845168404Spjd	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
846168404Spjd	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
847168404Spjd
848219089Spjdstatic void
849219089Spjdbuf_discard_identity(arc_buf_hdr_t *hdr)
850219089Spjd{
851219089Spjd	hdr->b_dva.dva_word[0] = 0;
852219089Spjd	hdr->b_dva.dva_word[1] = 0;
853219089Spjd	hdr->b_birth = 0;
854219089Spjd	hdr->b_cksum0 = 0;
855219089Spjd}
856219089Spjd
857168404Spjdstatic arc_buf_hdr_t *
858209962Smmbuf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
859168404Spjd{
860168404Spjd	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
861168404Spjd	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
862168404Spjd	arc_buf_hdr_t *buf;
863168404Spjd
864168404Spjd	mutex_enter(hash_lock);
865168404Spjd	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
866168404Spjd	    buf = buf->b_hash_next) {
867168404Spjd		if (BUF_EQUAL(spa, dva, birth, buf)) {
868168404Spjd			*lockp = hash_lock;
869168404Spjd			return (buf);
870168404Spjd		}
871168404Spjd	}
872168404Spjd	mutex_exit(hash_lock);
873168404Spjd	*lockp = NULL;
874168404Spjd	return (NULL);
875168404Spjd}
876168404Spjd
877168404Spjd/*
878168404Spjd * Insert an entry into the hash table.  If there is already an element
879168404Spjd * equal to elem in the hash table, then the already existing element
880168404Spjd * will be returned and the new element will not be inserted.
881168404Spjd * Otherwise returns NULL.
882168404Spjd */
883168404Spjdstatic arc_buf_hdr_t *
884168404Spjdbuf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
885168404Spjd{
886168404Spjd	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
887168404Spjd	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
888168404Spjd	arc_buf_hdr_t *fbuf;
889168404Spjd	uint32_t i;
890168404Spjd
891168404Spjd	ASSERT(!HDR_IN_HASH_TABLE(buf));
892168404Spjd	*lockp = hash_lock;
893168404Spjd	mutex_enter(hash_lock);
894168404Spjd	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
895168404Spjd	    fbuf = fbuf->b_hash_next, i++) {
896168404Spjd		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
897168404Spjd			return (fbuf);
898168404Spjd	}
899168404Spjd
900168404Spjd	buf->b_hash_next = buf_hash_table.ht_table[idx];
901168404Spjd	buf_hash_table.ht_table[idx] = buf;
902168404Spjd	buf->b_flags |= ARC_IN_HASH_TABLE;
903168404Spjd
904168404Spjd	/* collect some hash table performance data */
905168404Spjd	if (i > 0) {
906168404Spjd		ARCSTAT_BUMP(arcstat_hash_collisions);
907168404Spjd		if (i == 1)
908168404Spjd			ARCSTAT_BUMP(arcstat_hash_chains);
909168404Spjd
910168404Spjd		ARCSTAT_MAX(arcstat_hash_chain_max, i);
911168404Spjd	}
912168404Spjd
913168404Spjd	ARCSTAT_BUMP(arcstat_hash_elements);
914168404Spjd	ARCSTAT_MAXSTAT(arcstat_hash_elements);
915168404Spjd
916168404Spjd	return (NULL);
917168404Spjd}
918168404Spjd
919168404Spjdstatic void
920168404Spjdbuf_hash_remove(arc_buf_hdr_t *buf)
921168404Spjd{
922168404Spjd	arc_buf_hdr_t *fbuf, **bufp;
923168404Spjd	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
924168404Spjd
925168404Spjd	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
926168404Spjd	ASSERT(HDR_IN_HASH_TABLE(buf));
927168404Spjd
928168404Spjd	bufp = &buf_hash_table.ht_table[idx];
929168404Spjd	while ((fbuf = *bufp) != buf) {
930168404Spjd		ASSERT(fbuf != NULL);
931168404Spjd		bufp = &fbuf->b_hash_next;
932168404Spjd	}
933168404Spjd	*bufp = buf->b_hash_next;
934168404Spjd	buf->b_hash_next = NULL;
935168404Spjd	buf->b_flags &= ~ARC_IN_HASH_TABLE;
936168404Spjd
937168404Spjd	/* collect some hash table performance data */
938168404Spjd	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
939168404Spjd
940168404Spjd	if (buf_hash_table.ht_table[idx] &&
941168404Spjd	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
942168404Spjd		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
943168404Spjd}
944168404Spjd
945168404Spjd/*
946168404Spjd * Global data structures and functions for the buf kmem cache.
947168404Spjd */
948168404Spjdstatic kmem_cache_t *hdr_cache;
949168404Spjdstatic kmem_cache_t *buf_cache;
950168404Spjd
951168404Spjdstatic void
952168404Spjdbuf_fini(void)
953168404Spjd{
954168404Spjd	int i;
955168404Spjd
956168404Spjd	kmem_free(buf_hash_table.ht_table,
957168404Spjd	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
958168404Spjd	for (i = 0; i < BUF_LOCKS; i++)
959168404Spjd		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
960168404Spjd	kmem_cache_destroy(hdr_cache);
961168404Spjd	kmem_cache_destroy(buf_cache);
962168404Spjd}
963168404Spjd
964168404Spjd/*
965168404Spjd * Constructor callback - called when the cache is empty
966168404Spjd * and a new buf is requested.
967168404Spjd */
968168404Spjd/* ARGSUSED */
969168404Spjdstatic int
970168404Spjdhdr_cons(void *vbuf, void *unused, int kmflag)
971168404Spjd{
972168404Spjd	arc_buf_hdr_t *buf = vbuf;
973168404Spjd
974168404Spjd	bzero(buf, sizeof (arc_buf_hdr_t));
975168404Spjd	refcount_create(&buf->b_refcnt);
976168404Spjd	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
977185029Spjd	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
978208373Smm	arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
979185029Spjd
980168404Spjd	return (0);
981168404Spjd}
982168404Spjd
983185029Spjd/* ARGSUSED */
984185029Spjdstatic int
985185029Spjdbuf_cons(void *vbuf, void *unused, int kmflag)
986185029Spjd{
987185029Spjd	arc_buf_t *buf = vbuf;
988185029Spjd
989185029Spjd	bzero(buf, sizeof (arc_buf_t));
990219089Spjd	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
991208373Smm	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
992208373Smm
993185029Spjd	return (0);
994185029Spjd}
995185029Spjd
996168404Spjd/*
997168404Spjd * Destructor callback - called when a cached buf is
998168404Spjd * no longer required.
999168404Spjd */
1000168404Spjd/* ARGSUSED */
1001168404Spjdstatic void
1002168404Spjdhdr_dest(void *vbuf, void *unused)
1003168404Spjd{
1004168404Spjd	arc_buf_hdr_t *buf = vbuf;
1005168404Spjd
1006219089Spjd	ASSERT(BUF_EMPTY(buf));
1007168404Spjd	refcount_destroy(&buf->b_refcnt);
1008168404Spjd	cv_destroy(&buf->b_cv);
1009185029Spjd	mutex_destroy(&buf->b_freeze_lock);
1010208373Smm	arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1011168404Spjd}
1012168404Spjd
1013185029Spjd/* ARGSUSED */
1014185029Spjdstatic void
1015185029Spjdbuf_dest(void *vbuf, void *unused)
1016185029Spjd{
1017185029Spjd	arc_buf_t *buf = vbuf;
1018185029Spjd
1019219089Spjd	mutex_destroy(&buf->b_evict_lock);
1020208373Smm	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1021185029Spjd}
1022185029Spjd
1023168404Spjd/*
1024168404Spjd * Reclaim callback -- invoked when memory is low.
1025168404Spjd */
1026168404Spjd/* ARGSUSED */
1027168404Spjdstatic void
1028168404Spjdhdr_recl(void *unused)
1029168404Spjd{
1030168404Spjd	dprintf("hdr_recl called\n");
1031168404Spjd	/*
1032168404Spjd	 * umem calls the reclaim func when we destroy the buf cache,
1033168404Spjd	 * which is after we do arc_fini().
1034168404Spjd	 */
1035168404Spjd	if (!arc_dead)
1036168404Spjd		cv_signal(&arc_reclaim_thr_cv);
1037168404Spjd}
1038168404Spjd
1039168404Spjdstatic void
1040168404Spjdbuf_init(void)
1041168404Spjd{
1042168404Spjd	uint64_t *ct;
1043168404Spjd	uint64_t hsize = 1ULL << 12;
1044168404Spjd	int i, j;
1045168404Spjd
1046168404Spjd	/*
1047168404Spjd	 * The hash table is big enough to fill all of physical memory
1048168404Spjd	 * with an average 64K block size.  The table will take up
1049168404Spjd	 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
1050168404Spjd	 */
1051168696Spjd	while (hsize * 65536 < (uint64_t)physmem * PAGESIZE)
1052168404Spjd		hsize <<= 1;
1053168404Spjdretry:
1054168404Spjd	buf_hash_table.ht_mask = hsize - 1;
1055168404Spjd	buf_hash_table.ht_table =
1056168404Spjd	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1057168404Spjd	if (buf_hash_table.ht_table == NULL) {
1058168404Spjd		ASSERT(hsize > (1ULL << 8));
1059168404Spjd		hsize >>= 1;
1060168404Spjd		goto retry;
1061168404Spjd	}
1062168404Spjd
1063168404Spjd	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
1064168404Spjd	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
1065168404Spjd	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1066185029Spjd	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1067168404Spjd
1068168404Spjd	for (i = 0; i < 256; i++)
1069168404Spjd		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1070168404Spjd			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1071168404Spjd
1072168404Spjd	for (i = 0; i < BUF_LOCKS; i++) {
1073168404Spjd		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1074168404Spjd		    NULL, MUTEX_DEFAULT, NULL);
1075168404Spjd	}
1076168404Spjd}
1077168404Spjd
1078168404Spjd#define	ARC_MINTIME	(hz>>4) /* 62 ms */
1079168404Spjd
1080168404Spjdstatic void
1081168404Spjdarc_cksum_verify(arc_buf_t *buf)
1082168404Spjd{
1083168404Spjd	zio_cksum_t zc;
1084168404Spjd
1085168404Spjd	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1086168404Spjd		return;
1087168404Spjd
1088168404Spjd	mutex_enter(&buf->b_hdr->b_freeze_lock);
1089168404Spjd	if (buf->b_hdr->b_freeze_cksum == NULL ||
1090168404Spjd	    (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
1091168404Spjd		mutex_exit(&buf->b_hdr->b_freeze_lock);
1092168404Spjd		return;
1093168404Spjd	}
1094168404Spjd	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1095168404Spjd	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1096168404Spjd		panic("buffer modified while frozen!");
1097168404Spjd	mutex_exit(&buf->b_hdr->b_freeze_lock);
1098168404Spjd}
1099168404Spjd
1100185029Spjdstatic int
1101185029Spjdarc_cksum_equal(arc_buf_t *buf)
1102185029Spjd{
1103185029Spjd	zio_cksum_t zc;
1104185029Spjd	int equal;
1105185029Spjd
1106185029Spjd	mutex_enter(&buf->b_hdr->b_freeze_lock);
1107185029Spjd	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1108185029Spjd	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1109185029Spjd	mutex_exit(&buf->b_hdr->b_freeze_lock);
1110185029Spjd
1111185029Spjd	return (equal);
1112185029Spjd}
1113185029Spjd
1114168404Spjdstatic void
1115185029Spjdarc_cksum_compute(arc_buf_t *buf, boolean_t force)
1116168404Spjd{
1117185029Spjd	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1118168404Spjd		return;
1119168404Spjd
1120168404Spjd	mutex_enter(&buf->b_hdr->b_freeze_lock);
1121168404Spjd	if (buf->b_hdr->b_freeze_cksum != NULL) {
1122168404Spjd		mutex_exit(&buf->b_hdr->b_freeze_lock);
1123168404Spjd		return;
1124168404Spjd	}
1125168404Spjd	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1126168404Spjd	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1127168404Spjd	    buf->b_hdr->b_freeze_cksum);
1128168404Spjd	mutex_exit(&buf->b_hdr->b_freeze_lock);
1129240133Smm#ifdef illumos
1130240133Smm	arc_buf_watch(buf);
1131240133Smm#endif /* illumos */
1132168404Spjd}
1133168404Spjd
1134240133Smm#ifdef illumos
1135240133Smm#ifndef _KERNEL
1136240133Smmtypedef struct procctl {
1137240133Smm	long cmd;
1138240133Smm	prwatch_t prwatch;
1139240133Smm} procctl_t;
1140240133Smm#endif
1141240133Smm
1142240133Smm/* ARGSUSED */
1143240133Smmstatic void
1144240133Smmarc_buf_unwatch(arc_buf_t *buf)
1145240133Smm{
1146240133Smm#ifndef _KERNEL
1147240133Smm	if (arc_watch) {
1148240133Smm		int result;
1149240133Smm		procctl_t ctl;
1150240133Smm		ctl.cmd = PCWATCH;
1151240133Smm		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1152240133Smm		ctl.prwatch.pr_size = 0;
1153240133Smm		ctl.prwatch.pr_wflags = 0;
1154240133Smm		result = write(arc_procfd, &ctl, sizeof (ctl));
1155240133Smm		ASSERT3U(result, ==, sizeof (ctl));
1156240133Smm	}
1157240133Smm#endif
1158240133Smm}
1159240133Smm
1160240133Smm/* ARGSUSED */
1161240133Smmstatic void
1162240133Smmarc_buf_watch(arc_buf_t *buf)
1163240133Smm{
1164240133Smm#ifndef _KERNEL
1165240133Smm	if (arc_watch) {
1166240133Smm		int result;
1167240133Smm		procctl_t ctl;
1168240133Smm		ctl.cmd = PCWATCH;
1169240133Smm		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1170240133Smm		ctl.prwatch.pr_size = buf->b_hdr->b_size;
1171240133Smm		ctl.prwatch.pr_wflags = WA_WRITE;
1172240133Smm		result = write(arc_procfd, &ctl, sizeof (ctl));
1173240133Smm		ASSERT3U(result, ==, sizeof (ctl));
1174240133Smm	}
1175240133Smm#endif
1176240133Smm}
1177240133Smm#endif /* illumos */
1178240133Smm
1179168404Spjdvoid
1180168404Spjdarc_buf_thaw(arc_buf_t *buf)
1181168404Spjd{
1182185029Spjd	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1183185029Spjd		if (buf->b_hdr->b_state != arc_anon)
1184185029Spjd			panic("modifying non-anon buffer!");
1185185029Spjd		if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1186185029Spjd			panic("modifying buffer while i/o in progress!");
1187185029Spjd		arc_cksum_verify(buf);
1188185029Spjd	}
1189168404Spjd
1190168404Spjd	mutex_enter(&buf->b_hdr->b_freeze_lock);
1191168404Spjd	if (buf->b_hdr->b_freeze_cksum != NULL) {
1192168404Spjd		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1193168404Spjd		buf->b_hdr->b_freeze_cksum = NULL;
1194168404Spjd	}
1195219089Spjd
1196219089Spjd	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1197219089Spjd		if (buf->b_hdr->b_thawed)
1198219089Spjd			kmem_free(buf->b_hdr->b_thawed, 1);
1199219089Spjd		buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1200219089Spjd	}
1201219089Spjd
1202168404Spjd	mutex_exit(&buf->b_hdr->b_freeze_lock);
1203240133Smm
1204240133Smm#ifdef illumos
1205240133Smm	arc_buf_unwatch(buf);
1206240133Smm#endif /* illumos */
1207168404Spjd}
1208168404Spjd
1209168404Spjdvoid
1210168404Spjdarc_buf_freeze(arc_buf_t *buf)
1211168404Spjd{
1212219089Spjd	kmutex_t *hash_lock;
1213219089Spjd
1214168404Spjd	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1215168404Spjd		return;
1216168404Spjd
1217219089Spjd	hash_lock = HDR_LOCK(buf->b_hdr);
1218219089Spjd	mutex_enter(hash_lock);
1219219089Spjd
1220168404Spjd	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1221168404Spjd	    buf->b_hdr->b_state == arc_anon);
1222185029Spjd	arc_cksum_compute(buf, B_FALSE);
1223219089Spjd	mutex_exit(hash_lock);
1224240133Smm
1225168404Spjd}
1226168404Spjd
1227168404Spjdstatic void
1228205231Skmacyget_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock)
1229205231Skmacy{
1230205231Skmacy	uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth);
1231205231Skmacy
1232206796Spjd	if (ab->b_type == ARC_BUFC_METADATA)
1233206796Spjd		buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
1234205231Skmacy	else {
1235206796Spjd		buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
1236205231Skmacy		buf_hashid += ARC_BUFC_NUMMETADATALISTS;
1237205231Skmacy	}
1238205231Skmacy
1239205231Skmacy	*list = &state->arcs_lists[buf_hashid];
1240205231Skmacy	*lock = ARCS_LOCK(state, buf_hashid);
1241205231Skmacy}
1242205231Skmacy
1243205231Skmacy
1244205231Skmacystatic void
1245168404Spjdadd_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1246168404Spjd{
1247168404Spjd	ASSERT(MUTEX_HELD(hash_lock));
1248168404Spjd
1249168404Spjd	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1250168404Spjd	    (ab->b_state != arc_anon)) {
1251206796Spjd		uint64_t delta = ab->b_size * ab->b_datacnt;
1252206796Spjd		uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1253205231Skmacy		list_t *list;
1254205231Skmacy		kmutex_t *lock;
1255168404Spjd
1256205231Skmacy		get_buf_info(ab, ab->b_state, &list, &lock);
1257205231Skmacy		ASSERT(!MUTEX_HELD(lock));
1258205231Skmacy		mutex_enter(lock);
1259168404Spjd		ASSERT(list_link_active(&ab->b_arc_node));
1260185029Spjd		list_remove(list, ab);
1261168404Spjd		if (GHOST_STATE(ab->b_state)) {
1262240415Smm			ASSERT0(ab->b_datacnt);
1263168404Spjd			ASSERT3P(ab->b_buf, ==, NULL);
1264168404Spjd			delta = ab->b_size;
1265168404Spjd		}
1266168404Spjd		ASSERT(delta > 0);
1267185029Spjd		ASSERT3U(*size, >=, delta);
1268185029Spjd		atomic_add_64(size, -delta);
1269206794Spjd		mutex_exit(lock);
1270185029Spjd		/* remove the prefetch flag if we get a reference */
1271168404Spjd		if (ab->b_flags & ARC_PREFETCH)
1272168404Spjd			ab->b_flags &= ~ARC_PREFETCH;
1273168404Spjd	}
1274168404Spjd}
1275168404Spjd
1276168404Spjdstatic int
1277168404Spjdremove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1278168404Spjd{
1279168404Spjd	int cnt;
1280168404Spjd	arc_state_t *state = ab->b_state;
1281168404Spjd
1282168404Spjd	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1283168404Spjd	ASSERT(!GHOST_STATE(state));
1284168404Spjd
1285168404Spjd	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1286168404Spjd	    (state != arc_anon)) {
1287185029Spjd		uint64_t *size = &state->arcs_lsize[ab->b_type];
1288205231Skmacy		list_t *list;
1289205231Skmacy		kmutex_t *lock;
1290185029Spjd
1291205231Skmacy		get_buf_info(ab, state, &list, &lock);
1292205231Skmacy		ASSERT(!MUTEX_HELD(lock));
1293205231Skmacy		mutex_enter(lock);
1294168404Spjd		ASSERT(!list_link_active(&ab->b_arc_node));
1295205231Skmacy		list_insert_head(list, ab);
1296168404Spjd		ASSERT(ab->b_datacnt > 0);
1297185029Spjd		atomic_add_64(size, ab->b_size * ab->b_datacnt);
1298206794Spjd		mutex_exit(lock);
1299168404Spjd	}
1300168404Spjd	return (cnt);
1301168404Spjd}
1302168404Spjd
1303168404Spjd/*
1304168404Spjd * Move the supplied buffer to the indicated state.  The mutex
1305168404Spjd * for the buffer must be held by the caller.
1306168404Spjd */
1307168404Spjdstatic void
1308168404Spjdarc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1309168404Spjd{
1310168404Spjd	arc_state_t *old_state = ab->b_state;
1311168404Spjd	int64_t refcnt = refcount_count(&ab->b_refcnt);
1312168404Spjd	uint64_t from_delta, to_delta;
1313205231Skmacy	list_t *list;
1314205231Skmacy	kmutex_t *lock;
1315168404Spjd
1316168404Spjd	ASSERT(MUTEX_HELD(hash_lock));
1317168404Spjd	ASSERT(new_state != old_state);
1318168404Spjd	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1319168404Spjd	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1320219089Spjd	ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1321168404Spjd
1322168404Spjd	from_delta = to_delta = ab->b_datacnt * ab->b_size;
1323168404Spjd
1324168404Spjd	/*
1325168404Spjd	 * If this buffer is evictable, transfer it from the
1326168404Spjd	 * old state list to the new state list.
1327168404Spjd	 */
1328168404Spjd	if (refcnt == 0) {
1329168404Spjd		if (old_state != arc_anon) {
1330205231Skmacy			int use_mutex;
1331185029Spjd			uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1332168404Spjd
1333205231Skmacy			get_buf_info(ab, old_state, &list, &lock);
1334205231Skmacy			use_mutex = !MUTEX_HELD(lock);
1335168404Spjd			if (use_mutex)
1336205231Skmacy				mutex_enter(lock);
1337168404Spjd
1338168404Spjd			ASSERT(list_link_active(&ab->b_arc_node));
1339205231Skmacy			list_remove(list, ab);
1340168404Spjd
1341168404Spjd			/*
1342168404Spjd			 * If prefetching out of the ghost cache,
1343219089Spjd			 * we will have a non-zero datacnt.
1344168404Spjd			 */
1345168404Spjd			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1346168404Spjd				/* ghost elements have a ghost size */
1347168404Spjd				ASSERT(ab->b_buf == NULL);
1348168404Spjd				from_delta = ab->b_size;
1349168404Spjd			}
1350185029Spjd			ASSERT3U(*size, >=, from_delta);
1351185029Spjd			atomic_add_64(size, -from_delta);
1352168404Spjd
1353168404Spjd			if (use_mutex)
1354205231Skmacy				mutex_exit(lock);
1355168404Spjd		}
1356168404Spjd		if (new_state != arc_anon) {
1357206796Spjd			int use_mutex;
1358185029Spjd			uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1359168404Spjd
1360205231Skmacy			get_buf_info(ab, new_state, &list, &lock);
1361205231Skmacy			use_mutex = !MUTEX_HELD(lock);
1362168404Spjd			if (use_mutex)
1363205231Skmacy				mutex_enter(lock);
1364168404Spjd
1365205231Skmacy			list_insert_head(list, ab);
1366168404Spjd
1367168404Spjd			/* ghost elements have a ghost size */
1368168404Spjd			if (GHOST_STATE(new_state)) {
1369168404Spjd				ASSERT(ab->b_datacnt == 0);
1370168404Spjd				ASSERT(ab->b_buf == NULL);
1371168404Spjd				to_delta = ab->b_size;
1372168404Spjd			}
1373185029Spjd			atomic_add_64(size, to_delta);
1374168404Spjd
1375168404Spjd			if (use_mutex)
1376205231Skmacy				mutex_exit(lock);
1377168404Spjd		}
1378168404Spjd	}
1379168404Spjd
1380168404Spjd	ASSERT(!BUF_EMPTY(ab));
1381219089Spjd	if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1382168404Spjd		buf_hash_remove(ab);
1383168404Spjd
1384168404Spjd	/* adjust state sizes */
1385168404Spjd	if (to_delta)
1386168404Spjd		atomic_add_64(&new_state->arcs_size, to_delta);
1387168404Spjd	if (from_delta) {
1388168404Spjd		ASSERT3U(old_state->arcs_size, >=, from_delta);
1389168404Spjd		atomic_add_64(&old_state->arcs_size, -from_delta);
1390168404Spjd	}
1391168404Spjd	ab->b_state = new_state;
1392185029Spjd
1393185029Spjd	/* adjust l2arc hdr stats */
1394185029Spjd	if (new_state == arc_l2c_only)
1395185029Spjd		l2arc_hdr_stat_add();
1396185029Spjd	else if (old_state == arc_l2c_only)
1397185029Spjd		l2arc_hdr_stat_remove();
1398168404Spjd}
1399168404Spjd
1400185029Spjdvoid
1401208373Smmarc_space_consume(uint64_t space, arc_space_type_t type)
1402185029Spjd{
1403208373Smm	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1404208373Smm
1405208373Smm	switch (type) {
1406208373Smm	case ARC_SPACE_DATA:
1407208373Smm		ARCSTAT_INCR(arcstat_data_size, space);
1408208373Smm		break;
1409208373Smm	case ARC_SPACE_OTHER:
1410208373Smm		ARCSTAT_INCR(arcstat_other_size, space);
1411208373Smm		break;
1412208373Smm	case ARC_SPACE_HDRS:
1413208373Smm		ARCSTAT_INCR(arcstat_hdr_size, space);
1414208373Smm		break;
1415208373Smm	case ARC_SPACE_L2HDRS:
1416208373Smm		ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1417208373Smm		break;
1418208373Smm	}
1419208373Smm
1420185029Spjd	atomic_add_64(&arc_meta_used, space);
1421185029Spjd	atomic_add_64(&arc_size, space);
1422185029Spjd}
1423185029Spjd
1424185029Spjdvoid
1425208373Smmarc_space_return(uint64_t space, arc_space_type_t type)
1426185029Spjd{
1427208373Smm	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1428208373Smm
1429208373Smm	switch (type) {
1430208373Smm	case ARC_SPACE_DATA:
1431208373Smm		ARCSTAT_INCR(arcstat_data_size, -space);
1432208373Smm		break;
1433208373Smm	case ARC_SPACE_OTHER:
1434208373Smm		ARCSTAT_INCR(arcstat_other_size, -space);
1435208373Smm		break;
1436208373Smm	case ARC_SPACE_HDRS:
1437208373Smm		ARCSTAT_INCR(arcstat_hdr_size, -space);
1438208373Smm		break;
1439208373Smm	case ARC_SPACE_L2HDRS:
1440208373Smm		ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1441208373Smm		break;
1442208373Smm	}
1443208373Smm
1444185029Spjd	ASSERT(arc_meta_used >= space);
1445185029Spjd	if (arc_meta_max < arc_meta_used)
1446185029Spjd		arc_meta_max = arc_meta_used;
1447185029Spjd	atomic_add_64(&arc_meta_used, -space);
1448185029Spjd	ASSERT(arc_size >= space);
1449185029Spjd	atomic_add_64(&arc_size, -space);
1450185029Spjd}
1451185029Spjd
1452185029Spjdvoid *
1453185029Spjdarc_data_buf_alloc(uint64_t size)
1454185029Spjd{
1455185029Spjd	if (arc_evict_needed(ARC_BUFC_DATA))
1456185029Spjd		cv_signal(&arc_reclaim_thr_cv);
1457185029Spjd	atomic_add_64(&arc_size, size);
1458185029Spjd	return (zio_data_buf_alloc(size));
1459185029Spjd}
1460185029Spjd
1461185029Spjdvoid
1462185029Spjdarc_data_buf_free(void *buf, uint64_t size)
1463185029Spjd{
1464185029Spjd	zio_data_buf_free(buf, size);
1465185029Spjd	ASSERT(arc_size >= size);
1466185029Spjd	atomic_add_64(&arc_size, -size);
1467185029Spjd}
1468185029Spjd
1469168404Spjdarc_buf_t *
1470168404Spjdarc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1471168404Spjd{
1472168404Spjd	arc_buf_hdr_t *hdr;
1473168404Spjd	arc_buf_t *buf;
1474168404Spjd
1475168404Spjd	ASSERT3U(size, >, 0);
1476185029Spjd	hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1477168404Spjd	ASSERT(BUF_EMPTY(hdr));
1478168404Spjd	hdr->b_size = size;
1479168404Spjd	hdr->b_type = type;
1480228103Smm	hdr->b_spa = spa_load_guid(spa);
1481168404Spjd	hdr->b_state = arc_anon;
1482168404Spjd	hdr->b_arc_access = 0;
1483185029Spjd	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1484168404Spjd	buf->b_hdr = hdr;
1485168404Spjd	buf->b_data = NULL;
1486168404Spjd	buf->b_efunc = NULL;
1487168404Spjd	buf->b_private = NULL;
1488168404Spjd	buf->b_next = NULL;
1489168404Spjd	hdr->b_buf = buf;
1490168404Spjd	arc_get_data_buf(buf);
1491168404Spjd	hdr->b_datacnt = 1;
1492168404Spjd	hdr->b_flags = 0;
1493168404Spjd	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1494168404Spjd	(void) refcount_add(&hdr->b_refcnt, tag);
1495168404Spjd
1496168404Spjd	return (buf);
1497168404Spjd}
1498168404Spjd
1499209962Smmstatic char *arc_onloan_tag = "onloan";
1500209962Smm
1501209962Smm/*
1502209962Smm * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1503209962Smm * flight data by arc_tempreserve_space() until they are "returned". Loaned
1504209962Smm * buffers must be returned to the arc before they can be used by the DMU or
1505209962Smm * freed.
1506209962Smm */
1507209962Smmarc_buf_t *
1508209962Smmarc_loan_buf(spa_t *spa, int size)
1509209962Smm{
1510209962Smm	arc_buf_t *buf;
1511209962Smm
1512209962Smm	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1513209962Smm
1514209962Smm	atomic_add_64(&arc_loaned_bytes, size);
1515209962Smm	return (buf);
1516209962Smm}
1517209962Smm
1518209962Smm/*
1519209962Smm * Return a loaned arc buffer to the arc.
1520209962Smm */
1521209962Smmvoid
1522209962Smmarc_return_buf(arc_buf_t *buf, void *tag)
1523209962Smm{
1524209962Smm	arc_buf_hdr_t *hdr = buf->b_hdr;
1525209962Smm
1526209962Smm	ASSERT(buf->b_data != NULL);
1527219089Spjd	(void) refcount_add(&hdr->b_refcnt, tag);
1528219089Spjd	(void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1529209962Smm
1530209962Smm	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1531209962Smm}
1532209962Smm
1533219089Spjd/* Detach an arc_buf from a dbuf (tag) */
1534219089Spjdvoid
1535219089Spjdarc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1536219089Spjd{
1537219089Spjd	arc_buf_hdr_t *hdr;
1538219089Spjd
1539219089Spjd	ASSERT(buf->b_data != NULL);
1540219089Spjd	hdr = buf->b_hdr;
1541219089Spjd	(void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1542219089Spjd	(void) refcount_remove(&hdr->b_refcnt, tag);
1543219089Spjd	buf->b_efunc = NULL;
1544219089Spjd	buf->b_private = NULL;
1545219089Spjd
1546219089Spjd	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1547219089Spjd}
1548219089Spjd
1549168404Spjdstatic arc_buf_t *
1550168404Spjdarc_buf_clone(arc_buf_t *from)
1551168404Spjd{
1552168404Spjd	arc_buf_t *buf;
1553168404Spjd	arc_buf_hdr_t *hdr = from->b_hdr;
1554168404Spjd	uint64_t size = hdr->b_size;
1555168404Spjd
1556219089Spjd	ASSERT(hdr->b_state != arc_anon);
1557219089Spjd
1558185029Spjd	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1559168404Spjd	buf->b_hdr = hdr;
1560168404Spjd	buf->b_data = NULL;
1561168404Spjd	buf->b_efunc = NULL;
1562168404Spjd	buf->b_private = NULL;
1563168404Spjd	buf->b_next = hdr->b_buf;
1564168404Spjd	hdr->b_buf = buf;
1565168404Spjd	arc_get_data_buf(buf);
1566168404Spjd	bcopy(from->b_data, buf->b_data, size);
1567242845Sdelphij
1568242845Sdelphij	/*
1569242845Sdelphij	 * This buffer already exists in the arc so create a duplicate
1570242845Sdelphij	 * copy for the caller.  If the buffer is associated with user data
1571242845Sdelphij	 * then track the size and number of duplicates.  These stats will be
1572242845Sdelphij	 * updated as duplicate buffers are created and destroyed.
1573242845Sdelphij	 */
1574242845Sdelphij	if (hdr->b_type == ARC_BUFC_DATA) {
1575242845Sdelphij		ARCSTAT_BUMP(arcstat_duplicate_buffers);
1576242845Sdelphij		ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1577242845Sdelphij	}
1578168404Spjd	hdr->b_datacnt += 1;
1579168404Spjd	return (buf);
1580168404Spjd}
1581168404Spjd
1582168404Spjdvoid
1583168404Spjdarc_buf_add_ref(arc_buf_t *buf, void* tag)
1584168404Spjd{
1585168404Spjd	arc_buf_hdr_t *hdr;
1586168404Spjd	kmutex_t *hash_lock;
1587168404Spjd
1588168404Spjd	/*
1589185029Spjd	 * Check to see if this buffer is evicted.  Callers
1590185029Spjd	 * must verify b_data != NULL to know if the add_ref
1591185029Spjd	 * was successful.
1592168404Spjd	 */
1593219089Spjd	mutex_enter(&buf->b_evict_lock);
1594185029Spjd	if (buf->b_data == NULL) {
1595219089Spjd		mutex_exit(&buf->b_evict_lock);
1596168404Spjd		return;
1597168404Spjd	}
1598219089Spjd	hash_lock = HDR_LOCK(buf->b_hdr);
1599219089Spjd	mutex_enter(hash_lock);
1600185029Spjd	hdr = buf->b_hdr;
1601219089Spjd	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1602219089Spjd	mutex_exit(&buf->b_evict_lock);
1603168404Spjd
1604168404Spjd	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1605168404Spjd	add_reference(hdr, hash_lock, tag);
1606208373Smm	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1607168404Spjd	arc_access(hdr, hash_lock);
1608168404Spjd	mutex_exit(hash_lock);
1609168404Spjd	ARCSTAT_BUMP(arcstat_hits);
1610168404Spjd	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1611168404Spjd	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1612168404Spjd	    data, metadata, hits);
1613168404Spjd}
1614168404Spjd
1615185029Spjd/*
1616185029Spjd * Free the arc data buffer.  If it is an l2arc write in progress,
1617185029Spjd * the buffer is placed on l2arc_free_on_write to be freed later.
1618185029Spjd */
1619168404Spjdstatic void
1620240133Smmarc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1621185029Spjd{
1622240133Smm	arc_buf_hdr_t *hdr = buf->b_hdr;
1623240133Smm
1624185029Spjd	if (HDR_L2_WRITING(hdr)) {
1625185029Spjd		l2arc_data_free_t *df;
1626185029Spjd		df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1627240133Smm		df->l2df_data = buf->b_data;
1628240133Smm		df->l2df_size = hdr->b_size;
1629185029Spjd		df->l2df_func = free_func;
1630185029Spjd		mutex_enter(&l2arc_free_on_write_mtx);
1631185029Spjd		list_insert_head(l2arc_free_on_write, df);
1632185029Spjd		mutex_exit(&l2arc_free_on_write_mtx);
1633185029Spjd		ARCSTAT_BUMP(arcstat_l2_free_on_write);
1634185029Spjd	} else {
1635240133Smm		free_func(buf->b_data, hdr->b_size);
1636185029Spjd	}
1637185029Spjd}
1638185029Spjd
1639185029Spjdstatic void
1640168404Spjdarc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1641168404Spjd{
1642168404Spjd	arc_buf_t **bufp;
1643168404Spjd
1644168404Spjd	/* free up data associated with the buf */
1645168404Spjd	if (buf->b_data) {
1646168404Spjd		arc_state_t *state = buf->b_hdr->b_state;
1647168404Spjd		uint64_t size = buf->b_hdr->b_size;
1648168404Spjd		arc_buf_contents_t type = buf->b_hdr->b_type;
1649168404Spjd
1650168404Spjd		arc_cksum_verify(buf);
1651240133Smm#ifdef illumos
1652240133Smm		arc_buf_unwatch(buf);
1653240133Smm#endif /* illumos */
1654219089Spjd
1655168404Spjd		if (!recycle) {
1656168404Spjd			if (type == ARC_BUFC_METADATA) {
1657240133Smm				arc_buf_data_free(buf, zio_buf_free);
1658208373Smm				arc_space_return(size, ARC_SPACE_DATA);
1659168404Spjd			} else {
1660168404Spjd				ASSERT(type == ARC_BUFC_DATA);
1661240133Smm				arc_buf_data_free(buf, zio_data_buf_free);
1662208373Smm				ARCSTAT_INCR(arcstat_data_size, -size);
1663185029Spjd				atomic_add_64(&arc_size, -size);
1664168404Spjd			}
1665168404Spjd		}
1666168404Spjd		if (list_link_active(&buf->b_hdr->b_arc_node)) {
1667185029Spjd			uint64_t *cnt = &state->arcs_lsize[type];
1668185029Spjd
1669168404Spjd			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1670168404Spjd			ASSERT(state != arc_anon);
1671185029Spjd
1672185029Spjd			ASSERT3U(*cnt, >=, size);
1673185029Spjd			atomic_add_64(cnt, -size);
1674168404Spjd		}
1675168404Spjd		ASSERT3U(state->arcs_size, >=, size);
1676168404Spjd		atomic_add_64(&state->arcs_size, -size);
1677168404Spjd		buf->b_data = NULL;
1678242845Sdelphij
1679242845Sdelphij		/*
1680242845Sdelphij		 * If we're destroying a duplicate buffer make sure
1681242845Sdelphij		 * that the appropriate statistics are updated.
1682242845Sdelphij		 */
1683242845Sdelphij		if (buf->b_hdr->b_datacnt > 1 &&
1684242845Sdelphij		    buf->b_hdr->b_type == ARC_BUFC_DATA) {
1685242845Sdelphij			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1686242845Sdelphij			ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1687242845Sdelphij		}
1688168404Spjd		ASSERT(buf->b_hdr->b_datacnt > 0);
1689168404Spjd		buf->b_hdr->b_datacnt -= 1;
1690168404Spjd	}
1691168404Spjd
1692168404Spjd	/* only remove the buf if requested */
1693168404Spjd	if (!all)
1694168404Spjd		return;
1695168404Spjd
1696168404Spjd	/* remove the buf from the hdr list */
1697168404Spjd	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1698168404Spjd		continue;
1699168404Spjd	*bufp = buf->b_next;
1700219089Spjd	buf->b_next = NULL;
1701168404Spjd
1702168404Spjd	ASSERT(buf->b_efunc == NULL);
1703168404Spjd
1704168404Spjd	/* clean up the buf */
1705168404Spjd	buf->b_hdr = NULL;
1706168404Spjd	kmem_cache_free(buf_cache, buf);
1707168404Spjd}
1708168404Spjd
1709168404Spjdstatic void
1710168404Spjdarc_hdr_destroy(arc_buf_hdr_t *hdr)
1711168404Spjd{
1712168404Spjd	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1713168404Spjd	ASSERT3P(hdr->b_state, ==, arc_anon);
1714168404Spjd	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1715219089Spjd	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1716168404Spjd
1717219089Spjd	if (l2hdr != NULL) {
1718219089Spjd		boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1719219089Spjd		/*
1720219089Spjd		 * To prevent arc_free() and l2arc_evict() from
1721219089Spjd		 * attempting to free the same buffer at the same time,
1722219089Spjd		 * a FREE_IN_PROGRESS flag is given to arc_free() to
1723219089Spjd		 * give it priority.  l2arc_evict() can't destroy this
1724219089Spjd		 * header while we are waiting on l2arc_buflist_mtx.
1725219089Spjd		 *
1726219089Spjd		 * The hdr may be removed from l2ad_buflist before we
1727219089Spjd		 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1728219089Spjd		 */
1729219089Spjd		if (!buflist_held) {
1730185029Spjd			mutex_enter(&l2arc_buflist_mtx);
1731219089Spjd			l2hdr = hdr->b_l2hdr;
1732219089Spjd		}
1733219089Spjd
1734219089Spjd		if (l2hdr != NULL) {
1735248572Ssmh			trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
1736248574Ssmh			    hdr->b_size, 0);
1737219089Spjd			list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1738219089Spjd			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1739251478Sdelphij			ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1740219089Spjd			kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1741219089Spjd			if (hdr->b_state == arc_l2c_only)
1742219089Spjd				l2arc_hdr_stat_remove();
1743219089Spjd			hdr->b_l2hdr = NULL;
1744219089Spjd		}
1745219089Spjd
1746219089Spjd		if (!buflist_held)
1747185029Spjd			mutex_exit(&l2arc_buflist_mtx);
1748185029Spjd	}
1749185029Spjd
1750168404Spjd	if (!BUF_EMPTY(hdr)) {
1751168404Spjd		ASSERT(!HDR_IN_HASH_TABLE(hdr));
1752219089Spjd		buf_discard_identity(hdr);
1753168404Spjd	}
1754168404Spjd	while (hdr->b_buf) {
1755168404Spjd		arc_buf_t *buf = hdr->b_buf;
1756168404Spjd
1757168404Spjd		if (buf->b_efunc) {
1758168404Spjd			mutex_enter(&arc_eviction_mtx);
1759219089Spjd			mutex_enter(&buf->b_evict_lock);
1760168404Spjd			ASSERT(buf->b_hdr != NULL);
1761168404Spjd			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1762168404Spjd			hdr->b_buf = buf->b_next;
1763168404Spjd			buf->b_hdr = &arc_eviction_hdr;
1764168404Spjd			buf->b_next = arc_eviction_list;
1765168404Spjd			arc_eviction_list = buf;
1766219089Spjd			mutex_exit(&buf->b_evict_lock);
1767168404Spjd			mutex_exit(&arc_eviction_mtx);
1768168404Spjd		} else {
1769168404Spjd			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1770168404Spjd		}
1771168404Spjd	}
1772168404Spjd	if (hdr->b_freeze_cksum != NULL) {
1773168404Spjd		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1774168404Spjd		hdr->b_freeze_cksum = NULL;
1775168404Spjd	}
1776219089Spjd	if (hdr->b_thawed) {
1777219089Spjd		kmem_free(hdr->b_thawed, 1);
1778219089Spjd		hdr->b_thawed = NULL;
1779219089Spjd	}
1780168404Spjd
1781168404Spjd	ASSERT(!list_link_active(&hdr->b_arc_node));
1782168404Spjd	ASSERT3P(hdr->b_hash_next, ==, NULL);
1783168404Spjd	ASSERT3P(hdr->b_acb, ==, NULL);
1784168404Spjd	kmem_cache_free(hdr_cache, hdr);
1785168404Spjd}
1786168404Spjd
1787168404Spjdvoid
1788168404Spjdarc_buf_free(arc_buf_t *buf, void *tag)
1789168404Spjd{
1790168404Spjd	arc_buf_hdr_t *hdr = buf->b_hdr;
1791168404Spjd	int hashed = hdr->b_state != arc_anon;
1792168404Spjd
1793168404Spjd	ASSERT(buf->b_efunc == NULL);
1794168404Spjd	ASSERT(buf->b_data != NULL);
1795168404Spjd
1796168404Spjd	if (hashed) {
1797168404Spjd		kmutex_t *hash_lock = HDR_LOCK(hdr);
1798168404Spjd
1799168404Spjd		mutex_enter(hash_lock);
1800219089Spjd		hdr = buf->b_hdr;
1801219089Spjd		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1802219089Spjd
1803168404Spjd		(void) remove_reference(hdr, hash_lock, tag);
1804219089Spjd		if (hdr->b_datacnt > 1) {
1805168404Spjd			arc_buf_destroy(buf, FALSE, TRUE);
1806219089Spjd		} else {
1807219089Spjd			ASSERT(buf == hdr->b_buf);
1808219089Spjd			ASSERT(buf->b_efunc == NULL);
1809168404Spjd			hdr->b_flags |= ARC_BUF_AVAILABLE;
1810219089Spjd		}
1811168404Spjd		mutex_exit(hash_lock);
1812168404Spjd	} else if (HDR_IO_IN_PROGRESS(hdr)) {
1813168404Spjd		int destroy_hdr;
1814168404Spjd		/*
1815168404Spjd		 * We are in the middle of an async write.  Don't destroy
1816168404Spjd		 * this buffer unless the write completes before we finish
1817168404Spjd		 * decrementing the reference count.
1818168404Spjd		 */
1819168404Spjd		mutex_enter(&arc_eviction_mtx);
1820168404Spjd		(void) remove_reference(hdr, NULL, tag);
1821168404Spjd		ASSERT(refcount_is_zero(&hdr->b_refcnt));
1822168404Spjd		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1823168404Spjd		mutex_exit(&arc_eviction_mtx);
1824168404Spjd		if (destroy_hdr)
1825168404Spjd			arc_hdr_destroy(hdr);
1826168404Spjd	} else {
1827219089Spjd		if (remove_reference(hdr, NULL, tag) > 0)
1828168404Spjd			arc_buf_destroy(buf, FALSE, TRUE);
1829219089Spjd		else
1830168404Spjd			arc_hdr_destroy(hdr);
1831168404Spjd	}
1832168404Spjd}
1833168404Spjd
1834248571Smmboolean_t
1835168404Spjdarc_buf_remove_ref(arc_buf_t *buf, void* tag)
1836168404Spjd{
1837168404Spjd	arc_buf_hdr_t *hdr = buf->b_hdr;
1838168404Spjd	kmutex_t *hash_lock = HDR_LOCK(hdr);
1839248571Smm	boolean_t no_callback = (buf->b_efunc == NULL);
1840168404Spjd
1841168404Spjd	if (hdr->b_state == arc_anon) {
1842219089Spjd		ASSERT(hdr->b_datacnt == 1);
1843168404Spjd		arc_buf_free(buf, tag);
1844168404Spjd		return (no_callback);
1845168404Spjd	}
1846168404Spjd
1847168404Spjd	mutex_enter(hash_lock);
1848219089Spjd	hdr = buf->b_hdr;
1849219089Spjd	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1850168404Spjd	ASSERT(hdr->b_state != arc_anon);
1851168404Spjd	ASSERT(buf->b_data != NULL);
1852168404Spjd
1853168404Spjd	(void) remove_reference(hdr, hash_lock, tag);
1854168404Spjd	if (hdr->b_datacnt > 1) {
1855168404Spjd		if (no_callback)
1856168404Spjd			arc_buf_destroy(buf, FALSE, TRUE);
1857168404Spjd	} else if (no_callback) {
1858168404Spjd		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1859219089Spjd		ASSERT(buf->b_efunc == NULL);
1860168404Spjd		hdr->b_flags |= ARC_BUF_AVAILABLE;
1861168404Spjd	}
1862168404Spjd	ASSERT(no_callback || hdr->b_datacnt > 1 ||
1863168404Spjd	    refcount_is_zero(&hdr->b_refcnt));
1864168404Spjd	mutex_exit(hash_lock);
1865168404Spjd	return (no_callback);
1866168404Spjd}
1867168404Spjd
1868168404Spjdint
1869168404Spjdarc_buf_size(arc_buf_t *buf)
1870168404Spjd{
1871168404Spjd	return (buf->b_hdr->b_size);
1872168404Spjd}
1873168404Spjd
1874168404Spjd/*
1875242845Sdelphij * Called from the DMU to determine if the current buffer should be
1876242845Sdelphij * evicted. In order to ensure proper locking, the eviction must be initiated
1877242845Sdelphij * from the DMU. Return true if the buffer is associated with user data and
1878242845Sdelphij * duplicate buffers still exist.
1879242845Sdelphij */
1880242845Sdelphijboolean_t
1881242845Sdelphijarc_buf_eviction_needed(arc_buf_t *buf)
1882242845Sdelphij{
1883242845Sdelphij	arc_buf_hdr_t *hdr;
1884242845Sdelphij	boolean_t evict_needed = B_FALSE;
1885242845Sdelphij
1886242845Sdelphij	if (zfs_disable_dup_eviction)
1887242845Sdelphij		return (B_FALSE);
1888242845Sdelphij
1889242845Sdelphij	mutex_enter(&buf->b_evict_lock);
1890242845Sdelphij	hdr = buf->b_hdr;
1891242845Sdelphij	if (hdr == NULL) {
1892242845Sdelphij		/*
1893242845Sdelphij		 * We are in arc_do_user_evicts(); let that function
1894242845Sdelphij		 * perform the eviction.
1895242845Sdelphij		 */
1896242845Sdelphij		ASSERT(buf->b_data == NULL);
1897242845Sdelphij		mutex_exit(&buf->b_evict_lock);
1898242845Sdelphij		return (B_FALSE);
1899242845Sdelphij	} else if (buf->b_data == NULL) {
1900242845Sdelphij		/*
1901242845Sdelphij		 * We have already been added to the arc eviction list;
1902242845Sdelphij		 * recommend eviction.
1903242845Sdelphij		 */
1904242845Sdelphij		ASSERT3P(hdr, ==, &arc_eviction_hdr);
1905242845Sdelphij		mutex_exit(&buf->b_evict_lock);
1906242845Sdelphij		return (B_TRUE);
1907242845Sdelphij	}
1908242845Sdelphij
1909242845Sdelphij	if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1910242845Sdelphij		evict_needed = B_TRUE;
1911242845Sdelphij
1912242845Sdelphij	mutex_exit(&buf->b_evict_lock);
1913242845Sdelphij	return (evict_needed);
1914242845Sdelphij}
1915242845Sdelphij
1916242845Sdelphij/*
1917168404Spjd * Evict buffers from list until we've removed the specified number of
1918168404Spjd * bytes.  Move the removed buffers to the appropriate evict state.
1919168404Spjd * If the recycle flag is set, then attempt to "recycle" a buffer:
1920168404Spjd * - look for a buffer to evict that is `bytes' long.
1921168404Spjd * - return the data block from this buffer rather than freeing it.
1922168404Spjd * This flag is used by callers that are trying to make space for a
1923168404Spjd * new buffer in a full arc cache.
1924185029Spjd *
1925185029Spjd * This function makes a "best effort".  It skips over any buffers
1926185029Spjd * it can't get a hash_lock on, and so may not catch all candidates.
1927185029Spjd * It may also return without evicting as much space as requested.
1928168404Spjd */
1929168404Spjdstatic void *
1930209962Smmarc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1931168404Spjd    arc_buf_contents_t type)
1932168404Spjd{
1933168404Spjd	arc_state_t *evicted_state;
1934168404Spjd	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1935205231Skmacy	int64_t bytes_remaining;
1936168404Spjd	arc_buf_hdr_t *ab, *ab_prev = NULL;
1937205231Skmacy	list_t *evicted_list, *list, *evicted_list_start, *list_start;
1938205231Skmacy	kmutex_t *lock, *evicted_lock;
1939168404Spjd	kmutex_t *hash_lock;
1940168404Spjd	boolean_t have_lock;
1941168404Spjd	void *stolen = NULL;
1942205231Skmacy	static int evict_metadata_offset, evict_data_offset;
1943205231Skmacy	int i, idx, offset, list_count, count;
1944168404Spjd
1945168404Spjd	ASSERT(state == arc_mru || state == arc_mfu);
1946168404Spjd
1947168404Spjd	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1948206796Spjd
1949205231Skmacy	if (type == ARC_BUFC_METADATA) {
1950205231Skmacy		offset = 0;
1951205231Skmacy		list_count = ARC_BUFC_NUMMETADATALISTS;
1952205231Skmacy		list_start = &state->arcs_lists[0];
1953205231Skmacy		evicted_list_start = &evicted_state->arcs_lists[0];
1954205231Skmacy		idx = evict_metadata_offset;
1955205231Skmacy	} else {
1956205231Skmacy		offset = ARC_BUFC_NUMMETADATALISTS;
1957205231Skmacy		list_start = &state->arcs_lists[offset];
1958205231Skmacy		evicted_list_start = &evicted_state->arcs_lists[offset];
1959205231Skmacy		list_count = ARC_BUFC_NUMDATALISTS;
1960205231Skmacy		idx = evict_data_offset;
1961205231Skmacy	}
1962205231Skmacy	bytes_remaining = evicted_state->arcs_lsize[type];
1963205231Skmacy	count = 0;
1964206796Spjd
1965205231Skmacyevict_start:
1966205231Skmacy	list = &list_start[idx];
1967205231Skmacy	evicted_list = &evicted_list_start[idx];
1968205231Skmacy	lock = ARCS_LOCK(state, (offset + idx));
1969206796Spjd	evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
1970168404Spjd
1971205231Skmacy	mutex_enter(lock);
1972205231Skmacy	mutex_enter(evicted_lock);
1973205231Skmacy
1974185029Spjd	for (ab = list_tail(list); ab; ab = ab_prev) {
1975185029Spjd		ab_prev = list_prev(list, ab);
1976205231Skmacy		bytes_remaining -= (ab->b_size * ab->b_datacnt);
1977168404Spjd		/* prefetch buffers have a minimum lifespan */
1978168404Spjd		if (HDR_IO_IN_PROGRESS(ab) ||
1979185029Spjd		    (spa && ab->b_spa != spa) ||
1980168404Spjd		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1981219089Spjd		    ddi_get_lbolt() - ab->b_arc_access <
1982219089Spjd		    arc_min_prefetch_lifespan)) {
1983168404Spjd			skipped++;
1984168404Spjd			continue;
1985168404Spjd		}
1986168404Spjd		/* "lookahead" for better eviction candidate */
1987168404Spjd		if (recycle && ab->b_size != bytes &&
1988168404Spjd		    ab_prev && ab_prev->b_size == bytes)
1989168404Spjd			continue;
1990168404Spjd		hash_lock = HDR_LOCK(ab);
1991168404Spjd		have_lock = MUTEX_HELD(hash_lock);
1992168404Spjd		if (have_lock || mutex_tryenter(hash_lock)) {
1993240415Smm			ASSERT0(refcount_count(&ab->b_refcnt));
1994168404Spjd			ASSERT(ab->b_datacnt > 0);
1995168404Spjd			while (ab->b_buf) {
1996168404Spjd				arc_buf_t *buf = ab->b_buf;
1997219089Spjd				if (!mutex_tryenter(&buf->b_evict_lock)) {
1998185029Spjd					missed += 1;
1999185029Spjd					break;
2000185029Spjd				}
2001168404Spjd				if (buf->b_data) {
2002168404Spjd					bytes_evicted += ab->b_size;
2003168404Spjd					if (recycle && ab->b_type == type &&
2004185029Spjd					    ab->b_size == bytes &&
2005185029Spjd					    !HDR_L2_WRITING(ab)) {
2006168404Spjd						stolen = buf->b_data;
2007168404Spjd						recycle = FALSE;
2008168404Spjd					}
2009168404Spjd				}
2010168404Spjd				if (buf->b_efunc) {
2011168404Spjd					mutex_enter(&arc_eviction_mtx);
2012168404Spjd					arc_buf_destroy(buf,
2013168404Spjd					    buf->b_data == stolen, FALSE);
2014168404Spjd					ab->b_buf = buf->b_next;
2015168404Spjd					buf->b_hdr = &arc_eviction_hdr;
2016168404Spjd					buf->b_next = arc_eviction_list;
2017168404Spjd					arc_eviction_list = buf;
2018168404Spjd					mutex_exit(&arc_eviction_mtx);
2019219089Spjd					mutex_exit(&buf->b_evict_lock);
2020168404Spjd				} else {
2021219089Spjd					mutex_exit(&buf->b_evict_lock);
2022168404Spjd					arc_buf_destroy(buf,
2023168404Spjd					    buf->b_data == stolen, TRUE);
2024168404Spjd				}
2025168404Spjd			}
2026208373Smm
2027208373Smm			if (ab->b_l2hdr) {
2028208373Smm				ARCSTAT_INCR(arcstat_evict_l2_cached,
2029208373Smm				    ab->b_size);
2030208373Smm			} else {
2031208373Smm				if (l2arc_write_eligible(ab->b_spa, ab)) {
2032208373Smm					ARCSTAT_INCR(arcstat_evict_l2_eligible,
2033208373Smm					    ab->b_size);
2034208373Smm				} else {
2035208373Smm					ARCSTAT_INCR(
2036208373Smm					    arcstat_evict_l2_ineligible,
2037208373Smm					    ab->b_size);
2038208373Smm				}
2039208373Smm			}
2040208373Smm
2041185029Spjd			if (ab->b_datacnt == 0) {
2042185029Spjd				arc_change_state(evicted_state, ab, hash_lock);
2043185029Spjd				ASSERT(HDR_IN_HASH_TABLE(ab));
2044185029Spjd				ab->b_flags |= ARC_IN_HASH_TABLE;
2045185029Spjd				ab->b_flags &= ~ARC_BUF_AVAILABLE;
2046185029Spjd				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
2047185029Spjd			}
2048168404Spjd			if (!have_lock)
2049168404Spjd				mutex_exit(hash_lock);
2050168404Spjd			if (bytes >= 0 && bytes_evicted >= bytes)
2051168404Spjd				break;
2052205231Skmacy			if (bytes_remaining > 0) {
2053205231Skmacy				mutex_exit(evicted_lock);
2054205231Skmacy				mutex_exit(lock);
2055206796Spjd				idx  = ((idx + 1) & (list_count - 1));
2056205231Skmacy				count++;
2057205231Skmacy				goto evict_start;
2058205231Skmacy			}
2059168404Spjd		} else {
2060168404Spjd			missed += 1;
2061168404Spjd		}
2062168404Spjd	}
2063168404Spjd
2064205231Skmacy	mutex_exit(evicted_lock);
2065205231Skmacy	mutex_exit(lock);
2066206796Spjd
2067206796Spjd	idx  = ((idx + 1) & (list_count - 1));
2068205231Skmacy	count++;
2069168404Spjd
2070205231Skmacy	if (bytes_evicted < bytes) {
2071205231Skmacy		if (count < list_count)
2072205231Skmacy			goto evict_start;
2073205231Skmacy		else
2074205231Skmacy			dprintf("only evicted %lld bytes from %x",
2075205231Skmacy			    (longlong_t)bytes_evicted, state);
2076205231Skmacy	}
2077206796Spjd	if (type == ARC_BUFC_METADATA)
2078205231Skmacy		evict_metadata_offset = idx;
2079205231Skmacy	else
2080205231Skmacy		evict_data_offset = idx;
2081206796Spjd
2082168404Spjd	if (skipped)
2083168404Spjd		ARCSTAT_INCR(arcstat_evict_skip, skipped);
2084168404Spjd
2085168404Spjd	if (missed)
2086168404Spjd		ARCSTAT_INCR(arcstat_mutex_miss, missed);
2087168404Spjd
2088185029Spjd	/*
2089248571Smm	 * We have just evicted some data into the ghost state, make
2090185029Spjd	 * sure we also adjust the ghost state size if necessary.
2091185029Spjd	 */
2092185029Spjd	if (arc_no_grow &&
2093185029Spjd	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
2094185029Spjd		int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
2095185029Spjd		    arc_mru_ghost->arcs_size - arc_c;
2096185029Spjd
2097185029Spjd		if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
2098185029Spjd			int64_t todelete =
2099185029Spjd			    MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
2100209962Smm			arc_evict_ghost(arc_mru_ghost, 0, todelete);
2101185029Spjd		} else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
2102185029Spjd			int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
2103185029Spjd			    arc_mru_ghost->arcs_size +
2104185029Spjd			    arc_mfu_ghost->arcs_size - arc_c);
2105209962Smm			arc_evict_ghost(arc_mfu_ghost, 0, todelete);
2106185029Spjd		}
2107185029Spjd	}
2108205231Skmacy	if (stolen)
2109205231Skmacy		ARCSTAT_BUMP(arcstat_stolen);
2110185029Spjd
2111168404Spjd	return (stolen);
2112168404Spjd}
2113168404Spjd
2114168404Spjd/*
2115168404Spjd * Remove buffers from list until we've removed the specified number of
2116168404Spjd * bytes.  Destroy the buffers that are removed.
2117168404Spjd */
2118168404Spjdstatic void
2119209962Smmarc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
2120168404Spjd{
2121168404Spjd	arc_buf_hdr_t *ab, *ab_prev;
2122219089Spjd	arc_buf_hdr_t marker = { 0 };
2123205231Skmacy	list_t *list, *list_start;
2124205231Skmacy	kmutex_t *hash_lock, *lock;
2125168404Spjd	uint64_t bytes_deleted = 0;
2126168404Spjd	uint64_t bufs_skipped = 0;
2127205231Skmacy	static int evict_offset;
2128205231Skmacy	int list_count, idx = evict_offset;
2129205231Skmacy	int offset, count = 0;
2130168404Spjd
2131168404Spjd	ASSERT(GHOST_STATE(state));
2132205231Skmacy
2133205231Skmacy	/*
2134205231Skmacy	 * data lists come after metadata lists
2135205231Skmacy	 */
2136205231Skmacy	list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
2137205231Skmacy	list_count = ARC_BUFC_NUMDATALISTS;
2138205231Skmacy	offset = ARC_BUFC_NUMMETADATALISTS;
2139206796Spjd
2140205231Skmacyevict_start:
2141205231Skmacy	list = &list_start[idx];
2142205231Skmacy	lock = ARCS_LOCK(state, idx + offset);
2143205231Skmacy
2144205231Skmacy	mutex_enter(lock);
2145185029Spjd	for (ab = list_tail(list); ab; ab = ab_prev) {
2146185029Spjd		ab_prev = list_prev(list, ab);
2147185029Spjd		if (spa && ab->b_spa != spa)
2148185029Spjd			continue;
2149219089Spjd
2150219089Spjd		/* ignore markers */
2151219089Spjd		if (ab->b_spa == 0)
2152219089Spjd			continue;
2153219089Spjd
2154168404Spjd		hash_lock = HDR_LOCK(ab);
2155219089Spjd		/* caller may be trying to modify this buffer, skip it */
2156219089Spjd		if (MUTEX_HELD(hash_lock))
2157219089Spjd			continue;
2158168404Spjd		if (mutex_tryenter(hash_lock)) {
2159168404Spjd			ASSERT(!HDR_IO_IN_PROGRESS(ab));
2160168404Spjd			ASSERT(ab->b_buf == NULL);
2161168404Spjd			ARCSTAT_BUMP(arcstat_deleted);
2162168404Spjd			bytes_deleted += ab->b_size;
2163185029Spjd
2164185029Spjd			if (ab->b_l2hdr != NULL) {
2165185029Spjd				/*
2166185029Spjd				 * This buffer is cached on the 2nd Level ARC;
2167185029Spjd				 * don't destroy the header.
2168185029Spjd				 */
2169185029Spjd				arc_change_state(arc_l2c_only, ab, hash_lock);
2170185029Spjd				mutex_exit(hash_lock);
2171185029Spjd			} else {
2172185029Spjd				arc_change_state(arc_anon, ab, hash_lock);
2173185029Spjd				mutex_exit(hash_lock);
2174185029Spjd				arc_hdr_destroy(ab);
2175185029Spjd			}
2176185029Spjd
2177168404Spjd			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2178168404Spjd			if (bytes >= 0 && bytes_deleted >= bytes)
2179168404Spjd				break;
2180219089Spjd		} else if (bytes < 0) {
2181219089Spjd			/*
2182219089Spjd			 * Insert a list marker and then wait for the
2183219089Spjd			 * hash lock to become available. Once its
2184219089Spjd			 * available, restart from where we left off.
2185219089Spjd			 */
2186219089Spjd			list_insert_after(list, ab, &marker);
2187219089Spjd			mutex_exit(lock);
2188219089Spjd			mutex_enter(hash_lock);
2189219089Spjd			mutex_exit(hash_lock);
2190219089Spjd			mutex_enter(lock);
2191219089Spjd			ab_prev = list_prev(list, &marker);
2192219089Spjd			list_remove(list, &marker);
2193219089Spjd		} else
2194168404Spjd			bufs_skipped += 1;
2195168404Spjd	}
2196205231Skmacy	mutex_exit(lock);
2197206796Spjd	idx  = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
2198205231Skmacy	count++;
2199206796Spjd
2200205231Skmacy	if (count < list_count)
2201205231Skmacy		goto evict_start;
2202206796Spjd
2203205231Skmacy	evict_offset = idx;
2204205231Skmacy	if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
2205185029Spjd	    (bytes < 0 || bytes_deleted < bytes)) {
2206205231Skmacy		list_start = &state->arcs_lists[0];
2207205231Skmacy		list_count = ARC_BUFC_NUMMETADATALISTS;
2208205231Skmacy		offset = count = 0;
2209205231Skmacy		goto evict_start;
2210185029Spjd	}
2211185029Spjd
2212168404Spjd	if (bufs_skipped) {
2213168404Spjd		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2214168404Spjd		ASSERT(bytes >= 0);
2215168404Spjd	}
2216168404Spjd
2217168404Spjd	if (bytes_deleted < bytes)
2218168404Spjd		dprintf("only deleted %lld bytes from %p",
2219168404Spjd		    (longlong_t)bytes_deleted, state);
2220168404Spjd}
2221168404Spjd
2222168404Spjdstatic void
2223168404Spjdarc_adjust(void)
2224168404Spjd{
2225208373Smm	int64_t adjustment, delta;
2226168404Spjd
2227208373Smm	/*
2228208373Smm	 * Adjust MRU size
2229208373Smm	 */
2230168404Spjd
2231209275Smm	adjustment = MIN((int64_t)(arc_size - arc_c),
2232209275Smm	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2233209275Smm	    arc_p));
2234208373Smm
2235208373Smm	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2236208373Smm		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2237209962Smm		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
2238208373Smm		adjustment -= delta;
2239168404Spjd	}
2240168404Spjd
2241208373Smm	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2242208373Smm		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2243209962Smm		(void) arc_evict(arc_mru, 0, delta, FALSE,
2244185029Spjd		    ARC_BUFC_METADATA);
2245185029Spjd	}
2246185029Spjd
2247208373Smm	/*
2248208373Smm	 * Adjust MFU size
2249208373Smm	 */
2250168404Spjd
2251208373Smm	adjustment = arc_size - arc_c;
2252208373Smm
2253208373Smm	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2254208373Smm		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2255209962Smm		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
2256208373Smm		adjustment -= delta;
2257168404Spjd	}
2258168404Spjd
2259208373Smm	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2260208373Smm		int64_t delta = MIN(adjustment,
2261208373Smm		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2262209962Smm		(void) arc_evict(arc_mfu, 0, delta, FALSE,
2263208373Smm		    ARC_BUFC_METADATA);
2264208373Smm	}
2265168404Spjd
2266208373Smm	/*
2267208373Smm	 * Adjust ghost lists
2268208373Smm	 */
2269168404Spjd
2270208373Smm	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2271168404Spjd
2272208373Smm	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2273208373Smm		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2274209962Smm		arc_evict_ghost(arc_mru_ghost, 0, delta);
2275208373Smm	}
2276185029Spjd
2277208373Smm	adjustment =
2278208373Smm	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2279208373Smm
2280208373Smm	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2281208373Smm		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2282209962Smm		arc_evict_ghost(arc_mfu_ghost, 0, delta);
2283168404Spjd	}
2284168404Spjd}
2285168404Spjd
2286168404Spjdstatic void
2287168404Spjdarc_do_user_evicts(void)
2288168404Spjd{
2289191903Skmacy	static arc_buf_t *tmp_arc_eviction_list;
2290191903Skmacy
2291191903Skmacy	/*
2292191903Skmacy	 * Move list over to avoid LOR
2293191903Skmacy	 */
2294206796Spjdrestart:
2295168404Spjd	mutex_enter(&arc_eviction_mtx);
2296191903Skmacy	tmp_arc_eviction_list = arc_eviction_list;
2297191903Skmacy	arc_eviction_list = NULL;
2298191903Skmacy	mutex_exit(&arc_eviction_mtx);
2299191903Skmacy
2300191903Skmacy	while (tmp_arc_eviction_list != NULL) {
2301191903Skmacy		arc_buf_t *buf = tmp_arc_eviction_list;
2302191903Skmacy		tmp_arc_eviction_list = buf->b_next;
2303219089Spjd		mutex_enter(&buf->b_evict_lock);
2304168404Spjd		buf->b_hdr = NULL;
2305219089Spjd		mutex_exit(&buf->b_evict_lock);
2306168404Spjd
2307168404Spjd		if (buf->b_efunc != NULL)
2308168404Spjd			VERIFY(buf->b_efunc(buf) == 0);
2309168404Spjd
2310168404Spjd		buf->b_efunc = NULL;
2311168404Spjd		buf->b_private = NULL;
2312168404Spjd		kmem_cache_free(buf_cache, buf);
2313168404Spjd	}
2314191903Skmacy
2315191903Skmacy	if (arc_eviction_list != NULL)
2316191903Skmacy		goto restart;
2317168404Spjd}
2318168404Spjd
2319168404Spjd/*
2320185029Spjd * Flush all *evictable* data from the cache for the given spa.
2321168404Spjd * NOTE: this will not touch "active" (i.e. referenced) data.
2322168404Spjd */
2323168404Spjdvoid
2324185029Spjdarc_flush(spa_t *spa)
2325168404Spjd{
2326209962Smm	uint64_t guid = 0;
2327209962Smm
2328209962Smm	if (spa)
2329228103Smm		guid = spa_load_guid(spa);
2330209962Smm
2331205231Skmacy	while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
2332209962Smm		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2333185029Spjd		if (spa)
2334185029Spjd			break;
2335185029Spjd	}
2336205231Skmacy	while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
2337209962Smm		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2338185029Spjd		if (spa)
2339185029Spjd			break;
2340185029Spjd	}
2341205231Skmacy	while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
2342209962Smm		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2343185029Spjd		if (spa)
2344185029Spjd			break;
2345185029Spjd	}
2346205231Skmacy	while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
2347209962Smm		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2348185029Spjd		if (spa)
2349185029Spjd			break;
2350185029Spjd	}
2351168404Spjd
2352209962Smm	arc_evict_ghost(arc_mru_ghost, guid, -1);
2353209962Smm	arc_evict_ghost(arc_mfu_ghost, guid, -1);
2354168404Spjd
2355168404Spjd	mutex_enter(&arc_reclaim_thr_lock);
2356168404Spjd	arc_do_user_evicts();
2357168404Spjd	mutex_exit(&arc_reclaim_thr_lock);
2358185029Spjd	ASSERT(spa || arc_eviction_list == NULL);
2359168404Spjd}
2360168404Spjd
2361168404Spjdvoid
2362168404Spjdarc_shrink(void)
2363168404Spjd{
2364168404Spjd	if (arc_c > arc_c_min) {
2365168404Spjd		uint64_t to_free;
2366168404Spjd
2367168404Spjd#ifdef _KERNEL
2368168404Spjd		to_free = arc_c >> arc_shrink_shift;
2369168404Spjd#else
2370168404Spjd		to_free = arc_c >> arc_shrink_shift;
2371168404Spjd#endif
2372168404Spjd		if (arc_c > arc_c_min + to_free)
2373168404Spjd			atomic_add_64(&arc_c, -to_free);
2374168404Spjd		else
2375168404Spjd			arc_c = arc_c_min;
2376168404Spjd
2377168404Spjd		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2378168404Spjd		if (arc_c > arc_size)
2379168404Spjd			arc_c = MAX(arc_size, arc_c_min);
2380168404Spjd		if (arc_p > arc_c)
2381168404Spjd			arc_p = (arc_c >> 1);
2382168404Spjd		ASSERT(arc_c >= arc_c_min);
2383168404Spjd		ASSERT((int64_t)arc_p >= 0);
2384168404Spjd	}
2385168404Spjd
2386168404Spjd	if (arc_size > arc_c)
2387168404Spjd		arc_adjust();
2388168404Spjd}
2389168404Spjd
2390185029Spjdstatic int needfree = 0;
2391168404Spjd
2392168404Spjdstatic int
2393168404Spjdarc_reclaim_needed(void)
2394168404Spjd{
2395168404Spjd
2396168404Spjd#ifdef _KERNEL
2397219089Spjd
2398197816Skmacy	if (needfree)
2399197816Skmacy		return (1);
2400168404Spjd
2401191902Skmacy	/*
2402212780Savg	 * Cooperate with pagedaemon when it's time for it to scan
2403212780Savg	 * and reclaim some pages.
2404191902Skmacy	 */
2405212783Savg	if (vm_paging_needed())
2406191902Skmacy		return (1);
2407191902Skmacy
2408219089Spjd#ifdef sun
2409168404Spjd	/*
2410185029Spjd	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
2411185029Spjd	 */
2412185029Spjd	extra = desfree;
2413185029Spjd
2414185029Spjd	/*
2415185029Spjd	 * check that we're out of range of the pageout scanner.  It starts to
2416185029Spjd	 * schedule paging if freemem is less than lotsfree and needfree.
2417185029Spjd	 * lotsfree is the high-water mark for pageout, and needfree is the
2418185029Spjd	 * number of needed free pages.  We add extra pages here to make sure
2419185029Spjd	 * the scanner doesn't start up while we're freeing memory.
2420185029Spjd	 */
2421185029Spjd	if (freemem < lotsfree + needfree + extra)
2422185029Spjd		return (1);
2423185029Spjd
2424185029Spjd	/*
2425168404Spjd	 * check to make sure that swapfs has enough space so that anon
2426185029Spjd	 * reservations can still succeed. anon_resvmem() checks that the
2427168404Spjd	 * availrmem is greater than swapfs_minfree, and the number of reserved
2428168404Spjd	 * swap pages.  We also add a bit of extra here just to prevent
2429168404Spjd	 * circumstances from getting really dire.
2430168404Spjd	 */
2431168404Spjd	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2432168404Spjd		return (1);
2433168404Spjd
2434168404Spjd#if defined(__i386)
2435168404Spjd	/*
2436168404Spjd	 * If we're on an i386 platform, it's possible that we'll exhaust the
2437168404Spjd	 * kernel heap space before we ever run out of available physical
2438168404Spjd	 * memory.  Most checks of the size of the heap_area compare against
2439168404Spjd	 * tune.t_minarmem, which is the minimum available real memory that we
2440168404Spjd	 * can have in the system.  However, this is generally fixed at 25 pages
2441168404Spjd	 * which is so low that it's useless.  In this comparison, we seek to
2442168404Spjd	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
2443185029Spjd	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2444168404Spjd	 * free)
2445168404Spjd	 */
2446168404Spjd	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
2447168404Spjd	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
2448168404Spjd		return (1);
2449168404Spjd#endif
2450219089Spjd#else	/* !sun */
2451175633Spjd	if (kmem_used() > (kmem_size() * 3) / 4)
2452168404Spjd		return (1);
2453219089Spjd#endif	/* sun */
2454168404Spjd
2455168404Spjd#else
2456168404Spjd	if (spa_get_random(100) == 0)
2457168404Spjd		return (1);
2458168404Spjd#endif
2459168404Spjd	return (0);
2460168404Spjd}
2461168404Spjd
2462208454Spjdextern kmem_cache_t	*zio_buf_cache[];
2463208454Spjdextern kmem_cache_t	*zio_data_buf_cache[];
2464208454Spjd
2465168404Spjdstatic void
2466168404Spjdarc_kmem_reap_now(arc_reclaim_strategy_t strat)
2467168404Spjd{
2468168404Spjd	size_t			i;
2469168404Spjd	kmem_cache_t		*prev_cache = NULL;
2470168404Spjd	kmem_cache_t		*prev_data_cache = NULL;
2471168404Spjd
2472168404Spjd#ifdef _KERNEL
2473185029Spjd	if (arc_meta_used >= arc_meta_limit) {
2474185029Spjd		/*
2475185029Spjd		 * We are exceeding our meta-data cache limit.
2476185029Spjd		 * Purge some DNLC entries to release holds on meta-data.
2477185029Spjd		 */
2478185029Spjd		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2479185029Spjd	}
2480168404Spjd#if defined(__i386)
2481168404Spjd	/*
2482168404Spjd	 * Reclaim unused memory from all kmem caches.
2483168404Spjd	 */
2484168404Spjd	kmem_reap();
2485168404Spjd#endif
2486168404Spjd#endif
2487168404Spjd
2488168404Spjd	/*
2489185029Spjd	 * An aggressive reclamation will shrink the cache size as well as
2490168404Spjd	 * reap free buffers from the arc kmem caches.
2491168404Spjd	 */
2492168404Spjd	if (strat == ARC_RECLAIM_AGGR)
2493168404Spjd		arc_shrink();
2494168404Spjd
2495168404Spjd	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2496168404Spjd		if (zio_buf_cache[i] != prev_cache) {
2497168404Spjd			prev_cache = zio_buf_cache[i];
2498168404Spjd			kmem_cache_reap_now(zio_buf_cache[i]);
2499168404Spjd		}
2500168404Spjd		if (zio_data_buf_cache[i] != prev_data_cache) {
2501168404Spjd			prev_data_cache = zio_data_buf_cache[i];
2502168404Spjd			kmem_cache_reap_now(zio_data_buf_cache[i]);
2503168404Spjd		}
2504168404Spjd	}
2505168404Spjd	kmem_cache_reap_now(buf_cache);
2506168404Spjd	kmem_cache_reap_now(hdr_cache);
2507168404Spjd}
2508168404Spjd
2509168404Spjdstatic void
2510168404Spjdarc_reclaim_thread(void *dummy __unused)
2511168404Spjd{
2512168404Spjd	clock_t			growtime = 0;
2513168404Spjd	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
2514168404Spjd	callb_cpr_t		cpr;
2515168404Spjd
2516168404Spjd	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2517168404Spjd
2518168404Spjd	mutex_enter(&arc_reclaim_thr_lock);
2519168404Spjd	while (arc_thread_exit == 0) {
2520168404Spjd		if (arc_reclaim_needed()) {
2521168404Spjd
2522168404Spjd			if (arc_no_grow) {
2523168404Spjd				if (last_reclaim == ARC_RECLAIM_CONS) {
2524168404Spjd					last_reclaim = ARC_RECLAIM_AGGR;
2525168404Spjd				} else {
2526168404Spjd					last_reclaim = ARC_RECLAIM_CONS;
2527168404Spjd				}
2528168404Spjd			} else {
2529168404Spjd				arc_no_grow = TRUE;
2530168404Spjd				last_reclaim = ARC_RECLAIM_AGGR;
2531168404Spjd				membar_producer();
2532168404Spjd			}
2533168404Spjd
2534168404Spjd			/* reset the growth delay for every reclaim */
2535219089Spjd			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2536168404Spjd
2537185029Spjd			if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
2538168404Spjd				/*
2539185029Spjd				 * If needfree is TRUE our vm_lowmem hook
2540168404Spjd				 * was called and in that case we must free some
2541168404Spjd				 * memory, so switch to aggressive mode.
2542168404Spjd				 */
2543168404Spjd				arc_no_grow = TRUE;
2544168404Spjd				last_reclaim = ARC_RECLAIM_AGGR;
2545168404Spjd			}
2546168404Spjd			arc_kmem_reap_now(last_reclaim);
2547185029Spjd			arc_warm = B_TRUE;
2548185029Spjd
2549219089Spjd		} else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2550168404Spjd			arc_no_grow = FALSE;
2551168404Spjd		}
2552168404Spjd
2553209275Smm		arc_adjust();
2554168404Spjd
2555168404Spjd		if (arc_eviction_list != NULL)
2556168404Spjd			arc_do_user_evicts();
2557168404Spjd
2558211762Savg#ifdef _KERNEL
2559211762Savg		if (needfree) {
2560185029Spjd			needfree = 0;
2561185029Spjd			wakeup(&needfree);
2562211762Savg		}
2563168404Spjd#endif
2564168404Spjd
2565168404Spjd		/* block until needed, or one second, whichever is shorter */
2566168404Spjd		CALLB_CPR_SAFE_BEGIN(&cpr);
2567168404Spjd		(void) cv_timedwait(&arc_reclaim_thr_cv,
2568168404Spjd		    &arc_reclaim_thr_lock, hz);
2569168404Spjd		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2570168404Spjd	}
2571168404Spjd
2572168404Spjd	arc_thread_exit = 0;
2573168404Spjd	cv_broadcast(&arc_reclaim_thr_cv);
2574168404Spjd	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
2575168404Spjd	thread_exit();
2576168404Spjd}
2577168404Spjd
2578168404Spjd/*
2579168404Spjd * Adapt arc info given the number of bytes we are trying to add and
2580168404Spjd * the state that we are comming from.  This function is only called
2581168404Spjd * when we are adding new content to the cache.
2582168404Spjd */
2583168404Spjdstatic void
2584168404Spjdarc_adapt(int bytes, arc_state_t *state)
2585168404Spjd{
2586168404Spjd	int mult;
2587208373Smm	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2588168404Spjd
2589185029Spjd	if (state == arc_l2c_only)
2590185029Spjd		return;
2591185029Spjd
2592168404Spjd	ASSERT(bytes > 0);
2593168404Spjd	/*
2594168404Spjd	 * Adapt the target size of the MRU list:
2595168404Spjd	 *	- if we just hit in the MRU ghost list, then increase
2596168404Spjd	 *	  the target size of the MRU list.
2597168404Spjd	 *	- if we just hit in the MFU ghost list, then increase
2598168404Spjd	 *	  the target size of the MFU list by decreasing the
2599168404Spjd	 *	  target size of the MRU list.
2600168404Spjd	 */
2601168404Spjd	if (state == arc_mru_ghost) {
2602168404Spjd		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2603168404Spjd		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2604209275Smm		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2605168404Spjd
2606208373Smm		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2607168404Spjd	} else if (state == arc_mfu_ghost) {
2608208373Smm		uint64_t delta;
2609208373Smm
2610168404Spjd		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2611168404Spjd		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2612209275Smm		mult = MIN(mult, 10);
2613168404Spjd
2614208373Smm		delta = MIN(bytes * mult, arc_p);
2615208373Smm		arc_p = MAX(arc_p_min, arc_p - delta);
2616168404Spjd	}
2617168404Spjd	ASSERT((int64_t)arc_p >= 0);
2618168404Spjd
2619168404Spjd	if (arc_reclaim_needed()) {
2620168404Spjd		cv_signal(&arc_reclaim_thr_cv);
2621168404Spjd		return;
2622168404Spjd	}
2623168404Spjd
2624168404Spjd	if (arc_no_grow)
2625168404Spjd		return;
2626168404Spjd
2627168404Spjd	if (arc_c >= arc_c_max)
2628168404Spjd		return;
2629168404Spjd
2630168404Spjd	/*
2631168404Spjd	 * If we're within (2 * maxblocksize) bytes of the target
2632168404Spjd	 * cache size, increment the target cache size
2633168404Spjd	 */
2634168404Spjd	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2635168404Spjd		atomic_add_64(&arc_c, (int64_t)bytes);
2636168404Spjd		if (arc_c > arc_c_max)
2637168404Spjd			arc_c = arc_c_max;
2638168404Spjd		else if (state == arc_anon)
2639168404Spjd			atomic_add_64(&arc_p, (int64_t)bytes);
2640168404Spjd		if (arc_p > arc_c)
2641168404Spjd			arc_p = arc_c;
2642168404Spjd	}
2643168404Spjd	ASSERT((int64_t)arc_p >= 0);
2644168404Spjd}
2645168404Spjd
2646168404Spjd/*
2647168404Spjd * Check if the cache has reached its limits and eviction is required
2648168404Spjd * prior to insert.
2649168404Spjd */
2650168404Spjdstatic int
2651185029Spjdarc_evict_needed(arc_buf_contents_t type)
2652168404Spjd{
2653185029Spjd	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2654185029Spjd		return (1);
2655185029Spjd
2656219089Spjd#ifdef sun
2657185029Spjd#ifdef _KERNEL
2658185029Spjd	/*
2659185029Spjd	 * If zio data pages are being allocated out of a separate heap segment,
2660185029Spjd	 * then enforce that the size of available vmem for this area remains
2661185029Spjd	 * above about 1/32nd free.
2662185029Spjd	 */
2663185029Spjd	if (type == ARC_BUFC_DATA && zio_arena != NULL &&
2664185029Spjd	    vmem_size(zio_arena, VMEM_FREE) <
2665185029Spjd	    (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
2666185029Spjd		return (1);
2667185029Spjd#endif
2668219089Spjd#endif	/* sun */
2669185029Spjd
2670168404Spjd	if (arc_reclaim_needed())
2671168404Spjd		return (1);
2672168404Spjd
2673168404Spjd	return (arc_size > arc_c);
2674168404Spjd}
2675168404Spjd
2676168404Spjd/*
2677168404Spjd * The buffer, supplied as the first argument, needs a data block.
2678168404Spjd * So, if we are at cache max, determine which cache should be victimized.
2679168404Spjd * We have the following cases:
2680168404Spjd *
2681168404Spjd * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2682168404Spjd * In this situation if we're out of space, but the resident size of the MFU is
2683168404Spjd * under the limit, victimize the MFU cache to satisfy this insertion request.
2684168404Spjd *
2685168404Spjd * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2686168404Spjd * Here, we've used up all of the available space for the MRU, so we need to
2687168404Spjd * evict from our own cache instead.  Evict from the set of resident MRU
2688168404Spjd * entries.
2689168404Spjd *
2690168404Spjd * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2691168404Spjd * c minus p represents the MFU space in the cache, since p is the size of the
2692168404Spjd * cache that is dedicated to the MRU.  In this situation there's still space on
2693168404Spjd * the MFU side, so the MRU side needs to be victimized.
2694168404Spjd *
2695168404Spjd * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2696168404Spjd * MFU's resident set is consuming more space than it has been allotted.  In
2697168404Spjd * this situation, we must victimize our own cache, the MFU, for this insertion.
2698168404Spjd */
2699168404Spjdstatic void
2700168404Spjdarc_get_data_buf(arc_buf_t *buf)
2701168404Spjd{
2702168404Spjd	arc_state_t		*state = buf->b_hdr->b_state;
2703168404Spjd	uint64_t		size = buf->b_hdr->b_size;
2704168404Spjd	arc_buf_contents_t	type = buf->b_hdr->b_type;
2705168404Spjd
2706168404Spjd	arc_adapt(size, state);
2707168404Spjd
2708168404Spjd	/*
2709168404Spjd	 * We have not yet reached cache maximum size,
2710168404Spjd	 * just allocate a new buffer.
2711168404Spjd	 */
2712185029Spjd	if (!arc_evict_needed(type)) {
2713168404Spjd		if (type == ARC_BUFC_METADATA) {
2714168404Spjd			buf->b_data = zio_buf_alloc(size);
2715208373Smm			arc_space_consume(size, ARC_SPACE_DATA);
2716168404Spjd		} else {
2717168404Spjd			ASSERT(type == ARC_BUFC_DATA);
2718168404Spjd			buf->b_data = zio_data_buf_alloc(size);
2719208373Smm			ARCSTAT_INCR(arcstat_data_size, size);
2720185029Spjd			atomic_add_64(&arc_size, size);
2721168404Spjd		}
2722168404Spjd		goto out;
2723168404Spjd	}
2724168404Spjd
2725168404Spjd	/*
2726168404Spjd	 * If we are prefetching from the mfu ghost list, this buffer
2727168404Spjd	 * will end up on the mru list; so steal space from there.
2728168404Spjd	 */
2729168404Spjd	if (state == arc_mfu_ghost)
2730168404Spjd		state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2731168404Spjd	else if (state == arc_mru_ghost)
2732168404Spjd		state = arc_mru;
2733168404Spjd
2734168404Spjd	if (state == arc_mru || state == arc_anon) {
2735168404Spjd		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2736208373Smm		state = (arc_mfu->arcs_lsize[type] >= size &&
2737185029Spjd		    arc_p > mru_used) ? arc_mfu : arc_mru;
2738168404Spjd	} else {
2739168404Spjd		/* MFU cases */
2740168404Spjd		uint64_t mfu_space = arc_c - arc_p;
2741208373Smm		state =  (arc_mru->arcs_lsize[type] >= size &&
2742185029Spjd		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2743168404Spjd	}
2744209962Smm	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
2745168404Spjd		if (type == ARC_BUFC_METADATA) {
2746168404Spjd			buf->b_data = zio_buf_alloc(size);
2747208373Smm			arc_space_consume(size, ARC_SPACE_DATA);
2748168404Spjd		} else {
2749168404Spjd			ASSERT(type == ARC_BUFC_DATA);
2750168404Spjd			buf->b_data = zio_data_buf_alloc(size);
2751208373Smm			ARCSTAT_INCR(arcstat_data_size, size);
2752185029Spjd			atomic_add_64(&arc_size, size);
2753168404Spjd		}
2754168404Spjd		ARCSTAT_BUMP(arcstat_recycle_miss);
2755168404Spjd	}
2756168404Spjd	ASSERT(buf->b_data != NULL);
2757168404Spjdout:
2758168404Spjd	/*
2759168404Spjd	 * Update the state size.  Note that ghost states have a
2760168404Spjd	 * "ghost size" and so don't need to be updated.
2761168404Spjd	 */
2762168404Spjd	if (!GHOST_STATE(buf->b_hdr->b_state)) {
2763168404Spjd		arc_buf_hdr_t *hdr = buf->b_hdr;
2764168404Spjd
2765168404Spjd		atomic_add_64(&hdr->b_state->arcs_size, size);
2766168404Spjd		if (list_link_active(&hdr->b_arc_node)) {
2767168404Spjd			ASSERT(refcount_is_zero(&hdr->b_refcnt));
2768185029Spjd			atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2769168404Spjd		}
2770168404Spjd		/*
2771168404Spjd		 * If we are growing the cache, and we are adding anonymous
2772168404Spjd		 * data, and we have outgrown arc_p, update arc_p
2773168404Spjd		 */
2774168404Spjd		if (arc_size < arc_c && hdr->b_state == arc_anon &&
2775168404Spjd		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2776168404Spjd			arc_p = MIN(arc_c, arc_p + size);
2777168404Spjd	}
2778205231Skmacy	ARCSTAT_BUMP(arcstat_allocated);
2779168404Spjd}
2780168404Spjd
2781168404Spjd/*
2782168404Spjd * This routine is called whenever a buffer is accessed.
2783168404Spjd * NOTE: the hash lock is dropped in this function.
2784168404Spjd */
2785168404Spjdstatic void
2786168404Spjdarc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2787168404Spjd{
2788219089Spjd	clock_t now;
2789219089Spjd
2790168404Spjd	ASSERT(MUTEX_HELD(hash_lock));
2791168404Spjd
2792168404Spjd	if (buf->b_state == arc_anon) {
2793168404Spjd		/*
2794168404Spjd		 * This buffer is not in the cache, and does not
2795168404Spjd		 * appear in our "ghost" list.  Add the new buffer
2796168404Spjd		 * to the MRU state.
2797168404Spjd		 */
2798168404Spjd
2799168404Spjd		ASSERT(buf->b_arc_access == 0);
2800219089Spjd		buf->b_arc_access = ddi_get_lbolt();
2801168404Spjd		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2802168404Spjd		arc_change_state(arc_mru, buf, hash_lock);
2803168404Spjd
2804168404Spjd	} else if (buf->b_state == arc_mru) {
2805219089Spjd		now = ddi_get_lbolt();
2806219089Spjd
2807168404Spjd		/*
2808168404Spjd		 * If this buffer is here because of a prefetch, then either:
2809168404Spjd		 * - clear the flag if this is a "referencing" read
2810168404Spjd		 *   (any subsequent access will bump this into the MFU state).
2811168404Spjd		 * or
2812168404Spjd		 * - move the buffer to the head of the list if this is
2813168404Spjd		 *   another prefetch (to make it less likely to be evicted).
2814168404Spjd		 */
2815168404Spjd		if ((buf->b_flags & ARC_PREFETCH) != 0) {
2816168404Spjd			if (refcount_count(&buf->b_refcnt) == 0) {
2817168404Spjd				ASSERT(list_link_active(&buf->b_arc_node));
2818168404Spjd			} else {
2819168404Spjd				buf->b_flags &= ~ARC_PREFETCH;
2820168404Spjd				ARCSTAT_BUMP(arcstat_mru_hits);
2821168404Spjd			}
2822219089Spjd			buf->b_arc_access = now;
2823168404Spjd			return;
2824168404Spjd		}
2825168404Spjd
2826168404Spjd		/*
2827168404Spjd		 * This buffer has been "accessed" only once so far,
2828168404Spjd		 * but it is still in the cache. Move it to the MFU
2829168404Spjd		 * state.
2830168404Spjd		 */
2831219089Spjd		if (now > buf->b_arc_access + ARC_MINTIME) {
2832168404Spjd			/*
2833168404Spjd			 * More than 125ms have passed since we
2834168404Spjd			 * instantiated this buffer.  Move it to the
2835168404Spjd			 * most frequently used state.
2836168404Spjd			 */
2837219089Spjd			buf->b_arc_access = now;
2838168404Spjd			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2839168404Spjd			arc_change_state(arc_mfu, buf, hash_lock);
2840168404Spjd		}
2841168404Spjd		ARCSTAT_BUMP(arcstat_mru_hits);
2842168404Spjd	} else if (buf->b_state == arc_mru_ghost) {
2843168404Spjd		arc_state_t	*new_state;
2844168404Spjd		/*
2845168404Spjd		 * This buffer has been "accessed" recently, but
2846168404Spjd		 * was evicted from the cache.  Move it to the
2847168404Spjd		 * MFU state.
2848168404Spjd		 */
2849168404Spjd
2850168404Spjd		if (buf->b_flags & ARC_PREFETCH) {
2851168404Spjd			new_state = arc_mru;
2852168404Spjd			if (refcount_count(&buf->b_refcnt) > 0)
2853168404Spjd				buf->b_flags &= ~ARC_PREFETCH;
2854168404Spjd			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2855168404Spjd		} else {
2856168404Spjd			new_state = arc_mfu;
2857168404Spjd			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2858168404Spjd		}
2859168404Spjd
2860219089Spjd		buf->b_arc_access = ddi_get_lbolt();
2861168404Spjd		arc_change_state(new_state, buf, hash_lock);
2862168404Spjd
2863168404Spjd		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2864168404Spjd	} else if (buf->b_state == arc_mfu) {
2865168404Spjd		/*
2866168404Spjd		 * This buffer has been accessed more than once and is
2867168404Spjd		 * still in the cache.  Keep it in the MFU state.
2868168404Spjd		 *
2869168404Spjd		 * NOTE: an add_reference() that occurred when we did
2870168404Spjd		 * the arc_read() will have kicked this off the list.
2871168404Spjd		 * If it was a prefetch, we will explicitly move it to
2872168404Spjd		 * the head of the list now.
2873168404Spjd		 */
2874168404Spjd		if ((buf->b_flags & ARC_PREFETCH) != 0) {
2875168404Spjd			ASSERT(refcount_count(&buf->b_refcnt) == 0);
2876168404Spjd			ASSERT(list_link_active(&buf->b_arc_node));
2877168404Spjd		}
2878168404Spjd		ARCSTAT_BUMP(arcstat_mfu_hits);
2879219089Spjd		buf->b_arc_access = ddi_get_lbolt();
2880168404Spjd	} else if (buf->b_state == arc_mfu_ghost) {
2881168404Spjd		arc_state_t	*new_state = arc_mfu;
2882168404Spjd		/*
2883168404Spjd		 * This buffer has been accessed more than once but has
2884168404Spjd		 * been evicted from the cache.  Move it back to the
2885168404Spjd		 * MFU state.
2886168404Spjd		 */
2887168404Spjd
2888168404Spjd		if (buf->b_flags & ARC_PREFETCH) {
2889168404Spjd			/*
2890168404Spjd			 * This is a prefetch access...
2891168404Spjd			 * move this block back to the MRU state.
2892168404Spjd			 */
2893240415Smm			ASSERT0(refcount_count(&buf->b_refcnt));
2894168404Spjd			new_state = arc_mru;
2895168404Spjd		}
2896168404Spjd
2897219089Spjd		buf->b_arc_access = ddi_get_lbolt();
2898168404Spjd		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2899168404Spjd		arc_change_state(new_state, buf, hash_lock);
2900168404Spjd
2901168404Spjd		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2902185029Spjd	} else if (buf->b_state == arc_l2c_only) {
2903185029Spjd		/*
2904185029Spjd		 * This buffer is on the 2nd Level ARC.
2905185029Spjd		 */
2906185029Spjd
2907219089Spjd		buf->b_arc_access = ddi_get_lbolt();
2908185029Spjd		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2909185029Spjd		arc_change_state(arc_mfu, buf, hash_lock);
2910168404Spjd	} else {
2911168404Spjd		ASSERT(!"invalid arc state");
2912168404Spjd	}
2913168404Spjd}
2914168404Spjd
2915168404Spjd/* a generic arc_done_func_t which you can use */
2916168404Spjd/* ARGSUSED */
2917168404Spjdvoid
2918168404Spjdarc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2919168404Spjd{
2920219089Spjd	if (zio == NULL || zio->io_error == 0)
2921219089Spjd		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2922248571Smm	VERIFY(arc_buf_remove_ref(buf, arg));
2923168404Spjd}
2924168404Spjd
2925185029Spjd/* a generic arc_done_func_t */
2926168404Spjdvoid
2927168404Spjdarc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2928168404Spjd{
2929168404Spjd	arc_buf_t **bufp = arg;
2930168404Spjd	if (zio && zio->io_error) {
2931248571Smm		VERIFY(arc_buf_remove_ref(buf, arg));
2932168404Spjd		*bufp = NULL;
2933168404Spjd	} else {
2934168404Spjd		*bufp = buf;
2935219089Spjd		ASSERT(buf->b_data);
2936168404Spjd	}
2937168404Spjd}
2938168404Spjd
2939168404Spjdstatic void
2940168404Spjdarc_read_done(zio_t *zio)
2941168404Spjd{
2942168404Spjd	arc_buf_hdr_t	*hdr, *found;
2943168404Spjd	arc_buf_t	*buf;
2944168404Spjd	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
2945168404Spjd	kmutex_t	*hash_lock;
2946168404Spjd	arc_callback_t	*callback_list, *acb;
2947168404Spjd	int		freeable = FALSE;
2948168404Spjd
2949168404Spjd	buf = zio->io_private;
2950168404Spjd	hdr = buf->b_hdr;
2951168404Spjd
2952168404Spjd	/*
2953168404Spjd	 * The hdr was inserted into hash-table and removed from lists
2954168404Spjd	 * prior to starting I/O.  We should find this header, since
2955168404Spjd	 * it's in the hash table, and it should be legit since it's
2956168404Spjd	 * not possible to evict it during the I/O.  The only possible
2957168404Spjd	 * reason for it not to be found is if we were freed during the
2958168404Spjd	 * read.
2959168404Spjd	 */
2960209962Smm	found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
2961168404Spjd	    &hash_lock);
2962168404Spjd
2963168404Spjd	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2964185029Spjd	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2965185029Spjd	    (found == hdr && HDR_L2_READING(hdr)));
2966168404Spjd
2967185029Spjd	hdr->b_flags &= ~ARC_L2_EVICTED;
2968185029Spjd	if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2969185029Spjd		hdr->b_flags &= ~ARC_L2CACHE;
2970206796Spjd
2971168404Spjd	/* byteswap if necessary */
2972168404Spjd	callback_list = hdr->b_acb;
2973168404Spjd	ASSERT(callback_list != NULL);
2974209101Smm	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2975236884Smm		dmu_object_byteswap_t bswap =
2976236884Smm		    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
2977185029Spjd		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2978185029Spjd		    byteswap_uint64_array :
2979236884Smm		    dmu_ot_byteswap[bswap].ob_func;
2980185029Spjd		func(buf->b_data, hdr->b_size);
2981185029Spjd	}
2982168404Spjd
2983185029Spjd	arc_cksum_compute(buf, B_FALSE);
2984240133Smm#ifdef illumos
2985240133Smm	arc_buf_watch(buf);
2986240133Smm#endif /* illumos */
2987168404Spjd
2988219089Spjd	if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2989219089Spjd		/*
2990219089Spjd		 * Only call arc_access on anonymous buffers.  This is because
2991219089Spjd		 * if we've issued an I/O for an evicted buffer, we've already
2992219089Spjd		 * called arc_access (to prevent any simultaneous readers from
2993219089Spjd		 * getting confused).
2994219089Spjd		 */
2995219089Spjd		arc_access(hdr, hash_lock);
2996219089Spjd	}
2997219089Spjd
2998168404Spjd	/* create copies of the data buffer for the callers */
2999168404Spjd	abuf = buf;
3000168404Spjd	for (acb = callback_list; acb; acb = acb->acb_next) {
3001168404Spjd		if (acb->acb_done) {
3002242845Sdelphij			if (abuf == NULL) {
3003242845Sdelphij				ARCSTAT_BUMP(arcstat_duplicate_reads);
3004168404Spjd				abuf = arc_buf_clone(buf);
3005242845Sdelphij			}
3006168404Spjd			acb->acb_buf = abuf;
3007168404Spjd			abuf = NULL;
3008168404Spjd		}
3009168404Spjd	}
3010168404Spjd	hdr->b_acb = NULL;
3011168404Spjd	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3012168404Spjd	ASSERT(!HDR_BUF_AVAILABLE(hdr));
3013219089Spjd	if (abuf == buf) {
3014219089Spjd		ASSERT(buf->b_efunc == NULL);
3015219089Spjd		ASSERT(hdr->b_datacnt == 1);
3016168404Spjd		hdr->b_flags |= ARC_BUF_AVAILABLE;
3017219089Spjd	}
3018168404Spjd
3019168404Spjd	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
3020168404Spjd
3021168404Spjd	if (zio->io_error != 0) {
3022168404Spjd		hdr->b_flags |= ARC_IO_ERROR;
3023168404Spjd		if (hdr->b_state != arc_anon)
3024168404Spjd			arc_change_state(arc_anon, hdr, hash_lock);
3025168404Spjd		if (HDR_IN_HASH_TABLE(hdr))
3026168404Spjd			buf_hash_remove(hdr);
3027168404Spjd		freeable = refcount_is_zero(&hdr->b_refcnt);
3028168404Spjd	}
3029168404Spjd
3030168404Spjd	/*
3031168404Spjd	 * Broadcast before we drop the hash_lock to avoid the possibility
3032168404Spjd	 * that the hdr (and hence the cv) might be freed before we get to
3033168404Spjd	 * the cv_broadcast().
3034168404Spjd	 */
3035168404Spjd	cv_broadcast(&hdr->b_cv);
3036168404Spjd
3037168404Spjd	if (hash_lock) {
3038168404Spjd		mutex_exit(hash_lock);
3039168404Spjd	} else {
3040168404Spjd		/*
3041168404Spjd		 * This block was freed while we waited for the read to
3042168404Spjd		 * complete.  It has been removed from the hash table and
3043168404Spjd		 * moved to the anonymous state (so that it won't show up
3044168404Spjd		 * in the cache).
3045168404Spjd		 */
3046168404Spjd		ASSERT3P(hdr->b_state, ==, arc_anon);
3047168404Spjd		freeable = refcount_is_zero(&hdr->b_refcnt);
3048168404Spjd	}
3049168404Spjd
3050168404Spjd	/* execute each callback and free its structure */
3051168404Spjd	while ((acb = callback_list) != NULL) {
3052168404Spjd		if (acb->acb_done)
3053168404Spjd			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3054168404Spjd
3055168404Spjd		if (acb->acb_zio_dummy != NULL) {
3056168404Spjd			acb->acb_zio_dummy->io_error = zio->io_error;
3057168404Spjd			zio_nowait(acb->acb_zio_dummy);
3058168404Spjd		}
3059168404Spjd
3060168404Spjd		callback_list = acb->acb_next;
3061168404Spjd		kmem_free(acb, sizeof (arc_callback_t));
3062168404Spjd	}
3063168404Spjd
3064168404Spjd	if (freeable)
3065168404Spjd		arc_hdr_destroy(hdr);
3066168404Spjd}
3067168404Spjd
3068168404Spjd/*
3069168404Spjd * "Read" the block block at the specified DVA (in bp) via the
3070168404Spjd * cache.  If the block is found in the cache, invoke the provided
3071168404Spjd * callback immediately and return.  Note that the `zio' parameter
3072168404Spjd * in the callback will be NULL in this case, since no IO was
3073168404Spjd * required.  If the block is not in the cache pass the read request
3074168404Spjd * on to the spa with a substitute callback function, so that the
3075168404Spjd * requested block will be added to the cache.
3076168404Spjd *
3077168404Spjd * If a read request arrives for a block that has a read in-progress,
3078168404Spjd * either wait for the in-progress read to complete (and return the
3079168404Spjd * results); or, if this is a read with a "done" func, add a record
3080168404Spjd * to the read to invoke the "done" func when the read completes,
3081168404Spjd * and return; or just return.
3082168404Spjd *
3083168404Spjd * arc_read_done() will invoke all the requested "done" functions
3084168404Spjd * for readers of this block.
3085168404Spjd */
3086168404Spjdint
3087246666Smmarc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
3088246666Smm    void *private, int priority, int zio_flags, uint32_t *arc_flags,
3089246666Smm    const zbookmark_t *zb)
3090168404Spjd{
3091168404Spjd	arc_buf_hdr_t *hdr;
3092247187Smm	arc_buf_t *buf = NULL;
3093168404Spjd	kmutex_t *hash_lock;
3094185029Spjd	zio_t *rzio;
3095228103Smm	uint64_t guid = spa_load_guid(spa);
3096168404Spjd
3097168404Spjdtop:
3098219089Spjd	hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3099219089Spjd	    &hash_lock);
3100168404Spjd	if (hdr && hdr->b_datacnt > 0) {
3101168404Spjd
3102168404Spjd		*arc_flags |= ARC_CACHED;
3103168404Spjd
3104168404Spjd		if (HDR_IO_IN_PROGRESS(hdr)) {
3105168404Spjd
3106168404Spjd			if (*arc_flags & ARC_WAIT) {
3107168404Spjd				cv_wait(&hdr->b_cv, hash_lock);
3108168404Spjd				mutex_exit(hash_lock);
3109168404Spjd				goto top;
3110168404Spjd			}
3111168404Spjd			ASSERT(*arc_flags & ARC_NOWAIT);
3112168404Spjd
3113168404Spjd			if (done) {
3114168404Spjd				arc_callback_t	*acb = NULL;
3115168404Spjd
3116168404Spjd				acb = kmem_zalloc(sizeof (arc_callback_t),
3117168404Spjd				    KM_SLEEP);
3118168404Spjd				acb->acb_done = done;
3119168404Spjd				acb->acb_private = private;
3120168404Spjd				if (pio != NULL)
3121168404Spjd					acb->acb_zio_dummy = zio_null(pio,
3122209962Smm					    spa, NULL, NULL, NULL, zio_flags);
3123168404Spjd
3124168404Spjd				ASSERT(acb->acb_done != NULL);
3125168404Spjd				acb->acb_next = hdr->b_acb;
3126168404Spjd				hdr->b_acb = acb;
3127168404Spjd				add_reference(hdr, hash_lock, private);
3128168404Spjd				mutex_exit(hash_lock);
3129168404Spjd				return (0);
3130168404Spjd			}
3131168404Spjd			mutex_exit(hash_lock);
3132168404Spjd			return (0);
3133168404Spjd		}
3134168404Spjd
3135168404Spjd		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3136168404Spjd
3137168404Spjd		if (done) {
3138168404Spjd			add_reference(hdr, hash_lock, private);
3139168404Spjd			/*
3140168404Spjd			 * If this block is already in use, create a new
3141168404Spjd			 * copy of the data so that we will be guaranteed
3142168404Spjd			 * that arc_release() will always succeed.
3143168404Spjd			 */
3144168404Spjd			buf = hdr->b_buf;
3145168404Spjd			ASSERT(buf);
3146168404Spjd			ASSERT(buf->b_data);
3147168404Spjd			if (HDR_BUF_AVAILABLE(hdr)) {
3148168404Spjd				ASSERT(buf->b_efunc == NULL);
3149168404Spjd				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3150168404Spjd			} else {
3151168404Spjd				buf = arc_buf_clone(buf);
3152168404Spjd			}
3153219089Spjd
3154168404Spjd		} else if (*arc_flags & ARC_PREFETCH &&
3155168404Spjd		    refcount_count(&hdr->b_refcnt) == 0) {
3156168404Spjd			hdr->b_flags |= ARC_PREFETCH;
3157168404Spjd		}
3158168404Spjd		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3159168404Spjd		arc_access(hdr, hash_lock);
3160185029Spjd		if (*arc_flags & ARC_L2CACHE)
3161185029Spjd			hdr->b_flags |= ARC_L2CACHE;
3162251478Sdelphij		if (*arc_flags & ARC_L2COMPRESS)
3163251478Sdelphij			hdr->b_flags |= ARC_L2COMPRESS;
3164168404Spjd		mutex_exit(hash_lock);
3165168404Spjd		ARCSTAT_BUMP(arcstat_hits);
3166168404Spjd		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3167168404Spjd		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3168168404Spjd		    data, metadata, hits);
3169168404Spjd
3170168404Spjd		if (done)
3171168404Spjd			done(NULL, buf, private);
3172168404Spjd	} else {
3173168404Spjd		uint64_t size = BP_GET_LSIZE(bp);
3174168404Spjd		arc_callback_t	*acb;
3175185029Spjd		vdev_t *vd = NULL;
3176247187Smm		uint64_t addr = 0;
3177208373Smm		boolean_t devw = B_FALSE;
3178168404Spjd
3179168404Spjd		if (hdr == NULL) {
3180168404Spjd			/* this block is not in the cache */
3181168404Spjd			arc_buf_hdr_t	*exists;
3182168404Spjd			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3183168404Spjd			buf = arc_buf_alloc(spa, size, private, type);
3184168404Spjd			hdr = buf->b_hdr;
3185168404Spjd			hdr->b_dva = *BP_IDENTITY(bp);
3186219089Spjd			hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3187168404Spjd			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3188168404Spjd			exists = buf_hash_insert(hdr, &hash_lock);
3189168404Spjd			if (exists) {
3190168404Spjd				/* somebody beat us to the hash insert */
3191168404Spjd				mutex_exit(hash_lock);
3192219089Spjd				buf_discard_identity(hdr);
3193168404Spjd				(void) arc_buf_remove_ref(buf, private);
3194168404Spjd				goto top; /* restart the IO request */
3195168404Spjd			}
3196168404Spjd			/* if this is a prefetch, we don't have a reference */
3197168404Spjd			if (*arc_flags & ARC_PREFETCH) {
3198168404Spjd				(void) remove_reference(hdr, hash_lock,
3199168404Spjd				    private);
3200168404Spjd				hdr->b_flags |= ARC_PREFETCH;
3201168404Spjd			}
3202185029Spjd			if (*arc_flags & ARC_L2CACHE)
3203185029Spjd				hdr->b_flags |= ARC_L2CACHE;
3204251478Sdelphij			if (*arc_flags & ARC_L2COMPRESS)
3205251478Sdelphij				hdr->b_flags |= ARC_L2COMPRESS;
3206168404Spjd			if (BP_GET_LEVEL(bp) > 0)
3207168404Spjd				hdr->b_flags |= ARC_INDIRECT;
3208168404Spjd		} else {
3209168404Spjd			/* this block is in the ghost cache */
3210168404Spjd			ASSERT(GHOST_STATE(hdr->b_state));
3211168404Spjd			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3212240415Smm			ASSERT0(refcount_count(&hdr->b_refcnt));
3213168404Spjd			ASSERT(hdr->b_buf == NULL);
3214168404Spjd
3215168404Spjd			/* if this is a prefetch, we don't have a reference */
3216168404Spjd			if (*arc_flags & ARC_PREFETCH)
3217168404Spjd				hdr->b_flags |= ARC_PREFETCH;
3218168404Spjd			else
3219168404Spjd				add_reference(hdr, hash_lock, private);
3220185029Spjd			if (*arc_flags & ARC_L2CACHE)
3221185029Spjd				hdr->b_flags |= ARC_L2CACHE;
3222251478Sdelphij			if (*arc_flags & ARC_L2COMPRESS)
3223251478Sdelphij				hdr->b_flags |= ARC_L2COMPRESS;
3224185029Spjd			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3225168404Spjd			buf->b_hdr = hdr;
3226168404Spjd			buf->b_data = NULL;
3227168404Spjd			buf->b_efunc = NULL;
3228168404Spjd			buf->b_private = NULL;
3229168404Spjd			buf->b_next = NULL;
3230168404Spjd			hdr->b_buf = buf;
3231168404Spjd			ASSERT(hdr->b_datacnt == 0);
3232168404Spjd			hdr->b_datacnt = 1;
3233219089Spjd			arc_get_data_buf(buf);
3234219089Spjd			arc_access(hdr, hash_lock);
3235168404Spjd		}
3236168404Spjd
3237219089Spjd		ASSERT(!GHOST_STATE(hdr->b_state));
3238219089Spjd
3239168404Spjd		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3240168404Spjd		acb->acb_done = done;
3241168404Spjd		acb->acb_private = private;
3242168404Spjd
3243168404Spjd		ASSERT(hdr->b_acb == NULL);
3244168404Spjd		hdr->b_acb = acb;
3245168404Spjd		hdr->b_flags |= ARC_IO_IN_PROGRESS;
3246168404Spjd
3247185029Spjd		if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
3248185029Spjd		    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3249208373Smm			devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3250185029Spjd			addr = hdr->b_l2hdr->b_daddr;
3251185029Spjd			/*
3252185029Spjd			 * Lock out device removal.
3253185029Spjd			 */
3254185029Spjd			if (vdev_is_dead(vd) ||
3255185029Spjd			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3256185029Spjd				vd = NULL;
3257185029Spjd		}
3258185029Spjd
3259168404Spjd		mutex_exit(hash_lock);
3260168404Spjd
3261251629Sdelphij		/*
3262251629Sdelphij		 * At this point, we have a level 1 cache miss.  Try again in
3263251629Sdelphij		 * L2ARC if possible.
3264251629Sdelphij		 */
3265168404Spjd		ASSERT3U(hdr->b_size, ==, size);
3266219089Spjd		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3267219089Spjd		    uint64_t, size, zbookmark_t *, zb);
3268168404Spjd		ARCSTAT_BUMP(arcstat_misses);
3269168404Spjd		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3270168404Spjd		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3271168404Spjd		    data, metadata, misses);
3272228392Spjd#ifdef _KERNEL
3273228392Spjd		curthread->td_ru.ru_inblock++;
3274228392Spjd#endif
3275168404Spjd
3276208373Smm		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3277185029Spjd			/*
3278185029Spjd			 * Read from the L2ARC if the following are true:
3279185029Spjd			 * 1. The L2ARC vdev was previously cached.
3280185029Spjd			 * 2. This buffer still has L2ARC metadata.
3281185029Spjd			 * 3. This buffer isn't currently writing to the L2ARC.
3282185029Spjd			 * 4. The L2ARC entry wasn't evicted, which may
3283185029Spjd			 *    also have invalidated the vdev.
3284208373Smm			 * 5. This isn't prefetch and l2arc_noprefetch is set.
3285185029Spjd			 */
3286185029Spjd			if (hdr->b_l2hdr != NULL &&
3287208373Smm			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3288208373Smm			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3289185029Spjd				l2arc_read_callback_t *cb;
3290185029Spjd
3291185029Spjd				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3292185029Spjd				ARCSTAT_BUMP(arcstat_l2_hits);
3293185029Spjd
3294185029Spjd				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3295185029Spjd				    KM_SLEEP);
3296185029Spjd				cb->l2rcb_buf = buf;
3297185029Spjd				cb->l2rcb_spa = spa;
3298185029Spjd				cb->l2rcb_bp = *bp;
3299185029Spjd				cb->l2rcb_zb = *zb;
3300185029Spjd				cb->l2rcb_flags = zio_flags;
3301251478Sdelphij				cb->l2rcb_compress = hdr->b_l2hdr->b_compress;
3302185029Spjd
3303247187Smm				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3304247187Smm				    addr + size < vd->vdev_psize -
3305247187Smm				    VDEV_LABEL_END_SIZE);
3306247187Smm
3307185029Spjd				/*
3308185029Spjd				 * l2arc read.  The SCL_L2ARC lock will be
3309185029Spjd				 * released by l2arc_read_done().
3310251478Sdelphij				 * Issue a null zio if the underlying buffer
3311251478Sdelphij				 * was squashed to zero size by compression.
3312185029Spjd				 */
3313251478Sdelphij				if (hdr->b_l2hdr->b_compress ==
3314251478Sdelphij				    ZIO_COMPRESS_EMPTY) {
3315251478Sdelphij					rzio = zio_null(pio, spa, vd,
3316251478Sdelphij					    l2arc_read_done, cb,
3317251478Sdelphij					    zio_flags | ZIO_FLAG_DONT_CACHE |
3318251478Sdelphij					    ZIO_FLAG_CANFAIL |
3319251478Sdelphij					    ZIO_FLAG_DONT_PROPAGATE |
3320251478Sdelphij					    ZIO_FLAG_DONT_RETRY);
3321251478Sdelphij				} else {
3322251478Sdelphij					rzio = zio_read_phys(pio, vd, addr,
3323251478Sdelphij					    hdr->b_l2hdr->b_asize,
3324251478Sdelphij					    buf->b_data, ZIO_CHECKSUM_OFF,
3325251478Sdelphij					    l2arc_read_done, cb, priority,
3326251478Sdelphij					    zio_flags | ZIO_FLAG_DONT_CACHE |
3327251478Sdelphij					    ZIO_FLAG_CANFAIL |
3328251478Sdelphij					    ZIO_FLAG_DONT_PROPAGATE |
3329251478Sdelphij					    ZIO_FLAG_DONT_RETRY, B_FALSE);
3330251478Sdelphij				}
3331185029Spjd				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3332185029Spjd				    zio_t *, rzio);
3333251478Sdelphij				ARCSTAT_INCR(arcstat_l2_read_bytes,
3334251478Sdelphij				    hdr->b_l2hdr->b_asize);
3335185029Spjd
3336185029Spjd				if (*arc_flags & ARC_NOWAIT) {
3337185029Spjd					zio_nowait(rzio);
3338185029Spjd					return (0);
3339185029Spjd				}
3340185029Spjd
3341185029Spjd				ASSERT(*arc_flags & ARC_WAIT);
3342185029Spjd				if (zio_wait(rzio) == 0)
3343185029Spjd					return (0);
3344185029Spjd
3345185029Spjd				/* l2arc read error; goto zio_read() */
3346185029Spjd			} else {
3347185029Spjd				DTRACE_PROBE1(l2arc__miss,
3348185029Spjd				    arc_buf_hdr_t *, hdr);
3349185029Spjd				ARCSTAT_BUMP(arcstat_l2_misses);
3350185029Spjd				if (HDR_L2_WRITING(hdr))
3351185029Spjd					ARCSTAT_BUMP(arcstat_l2_rw_clash);
3352185029Spjd				spa_config_exit(spa, SCL_L2ARC, vd);
3353185029Spjd			}
3354208373Smm		} else {
3355208373Smm			if (vd != NULL)
3356208373Smm				spa_config_exit(spa, SCL_L2ARC, vd);
3357208373Smm			if (l2arc_ndev != 0) {
3358208373Smm				DTRACE_PROBE1(l2arc__miss,
3359208373Smm				    arc_buf_hdr_t *, hdr);
3360208373Smm				ARCSTAT_BUMP(arcstat_l2_misses);
3361208373Smm			}
3362185029Spjd		}
3363185029Spjd
3364168404Spjd		rzio = zio_read(pio, spa, bp, buf->b_data, size,
3365185029Spjd		    arc_read_done, buf, priority, zio_flags, zb);
3366168404Spjd
3367168404Spjd		if (*arc_flags & ARC_WAIT)
3368168404Spjd			return (zio_wait(rzio));
3369168404Spjd
3370168404Spjd		ASSERT(*arc_flags & ARC_NOWAIT);
3371168404Spjd		zio_nowait(rzio);
3372168404Spjd	}
3373168404Spjd	return (0);
3374168404Spjd}
3375168404Spjd
3376168404Spjdvoid
3377168404Spjdarc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3378168404Spjd{
3379168404Spjd	ASSERT(buf->b_hdr != NULL);
3380168404Spjd	ASSERT(buf->b_hdr->b_state != arc_anon);
3381168404Spjd	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3382219089Spjd	ASSERT(buf->b_efunc == NULL);
3383219089Spjd	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3384219089Spjd
3385168404Spjd	buf->b_efunc = func;
3386168404Spjd	buf->b_private = private;
3387168404Spjd}
3388168404Spjd
3389168404Spjd/*
3390251520Sdelphij * Notify the arc that a block was freed, and thus will never be used again.
3391251520Sdelphij */
3392251520Sdelphijvoid
3393251520Sdelphijarc_freed(spa_t *spa, const blkptr_t *bp)
3394251520Sdelphij{
3395251520Sdelphij	arc_buf_hdr_t *hdr;
3396251520Sdelphij	kmutex_t *hash_lock;
3397251520Sdelphij	uint64_t guid = spa_load_guid(spa);
3398251520Sdelphij
3399251520Sdelphij	hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3400251520Sdelphij	    &hash_lock);
3401251520Sdelphij	if (hdr == NULL)
3402251520Sdelphij		return;
3403251520Sdelphij	if (HDR_BUF_AVAILABLE(hdr)) {
3404251520Sdelphij		arc_buf_t *buf = hdr->b_buf;
3405251520Sdelphij		add_reference(hdr, hash_lock, FTAG);
3406251520Sdelphij		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3407251520Sdelphij		mutex_exit(hash_lock);
3408251520Sdelphij
3409251520Sdelphij		arc_release(buf, FTAG);
3410251520Sdelphij		(void) arc_buf_remove_ref(buf, FTAG);
3411251520Sdelphij	} else {
3412251520Sdelphij		mutex_exit(hash_lock);
3413251520Sdelphij	}
3414251520Sdelphij
3415251520Sdelphij}
3416251520Sdelphij
3417251520Sdelphij/*
3418168404Spjd * This is used by the DMU to let the ARC know that a buffer is
3419168404Spjd * being evicted, so the ARC should clean up.  If this arc buf
3420168404Spjd * is not yet in the evicted state, it will be put there.
3421168404Spjd */
3422168404Spjdint
3423168404Spjdarc_buf_evict(arc_buf_t *buf)
3424168404Spjd{
3425168404Spjd	arc_buf_hdr_t *hdr;
3426168404Spjd	kmutex_t *hash_lock;
3427168404Spjd	arc_buf_t **bufp;
3428205231Skmacy	list_t *list, *evicted_list;
3429205231Skmacy	kmutex_t *lock, *evicted_lock;
3430206796Spjd
3431219089Spjd	mutex_enter(&buf->b_evict_lock);
3432168404Spjd	hdr = buf->b_hdr;
3433168404Spjd	if (hdr == NULL) {
3434168404Spjd		/*
3435168404Spjd		 * We are in arc_do_user_evicts().
3436168404Spjd		 */
3437168404Spjd		ASSERT(buf->b_data == NULL);
3438219089Spjd		mutex_exit(&buf->b_evict_lock);
3439168404Spjd		return (0);
3440185029Spjd	} else if (buf->b_data == NULL) {
3441185029Spjd		arc_buf_t copy = *buf; /* structure assignment */
3442185029Spjd		/*
3443185029Spjd		 * We are on the eviction list; process this buffer now
3444185029Spjd		 * but let arc_do_user_evicts() do the reaping.
3445185029Spjd		 */
3446185029Spjd		buf->b_efunc = NULL;
3447219089Spjd		mutex_exit(&buf->b_evict_lock);
3448185029Spjd		VERIFY(copy.b_efunc(&copy) == 0);
3449185029Spjd		return (1);
3450168404Spjd	}
3451168404Spjd	hash_lock = HDR_LOCK(hdr);
3452168404Spjd	mutex_enter(hash_lock);
3453219089Spjd	hdr = buf->b_hdr;
3454219089Spjd	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3455168404Spjd
3456168404Spjd	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3457168404Spjd	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3458168404Spjd
3459168404Spjd	/*
3460168404Spjd	 * Pull this buffer off of the hdr
3461168404Spjd	 */
3462168404Spjd	bufp = &hdr->b_buf;
3463168404Spjd	while (*bufp != buf)
3464168404Spjd		bufp = &(*bufp)->b_next;
3465168404Spjd	*bufp = buf->b_next;
3466168404Spjd
3467168404Spjd	ASSERT(buf->b_data != NULL);
3468168404Spjd	arc_buf_destroy(buf, FALSE, FALSE);
3469168404Spjd
3470168404Spjd	if (hdr->b_datacnt == 0) {
3471168404Spjd		arc_state_t *old_state = hdr->b_state;
3472168404Spjd		arc_state_t *evicted_state;
3473168404Spjd
3474219089Spjd		ASSERT(hdr->b_buf == NULL);
3475168404Spjd		ASSERT(refcount_is_zero(&hdr->b_refcnt));
3476168404Spjd
3477168404Spjd		evicted_state =
3478168404Spjd		    (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3479168404Spjd
3480205231Skmacy		get_buf_info(hdr, old_state, &list, &lock);
3481205231Skmacy		get_buf_info(hdr, evicted_state, &evicted_list, &evicted_lock);
3482205231Skmacy		mutex_enter(lock);
3483205231Skmacy		mutex_enter(evicted_lock);
3484168404Spjd
3485168404Spjd		arc_change_state(evicted_state, hdr, hash_lock);
3486168404Spjd		ASSERT(HDR_IN_HASH_TABLE(hdr));
3487185029Spjd		hdr->b_flags |= ARC_IN_HASH_TABLE;
3488185029Spjd		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3489168404Spjd
3490205231Skmacy		mutex_exit(evicted_lock);
3491205231Skmacy		mutex_exit(lock);
3492168404Spjd	}
3493168404Spjd	mutex_exit(hash_lock);
3494219089Spjd	mutex_exit(&buf->b_evict_lock);
3495168404Spjd
3496168404Spjd	VERIFY(buf->b_efunc(buf) == 0);
3497168404Spjd	buf->b_efunc = NULL;
3498168404Spjd	buf->b_private = NULL;
3499168404Spjd	buf->b_hdr = NULL;
3500219089Spjd	buf->b_next = NULL;
3501168404Spjd	kmem_cache_free(buf_cache, buf);
3502168404Spjd	return (1);
3503168404Spjd}
3504168404Spjd
3505168404Spjd/*
3506251629Sdelphij * Release this buffer from the cache, making it an anonymous buffer.  This
3507251629Sdelphij * must be done after a read and prior to modifying the buffer contents.
3508168404Spjd * If the buffer has more than one reference, we must make
3509185029Spjd * a new hdr for the buffer.
3510168404Spjd */
3511168404Spjdvoid
3512168404Spjdarc_release(arc_buf_t *buf, void *tag)
3513168404Spjd{
3514185029Spjd	arc_buf_hdr_t *hdr;
3515219089Spjd	kmutex_t *hash_lock = NULL;
3516185029Spjd	l2arc_buf_hdr_t *l2hdr;
3517185029Spjd	uint64_t buf_size;
3518168404Spjd
3519219089Spjd	/*
3520219089Spjd	 * It would be nice to assert that if it's DMU metadata (level >
3521219089Spjd	 * 0 || it's the dnode file), then it must be syncing context.
3522219089Spjd	 * But we don't know that information at this level.
3523219089Spjd	 */
3524219089Spjd
3525219089Spjd	mutex_enter(&buf->b_evict_lock);
3526185029Spjd	hdr = buf->b_hdr;
3527185029Spjd
3528168404Spjd	/* this buffer is not on any list */
3529168404Spjd	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3530168404Spjd
3531168404Spjd	if (hdr->b_state == arc_anon) {
3532168404Spjd		/* this buffer is already released */
3533168404Spjd		ASSERT(buf->b_efunc == NULL);
3534208373Smm	} else {
3535208373Smm		hash_lock = HDR_LOCK(hdr);
3536208373Smm		mutex_enter(hash_lock);
3537219089Spjd		hdr = buf->b_hdr;
3538219089Spjd		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3539168404Spjd	}
3540168404Spjd
3541185029Spjd	l2hdr = hdr->b_l2hdr;
3542185029Spjd	if (l2hdr) {
3543185029Spjd		mutex_enter(&l2arc_buflist_mtx);
3544185029Spjd		hdr->b_l2hdr = NULL;
3545185029Spjd	}
3546247187Smm	buf_size = hdr->b_size;
3547185029Spjd
3548168404Spjd	/*
3549168404Spjd	 * Do we have more than one buf?
3550168404Spjd	 */
3551185029Spjd	if (hdr->b_datacnt > 1) {
3552168404Spjd		arc_buf_hdr_t *nhdr;
3553168404Spjd		arc_buf_t **bufp;
3554168404Spjd		uint64_t blksz = hdr->b_size;
3555209962Smm		uint64_t spa = hdr->b_spa;
3556168404Spjd		arc_buf_contents_t type = hdr->b_type;
3557185029Spjd		uint32_t flags = hdr->b_flags;
3558168404Spjd
3559185029Spjd		ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3560168404Spjd		/*
3561219089Spjd		 * Pull the data off of this hdr and attach it to
3562219089Spjd		 * a new anonymous hdr.
3563168404Spjd		 */
3564168404Spjd		(void) remove_reference(hdr, hash_lock, tag);
3565168404Spjd		bufp = &hdr->b_buf;
3566168404Spjd		while (*bufp != buf)
3567168404Spjd			bufp = &(*bufp)->b_next;
3568219089Spjd		*bufp = buf->b_next;
3569168404Spjd		buf->b_next = NULL;
3570168404Spjd
3571168404Spjd		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3572168404Spjd		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3573168404Spjd		if (refcount_is_zero(&hdr->b_refcnt)) {
3574185029Spjd			uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3575185029Spjd			ASSERT3U(*size, >=, hdr->b_size);
3576185029Spjd			atomic_add_64(size, -hdr->b_size);
3577168404Spjd		}
3578242845Sdelphij
3579242845Sdelphij		/*
3580242845Sdelphij		 * We're releasing a duplicate user data buffer, update
3581242845Sdelphij		 * our statistics accordingly.
3582242845Sdelphij		 */
3583242845Sdelphij		if (hdr->b_type == ARC_BUFC_DATA) {
3584242845Sdelphij			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3585242845Sdelphij			ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3586242845Sdelphij			    -hdr->b_size);
3587242845Sdelphij		}
3588168404Spjd		hdr->b_datacnt -= 1;
3589168404Spjd		arc_cksum_verify(buf);
3590240133Smm#ifdef illumos
3591240133Smm		arc_buf_unwatch(buf);
3592240133Smm#endif /* illumos */
3593168404Spjd
3594168404Spjd		mutex_exit(hash_lock);
3595168404Spjd
3596185029Spjd		nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3597168404Spjd		nhdr->b_size = blksz;
3598168404Spjd		nhdr->b_spa = spa;
3599168404Spjd		nhdr->b_type = type;
3600168404Spjd		nhdr->b_buf = buf;
3601168404Spjd		nhdr->b_state = arc_anon;
3602168404Spjd		nhdr->b_arc_access = 0;
3603185029Spjd		nhdr->b_flags = flags & ARC_L2_WRITING;
3604185029Spjd		nhdr->b_l2hdr = NULL;
3605168404Spjd		nhdr->b_datacnt = 1;
3606168404Spjd		nhdr->b_freeze_cksum = NULL;
3607168404Spjd		(void) refcount_add(&nhdr->b_refcnt, tag);
3608168404Spjd		buf->b_hdr = nhdr;
3609219089Spjd		mutex_exit(&buf->b_evict_lock);
3610168404Spjd		atomic_add_64(&arc_anon->arcs_size, blksz);
3611168404Spjd	} else {
3612219089Spjd		mutex_exit(&buf->b_evict_lock);
3613168404Spjd		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3614168404Spjd		ASSERT(!list_link_active(&hdr->b_arc_node));
3615168404Spjd		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3616219089Spjd		if (hdr->b_state != arc_anon)
3617219089Spjd			arc_change_state(arc_anon, hdr, hash_lock);
3618168404Spjd		hdr->b_arc_access = 0;
3619219089Spjd		if (hash_lock)
3620219089Spjd			mutex_exit(hash_lock);
3621185029Spjd
3622219089Spjd		buf_discard_identity(hdr);
3623168404Spjd		arc_buf_thaw(buf);
3624168404Spjd	}
3625168404Spjd	buf->b_efunc = NULL;
3626168404Spjd	buf->b_private = NULL;
3627185029Spjd
3628185029Spjd	if (l2hdr) {
3629251478Sdelphij		ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3630248572Ssmh		trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
3631248574Ssmh		    hdr->b_size, 0);
3632185029Spjd		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3633185029Spjd		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3634185029Spjd		ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3635185029Spjd		mutex_exit(&l2arc_buflist_mtx);
3636185029Spjd	}
3637168404Spjd}
3638168404Spjd
3639168404Spjdint
3640168404Spjdarc_released(arc_buf_t *buf)
3641168404Spjd{
3642185029Spjd	int released;
3643185029Spjd
3644219089Spjd	mutex_enter(&buf->b_evict_lock);
3645185029Spjd	released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3646219089Spjd	mutex_exit(&buf->b_evict_lock);
3647185029Spjd	return (released);
3648168404Spjd}
3649168404Spjd
3650168404Spjdint
3651168404Spjdarc_has_callback(arc_buf_t *buf)
3652168404Spjd{
3653185029Spjd	int callback;
3654185029Spjd
3655219089Spjd	mutex_enter(&buf->b_evict_lock);
3656185029Spjd	callback = (buf->b_efunc != NULL);
3657219089Spjd	mutex_exit(&buf->b_evict_lock);
3658185029Spjd	return (callback);
3659168404Spjd}
3660168404Spjd
3661168404Spjd#ifdef ZFS_DEBUG
3662168404Spjdint
3663168404Spjdarc_referenced(arc_buf_t *buf)
3664168404Spjd{
3665185029Spjd	int referenced;
3666185029Spjd
3667219089Spjd	mutex_enter(&buf->b_evict_lock);
3668185029Spjd	referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3669219089Spjd	mutex_exit(&buf->b_evict_lock);
3670185029Spjd	return (referenced);
3671168404Spjd}
3672168404Spjd#endif
3673168404Spjd
3674168404Spjdstatic void
3675168404Spjdarc_write_ready(zio_t *zio)
3676168404Spjd{
3677168404Spjd	arc_write_callback_t *callback = zio->io_private;
3678168404Spjd	arc_buf_t *buf = callback->awcb_buf;
3679185029Spjd	arc_buf_hdr_t *hdr = buf->b_hdr;
3680168404Spjd
3681185029Spjd	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3682185029Spjd	callback->awcb_ready(zio, buf, callback->awcb_private);
3683185029Spjd
3684185029Spjd	/*
3685185029Spjd	 * If the IO is already in progress, then this is a re-write
3686185029Spjd	 * attempt, so we need to thaw and re-compute the cksum.
3687185029Spjd	 * It is the responsibility of the callback to handle the
3688185029Spjd	 * accounting for any re-write attempt.
3689185029Spjd	 */
3690185029Spjd	if (HDR_IO_IN_PROGRESS(hdr)) {
3691185029Spjd		mutex_enter(&hdr->b_freeze_lock);
3692185029Spjd		if (hdr->b_freeze_cksum != NULL) {
3693185029Spjd			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3694185029Spjd			hdr->b_freeze_cksum = NULL;
3695185029Spjd		}
3696185029Spjd		mutex_exit(&hdr->b_freeze_lock);
3697168404Spjd	}
3698185029Spjd	arc_cksum_compute(buf, B_FALSE);
3699185029Spjd	hdr->b_flags |= ARC_IO_IN_PROGRESS;
3700168404Spjd}
3701168404Spjd
3702168404Spjdstatic void
3703168404Spjdarc_write_done(zio_t *zio)
3704168404Spjd{
3705168404Spjd	arc_write_callback_t *callback = zio->io_private;
3706168404Spjd	arc_buf_t *buf = callback->awcb_buf;
3707168404Spjd	arc_buf_hdr_t *hdr = buf->b_hdr;
3708168404Spjd
3709219089Spjd	ASSERT(hdr->b_acb == NULL);
3710168404Spjd
3711219089Spjd	if (zio->io_error == 0) {
3712219089Spjd		hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3713219089Spjd		hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3714219089Spjd		hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3715219089Spjd	} else {
3716219089Spjd		ASSERT(BUF_EMPTY(hdr));
3717219089Spjd	}
3718219089Spjd
3719168404Spjd	/*
3720168404Spjd	 * If the block to be written was all-zero, we may have
3721168404Spjd	 * compressed it away.  In this case no write was performed
3722219089Spjd	 * so there will be no dva/birth/checksum.  The buffer must
3723219089Spjd	 * therefore remain anonymous (and uncached).
3724168404Spjd	 */
3725168404Spjd	if (!BUF_EMPTY(hdr)) {
3726168404Spjd		arc_buf_hdr_t *exists;
3727168404Spjd		kmutex_t *hash_lock;
3728168404Spjd
3729219089Spjd		ASSERT(zio->io_error == 0);
3730219089Spjd
3731168404Spjd		arc_cksum_verify(buf);
3732168404Spjd
3733168404Spjd		exists = buf_hash_insert(hdr, &hash_lock);
3734168404Spjd		if (exists) {
3735168404Spjd			/*
3736168404Spjd			 * This can only happen if we overwrite for
3737168404Spjd			 * sync-to-convergence, because we remove
3738168404Spjd			 * buffers from the hash table when we arc_free().
3739168404Spjd			 */
3740219089Spjd			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3741219089Spjd				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3742219089Spjd					panic("bad overwrite, hdr=%p exists=%p",
3743219089Spjd					    (void *)hdr, (void *)exists);
3744219089Spjd				ASSERT(refcount_is_zero(&exists->b_refcnt));
3745219089Spjd				arc_change_state(arc_anon, exists, hash_lock);
3746219089Spjd				mutex_exit(hash_lock);
3747219089Spjd				arc_hdr_destroy(exists);
3748219089Spjd				exists = buf_hash_insert(hdr, &hash_lock);
3749219089Spjd				ASSERT3P(exists, ==, NULL);
3750243524Smm			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3751243524Smm				/* nopwrite */
3752243524Smm				ASSERT(zio->io_prop.zp_nopwrite);
3753243524Smm				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3754243524Smm					panic("bad nopwrite, hdr=%p exists=%p",
3755243524Smm					    (void *)hdr, (void *)exists);
3756219089Spjd			} else {
3757219089Spjd				/* Dedup */
3758219089Spjd				ASSERT(hdr->b_datacnt == 1);
3759219089Spjd				ASSERT(hdr->b_state == arc_anon);
3760219089Spjd				ASSERT(BP_GET_DEDUP(zio->io_bp));
3761219089Spjd				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3762219089Spjd			}
3763168404Spjd		}
3764168404Spjd		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3765185029Spjd		/* if it's not anon, we are doing a scrub */
3766219089Spjd		if (!exists && hdr->b_state == arc_anon)
3767185029Spjd			arc_access(hdr, hash_lock);
3768168404Spjd		mutex_exit(hash_lock);
3769168404Spjd	} else {
3770168404Spjd		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3771168404Spjd	}
3772168404Spjd
3773219089Spjd	ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3774219089Spjd	callback->awcb_done(zio, buf, callback->awcb_private);
3775168404Spjd
3776168404Spjd	kmem_free(callback, sizeof (arc_write_callback_t));
3777168404Spjd}
3778168404Spjd
3779168404Spjdzio_t *
3780219089Spjdarc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3781251478Sdelphij    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3782251478Sdelphij    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
3783251478Sdelphij    void *private, int priority, int zio_flags, const zbookmark_t *zb)
3784168404Spjd{
3785168404Spjd	arc_buf_hdr_t *hdr = buf->b_hdr;
3786168404Spjd	arc_write_callback_t *callback;
3787185029Spjd	zio_t *zio;
3788168404Spjd
3789185029Spjd	ASSERT(ready != NULL);
3790219089Spjd	ASSERT(done != NULL);
3791168404Spjd	ASSERT(!HDR_IO_ERROR(hdr));
3792168404Spjd	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3793219089Spjd	ASSERT(hdr->b_acb == NULL);
3794185029Spjd	if (l2arc)
3795185029Spjd		hdr->b_flags |= ARC_L2CACHE;
3796251478Sdelphij	if (l2arc_compress)
3797251478Sdelphij		hdr->b_flags |= ARC_L2COMPRESS;
3798168404Spjd	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3799168404Spjd	callback->awcb_ready = ready;
3800168404Spjd	callback->awcb_done = done;
3801168404Spjd	callback->awcb_private = private;
3802168404Spjd	callback->awcb_buf = buf;
3803168404Spjd
3804219089Spjd	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3805185029Spjd	    arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3806185029Spjd
3807168404Spjd	return (zio);
3808168404Spjd}
3809168404Spjd
3810185029Spjdstatic int
3811209962Smmarc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
3812185029Spjd{
3813185029Spjd#ifdef _KERNEL
3814219089Spjd	uint64_t available_memory =
3815219089Spjd	    ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count);
3816185029Spjd	static uint64_t page_load = 0;
3817185029Spjd	static uint64_t last_txg = 0;
3818185029Spjd
3819219089Spjd#ifdef sun
3820185029Spjd#if defined(__i386)
3821185029Spjd	available_memory =
3822185029Spjd	    MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3823185029Spjd#endif
3824219089Spjd#endif	/* sun */
3825185029Spjd	if (available_memory >= zfs_write_limit_max)
3826185029Spjd		return (0);
3827185029Spjd
3828185029Spjd	if (txg > last_txg) {
3829185029Spjd		last_txg = txg;
3830185029Spjd		page_load = 0;
3831185029Spjd	}
3832185029Spjd	/*
3833185029Spjd	 * If we are in pageout, we know that memory is already tight,
3834185029Spjd	 * the arc is already going to be evicting, so we just want to
3835185029Spjd	 * continue to let page writes occur as quickly as possible.
3836185029Spjd	 */
3837185029Spjd	if (curproc == pageproc) {
3838185029Spjd		if (page_load > available_memory / 4)
3839249195Smm			return (SET_ERROR(ERESTART));
3840185029Spjd		/* Note: reserve is inflated, so we deflate */
3841185029Spjd		page_load += reserve / 8;
3842185029Spjd		return (0);
3843185029Spjd	} else if (page_load > 0 && arc_reclaim_needed()) {
3844185029Spjd		/* memory is low, delay before restarting */
3845185029Spjd		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3846249195Smm		return (SET_ERROR(EAGAIN));
3847185029Spjd	}
3848185029Spjd	page_load = 0;
3849185029Spjd
3850185029Spjd	if (arc_size > arc_c_min) {
3851185029Spjd		uint64_t evictable_memory =
3852185029Spjd		    arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3853185029Spjd		    arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3854185029Spjd		    arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3855185029Spjd		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3856185029Spjd		available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3857185029Spjd	}
3858185029Spjd
3859185029Spjd	if (inflight_data > available_memory / 4) {
3860185029Spjd		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3861249195Smm		return (SET_ERROR(ERESTART));
3862185029Spjd	}
3863185029Spjd#endif
3864185029Spjd	return (0);
3865185029Spjd}
3866185029Spjd
3867168404Spjdvoid
3868185029Spjdarc_tempreserve_clear(uint64_t reserve)
3869168404Spjd{
3870185029Spjd	atomic_add_64(&arc_tempreserve, -reserve);
3871168404Spjd	ASSERT((int64_t)arc_tempreserve >= 0);
3872168404Spjd}
3873168404Spjd
3874168404Spjdint
3875185029Spjdarc_tempreserve_space(uint64_t reserve, uint64_t txg)
3876168404Spjd{
3877185029Spjd	int error;
3878209962Smm	uint64_t anon_size;
3879185029Spjd
3880168404Spjd#ifdef ZFS_DEBUG
3881168404Spjd	/*
3882168404Spjd	 * Once in a while, fail for no reason.  Everything should cope.
3883168404Spjd	 */
3884168404Spjd	if (spa_get_random(10000) == 0) {
3885168404Spjd		dprintf("forcing random failure\n");
3886249195Smm		return (SET_ERROR(ERESTART));
3887168404Spjd	}
3888168404Spjd#endif
3889185029Spjd	if (reserve > arc_c/4 && !arc_no_grow)
3890185029Spjd		arc_c = MIN(arc_c_max, reserve * 4);
3891185029Spjd	if (reserve > arc_c)
3892249195Smm		return (SET_ERROR(ENOMEM));
3893168404Spjd
3894168404Spjd	/*
3895209962Smm	 * Don't count loaned bufs as in flight dirty data to prevent long
3896209962Smm	 * network delays from blocking transactions that are ready to be
3897209962Smm	 * assigned to a txg.
3898209962Smm	 */
3899209962Smm	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3900209962Smm
3901209962Smm	/*
3902185029Spjd	 * Writes will, almost always, require additional memory allocations
3903185029Spjd	 * in order to compress/encrypt/etc the data.  We therefor need to
3904185029Spjd	 * make sure that there is sufficient available memory for this.
3905185029Spjd	 */
3906209962Smm	if (error = arc_memory_throttle(reserve, anon_size, txg))
3907185029Spjd		return (error);
3908185029Spjd
3909185029Spjd	/*
3910168404Spjd	 * Throttle writes when the amount of dirty data in the cache
3911168404Spjd	 * gets too large.  We try to keep the cache less than half full
3912168404Spjd	 * of dirty blocks so that our sync times don't grow too large.
3913168404Spjd	 * Note: if two requests come in concurrently, we might let them
3914168404Spjd	 * both succeed, when one of them should fail.  Not a huge deal.
3915168404Spjd	 */
3916209962Smm
3917209962Smm	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3918209962Smm	    anon_size > arc_c / 4) {
3919185029Spjd		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3920185029Spjd		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3921185029Spjd		    arc_tempreserve>>10,
3922185029Spjd		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3923185029Spjd		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3924185029Spjd		    reserve>>10, arc_c>>10);
3925249195Smm		return (SET_ERROR(ERESTART));
3926168404Spjd	}
3927185029Spjd	atomic_add_64(&arc_tempreserve, reserve);
3928168404Spjd	return (0);
3929168404Spjd}
3930168404Spjd
3931168582Spjdstatic kmutex_t arc_lowmem_lock;
3932168404Spjd#ifdef _KERNEL
3933168566Spjdstatic eventhandler_tag arc_event_lowmem = NULL;
3934168404Spjd
3935168404Spjdstatic void
3936168566Spjdarc_lowmem(void *arg __unused, int howto __unused)
3937168404Spjd{
3938168404Spjd
3939168566Spjd	/* Serialize access via arc_lowmem_lock. */
3940168566Spjd	mutex_enter(&arc_lowmem_lock);
3941219089Spjd	mutex_enter(&arc_reclaim_thr_lock);
3942185029Spjd	needfree = 1;
3943168404Spjd	cv_signal(&arc_reclaim_thr_cv);
3944241773Savg
3945241773Savg	/*
3946241773Savg	 * It is unsafe to block here in arbitrary threads, because we can come
3947241773Savg	 * here from ARC itself and may hold ARC locks and thus risk a deadlock
3948241773Savg	 * with ARC reclaim thread.
3949241773Savg	 */
3950241773Savg	if (curproc == pageproc) {
3951241773Savg		while (needfree)
3952241773Savg			msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
3953241773Savg	}
3954219089Spjd	mutex_exit(&arc_reclaim_thr_lock);
3955168566Spjd	mutex_exit(&arc_lowmem_lock);
3956168404Spjd}
3957168404Spjd#endif
3958168404Spjd
3959168404Spjdvoid
3960168404Spjdarc_init(void)
3961168404Spjd{
3962219089Spjd	int i, prefetch_tunable_set = 0;
3963205231Skmacy
3964168404Spjd	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3965168404Spjd	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3966168566Spjd	mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
3967168404Spjd
3968168404Spjd	/* Convert seconds to clock ticks */
3969168404Spjd	arc_min_prefetch_lifespan = 1 * hz;
3970168404Spjd
3971168404Spjd	/* Start out with 1/8 of all memory */
3972168566Spjd	arc_c = kmem_size() / 8;
3973219089Spjd
3974219089Spjd#ifdef sun
3975192360Skmacy#ifdef _KERNEL
3976192360Skmacy	/*
3977192360Skmacy	 * On architectures where the physical memory can be larger
3978192360Skmacy	 * than the addressable space (intel in 32-bit mode), we may
3979192360Skmacy	 * need to limit the cache to 1/8 of VM size.
3980192360Skmacy	 */
3981192360Skmacy	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3982192360Skmacy#endif
3983219089Spjd#endif	/* sun */
3984168566Spjd	/* set min cache to 1/32 of all memory, or 16MB, whichever is more */
3985168566Spjd	arc_c_min = MAX(arc_c / 4, 64<<18);
3986168566Spjd	/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
3987168404Spjd	if (arc_c * 8 >= 1<<30)
3988168404Spjd		arc_c_max = (arc_c * 8) - (1<<30);
3989168404Spjd	else
3990168404Spjd		arc_c_max = arc_c_min;
3991175633Spjd	arc_c_max = MAX(arc_c * 5, arc_c_max);
3992219089Spjd
3993168481Spjd#ifdef _KERNEL
3994168404Spjd	/*
3995168404Spjd	 * Allow the tunables to override our calculations if they are
3996168566Spjd	 * reasonable (ie. over 16MB)
3997168404Spjd	 */
3998219089Spjd	if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size())
3999168404Spjd		arc_c_max = zfs_arc_max;
4000219089Spjd	if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max)
4001168404Spjd		arc_c_min = zfs_arc_min;
4002168481Spjd#endif
4003219089Spjd
4004168404Spjd	arc_c = arc_c_max;
4005168404Spjd	arc_p = (arc_c >> 1);
4006168404Spjd
4007185029Spjd	/* limit meta-data to 1/4 of the arc capacity */
4008185029Spjd	arc_meta_limit = arc_c_max / 4;
4009185029Spjd
4010185029Spjd	/* Allow the tunable to override if it is reasonable */
4011185029Spjd	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
4012185029Spjd		arc_meta_limit = zfs_arc_meta_limit;
4013185029Spjd
4014185029Spjd	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
4015185029Spjd		arc_c_min = arc_meta_limit / 2;
4016185029Spjd
4017208373Smm	if (zfs_arc_grow_retry > 0)
4018208373Smm		arc_grow_retry = zfs_arc_grow_retry;
4019208373Smm
4020208373Smm	if (zfs_arc_shrink_shift > 0)
4021208373Smm		arc_shrink_shift = zfs_arc_shrink_shift;
4022208373Smm
4023208373Smm	if (zfs_arc_p_min_shift > 0)
4024208373Smm		arc_p_min_shift = zfs_arc_p_min_shift;
4025208373Smm
4026168404Spjd	/* if kmem_flags are set, lets try to use less memory */
4027168404Spjd	if (kmem_debugging())
4028168404Spjd		arc_c = arc_c / 2;
4029168404Spjd	if (arc_c < arc_c_min)
4030168404Spjd		arc_c = arc_c_min;
4031168404Spjd
4032168473Spjd	zfs_arc_min = arc_c_min;
4033168473Spjd	zfs_arc_max = arc_c_max;
4034168473Spjd
4035168404Spjd	arc_anon = &ARC_anon;
4036168404Spjd	arc_mru = &ARC_mru;
4037168404Spjd	arc_mru_ghost = &ARC_mru_ghost;
4038168404Spjd	arc_mfu = &ARC_mfu;
4039168404Spjd	arc_mfu_ghost = &ARC_mfu_ghost;
4040185029Spjd	arc_l2c_only = &ARC_l2c_only;
4041168404Spjd	arc_size = 0;
4042168404Spjd
4043205231Skmacy	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4044205231Skmacy		mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
4045205231Skmacy		    NULL, MUTEX_DEFAULT, NULL);
4046205231Skmacy		mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
4047205231Skmacy		    NULL, MUTEX_DEFAULT, NULL);
4048205231Skmacy		mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
4049205231Skmacy		    NULL, MUTEX_DEFAULT, NULL);
4050205231Skmacy		mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
4051205231Skmacy		    NULL, MUTEX_DEFAULT, NULL);
4052205231Skmacy		mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
4053205231Skmacy		    NULL, MUTEX_DEFAULT, NULL);
4054205231Skmacy		mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
4055205231Skmacy		    NULL, MUTEX_DEFAULT, NULL);
4056206796Spjd
4057205231Skmacy		list_create(&arc_mru->arcs_lists[i],
4058205231Skmacy		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4059205231Skmacy		list_create(&arc_mru_ghost->arcs_lists[i],
4060205231Skmacy		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4061205231Skmacy		list_create(&arc_mfu->arcs_lists[i],
4062205231Skmacy		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4063205231Skmacy		list_create(&arc_mfu_ghost->arcs_lists[i],
4064205231Skmacy		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4065205231Skmacy		list_create(&arc_mfu_ghost->arcs_lists[i],
4066205231Skmacy		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4067205231Skmacy		list_create(&arc_l2c_only->arcs_lists[i],
4068205231Skmacy		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4069205231Skmacy	}
4070168404Spjd
4071168404Spjd	buf_init();
4072168404Spjd
4073168404Spjd	arc_thread_exit = 0;
4074168404Spjd	arc_eviction_list = NULL;
4075168404Spjd	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
4076168404Spjd	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4077168404Spjd
4078168404Spjd	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4079168404Spjd	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4080168404Spjd
4081168404Spjd	if (arc_ksp != NULL) {
4082168404Spjd		arc_ksp->ks_data = &arc_stats;
4083168404Spjd		kstat_install(arc_ksp);
4084168404Spjd	}
4085168404Spjd
4086168404Spjd	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
4087168404Spjd	    TS_RUN, minclsyspri);
4088168404Spjd
4089168404Spjd#ifdef _KERNEL
4090168566Spjd	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
4091168404Spjd	    EVENTHANDLER_PRI_FIRST);
4092168404Spjd#endif
4093168404Spjd
4094168404Spjd	arc_dead = FALSE;
4095185029Spjd	arc_warm = B_FALSE;
4096168566Spjd
4097185029Spjd	if (zfs_write_limit_max == 0)
4098185029Spjd		zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
4099185029Spjd	else
4100185029Spjd		zfs_write_limit_shift = 0;
4101185029Spjd	mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
4102185029Spjd
4103168566Spjd#ifdef _KERNEL
4104194043Skmacy	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
4105193953Skmacy		prefetch_tunable_set = 1;
4106206796Spjd
4107193878Skmacy#ifdef __i386__
4108193953Skmacy	if (prefetch_tunable_set == 0) {
4109196863Strasz		printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
4110196863Strasz		    "-- to enable,\n");
4111196863Strasz		printf("            add \"vfs.zfs.prefetch_disable=0\" "
4112196863Strasz		    "to /boot/loader.conf.\n");
4113219089Spjd		zfs_prefetch_disable = 1;
4114193878Skmacy	}
4115206796Spjd#else
4116193878Skmacy	if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
4117193953Skmacy	    prefetch_tunable_set == 0) {
4118196863Strasz		printf("ZFS NOTICE: Prefetch is disabled by default if less "
4119196941Strasz		    "than 4GB of RAM is present;\n"
4120196863Strasz		    "            to enable, add \"vfs.zfs.prefetch_disable=0\" "
4121196863Strasz		    "to /boot/loader.conf.\n");
4122219089Spjd		zfs_prefetch_disable = 1;
4123193878Skmacy	}
4124206796Spjd#endif
4125175633Spjd	/* Warn about ZFS memory and address space requirements. */
4126168696Spjd	if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
4127168987Sbmah		printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
4128168987Sbmah		    "expect unstable behavior.\n");
4129175633Spjd	}
4130175633Spjd	if (kmem_size() < 512 * (1 << 20)) {
4131173419Spjd		printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
4132168987Sbmah		    "expect unstable behavior.\n");
4133185029Spjd		printf("             Consider tuning vm.kmem_size and "
4134173419Spjd		    "vm.kmem_size_max\n");
4135185029Spjd		printf("             in /boot/loader.conf.\n");
4136168566Spjd	}
4137168566Spjd#endif
4138168404Spjd}
4139168404Spjd
4140168404Spjdvoid
4141168404Spjdarc_fini(void)
4142168404Spjd{
4143205231Skmacy	int i;
4144206796Spjd
4145168404Spjd	mutex_enter(&arc_reclaim_thr_lock);
4146168404Spjd	arc_thread_exit = 1;
4147168404Spjd	cv_signal(&arc_reclaim_thr_cv);
4148168404Spjd	while (arc_thread_exit != 0)
4149168404Spjd		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
4150168404Spjd	mutex_exit(&arc_reclaim_thr_lock);
4151168404Spjd
4152185029Spjd	arc_flush(NULL);
4153168404Spjd
4154168404Spjd	arc_dead = TRUE;
4155168404Spjd
4156168404Spjd	if (arc_ksp != NULL) {
4157168404Spjd		kstat_delete(arc_ksp);
4158168404Spjd		arc_ksp = NULL;
4159168404Spjd	}
4160168404Spjd
4161168404Spjd	mutex_destroy(&arc_eviction_mtx);
4162168404Spjd	mutex_destroy(&arc_reclaim_thr_lock);
4163168404Spjd	cv_destroy(&arc_reclaim_thr_cv);
4164168404Spjd
4165205231Skmacy	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4166205231Skmacy		list_destroy(&arc_mru->arcs_lists[i]);
4167205231Skmacy		list_destroy(&arc_mru_ghost->arcs_lists[i]);
4168205231Skmacy		list_destroy(&arc_mfu->arcs_lists[i]);
4169205231Skmacy		list_destroy(&arc_mfu_ghost->arcs_lists[i]);
4170206795Spjd		list_destroy(&arc_l2c_only->arcs_lists[i]);
4171168404Spjd
4172205231Skmacy		mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
4173205231Skmacy		mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
4174205231Skmacy		mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
4175205231Skmacy		mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
4176205231Skmacy		mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
4177206795Spjd		mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
4178205231Skmacy	}
4179206796Spjd
4180185029Spjd	mutex_destroy(&zfs_write_limit_lock);
4181185029Spjd
4182168404Spjd	buf_fini();
4183168404Spjd
4184209962Smm	ASSERT(arc_loaned_bytes == 0);
4185209962Smm
4186168582Spjd	mutex_destroy(&arc_lowmem_lock);
4187168404Spjd#ifdef _KERNEL
4188168566Spjd	if (arc_event_lowmem != NULL)
4189168566Spjd		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
4190168404Spjd#endif
4191168404Spjd}
4192185029Spjd
4193185029Spjd/*
4194185029Spjd * Level 2 ARC
4195185029Spjd *
4196185029Spjd * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
4197185029Spjd * It uses dedicated storage devices to hold cached data, which are populated
4198185029Spjd * using large infrequent writes.  The main role of this cache is to boost
4199185029Spjd * the performance of random read workloads.  The intended L2ARC devices
4200185029Spjd * include short-stroked disks, solid state disks, and other media with
4201185029Spjd * substantially faster read latency than disk.
4202185029Spjd *
4203185029Spjd *                 +-----------------------+
4204185029Spjd *                 |         ARC           |
4205185029Spjd *                 +-----------------------+
4206185029Spjd *                    |         ^     ^
4207185029Spjd *                    |         |     |
4208185029Spjd *      l2arc_feed_thread()    arc_read()
4209185029Spjd *                    |         |     |
4210185029Spjd *                    |  l2arc read   |
4211185029Spjd *                    V         |     |
4212185029Spjd *               +---------------+    |
4213185029Spjd *               |     L2ARC     |    |
4214185029Spjd *               +---------------+    |
4215185029Spjd *                   |    ^           |
4216185029Spjd *          l2arc_write() |           |
4217185029Spjd *                   |    |           |
4218185029Spjd *                   V    |           |
4219185029Spjd *                 +-------+      +-------+
4220185029Spjd *                 | vdev  |      | vdev  |
4221185029Spjd *                 | cache |      | cache |
4222185029Spjd *                 +-------+      +-------+
4223185029Spjd *                 +=========+     .-----.
4224185029Spjd *                 :  L2ARC  :    |-_____-|
4225185029Spjd *                 : devices :    | Disks |
4226185029Spjd *                 +=========+    `-_____-'
4227185029Spjd *
4228185029Spjd * Read requests are satisfied from the following sources, in order:
4229185029Spjd *
4230185029Spjd *	1) ARC
4231185029Spjd *	2) vdev cache of L2ARC devices
4232185029Spjd *	3) L2ARC devices
4233185029Spjd *	4) vdev cache of disks
4234185029Spjd *	5) disks
4235185029Spjd *
4236185029Spjd * Some L2ARC device types exhibit extremely slow write performance.
4237185029Spjd * To accommodate for this there are some significant differences between
4238185029Spjd * the L2ARC and traditional cache design:
4239185029Spjd *
4240185029Spjd * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
4241185029Spjd * the ARC behave as usual, freeing buffers and placing headers on ghost
4242185029Spjd * lists.  The ARC does not send buffers to the L2ARC during eviction as
4243185029Spjd * this would add inflated write latencies for all ARC memory pressure.
4244185029Spjd *
4245185029Spjd * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4246185029Spjd * It does this by periodically scanning buffers from the eviction-end of
4247185029Spjd * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4248251478Sdelphij * not already there. It scans until a headroom of buffers is satisfied,
4249251478Sdelphij * which itself is a buffer for ARC eviction. If a compressible buffer is
4250251478Sdelphij * found during scanning and selected for writing to an L2ARC device, we
4251251478Sdelphij * temporarily boost scanning headroom during the next scan cycle to make
4252251478Sdelphij * sure we adapt to compression effects (which might significantly reduce
4253251478Sdelphij * the data volume we write to L2ARC). The thread that does this is
4254185029Spjd * l2arc_feed_thread(), illustrated below; example sizes are included to
4255185029Spjd * provide a better sense of ratio than this diagram:
4256185029Spjd *
4257185029Spjd *	       head -->                        tail
4258185029Spjd *	        +---------------------+----------+
4259185029Spjd *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
4260185029Spjd *	        +---------------------+----------+   |   o L2ARC eligible
4261185029Spjd *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
4262185029Spjd *	        +---------------------+----------+   |
4263185029Spjd *	             15.9 Gbytes      ^ 32 Mbytes    |
4264185029Spjd *	                           headroom          |
4265185029Spjd *	                                      l2arc_feed_thread()
4266185029Spjd *	                                             |
4267185029Spjd *	                 l2arc write hand <--[oooo]--'
4268185029Spjd *	                         |           8 Mbyte
4269185029Spjd *	                         |          write max
4270185029Spjd *	                         V
4271185029Spjd *		  +==============================+
4272185029Spjd *	L2ARC dev |####|#|###|###|    |####| ... |
4273185029Spjd *	          +==============================+
4274185029Spjd *	                     32 Gbytes
4275185029Spjd *
4276185029Spjd * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4277185029Spjd * evicted, then the L2ARC has cached a buffer much sooner than it probably
4278185029Spjd * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
4279185029Spjd * safe to say that this is an uncommon case, since buffers at the end of
4280185029Spjd * the ARC lists have moved there due to inactivity.
4281185029Spjd *
4282185029Spjd * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4283185029Spjd * then the L2ARC simply misses copying some buffers.  This serves as a
4284185029Spjd * pressure valve to prevent heavy read workloads from both stalling the ARC
4285185029Spjd * with waits and clogging the L2ARC with writes.  This also helps prevent
4286185029Spjd * the potential for the L2ARC to churn if it attempts to cache content too
4287185029Spjd * quickly, such as during backups of the entire pool.
4288185029Spjd *
4289185029Spjd * 5. After system boot and before the ARC has filled main memory, there are
4290185029Spjd * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4291185029Spjd * lists can remain mostly static.  Instead of searching from tail of these
4292185029Spjd * lists as pictured, the l2arc_feed_thread() will search from the list heads
4293185029Spjd * for eligible buffers, greatly increasing its chance of finding them.
4294185029Spjd *
4295185029Spjd * The L2ARC device write speed is also boosted during this time so that
4296185029Spjd * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
4297185029Spjd * there are no L2ARC reads, and no fear of degrading read performance
4298185029Spjd * through increased writes.
4299185029Spjd *
4300185029Spjd * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4301185029Spjd * the vdev queue can aggregate them into larger and fewer writes.  Each
4302185029Spjd * device is written to in a rotor fashion, sweeping writes through
4303185029Spjd * available space then repeating.
4304185029Spjd *
4305185029Spjd * 7. The L2ARC does not store dirty content.  It never needs to flush
4306185029Spjd * write buffers back to disk based storage.
4307185029Spjd *
4308185029Spjd * 8. If an ARC buffer is written (and dirtied) which also exists in the
4309185029Spjd * L2ARC, the now stale L2ARC buffer is immediately dropped.
4310185029Spjd *
4311185029Spjd * The performance of the L2ARC can be tweaked by a number of tunables, which
4312185029Spjd * may be necessary for different workloads:
4313185029Spjd *
4314185029Spjd *	l2arc_write_max		max write bytes per interval
4315185029Spjd *	l2arc_write_boost	extra write bytes during device warmup
4316185029Spjd *	l2arc_noprefetch	skip caching prefetched buffers
4317185029Spjd *	l2arc_headroom		number of max device writes to precache
4318251478Sdelphij *	l2arc_headroom_boost	when we find compressed buffers during ARC
4319251478Sdelphij *				scanning, we multiply headroom by this
4320251478Sdelphij *				percentage factor for the next scan cycle,
4321251478Sdelphij *				since more compressed buffers are likely to
4322251478Sdelphij *				be present
4323185029Spjd *	l2arc_feed_secs		seconds between L2ARC writing
4324185029Spjd *
4325185029Spjd * Tunables may be removed or added as future performance improvements are
4326185029Spjd * integrated, and also may become zpool properties.
4327208373Smm *
4328208373Smm * There are three key functions that control how the L2ARC warms up:
4329208373Smm *
4330208373Smm *	l2arc_write_eligible()	check if a buffer is eligible to cache
4331208373Smm *	l2arc_write_size()	calculate how much to write
4332208373Smm *	l2arc_write_interval()	calculate sleep delay between writes
4333208373Smm *
4334208373Smm * These three functions determine what to write, how much, and how quickly
4335208373Smm * to send writes.
4336185029Spjd */
4337185029Spjd
4338208373Smmstatic boolean_t
4339209962Smml2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4340208373Smm{
4341208373Smm	/*
4342208373Smm	 * A buffer is *not* eligible for the L2ARC if it:
4343208373Smm	 * 1. belongs to a different spa.
4344208373Smm	 * 2. is already cached on the L2ARC.
4345208373Smm	 * 3. has an I/O in progress (it may be an incomplete read).
4346208373Smm	 * 4. is flagged not eligible (zfs property).
4347208373Smm	 */
4348209962Smm	if (ab->b_spa != spa_guid) {
4349208373Smm		ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
4350208373Smm		return (B_FALSE);
4351208373Smm	}
4352208373Smm	if (ab->b_l2hdr != NULL) {
4353208373Smm		ARCSTAT_BUMP(arcstat_l2_write_in_l2);
4354208373Smm		return (B_FALSE);
4355208373Smm	}
4356208373Smm	if (HDR_IO_IN_PROGRESS(ab)) {
4357208373Smm		ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
4358208373Smm		return (B_FALSE);
4359208373Smm	}
4360208373Smm	if (!HDR_L2CACHE(ab)) {
4361208373Smm		ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
4362208373Smm		return (B_FALSE);
4363208373Smm	}
4364208373Smm
4365208373Smm	return (B_TRUE);
4366208373Smm}
4367208373Smm
4368208373Smmstatic uint64_t
4369251478Sdelphijl2arc_write_size(void)
4370208373Smm{
4371208373Smm	uint64_t size;
4372208373Smm
4373251478Sdelphij	/*
4374251478Sdelphij	 * Make sure our globals have meaningful values in case the user
4375251478Sdelphij	 * altered them.
4376251478Sdelphij	 */
4377251478Sdelphij	size = l2arc_write_max;
4378251478Sdelphij	if (size == 0) {
4379251478Sdelphij		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4380251478Sdelphij		    "be greater than zero, resetting it to the default (%d)",
4381251478Sdelphij		    L2ARC_WRITE_SIZE);
4382251478Sdelphij		size = l2arc_write_max = L2ARC_WRITE_SIZE;
4383251478Sdelphij	}
4384208373Smm
4385208373Smm	if (arc_warm == B_FALSE)
4386251478Sdelphij		size += l2arc_write_boost;
4387208373Smm
4388208373Smm	return (size);
4389208373Smm
4390208373Smm}
4391208373Smm
4392208373Smmstatic clock_t
4393208373Smml2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4394208373Smm{
4395219089Spjd	clock_t interval, next, now;
4396208373Smm
4397208373Smm	/*
4398208373Smm	 * If the ARC lists are busy, increase our write rate; if the
4399208373Smm	 * lists are stale, idle back.  This is achieved by checking
4400208373Smm	 * how much we previously wrote - if it was more than half of
4401208373Smm	 * what we wanted, schedule the next write much sooner.
4402208373Smm	 */
4403208373Smm	if (l2arc_feed_again && wrote > (wanted / 2))
4404208373Smm		interval = (hz * l2arc_feed_min_ms) / 1000;
4405208373Smm	else
4406208373Smm		interval = hz * l2arc_feed_secs;
4407208373Smm
4408219089Spjd	now = ddi_get_lbolt();
4409219089Spjd	next = MAX(now, MIN(now + interval, began + interval));
4410208373Smm
4411208373Smm	return (next);
4412208373Smm}
4413208373Smm
4414185029Spjdstatic void
4415185029Spjdl2arc_hdr_stat_add(void)
4416185029Spjd{
4417185029Spjd	ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4418185029Spjd	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4419185029Spjd}
4420185029Spjd
4421185029Spjdstatic void
4422185029Spjdl2arc_hdr_stat_remove(void)
4423185029Spjd{
4424185029Spjd	ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4425185029Spjd	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4426185029Spjd}
4427185029Spjd
4428185029Spjd/*
4429185029Spjd * Cycle through L2ARC devices.  This is how L2ARC load balances.
4430185029Spjd * If a device is returned, this also returns holding the spa config lock.
4431185029Spjd */
4432185029Spjdstatic l2arc_dev_t *
4433185029Spjdl2arc_dev_get_next(void)
4434185029Spjd{
4435185029Spjd	l2arc_dev_t *first, *next = NULL;
4436185029Spjd
4437185029Spjd	/*
4438185029Spjd	 * Lock out the removal of spas (spa_namespace_lock), then removal
4439185029Spjd	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4440185029Spjd	 * both locks will be dropped and a spa config lock held instead.
4441185029Spjd	 */
4442185029Spjd	mutex_enter(&spa_namespace_lock);
4443185029Spjd	mutex_enter(&l2arc_dev_mtx);
4444185029Spjd
4445185029Spjd	/* if there are no vdevs, there is nothing to do */
4446185029Spjd	if (l2arc_ndev == 0)
4447185029Spjd		goto out;
4448185029Spjd
4449185029Spjd	first = NULL;
4450185029Spjd	next = l2arc_dev_last;
4451185029Spjd	do {
4452185029Spjd		/* loop around the list looking for a non-faulted vdev */
4453185029Spjd		if (next == NULL) {
4454185029Spjd			next = list_head(l2arc_dev_list);
4455185029Spjd		} else {
4456185029Spjd			next = list_next(l2arc_dev_list, next);
4457185029Spjd			if (next == NULL)
4458185029Spjd				next = list_head(l2arc_dev_list);
4459185029Spjd		}
4460185029Spjd
4461185029Spjd		/* if we have come back to the start, bail out */
4462185029Spjd		if (first == NULL)
4463185029Spjd			first = next;
4464185029Spjd		else if (next == first)
4465185029Spjd			break;
4466185029Spjd
4467185029Spjd	} while (vdev_is_dead(next->l2ad_vdev));
4468185029Spjd
4469185029Spjd	/* if we were unable to find any usable vdevs, return NULL */
4470185029Spjd	if (vdev_is_dead(next->l2ad_vdev))
4471185029Spjd		next = NULL;
4472185029Spjd
4473185029Spjd	l2arc_dev_last = next;
4474185029Spjd
4475185029Spjdout:
4476185029Spjd	mutex_exit(&l2arc_dev_mtx);
4477185029Spjd
4478185029Spjd	/*
4479185029Spjd	 * Grab the config lock to prevent the 'next' device from being
4480185029Spjd	 * removed while we are writing to it.
4481185029Spjd	 */
4482185029Spjd	if (next != NULL)
4483185029Spjd		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4484185029Spjd	mutex_exit(&spa_namespace_lock);
4485185029Spjd
4486185029Spjd	return (next);
4487185029Spjd}
4488185029Spjd
4489185029Spjd/*
4490185029Spjd * Free buffers that were tagged for destruction.
4491185029Spjd */
4492185029Spjdstatic void
4493185029Spjdl2arc_do_free_on_write()
4494185029Spjd{
4495185029Spjd	list_t *buflist;
4496185029Spjd	l2arc_data_free_t *df, *df_prev;
4497185029Spjd
4498185029Spjd	mutex_enter(&l2arc_free_on_write_mtx);
4499185029Spjd	buflist = l2arc_free_on_write;
4500185029Spjd
4501185029Spjd	for (df = list_tail(buflist); df; df = df_prev) {
4502185029Spjd		df_prev = list_prev(buflist, df);
4503185029Spjd		ASSERT(df->l2df_data != NULL);
4504185029Spjd		ASSERT(df->l2df_func != NULL);
4505185029Spjd		df->l2df_func(df->l2df_data, df->l2df_size);
4506185029Spjd		list_remove(buflist, df);
4507185029Spjd		kmem_free(df, sizeof (l2arc_data_free_t));
4508185029Spjd	}
4509185029Spjd
4510185029Spjd	mutex_exit(&l2arc_free_on_write_mtx);
4511185029Spjd}
4512185029Spjd
4513185029Spjd/*
4514185029Spjd * A write to a cache device has completed.  Update all headers to allow
4515185029Spjd * reads from these buffers to begin.
4516185029Spjd */
4517185029Spjdstatic void
4518185029Spjdl2arc_write_done(zio_t *zio)
4519185029Spjd{
4520185029Spjd	l2arc_write_callback_t *cb;
4521185029Spjd	l2arc_dev_t *dev;
4522185029Spjd	list_t *buflist;
4523185029Spjd	arc_buf_hdr_t *head, *ab, *ab_prev;
4524185029Spjd	l2arc_buf_hdr_t *abl2;
4525185029Spjd	kmutex_t *hash_lock;
4526185029Spjd
4527185029Spjd	cb = zio->io_private;
4528185029Spjd	ASSERT(cb != NULL);
4529185029Spjd	dev = cb->l2wcb_dev;
4530185029Spjd	ASSERT(dev != NULL);
4531185029Spjd	head = cb->l2wcb_head;
4532185029Spjd	ASSERT(head != NULL);
4533185029Spjd	buflist = dev->l2ad_buflist;
4534185029Spjd	ASSERT(buflist != NULL);
4535185029Spjd	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4536185029Spjd	    l2arc_write_callback_t *, cb);
4537185029Spjd
4538185029Spjd	if (zio->io_error != 0)
4539185029Spjd		ARCSTAT_BUMP(arcstat_l2_writes_error);
4540185029Spjd
4541185029Spjd	mutex_enter(&l2arc_buflist_mtx);
4542185029Spjd
4543185029Spjd	/*
4544185029Spjd	 * All writes completed, or an error was hit.
4545185029Spjd	 */
4546185029Spjd	for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4547185029Spjd		ab_prev = list_prev(buflist, ab);
4548185029Spjd
4549185029Spjd		hash_lock = HDR_LOCK(ab);
4550185029Spjd		if (!mutex_tryenter(hash_lock)) {
4551185029Spjd			/*
4552185029Spjd			 * This buffer misses out.  It may be in a stage
4553185029Spjd			 * of eviction.  Its ARC_L2_WRITING flag will be
4554185029Spjd			 * left set, denying reads to this buffer.
4555185029Spjd			 */
4556185029Spjd			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4557185029Spjd			continue;
4558185029Spjd		}
4559185029Spjd
4560251478Sdelphij		abl2 = ab->b_l2hdr;
4561251478Sdelphij
4562251478Sdelphij		/*
4563251478Sdelphij		 * Release the temporary compressed buffer as soon as possible.
4564251478Sdelphij		 */
4565251478Sdelphij		if (abl2->b_compress != ZIO_COMPRESS_OFF)
4566251478Sdelphij			l2arc_release_cdata_buf(ab);
4567251478Sdelphij
4568185029Spjd		if (zio->io_error != 0) {
4569185029Spjd			/*
4570185029Spjd			 * Error - drop L2ARC entry.
4571185029Spjd			 */
4572185029Spjd			list_remove(buflist, ab);
4573251478Sdelphij			ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4574185029Spjd			ab->b_l2hdr = NULL;
4575248572Ssmh			trim_map_free(abl2->b_dev->l2ad_vdev, abl2->b_daddr,
4576248574Ssmh			    ab->b_size, 0);
4577185029Spjd			kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4578185029Spjd			ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4579185029Spjd		}
4580185029Spjd
4581185029Spjd		/*
4582185029Spjd		 * Allow ARC to begin reads to this L2ARC entry.
4583185029Spjd		 */
4584185029Spjd		ab->b_flags &= ~ARC_L2_WRITING;
4585185029Spjd
4586185029Spjd		mutex_exit(hash_lock);
4587185029Spjd	}
4588185029Spjd
4589185029Spjd	atomic_inc_64(&l2arc_writes_done);
4590185029Spjd	list_remove(buflist, head);
4591185029Spjd	kmem_cache_free(hdr_cache, head);
4592185029Spjd	mutex_exit(&l2arc_buflist_mtx);
4593185029Spjd
4594185029Spjd	l2arc_do_free_on_write();
4595185029Spjd
4596185029Spjd	kmem_free(cb, sizeof (l2arc_write_callback_t));
4597185029Spjd}
4598185029Spjd
4599185029Spjd/*
4600185029Spjd * A read to a cache device completed.  Validate buffer contents before
4601185029Spjd * handing over to the regular ARC routines.
4602185029Spjd */
4603185029Spjdstatic void
4604185029Spjdl2arc_read_done(zio_t *zio)
4605185029Spjd{
4606185029Spjd	l2arc_read_callback_t *cb;
4607185029Spjd	arc_buf_hdr_t *hdr;
4608185029Spjd	arc_buf_t *buf;
4609185029Spjd	kmutex_t *hash_lock;
4610185029Spjd	int equal;
4611185029Spjd
4612185029Spjd	ASSERT(zio->io_vd != NULL);
4613185029Spjd	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4614185029Spjd
4615185029Spjd	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4616185029Spjd
4617185029Spjd	cb = zio->io_private;
4618185029Spjd	ASSERT(cb != NULL);
4619185029Spjd	buf = cb->l2rcb_buf;
4620185029Spjd	ASSERT(buf != NULL);
4621185029Spjd
4622219089Spjd	hash_lock = HDR_LOCK(buf->b_hdr);
4623185029Spjd	mutex_enter(hash_lock);
4624219089Spjd	hdr = buf->b_hdr;
4625219089Spjd	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4626185029Spjd
4627185029Spjd	/*
4628251478Sdelphij	 * If the buffer was compressed, decompress it first.
4629251478Sdelphij	 */
4630251478Sdelphij	if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4631251478Sdelphij		l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4632251478Sdelphij	ASSERT(zio->io_data != NULL);
4633251478Sdelphij
4634251478Sdelphij	/*
4635185029Spjd	 * Check this survived the L2ARC journey.
4636185029Spjd	 */
4637185029Spjd	equal = arc_cksum_equal(buf);
4638185029Spjd	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4639185029Spjd		mutex_exit(hash_lock);
4640185029Spjd		zio->io_private = buf;
4641185029Spjd		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
4642185029Spjd		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
4643185029Spjd		arc_read_done(zio);
4644185029Spjd	} else {
4645185029Spjd		mutex_exit(hash_lock);
4646185029Spjd		/*
4647185029Spjd		 * Buffer didn't survive caching.  Increment stats and
4648185029Spjd		 * reissue to the original storage device.
4649185029Spjd		 */
4650185029Spjd		if (zio->io_error != 0) {
4651185029Spjd			ARCSTAT_BUMP(arcstat_l2_io_error);
4652185029Spjd		} else {
4653249195Smm			zio->io_error = SET_ERROR(EIO);
4654185029Spjd		}
4655185029Spjd		if (!equal)
4656185029Spjd			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4657185029Spjd
4658185029Spjd		/*
4659185029Spjd		 * If there's no waiter, issue an async i/o to the primary
4660185029Spjd		 * storage now.  If there *is* a waiter, the caller must
4661185029Spjd		 * issue the i/o in a context where it's OK to block.
4662185029Spjd		 */
4663209962Smm		if (zio->io_waiter == NULL) {
4664209962Smm			zio_t *pio = zio_unique_parent(zio);
4665209962Smm
4666209962Smm			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4667209962Smm
4668209962Smm			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4669185029Spjd			    buf->b_data, zio->io_size, arc_read_done, buf,
4670185029Spjd			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4671209962Smm		}
4672185029Spjd	}
4673185029Spjd
4674185029Spjd	kmem_free(cb, sizeof (l2arc_read_callback_t));
4675185029Spjd}
4676185029Spjd
4677185029Spjd/*
4678185029Spjd * This is the list priority from which the L2ARC will search for pages to
4679185029Spjd * cache.  This is used within loops (0..3) to cycle through lists in the
4680185029Spjd * desired order.  This order can have a significant effect on cache
4681185029Spjd * performance.
4682185029Spjd *
4683185029Spjd * Currently the metadata lists are hit first, MFU then MRU, followed by
4684185029Spjd * the data lists.  This function returns a locked list, and also returns
4685185029Spjd * the lock pointer.
4686185029Spjd */
4687185029Spjdstatic list_t *
4688185029Spjdl2arc_list_locked(int list_num, kmutex_t **lock)
4689185029Spjd{
4690247187Smm	list_t *list = NULL;
4691205231Skmacy	int idx;
4692185029Spjd
4693206796Spjd	ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);
4694206796Spjd
4695205231Skmacy	if (list_num < ARC_BUFC_NUMMETADATALISTS) {
4696205231Skmacy		idx = list_num;
4697205231Skmacy		list = &arc_mfu->arcs_lists[idx];
4698205231Skmacy		*lock = ARCS_LOCK(arc_mfu, idx);
4699206796Spjd	} else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) {
4700205231Skmacy		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4701205231Skmacy		list = &arc_mru->arcs_lists[idx];
4702205231Skmacy		*lock = ARCS_LOCK(arc_mru, idx);
4703206796Spjd	} else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 +
4704205231Skmacy		ARC_BUFC_NUMDATALISTS)) {
4705205231Skmacy		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4706205231Skmacy		list = &arc_mfu->arcs_lists[idx];
4707205231Skmacy		*lock = ARCS_LOCK(arc_mfu, idx);
4708205231Skmacy	} else {
4709205231Skmacy		idx = list_num - ARC_BUFC_NUMLISTS;
4710205231Skmacy		list = &arc_mru->arcs_lists[idx];
4711205231Skmacy		*lock = ARCS_LOCK(arc_mru, idx);
4712185029Spjd	}
4713185029Spjd
4714185029Spjd	ASSERT(!(MUTEX_HELD(*lock)));
4715185029Spjd	mutex_enter(*lock);
4716185029Spjd	return (list);
4717185029Spjd}
4718185029Spjd
4719185029Spjd/*
4720185029Spjd * Evict buffers from the device write hand to the distance specified in
4721185029Spjd * bytes.  This distance may span populated buffers, it may span nothing.
4722185029Spjd * This is clearing a region on the L2ARC device ready for writing.
4723185029Spjd * If the 'all' boolean is set, every buffer is evicted.
4724185029Spjd */
4725185029Spjdstatic void
4726185029Spjdl2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4727185029Spjd{
4728185029Spjd	list_t *buflist;
4729185029Spjd	l2arc_buf_hdr_t *abl2;
4730185029Spjd	arc_buf_hdr_t *ab, *ab_prev;
4731185029Spjd	kmutex_t *hash_lock;
4732185029Spjd	uint64_t taddr;
4733185029Spjd
4734185029Spjd	buflist = dev->l2ad_buflist;
4735185029Spjd
4736185029Spjd	if (buflist == NULL)
4737185029Spjd		return;
4738185029Spjd
4739185029Spjd	if (!all && dev->l2ad_first) {
4740185029Spjd		/*
4741185029Spjd		 * This is the first sweep through the device.  There is
4742185029Spjd		 * nothing to evict.
4743185029Spjd		 */
4744185029Spjd		return;
4745185029Spjd	}
4746185029Spjd
4747185029Spjd	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4748185029Spjd		/*
4749185029Spjd		 * When nearing the end of the device, evict to the end
4750185029Spjd		 * before the device write hand jumps to the start.
4751185029Spjd		 */
4752185029Spjd		taddr = dev->l2ad_end;
4753185029Spjd	} else {
4754185029Spjd		taddr = dev->l2ad_hand + distance;
4755185029Spjd	}
4756185029Spjd	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4757185029Spjd	    uint64_t, taddr, boolean_t, all);
4758185029Spjd
4759185029Spjdtop:
4760185029Spjd	mutex_enter(&l2arc_buflist_mtx);
4761185029Spjd	for (ab = list_tail(buflist); ab; ab = ab_prev) {
4762185029Spjd		ab_prev = list_prev(buflist, ab);
4763185029Spjd
4764185029Spjd		hash_lock = HDR_LOCK(ab);
4765185029Spjd		if (!mutex_tryenter(hash_lock)) {
4766185029Spjd			/*
4767185029Spjd			 * Missed the hash lock.  Retry.
4768185029Spjd			 */
4769185029Spjd			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4770185029Spjd			mutex_exit(&l2arc_buflist_mtx);
4771185029Spjd			mutex_enter(hash_lock);
4772185029Spjd			mutex_exit(hash_lock);
4773185029Spjd			goto top;
4774185029Spjd		}
4775185029Spjd
4776185029Spjd		if (HDR_L2_WRITE_HEAD(ab)) {
4777185029Spjd			/*
4778185029Spjd			 * We hit a write head node.  Leave it for
4779185029Spjd			 * l2arc_write_done().
4780185029Spjd			 */
4781185029Spjd			list_remove(buflist, ab);
4782185029Spjd			mutex_exit(hash_lock);
4783185029Spjd			continue;
4784185029Spjd		}
4785185029Spjd
4786185029Spjd		if (!all && ab->b_l2hdr != NULL &&
4787185029Spjd		    (ab->b_l2hdr->b_daddr > taddr ||
4788185029Spjd		    ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4789185029Spjd			/*
4790185029Spjd			 * We've evicted to the target address,
4791185029Spjd			 * or the end of the device.
4792185029Spjd			 */
4793185029Spjd			mutex_exit(hash_lock);
4794185029Spjd			break;
4795185029Spjd		}
4796185029Spjd
4797185029Spjd		if (HDR_FREE_IN_PROGRESS(ab)) {
4798185029Spjd			/*
4799185029Spjd			 * Already on the path to destruction.
4800185029Spjd			 */
4801185029Spjd			mutex_exit(hash_lock);
4802185029Spjd			continue;
4803185029Spjd		}
4804185029Spjd
4805185029Spjd		if (ab->b_state == arc_l2c_only) {
4806185029Spjd			ASSERT(!HDR_L2_READING(ab));
4807185029Spjd			/*
4808185029Spjd			 * This doesn't exist in the ARC.  Destroy.
4809185029Spjd			 * arc_hdr_destroy() will call list_remove()
4810185029Spjd			 * and decrement arcstat_l2_size.
4811185029Spjd			 */
4812185029Spjd			arc_change_state(arc_anon, ab, hash_lock);
4813185029Spjd			arc_hdr_destroy(ab);
4814185029Spjd		} else {
4815185029Spjd			/*
4816185029Spjd			 * Invalidate issued or about to be issued
4817185029Spjd			 * reads, since we may be about to write
4818185029Spjd			 * over this location.
4819185029Spjd			 */
4820185029Spjd			if (HDR_L2_READING(ab)) {
4821185029Spjd				ARCSTAT_BUMP(arcstat_l2_evict_reading);
4822185029Spjd				ab->b_flags |= ARC_L2_EVICTED;
4823185029Spjd			}
4824185029Spjd
4825185029Spjd			/*
4826185029Spjd			 * Tell ARC this no longer exists in L2ARC.
4827185029Spjd			 */
4828185029Spjd			if (ab->b_l2hdr != NULL) {
4829185029Spjd				abl2 = ab->b_l2hdr;
4830251478Sdelphij				ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4831185029Spjd				ab->b_l2hdr = NULL;
4832185029Spjd				kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4833185029Spjd				ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4834185029Spjd			}
4835185029Spjd			list_remove(buflist, ab);
4836185029Spjd
4837185029Spjd			/*
4838185029Spjd			 * This may have been leftover after a
4839185029Spjd			 * failed write.
4840185029Spjd			 */
4841185029Spjd			ab->b_flags &= ~ARC_L2_WRITING;
4842185029Spjd		}
4843185029Spjd		mutex_exit(hash_lock);
4844185029Spjd	}
4845185029Spjd	mutex_exit(&l2arc_buflist_mtx);
4846185029Spjd
4847219089Spjd	vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4848185029Spjd	dev->l2ad_evict = taddr;
4849185029Spjd}
4850185029Spjd
4851185029Spjd/*
4852185029Spjd * Find and write ARC buffers to the L2ARC device.
4853185029Spjd *
4854185029Spjd * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4855185029Spjd * for reading until they have completed writing.
4856251478Sdelphij * The headroom_boost is an in-out parameter used to maintain headroom boost
4857251478Sdelphij * state between calls to this function.
4858251478Sdelphij *
4859251478Sdelphij * Returns the number of bytes actually written (which may be smaller than
4860251478Sdelphij * the delta by which the device hand has changed due to alignment).
4861185029Spjd */
4862208373Smmstatic uint64_t
4863251478Sdelphijl2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4864251478Sdelphij    boolean_t *headroom_boost)
4865185029Spjd{
4866185029Spjd	arc_buf_hdr_t *ab, *ab_prev, *head;
4867185029Spjd	list_t *list;
4868251478Sdelphij	uint64_t write_asize, write_psize, write_sz, headroom,
4869251478Sdelphij	    buf_compress_minsz;
4870185029Spjd	void *buf_data;
4871251478Sdelphij	kmutex_t *list_lock;
4872251478Sdelphij	boolean_t full;
4873185029Spjd	l2arc_write_callback_t *cb;
4874185029Spjd	zio_t *pio, *wzio;
4875228103Smm	uint64_t guid = spa_load_guid(spa);
4876251478Sdelphij	const boolean_t do_headroom_boost = *headroom_boost;
4877185029Spjd	int try;
4878185029Spjd
4879185029Spjd	ASSERT(dev->l2ad_vdev != NULL);
4880185029Spjd
4881251478Sdelphij	/* Lower the flag now, we might want to raise it again later. */
4882251478Sdelphij	*headroom_boost = B_FALSE;
4883251478Sdelphij
4884185029Spjd	pio = NULL;
4885251478Sdelphij	write_sz = write_asize = write_psize = 0;
4886185029Spjd	full = B_FALSE;
4887185029Spjd	head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4888185029Spjd	head->b_flags |= ARC_L2_WRITE_HEAD;
4889185029Spjd
4890205231Skmacy	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
4891185029Spjd	/*
4892251478Sdelphij	 * We will want to try to compress buffers that are at least 2x the
4893251478Sdelphij	 * device sector size.
4894251478Sdelphij	 */
4895251478Sdelphij	buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4896251478Sdelphij
4897251478Sdelphij	/*
4898185029Spjd	 * Copy buffers for L2ARC writing.
4899185029Spjd	 */
4900185029Spjd	mutex_enter(&l2arc_buflist_mtx);
4901206796Spjd	for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
4902251478Sdelphij		uint64_t passed_sz = 0;
4903251478Sdelphij
4904185029Spjd		list = l2arc_list_locked(try, &list_lock);
4905205231Skmacy		ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
4906185029Spjd
4907185029Spjd		/*
4908185029Spjd		 * L2ARC fast warmup.
4909185029Spjd		 *
4910185029Spjd		 * Until the ARC is warm and starts to evict, read from the
4911185029Spjd		 * head of the ARC lists rather than the tail.
4912185029Spjd		 */
4913185029Spjd		if (arc_warm == B_FALSE)
4914185029Spjd			ab = list_head(list);
4915185029Spjd		else
4916185029Spjd			ab = list_tail(list);
4917206796Spjd		if (ab == NULL)
4918205231Skmacy			ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
4919185029Spjd
4920251478Sdelphij		headroom = target_sz * l2arc_headroom;
4921251478Sdelphij		if (do_headroom_boost)
4922251478Sdelphij			headroom = (headroom * l2arc_headroom_boost) / 100;
4923251478Sdelphij
4924185029Spjd		for (; ab; ab = ab_prev) {
4925251478Sdelphij			l2arc_buf_hdr_t *l2hdr;
4926251478Sdelphij			kmutex_t *hash_lock;
4927251478Sdelphij			uint64_t buf_sz;
4928251478Sdelphij
4929185029Spjd			if (arc_warm == B_FALSE)
4930185029Spjd				ab_prev = list_next(list, ab);
4931185029Spjd			else
4932185029Spjd				ab_prev = list_prev(list, ab);
4933205231Skmacy			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size);
4934206796Spjd
4935185029Spjd			hash_lock = HDR_LOCK(ab);
4936251478Sdelphij			if (!mutex_tryenter(hash_lock)) {
4937205231Skmacy				ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
4938185029Spjd				/*
4939185029Spjd				 * Skip this buffer rather than waiting.
4940185029Spjd				 */
4941185029Spjd				continue;
4942185029Spjd			}
4943185029Spjd
4944185029Spjd			passed_sz += ab->b_size;
4945185029Spjd			if (passed_sz > headroom) {
4946185029Spjd				/*
4947185029Spjd				 * Searched too far.
4948185029Spjd				 */
4949185029Spjd				mutex_exit(hash_lock);
4950205231Skmacy				ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
4951185029Spjd				break;
4952185029Spjd			}
4953185029Spjd
4954209962Smm			if (!l2arc_write_eligible(guid, ab)) {
4955185029Spjd				mutex_exit(hash_lock);
4956185029Spjd				continue;
4957185029Spjd			}
4958185029Spjd
4959185029Spjd			if ((write_sz + ab->b_size) > target_sz) {
4960185029Spjd				full = B_TRUE;
4961185029Spjd				mutex_exit(hash_lock);
4962205231Skmacy				ARCSTAT_BUMP(arcstat_l2_write_full);
4963185029Spjd				break;
4964185029Spjd			}
4965185029Spjd
4966185029Spjd			if (pio == NULL) {
4967185029Spjd				/*
4968185029Spjd				 * Insert a dummy header on the buflist so
4969185029Spjd				 * l2arc_write_done() can find where the
4970185029Spjd				 * write buffers begin without searching.
4971185029Spjd				 */
4972185029Spjd				list_insert_head(dev->l2ad_buflist, head);
4973185029Spjd
4974185029Spjd				cb = kmem_alloc(
4975185029Spjd				    sizeof (l2arc_write_callback_t), KM_SLEEP);
4976185029Spjd				cb->l2wcb_dev = dev;
4977185029Spjd				cb->l2wcb_head = head;
4978185029Spjd				pio = zio_root(spa, l2arc_write_done, cb,
4979185029Spjd				    ZIO_FLAG_CANFAIL);
4980205231Skmacy				ARCSTAT_BUMP(arcstat_l2_write_pios);
4981185029Spjd			}
4982185029Spjd
4983185029Spjd			/*
4984185029Spjd			 * Create and add a new L2ARC header.
4985185029Spjd			 */
4986251478Sdelphij			l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4987251478Sdelphij			l2hdr->b_dev = dev;
4988251478Sdelphij			ab->b_flags |= ARC_L2_WRITING;
4989185029Spjd
4990251478Sdelphij			/*
4991251478Sdelphij			 * Temporarily stash the data buffer in b_tmp_cdata.
4992251478Sdelphij			 * The subsequent write step will pick it up from
4993251478Sdelphij			 * there. This is because can't access ab->b_buf
4994251478Sdelphij			 * without holding the hash_lock, which we in turn
4995251478Sdelphij			 * can't access without holding the ARC list locks
4996251478Sdelphij			 * (which we want to avoid during compression/writing).
4997251478Sdelphij			 */
4998251478Sdelphij			l2hdr->b_compress = ZIO_COMPRESS_OFF;
4999251478Sdelphij			l2hdr->b_asize = ab->b_size;
5000251478Sdelphij			l2hdr->b_tmp_cdata = ab->b_buf->b_data;
5001251478Sdelphij
5002185029Spjd			buf_sz = ab->b_size;
5003251478Sdelphij			ab->b_l2hdr = l2hdr;
5004185029Spjd
5005251478Sdelphij			list_insert_head(dev->l2ad_buflist, ab);
5006251478Sdelphij
5007185029Spjd			/*
5008185029Spjd			 * Compute and store the buffer cksum before
5009185029Spjd			 * writing.  On debug the cksum is verified first.
5010185029Spjd			 */
5011185029Spjd			arc_cksum_verify(ab->b_buf);
5012185029Spjd			arc_cksum_compute(ab->b_buf, B_TRUE);
5013185029Spjd
5014185029Spjd			mutex_exit(hash_lock);
5015185029Spjd
5016251478Sdelphij			write_sz += buf_sz;
5017251478Sdelphij		}
5018251478Sdelphij
5019251478Sdelphij		mutex_exit(list_lock);
5020251478Sdelphij
5021251478Sdelphij		if (full == B_TRUE)
5022251478Sdelphij			break;
5023251478Sdelphij	}
5024251478Sdelphij
5025251478Sdelphij	/* No buffers selected for writing? */
5026251478Sdelphij	if (pio == NULL) {
5027251478Sdelphij		ASSERT0(write_sz);
5028251478Sdelphij		mutex_exit(&l2arc_buflist_mtx);
5029251478Sdelphij		kmem_cache_free(hdr_cache, head);
5030251478Sdelphij		return (0);
5031251478Sdelphij	}
5032251478Sdelphij
5033251478Sdelphij	/*
5034251478Sdelphij	 * Now start writing the buffers. We're starting at the write head
5035251478Sdelphij	 * and work backwards, retracing the course of the buffer selector
5036251478Sdelphij	 * loop above.
5037251478Sdelphij	 */
5038251478Sdelphij	for (ab = list_prev(dev->l2ad_buflist, head); ab;
5039251478Sdelphij	    ab = list_prev(dev->l2ad_buflist, ab)) {
5040251478Sdelphij		l2arc_buf_hdr_t *l2hdr;
5041251478Sdelphij		uint64_t buf_sz;
5042251478Sdelphij
5043251478Sdelphij		/*
5044251478Sdelphij		 * We shouldn't need to lock the buffer here, since we flagged
5045251478Sdelphij		 * it as ARC_L2_WRITING in the previous step, but we must take
5046251478Sdelphij		 * care to only access its L2 cache parameters. In particular,
5047251478Sdelphij		 * ab->b_buf may be invalid by now due to ARC eviction.
5048251478Sdelphij		 */
5049251478Sdelphij		l2hdr = ab->b_l2hdr;
5050251478Sdelphij		l2hdr->b_daddr = dev->l2ad_hand;
5051251478Sdelphij
5052251478Sdelphij		if ((ab->b_flags & ARC_L2COMPRESS) &&
5053251478Sdelphij		    l2hdr->b_asize >= buf_compress_minsz) {
5054251478Sdelphij			if (l2arc_compress_buf(l2hdr)) {
5055251478Sdelphij				/*
5056251478Sdelphij				 * If compression succeeded, enable headroom
5057251478Sdelphij				 * boost on the next scan cycle.
5058251478Sdelphij				 */
5059251478Sdelphij				*headroom_boost = B_TRUE;
5060251478Sdelphij			}
5061251478Sdelphij		}
5062251478Sdelphij
5063251478Sdelphij		/*
5064251478Sdelphij		 * Pick up the buffer data we had previously stashed away
5065251478Sdelphij		 * (and now potentially also compressed).
5066251478Sdelphij		 */
5067251478Sdelphij		buf_data = l2hdr->b_tmp_cdata;
5068251478Sdelphij		buf_sz = l2hdr->b_asize;
5069251478Sdelphij
5070251478Sdelphij		/* Compression may have squashed the buffer to zero length. */
5071251478Sdelphij		if (buf_sz != 0) {
5072251478Sdelphij			uint64_t buf_p_sz;
5073251478Sdelphij
5074185029Spjd			wzio = zio_write_phys(pio, dev->l2ad_vdev,
5075185029Spjd			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5076185029Spjd			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5077185029Spjd			    ZIO_FLAG_CANFAIL, B_FALSE);
5078185029Spjd
5079185029Spjd			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5080185029Spjd			    zio_t *, wzio);
5081185029Spjd			(void) zio_nowait(wzio);
5082185029Spjd
5083251478Sdelphij			write_asize += buf_sz;
5084185029Spjd			/*
5085185029Spjd			 * Keep the clock hand suitably device-aligned.
5086185029Spjd			 */
5087251478Sdelphij			buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
5088251478Sdelphij			write_psize += buf_p_sz;
5089251478Sdelphij			dev->l2ad_hand += buf_p_sz;
5090185029Spjd		}
5091251478Sdelphij	}
5092185029Spjd
5093185029Spjd	mutex_exit(&l2arc_buflist_mtx);
5094185029Spjd
5095251478Sdelphij	ASSERT3U(write_asize, <=, target_sz);
5096185029Spjd	ARCSTAT_BUMP(arcstat_l2_writes_sent);
5097251478Sdelphij	ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5098185029Spjd	ARCSTAT_INCR(arcstat_l2_size, write_sz);
5099251478Sdelphij	ARCSTAT_INCR(arcstat_l2_asize, write_asize);
5100251478Sdelphij	vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
5101185029Spjd
5102185029Spjd	/*
5103185029Spjd	 * Bump device hand to the device start if it is approaching the end.
5104185029Spjd	 * l2arc_evict() will already have evicted ahead for this case.
5105185029Spjd	 */
5106185029Spjd	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
5107219089Spjd		vdev_space_update(dev->l2ad_vdev,
5108219089Spjd		    dev->l2ad_end - dev->l2ad_hand, 0, 0);
5109185029Spjd		dev->l2ad_hand = dev->l2ad_start;
5110185029Spjd		dev->l2ad_evict = dev->l2ad_start;
5111185029Spjd		dev->l2ad_first = B_FALSE;
5112185029Spjd	}
5113185029Spjd
5114208373Smm	dev->l2ad_writing = B_TRUE;
5115185029Spjd	(void) zio_wait(pio);
5116208373Smm	dev->l2ad_writing = B_FALSE;
5117208373Smm
5118251478Sdelphij	return (write_asize);
5119185029Spjd}
5120185029Spjd
5121185029Spjd/*
5122251478Sdelphij * Compresses an L2ARC buffer.
5123251478Sdelphij * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
5124251478Sdelphij * size in l2hdr->b_asize. This routine tries to compress the data and
5125251478Sdelphij * depending on the compression result there are three possible outcomes:
5126251478Sdelphij * *) The buffer was incompressible. The original l2hdr contents were left
5127251478Sdelphij *    untouched and are ready for writing to an L2 device.
5128251478Sdelphij * *) The buffer was all-zeros, so there is no need to write it to an L2
5129251478Sdelphij *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
5130251478Sdelphij *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
5131251478Sdelphij * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
5132251478Sdelphij *    data buffer which holds the compressed data to be written, and b_asize
5133251478Sdelphij *    tells us how much data there is. b_compress is set to the appropriate
5134251478Sdelphij *    compression algorithm. Once writing is done, invoke
5135251478Sdelphij *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
5136251478Sdelphij *
5137251478Sdelphij * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
5138251478Sdelphij * buffer was incompressible).
5139251478Sdelphij */
5140251478Sdelphijstatic boolean_t
5141251478Sdelphijl2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
5142251478Sdelphij{
5143251478Sdelphij	void *cdata;
5144251478Sdelphij	size_t csize, len;
5145251478Sdelphij
5146251478Sdelphij	ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
5147251478Sdelphij	ASSERT(l2hdr->b_tmp_cdata != NULL);
5148251478Sdelphij
5149251478Sdelphij	len = l2hdr->b_asize;
5150251478Sdelphij	cdata = zio_data_buf_alloc(len);
5151251478Sdelphij	csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
5152251478Sdelphij	    cdata, l2hdr->b_asize);
5153251478Sdelphij
5154251478Sdelphij	if (csize == 0) {
5155251478Sdelphij		/* zero block, indicate that there's nothing to write */
5156251478Sdelphij		zio_data_buf_free(cdata, len);
5157251478Sdelphij		l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
5158251478Sdelphij		l2hdr->b_asize = 0;
5159251478Sdelphij		l2hdr->b_tmp_cdata = NULL;
5160251478Sdelphij		ARCSTAT_BUMP(arcstat_l2_compress_zeros);
5161251478Sdelphij		return (B_TRUE);
5162251478Sdelphij	} else if (csize > 0 && csize < len) {
5163251478Sdelphij		/*
5164251478Sdelphij		 * Compression succeeded, we'll keep the cdata around for
5165251478Sdelphij		 * writing and release it afterwards.
5166251478Sdelphij		 */
5167251478Sdelphij		l2hdr->b_compress = ZIO_COMPRESS_LZ4;
5168251478Sdelphij		l2hdr->b_asize = csize;
5169251478Sdelphij		l2hdr->b_tmp_cdata = cdata;
5170251478Sdelphij		ARCSTAT_BUMP(arcstat_l2_compress_successes);
5171251478Sdelphij		return (B_TRUE);
5172251478Sdelphij	} else {
5173251478Sdelphij		/*
5174251478Sdelphij		 * Compression failed, release the compressed buffer.
5175251478Sdelphij		 * l2hdr will be left unmodified.
5176251478Sdelphij		 */
5177251478Sdelphij		zio_data_buf_free(cdata, len);
5178251478Sdelphij		ARCSTAT_BUMP(arcstat_l2_compress_failures);
5179251478Sdelphij		return (B_FALSE);
5180251478Sdelphij	}
5181251478Sdelphij}
5182251478Sdelphij
5183251478Sdelphij/*
5184251478Sdelphij * Decompresses a zio read back from an l2arc device. On success, the
5185251478Sdelphij * underlying zio's io_data buffer is overwritten by the uncompressed
5186251478Sdelphij * version. On decompression error (corrupt compressed stream), the
5187251478Sdelphij * zio->io_error value is set to signal an I/O error.
5188251478Sdelphij *
5189251478Sdelphij * Please note that the compressed data stream is not checksummed, so
5190251478Sdelphij * if the underlying device is experiencing data corruption, we may feed
5191251478Sdelphij * corrupt data to the decompressor, so the decompressor needs to be
5192251478Sdelphij * able to handle this situation (LZ4 does).
5193251478Sdelphij */
5194251478Sdelphijstatic void
5195251478Sdelphijl2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
5196251478Sdelphij{
5197251478Sdelphij	ASSERT(L2ARC_IS_VALID_COMPRESS(c));
5198251478Sdelphij
5199251478Sdelphij	if (zio->io_error != 0) {
5200251478Sdelphij		/*
5201251478Sdelphij		 * An io error has occured, just restore the original io
5202251478Sdelphij		 * size in preparation for a main pool read.
5203251478Sdelphij		 */
5204251478Sdelphij		zio->io_orig_size = zio->io_size = hdr->b_size;
5205251478Sdelphij		return;
5206251478Sdelphij	}
5207251478Sdelphij
5208251478Sdelphij	if (c == ZIO_COMPRESS_EMPTY) {
5209251478Sdelphij		/*
5210251478Sdelphij		 * An empty buffer results in a null zio, which means we
5211251478Sdelphij		 * need to fill its io_data after we're done restoring the
5212251478Sdelphij		 * buffer's contents.
5213251478Sdelphij		 */
5214251478Sdelphij		ASSERT(hdr->b_buf != NULL);
5215251478Sdelphij		bzero(hdr->b_buf->b_data, hdr->b_size);
5216251478Sdelphij		zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
5217251478Sdelphij	} else {
5218251478Sdelphij		ASSERT(zio->io_data != NULL);
5219251478Sdelphij		/*
5220251478Sdelphij		 * We copy the compressed data from the start of the arc buffer
5221251478Sdelphij		 * (the zio_read will have pulled in only what we need, the
5222251478Sdelphij		 * rest is garbage which we will overwrite at decompression)
5223251478Sdelphij		 * and then decompress back to the ARC data buffer. This way we
5224251478Sdelphij		 * can minimize copying by simply decompressing back over the
5225251478Sdelphij		 * original compressed data (rather than decompressing to an
5226251478Sdelphij		 * aux buffer and then copying back the uncompressed buffer,
5227251478Sdelphij		 * which is likely to be much larger).
5228251478Sdelphij		 */
5229251478Sdelphij		uint64_t csize;
5230251478Sdelphij		void *cdata;
5231251478Sdelphij
5232251478Sdelphij		csize = zio->io_size;
5233251478Sdelphij		cdata = zio_data_buf_alloc(csize);
5234251478Sdelphij		bcopy(zio->io_data, cdata, csize);
5235251478Sdelphij		if (zio_decompress_data(c, cdata, zio->io_data, csize,
5236251478Sdelphij		    hdr->b_size) != 0)
5237251478Sdelphij			zio->io_error = EIO;
5238251478Sdelphij		zio_data_buf_free(cdata, csize);
5239251478Sdelphij	}
5240251478Sdelphij
5241251478Sdelphij	/* Restore the expected uncompressed IO size. */
5242251478Sdelphij	zio->io_orig_size = zio->io_size = hdr->b_size;
5243251478Sdelphij}
5244251478Sdelphij
5245251478Sdelphij/*
5246251478Sdelphij * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
5247251478Sdelphij * This buffer serves as a temporary holder of compressed data while
5248251478Sdelphij * the buffer entry is being written to an l2arc device. Once that is
5249251478Sdelphij * done, we can dispose of it.
5250251478Sdelphij */
5251251478Sdelphijstatic void
5252251478Sdelphijl2arc_release_cdata_buf(arc_buf_hdr_t *ab)
5253251478Sdelphij{
5254251478Sdelphij	l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
5255251478Sdelphij
5256251478Sdelphij	if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
5257251478Sdelphij		/*
5258251478Sdelphij		 * If the data was compressed, then we've allocated a
5259251478Sdelphij		 * temporary buffer for it, so now we need to release it.
5260251478Sdelphij		 */
5261251478Sdelphij		ASSERT(l2hdr->b_tmp_cdata != NULL);
5262251478Sdelphij		zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
5263251478Sdelphij	}
5264251478Sdelphij	l2hdr->b_tmp_cdata = NULL;
5265251478Sdelphij}
5266251478Sdelphij
5267251478Sdelphij/*
5268185029Spjd * This thread feeds the L2ARC at regular intervals.  This is the beating
5269185029Spjd * heart of the L2ARC.
5270185029Spjd */
5271185029Spjdstatic void
5272185029Spjdl2arc_feed_thread(void *dummy __unused)
5273185029Spjd{
5274185029Spjd	callb_cpr_t cpr;
5275185029Spjd	l2arc_dev_t *dev;
5276185029Spjd	spa_t *spa;
5277208373Smm	uint64_t size, wrote;
5278219089Spjd	clock_t begin, next = ddi_get_lbolt();
5279251478Sdelphij	boolean_t headroom_boost = B_FALSE;
5280185029Spjd
5281185029Spjd	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
5282185029Spjd
5283185029Spjd	mutex_enter(&l2arc_feed_thr_lock);
5284185029Spjd
5285185029Spjd	while (l2arc_thread_exit == 0) {
5286185029Spjd		CALLB_CPR_SAFE_BEGIN(&cpr);
5287185029Spjd		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
5288219089Spjd		    next - ddi_get_lbolt());
5289185029Spjd		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
5290219089Spjd		next = ddi_get_lbolt() + hz;
5291185029Spjd
5292185029Spjd		/*
5293185029Spjd		 * Quick check for L2ARC devices.
5294185029Spjd		 */
5295185029Spjd		mutex_enter(&l2arc_dev_mtx);
5296185029Spjd		if (l2arc_ndev == 0) {
5297185029Spjd			mutex_exit(&l2arc_dev_mtx);
5298185029Spjd			continue;
5299185029Spjd		}
5300185029Spjd		mutex_exit(&l2arc_dev_mtx);
5301219089Spjd		begin = ddi_get_lbolt();
5302185029Spjd
5303185029Spjd		/*
5304185029Spjd		 * This selects the next l2arc device to write to, and in
5305185029Spjd		 * doing so the next spa to feed from: dev->l2ad_spa.   This
5306185029Spjd		 * will return NULL if there are now no l2arc devices or if
5307185029Spjd		 * they are all faulted.
5308185029Spjd		 *
5309185029Spjd		 * If a device is returned, its spa's config lock is also
5310185029Spjd		 * held to prevent device removal.  l2arc_dev_get_next()
5311185029Spjd		 * will grab and release l2arc_dev_mtx.
5312185029Spjd		 */
5313185029Spjd		if ((dev = l2arc_dev_get_next()) == NULL)
5314185029Spjd			continue;
5315185029Spjd
5316185029Spjd		spa = dev->l2ad_spa;
5317185029Spjd		ASSERT(spa != NULL);
5318185029Spjd
5319185029Spjd		/*
5320219089Spjd		 * If the pool is read-only then force the feed thread to
5321219089Spjd		 * sleep a little longer.
5322219089Spjd		 */
5323219089Spjd		if (!spa_writeable(spa)) {
5324219089Spjd			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
5325219089Spjd			spa_config_exit(spa, SCL_L2ARC, dev);
5326219089Spjd			continue;
5327219089Spjd		}
5328219089Spjd
5329219089Spjd		/*
5330185029Spjd		 * Avoid contributing to memory pressure.
5331185029Spjd		 */
5332185029Spjd		if (arc_reclaim_needed()) {
5333185029Spjd			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5334185029Spjd			spa_config_exit(spa, SCL_L2ARC, dev);
5335185029Spjd			continue;
5336185029Spjd		}
5337185029Spjd
5338185029Spjd		ARCSTAT_BUMP(arcstat_l2_feeds);
5339185029Spjd
5340251478Sdelphij		size = l2arc_write_size();
5341185029Spjd
5342185029Spjd		/*
5343185029Spjd		 * Evict L2ARC buffers that will be overwritten.
5344185029Spjd		 */
5345185029Spjd		l2arc_evict(dev, size, B_FALSE);
5346185029Spjd
5347185029Spjd		/*
5348185029Spjd		 * Write ARC buffers.
5349185029Spjd		 */
5350251478Sdelphij		wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5351208373Smm
5352208373Smm		/*
5353208373Smm		 * Calculate interval between writes.
5354208373Smm		 */
5355208373Smm		next = l2arc_write_interval(begin, size, wrote);
5356185029Spjd		spa_config_exit(spa, SCL_L2ARC, dev);
5357185029Spjd	}
5358185029Spjd
5359185029Spjd	l2arc_thread_exit = 0;
5360185029Spjd	cv_broadcast(&l2arc_feed_thr_cv);
5361185029Spjd	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
5362185029Spjd	thread_exit();
5363185029Spjd}
5364185029Spjd
5365185029Spjdboolean_t
5366185029Spjdl2arc_vdev_present(vdev_t *vd)
5367185029Spjd{
5368185029Spjd	l2arc_dev_t *dev;
5369185029Spjd
5370185029Spjd	mutex_enter(&l2arc_dev_mtx);
5371185029Spjd	for (dev = list_head(l2arc_dev_list); dev != NULL;
5372185029Spjd	    dev = list_next(l2arc_dev_list, dev)) {
5373185029Spjd		if (dev->l2ad_vdev == vd)
5374185029Spjd			break;
5375185029Spjd	}
5376185029Spjd	mutex_exit(&l2arc_dev_mtx);
5377185029Spjd
5378185029Spjd	return (dev != NULL);
5379185029Spjd}
5380185029Spjd
5381185029Spjd/*
5382185029Spjd * Add a vdev for use by the L2ARC.  By this point the spa has already
5383185029Spjd * validated the vdev and opened it.
5384185029Spjd */
5385185029Spjdvoid
5386219089Spjdl2arc_add_vdev(spa_t *spa, vdev_t *vd)
5387185029Spjd{
5388185029Spjd	l2arc_dev_t *adddev;
5389185029Spjd
5390185029Spjd	ASSERT(!l2arc_vdev_present(vd));
5391185029Spjd
5392185029Spjd	/*
5393185029Spjd	 * Create a new l2arc device entry.
5394185029Spjd	 */
5395185029Spjd	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5396185029Spjd	adddev->l2ad_spa = spa;
5397185029Spjd	adddev->l2ad_vdev = vd;
5398219089Spjd	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5399219089Spjd	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5400185029Spjd	adddev->l2ad_hand = adddev->l2ad_start;
5401185029Spjd	adddev->l2ad_evict = adddev->l2ad_start;
5402185029Spjd	adddev->l2ad_first = B_TRUE;
5403208373Smm	adddev->l2ad_writing = B_FALSE;
5404185029Spjd
5405185029Spjd	/*
5406185029Spjd	 * This is a list of all ARC buffers that are still valid on the
5407185029Spjd	 * device.
5408185029Spjd	 */
5409185029Spjd	adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5410185029Spjd	list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5411185029Spjd	    offsetof(arc_buf_hdr_t, b_l2node));
5412185029Spjd
5413219089Spjd	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5414185029Spjd
5415185029Spjd	/*
5416185029Spjd	 * Add device to global list
5417185029Spjd	 */
5418185029Spjd	mutex_enter(&l2arc_dev_mtx);
5419185029Spjd	list_insert_head(l2arc_dev_list, adddev);
5420185029Spjd	atomic_inc_64(&l2arc_ndev);
5421185029Spjd	mutex_exit(&l2arc_dev_mtx);
5422185029Spjd}
5423185029Spjd
5424185029Spjd/*
5425185029Spjd * Remove a vdev from the L2ARC.
5426185029Spjd */
5427185029Spjdvoid
5428185029Spjdl2arc_remove_vdev(vdev_t *vd)
5429185029Spjd{
5430185029Spjd	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5431185029Spjd
5432185029Spjd	/*
5433185029Spjd	 * Find the device by vdev
5434185029Spjd	 */
5435185029Spjd	mutex_enter(&l2arc_dev_mtx);
5436185029Spjd	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5437185029Spjd		nextdev = list_next(l2arc_dev_list, dev);
5438185029Spjd		if (vd == dev->l2ad_vdev) {
5439185029Spjd			remdev = dev;
5440185029Spjd			break;
5441185029Spjd		}
5442185029Spjd	}
5443185029Spjd	ASSERT(remdev != NULL);
5444185029Spjd
5445185029Spjd	/*
5446185029Spjd	 * Remove device from global list
5447185029Spjd	 */
5448185029Spjd	list_remove(l2arc_dev_list, remdev);
5449185029Spjd	l2arc_dev_last = NULL;		/* may have been invalidated */
5450185029Spjd	atomic_dec_64(&l2arc_ndev);
5451185029Spjd	mutex_exit(&l2arc_dev_mtx);
5452185029Spjd
5453185029Spjd	/*
5454185029Spjd	 * Clear all buflists and ARC references.  L2ARC device flush.
5455185029Spjd	 */
5456185029Spjd	l2arc_evict(remdev, 0, B_TRUE);
5457185029Spjd	list_destroy(remdev->l2ad_buflist);
5458185029Spjd	kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5459185029Spjd	kmem_free(remdev, sizeof (l2arc_dev_t));
5460185029Spjd}
5461185029Spjd
5462185029Spjdvoid
5463185029Spjdl2arc_init(void)
5464185029Spjd{
5465185029Spjd	l2arc_thread_exit = 0;
5466185029Spjd	l2arc_ndev = 0;
5467185029Spjd	l2arc_writes_sent = 0;
5468185029Spjd	l2arc_writes_done = 0;
5469185029Spjd
5470185029Spjd	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5471185029Spjd	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5472185029Spjd	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5473185029Spjd	mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5474185029Spjd	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5475185029Spjd
5476185029Spjd	l2arc_dev_list = &L2ARC_dev_list;
5477185029Spjd	l2arc_free_on_write = &L2ARC_free_on_write;
5478185029Spjd	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5479185029Spjd	    offsetof(l2arc_dev_t, l2ad_node));
5480185029Spjd	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5481185029Spjd	    offsetof(l2arc_data_free_t, l2df_list_node));
5482185029Spjd}
5483185029Spjd
5484185029Spjdvoid
5485185029Spjdl2arc_fini(void)
5486185029Spjd{
5487185029Spjd	/*
5488185029Spjd	 * This is called from dmu_fini(), which is called from spa_fini();
5489185029Spjd	 * Because of this, we can assume that all l2arc devices have
5490185029Spjd	 * already been removed when the pools themselves were removed.
5491185029Spjd	 */
5492185029Spjd
5493185029Spjd	l2arc_do_free_on_write();
5494185029Spjd
5495185029Spjd	mutex_destroy(&l2arc_feed_thr_lock);
5496185029Spjd	cv_destroy(&l2arc_feed_thr_cv);
5497185029Spjd	mutex_destroy(&l2arc_dev_mtx);
5498185029Spjd	mutex_destroy(&l2arc_buflist_mtx);
5499185029Spjd	mutex_destroy(&l2arc_free_on_write_mtx);
5500185029Spjd
5501185029Spjd	list_destroy(l2arc_dev_list);
5502185029Spjd	list_destroy(l2arc_free_on_write);
5503185029Spjd}
5504185029Spjd
5505185029Spjdvoid
5506185029Spjdl2arc_start(void)
5507185029Spjd{
5508209962Smm	if (!(spa_mode_global & FWRITE))
5509185029Spjd		return;
5510185029Spjd
5511185029Spjd	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5512185029Spjd	    TS_RUN, minclsyspri);
5513185029Spjd}
5514185029Spjd
5515185029Spjdvoid
5516185029Spjdl2arc_stop(void)
5517185029Spjd{
5518209962Smm	if (!(spa_mode_global & FWRITE))
5519185029Spjd		return;
5520185029Spjd
5521185029Spjd	mutex_enter(&l2arc_feed_thr_lock);
5522185029Spjd	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
5523185029Spjd	l2arc_thread_exit = 1;
5524185029Spjd	while (l2arc_thread_exit != 0)
5525185029Spjd		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5526185029Spjd	mutex_exit(&l2arc_feed_thr_lock);
5527185029Spjd}
5528