arc.c revision 205264
1134758Sstefanf/*
2134758Sstefanf * CDDL HEADER START
3134758Sstefanf *
4134758Sstefanf * The contents of this file are subject to the terms of the
5134758Sstefanf * Common Development and Distribution License (the "License").
6134758Sstefanf * You may not use this file except in compliance with the License.
7134758Sstefanf *
8134758Sstefanf * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9134758Sstefanf * or http://www.opensolaris.org/os/licensing.
10134758Sstefanf * See the License for the specific language governing permissions
11134758Sstefanf * and limitations under the License.
12134758Sstefanf *
13134758Sstefanf * When distributing Covered Code, include this CDDL HEADER in each
14134758Sstefanf * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15134758Sstefanf * If applicable, add the following below this CDDL HEADER, with the
16134758Sstefanf * fields enclosed by brackets "[]" replaced with your own identifying
17134758Sstefanf * information: Portions Copyright [yyyy] [name of copyright owner]
18134758Sstefanf *
19134758Sstefanf * CDDL HEADER END
20134758Sstefanf */
21134758Sstefanf/*
22134758Sstefanf * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23134758Sstefanf * Use is subject to license terms.
24134758Sstefanf */
25134758Sstefanf
26134758Sstefanf/*
27134758Sstefanf * DVA-based Adjustable Replacement Cache
28134758Sstefanf *
29134758Sstefanf * While much of the theory of operation used here is
30134758Sstefanf * based on the self-tuning, low overhead replacement cache
31134758Sstefanf * presented by Megiddo and Modha at FAST 2003, there are some
32134758Sstefanf * significant differences:
33134758Sstefanf *
34134758Sstefanf * 1. The Megiddo and Modha model assumes any page is evictable.
35134758Sstefanf * Pages in its cache cannot be "locked" into memory.  This makes
36134758Sstefanf * the eviction algorithm simple: evict the last page in the list.
37134758Sstefanf * This also make the performance characteristics easy to reason
38134758Sstefanf * about.  Our cache is not so simple.  At any given moment, some
39134758Sstefanf * subset of the blocks in the cache are un-evictable because we
40134758Sstefanf * have handed out a reference to them.  Blocks are only evictable
41134758Sstefanf * when there are no external references active.  This makes
42137587Snik * eviction far more problematic:  we choose to evict the evictable
43137587Snik * blocks that are the "lowest" in the list.
44134758Sstefanf *
45134758Sstefanf * There are times when it is not possible to evict the requested
46134758Sstefanf * space.  In these circumstances we are unable to adjust the cache
47134758Sstefanf * size.  To prevent the cache growing unbounded at these times we
48134758Sstefanf * implement a "cache throttle" that slows the flow of new data
49134758Sstefanf * into the cache until we can make space available.
50134758Sstefanf *
51134758Sstefanf * 2. The Megiddo and Modha model assumes a fixed cache size.
52134758Sstefanf * Pages are evicted when the cache is full and there is a cache
53134758Sstefanf * miss.  Our model has a variable sized cache.  It grows with
54134758Sstefanf * high use, but also tries to react to memory pressure from the
55134758Sstefanf * operating system: decreasing its size when system memory is
56134758Sstefanf * tight.
57134758Sstefanf *
58134758Sstefanf * 3. The Megiddo and Modha model assumes a fixed page size. All
59134758Sstefanf * elements of the cache are therefor exactly the same size.  So
60134758Sstefanf * when adjusting the cache size following a cache miss, its simply
61134758Sstefanf * a matter of choosing a single page to evict.  In our model, we
62134758Sstefanf * have variable sized cache blocks (rangeing from 512 bytes to
63134758Sstefanf * 128K bytes).  We therefor choose a set of blocks to evict to make
64134758Sstefanf * space for a cache miss that approximates as closely as possible
65134758Sstefanf * the space used by the new block.
66134758Sstefanf *
67134758Sstefanf * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
68134758Sstefanf * by N. Megiddo & D. Modha, FAST 2003
69134758Sstefanf */
70134758Sstefanf
71134758Sstefanf/*
72134758Sstefanf * The locking model:
73134758Sstefanf *
74134758Sstefanf * A new reference to a cache buffer can be obtained in two
75134758Sstefanf * ways: 1) via a hash table lookup using the DVA as a key,
76134758Sstefanf * or 2) via one of the ARC lists.  The arc_read() interface
77134758Sstefanf * uses method 1, while the internal arc algorithms for
78134758Sstefanf * adjusting the cache use method 2.  We therefor provide two
79134758Sstefanf * types of locks: 1) the hash table lock array, and 2) the
80134758Sstefanf * arc list locks.
81134758Sstefanf *
82134758Sstefanf * Buffers do not have their own mutexs, rather they rely on the
83134758Sstefanf * hash table mutexs for the bulk of their protection (i.e. most
84134758Sstefanf * fields in the arc_buf_hdr_t are protected by these mutexs).
85134758Sstefanf *
86134758Sstefanf * buf_hash_find() returns the appropriate mutex (held) when it
87134758Sstefanf * locates the requested buffer in the hash table.  It returns
88134758Sstefanf * NULL for the mutex if the buffer was not in the table.
89134758Sstefanf *
90134758Sstefanf * buf_hash_remove() expects the appropriate hash mutex to be
91134758Sstefanf * already held before it is invoked.
92134758Sstefanf *
93134758Sstefanf * Each arc state also has a mutex which is used to protect the
94134758Sstefanf * buffer list associated with the state.  When attempting to
95134758Sstefanf * obtain a hash table lock while holding an arc list lock you
96134758Sstefanf * must use: mutex_tryenter() to avoid deadlock.  Also note that
97134758Sstefanf * the active state mutex must be held before the ghost state mutex.
98134758Sstefanf *
99134758Sstefanf * Arc buffers may have an associated eviction callback function.
100134758Sstefanf * This function will be invoked prior to removing the buffer (e.g.
101134758Sstefanf * in arc_do_user_evicts()).  Note however that the data associated
102134758Sstefanf * with the buffer may be evicted prior to the callback.  The callback
103134758Sstefanf * must be made with *no locks held* (to prevent deadlock).  Additionally,
104134758Sstefanf * the users of callbacks must ensure that their private data is
105134758Sstefanf * protected from simultaneous callbacks from arc_buf_evict()
106134758Sstefanf * and arc_do_user_evicts().
107134758Sstefanf *
108134758Sstefanf * Note that the majority of the performance stats are manipulated
109134758Sstefanf * with atomic operations.
110134758Sstefanf *
111134758Sstefanf * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
112134758Sstefanf *
113134758Sstefanf *	- L2ARC buflist creation
114134758Sstefanf *	- L2ARC buflist eviction
115134758Sstefanf *	- L2ARC write completion, which walks L2ARC buflists
116134758Sstefanf *	- ARC header destruction, as it removes from L2ARC buflists
117134758Sstefanf *	- ARC header release, as it removes from L2ARC buflists
118134758Sstefanf */
119134758Sstefanf
120134758Sstefanf#include <sys/spa.h>
121134758Sstefanf#include <sys/zio.h>
122134758Sstefanf#include <sys/zio_checksum.h>
123134758Sstefanf#include <sys/zfs_context.h>
124134758Sstefanf#include <sys/arc.h>
125134758Sstefanf#include <sys/refcount.h>
126134758Sstefanf#include <sys/vdev.h>
127134758Sstefanf#ifdef _KERNEL
128134758Sstefanf#include <sys/dnlc.h>
129134758Sstefanf#endif
130134758Sstefanf#include <sys/callb.h>
131134758Sstefanf#include <sys/kstat.h>
132134758Sstefanf#include <sys/sdt.h>
133134758Sstefanf
134134758Sstefanf#include <sys/ktr.h>
135134758Sstefanf#include <vm/vm_pageout.h>
136134758Sstefanf
137134758Sstefanfstatic kmutex_t		arc_reclaim_thr_lock;
138134758Sstefanfstatic kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
139134758Sstefanfstatic uint8_t		arc_thread_exit;
140134758Sstefanf
141134758Sstefanfextern int zfs_write_limit_shift;
142134758Sstefanfextern uint64_t zfs_write_limit_max;
143134758Sstefanfextern kmutex_t zfs_write_limit_lock;
144134758Sstefanf
145134758Sstefanf#define	ARC_REDUCE_DNLC_PERCENT	3
146134758Sstefanfuint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
147134758Sstefanf
148134758Sstefanftypedef enum arc_reclaim_strategy {
149134758Sstefanf	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
150134758Sstefanf	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
151134758Sstefanf} arc_reclaim_strategy_t;
152134758Sstefanf
153134758Sstefanf/* number of seconds before growing cache again */
154134758Sstefanfstatic int		arc_grow_retry = 60;
155134758Sstefanf
156134758Sstefanf/*
157134758Sstefanf * minimum lifespan of a prefetch block in clock ticks
158134758Sstefanf * (initialized in arc_init())
159134758Sstefanf */
160134758Sstefanfstatic int		arc_min_prefetch_lifespan;
161134758Sstefanf
162134758Sstefanfextern int zfs_prefetch_disable;
163134758Sstefanfstatic int arc_dead;
164134758Sstefanf
165134758Sstefanf/*
166134758Sstefanf * The arc has filled available memory and has now warmed up.
167134758Sstefanf */
168134758Sstefanfstatic boolean_t arc_warm;
169134758Sstefanf
170134758Sstefanf/*
171134758Sstefanf * These tunables are for performance analysis.
172134758Sstefanf */
173134758Sstefanfuint64_t zfs_arc_max;
174134758Sstefanfuint64_t zfs_arc_min;
175134758Sstefanfuint64_t zfs_arc_meta_limit = 0;
176134758Sstefanfint zfs_mdcomp_disable = 0;
177134758Sstefanf
178134758SstefanfTUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
179134758SstefanfTUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
180134758SstefanfTUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
181134758SstefanfTUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
182134758SstefanfSYSCTL_DECL(_vfs_zfs);
183134758SstefanfSYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
184134758Sstefanf    "Maximum ARC size");
185134758SstefanfSYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
186134758Sstefanf    "Minimum ARC size");
187134758SstefanfSYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
188134758Sstefanf    &zfs_mdcomp_disable, 0, "Disable metadata compression");
189134758Sstefanf
190134758Sstefanf#ifdef ZIO_USE_UMA
191134758Sstefanfextern kmem_cache_t	*zio_buf_cache[];
192134758Sstefanfextern kmem_cache_t	*zio_data_buf_cache[];
193134758Sstefanf#endif
194134758Sstefanf
195134758Sstefanf/*
196134758Sstefanf * Note that buffers can be in one of 6 states:
197134758Sstefanf *	ARC_anon	- anonymous (discussed below)
198134758Sstefanf *	ARC_mru		- recently used, currently cached
199134758Sstefanf *	ARC_mru_ghost	- recentely used, no longer in cache
200134758Sstefanf *	ARC_mfu		- frequently used, currently cached
201134758Sstefanf *	ARC_mfu_ghost	- frequently used, no longer in cache
202134758Sstefanf *	ARC_l2c_only	- exists in L2ARC but not other states
203134758Sstefanf * When there are no active references to the buffer, they are
204134758Sstefanf * are linked onto a list in one of these arc states.  These are
205134758Sstefanf * the only buffers that can be evicted or deleted.  Within each
206134758Sstefanf * state there are multiple lists, one for meta-data and one for
207134758Sstefanf * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
208134758Sstefanf * etc.) is tracked separately so that it can be managed more
209134758Sstefanf * explicitly: favored over data, limited explicitly.
210134758Sstefanf *
211134758Sstefanf * Anonymous buffers are buffers that are not associated with
212134758Sstefanf * a DVA.  These are buffers that hold dirty block copies
213134758Sstefanf * before they are written to stable storage.  By definition,
214134758Sstefanf * they are "ref'd" and are considered part of arc_mru
215134758Sstefanf * that cannot be freed.  Generally, they will aquire a DVA
216134758Sstefanf * as they are written and migrate onto the arc_mru list.
217134758Sstefanf *
218134758Sstefanf * The ARC_l2c_only state is for buffers that are in the second
219134758Sstefanf * level ARC but no longer in any of the ARC_m* lists.  The second
220134758Sstefanf * level ARC itself may also contain buffers that are in any of
221134758Sstefanf * the ARC_m* states - meaning that a buffer can exist in two
222134758Sstefanf * places.  The reason for the ARC_l2c_only state is to keep the
223134758Sstefanf * buffer header in the hash table, so that reads that hit the
224134758Sstefanf * second level ARC benefit from these fast lookups.
225134758Sstefanf */
226134758Sstefanf
227134758Sstefanf#define	ARCS_LOCK_PAD		CACHE_LINE_SIZE
228134758Sstefanfstruct arcs_lock {
229134758Sstefanf	kmutex_t	arcs_lock;
230134758Sstefanf#ifdef _KERNEL
231134758Sstefanf	unsigned char	pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
232134758Sstefanf#endif
233134758Sstefanf};
234134758Sstefanf
235134758Sstefanf/*
236134758Sstefanf * must be power of two for mask use to work
237134758Sstefanf *
238134758Sstefanf */
239134758Sstefanf#define ARC_BUFC_NUMDATALISTS		16
240134758Sstefanf#define ARC_BUFC_NUMMETADATALISTS	16
241134758Sstefanf#define ARC_BUFC_NUMLISTS	(ARC_BUFC_NUMMETADATALISTS+ARC_BUFC_NUMDATALISTS)
242134758Sstefanf
243134758Sstefanftypedef struct arc_state {
244134758Sstefanf	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
245134758Sstefanf	uint64_t arcs_size;	/* total amount of data in this state */
246134758Sstefanf	list_t	arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
247134758Sstefanf	struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
248134758Sstefanf} arc_state_t;
249134758Sstefanf
250134758Sstefanf#define ARCS_LOCK(s, i) &((s)->arcs_locks[(i)].arcs_lock)
251134758Sstefanf
252134758Sstefanf/* The 6 states: */
253134758Sstefanfstatic arc_state_t ARC_anon;
254134758Sstefanfstatic arc_state_t ARC_mru;
255134758Sstefanfstatic arc_state_t ARC_mru_ghost;
256134758Sstefanfstatic arc_state_t ARC_mfu;
257134758Sstefanfstatic arc_state_t ARC_mfu_ghost;
258134758Sstefanfstatic arc_state_t ARC_l2c_only;
259134758Sstefanf
260134758Sstefanftypedef struct arc_stats {
261134758Sstefanf	kstat_named_t arcstat_hits;
262134758Sstefanf	kstat_named_t arcstat_misses;
263134758Sstefanf	kstat_named_t arcstat_demand_data_hits;
264134758Sstefanf	kstat_named_t arcstat_demand_data_misses;
265134758Sstefanf	kstat_named_t arcstat_demand_metadata_hits;
266134758Sstefanf	kstat_named_t arcstat_demand_metadata_misses;
267134758Sstefanf	kstat_named_t arcstat_prefetch_data_hits;
268134758Sstefanf	kstat_named_t arcstat_prefetch_data_misses;
269134758Sstefanf	kstat_named_t arcstat_prefetch_metadata_hits;
270134758Sstefanf	kstat_named_t arcstat_prefetch_metadata_misses;
271134758Sstefanf	kstat_named_t arcstat_mru_hits;
272134758Sstefanf	kstat_named_t arcstat_mru_ghost_hits;
273134758Sstefanf	kstat_named_t arcstat_mfu_hits;
274134758Sstefanf	kstat_named_t arcstat_mfu_ghost_hits;
275134758Sstefanf	kstat_named_t arcstat_allocated;
276134758Sstefanf	kstat_named_t arcstat_deleted;
277134758Sstefanf	kstat_named_t arcstat_stolen;
278134758Sstefanf	kstat_named_t arcstat_recycle_miss;
279134758Sstefanf	kstat_named_t arcstat_mutex_miss;
280134758Sstefanf	kstat_named_t arcstat_evict_skip;
281134758Sstefanf	kstat_named_t arcstat_hash_elements;
282134758Sstefanf	kstat_named_t arcstat_hash_elements_max;
283134758Sstefanf	kstat_named_t arcstat_hash_collisions;
284134758Sstefanf	kstat_named_t arcstat_hash_chains;
285134758Sstefanf	kstat_named_t arcstat_hash_chain_max;
286134758Sstefanf	kstat_named_t arcstat_p;
287134758Sstefanf	kstat_named_t arcstat_c;
288134758Sstefanf	kstat_named_t arcstat_c_min;
289134758Sstefanf	kstat_named_t arcstat_c_max;
290134758Sstefanf	kstat_named_t arcstat_size;
291134758Sstefanf	kstat_named_t arcstat_hdr_size;
292134758Sstefanf	kstat_named_t arcstat_l2_hits;
293134758Sstefanf	kstat_named_t arcstat_l2_misses;
294134758Sstefanf	kstat_named_t arcstat_l2_feeds;
295134758Sstefanf	kstat_named_t arcstat_l2_rw_clash;
296134758Sstefanf	kstat_named_t arcstat_l2_writes_sent;
297134758Sstefanf	kstat_named_t arcstat_l2_writes_done;
298134758Sstefanf	kstat_named_t arcstat_l2_writes_error;
299134758Sstefanf	kstat_named_t arcstat_l2_writes_hdr_miss;
300134758Sstefanf	kstat_named_t arcstat_l2_evict_lock_retry;
301134758Sstefanf	kstat_named_t arcstat_l2_evict_reading;
302134758Sstefanf	kstat_named_t arcstat_l2_free_on_write;
303134758Sstefanf	kstat_named_t arcstat_l2_abort_lowmem;
304134758Sstefanf	kstat_named_t arcstat_l2_cksum_bad;
305134758Sstefanf	kstat_named_t arcstat_l2_io_error;
306134758Sstefanf	kstat_named_t arcstat_l2_size;
307134758Sstefanf	kstat_named_t arcstat_l2_hdr_size;
308134758Sstefanf	kstat_named_t arcstat_memory_throttle_count;
309134758Sstefanf	kstat_named_t arcstat_l2_write_trylock_fail;
310134758Sstefanf	kstat_named_t arcstat_l2_write_in_l2;
311134758Sstefanf	kstat_named_t arcstat_l2_write_passed_headroom;
312134758Sstefanf	kstat_named_t arcstat_l2_write_spa_mismatch;
313134758Sstefanf	kstat_named_t arcstat_l2_write_hdr_io_in_progress;
314134758Sstefanf	kstat_named_t arcstat_l2_write_not_cacheable;
315134758Sstefanf	kstat_named_t arcstat_l2_write_full;
316134758Sstefanf	kstat_named_t arcstat_l2_write_buffer_iter;
317134758Sstefanf	kstat_named_t arcstat_l2_write_pios;
318134758Sstefanf	kstat_named_t arcstat_l2_write_bytes_written;
319134758Sstefanf	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
320134758Sstefanf	kstat_named_t arcstat_l2_write_buffer_list_iter;
321134758Sstefanf	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
322134758Sstefanf} arc_stats_t;
323134758Sstefanf
324134758Sstefanfstatic arc_stats_t arc_stats = {
325134758Sstefanf	{ "hits",			KSTAT_DATA_UINT64 },
326134758Sstefanf	{ "misses",			KSTAT_DATA_UINT64 },
327134758Sstefanf	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
328134758Sstefanf	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
329134758Sstefanf	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
330134758Sstefanf	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
331134758Sstefanf	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
332134758Sstefanf	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
333134758Sstefanf	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
334134758Sstefanf	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
335134758Sstefanf	{ "mru_hits",			KSTAT_DATA_UINT64 },
336134758Sstefanf	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
337134758Sstefanf	{ "mfu_hits",			KSTAT_DATA_UINT64 },
338134758Sstefanf	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
339134758Sstefanf	{ "allocated",			KSTAT_DATA_UINT64 },
340134758Sstefanf	{ "deleted",			KSTAT_DATA_UINT64 },
341134758Sstefanf	{ "stolen",			KSTAT_DATA_UINT64 },
342134758Sstefanf	{ "recycle_miss",		KSTAT_DATA_UINT64 },
343134758Sstefanf	{ "mutex_miss",			KSTAT_DATA_UINT64 },
344134758Sstefanf	{ "evict_skip",			KSTAT_DATA_UINT64 },
345134758Sstefanf	{ "hash_elements",		KSTAT_DATA_UINT64 },
346134758Sstefanf	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
347134758Sstefanf	{ "hash_collisions",		KSTAT_DATA_UINT64 },
348134758Sstefanf	{ "hash_chains",		KSTAT_DATA_UINT64 },
349134758Sstefanf	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
350134758Sstefanf	{ "p",				KSTAT_DATA_UINT64 },
351134758Sstefanf	{ "c",				KSTAT_DATA_UINT64 },
352134758Sstefanf	{ "c_min",			KSTAT_DATA_UINT64 },
353134758Sstefanf	{ "c_max",			KSTAT_DATA_UINT64 },
354134758Sstefanf	{ "size",			KSTAT_DATA_UINT64 },
355134758Sstefanf	{ "hdr_size",			KSTAT_DATA_UINT64 },
356134758Sstefanf	{ "l2_hits",			KSTAT_DATA_UINT64 },
357134758Sstefanf	{ "l2_misses",			KSTAT_DATA_UINT64 },
358134758Sstefanf	{ "l2_feeds",			KSTAT_DATA_UINT64 },
359134758Sstefanf	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
360134758Sstefanf	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
361134758Sstefanf	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
362134758Sstefanf	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
363134758Sstefanf	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
364134758Sstefanf	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
365134758Sstefanf	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
366134758Sstefanf	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
367134758Sstefanf	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
368134758Sstefanf	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
369134758Sstefanf	{ "l2_io_error",		KSTAT_DATA_UINT64 },
370134758Sstefanf	{ "l2_size",			KSTAT_DATA_UINT64 },
371134758Sstefanf	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
372134758Sstefanf	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
373134758Sstefanf	{ "l2_write_trylock_fail", 	KSTAT_DATA_UINT64 },
374134758Sstefanf	{ "l2_write_in_l2", 		KSTAT_DATA_UINT64 },
375134758Sstefanf	{ "l2_write_passed_headroom", 	KSTAT_DATA_UINT64 },
376134758Sstefanf	{ "l2_write_spa_mismatch", 	KSTAT_DATA_UINT64 },
377134758Sstefanf	{ "l2_write_io_in_progress", 	KSTAT_DATA_UINT64 },
378134758Sstefanf	{ "l2_write_not_cacheable", 	KSTAT_DATA_UINT64 },
379134758Sstefanf	{ "l2_write_full", 		KSTAT_DATA_UINT64 },
380134758Sstefanf	{ "l2_write_buffer_iter", 	KSTAT_DATA_UINT64 },
381134758Sstefanf	{ "l2_write_pios", 		KSTAT_DATA_UINT64 },
382134758Sstefanf	{ "l2_write_bytes_written", 		KSTAT_DATA_UINT64 },
383134758Sstefanf	{ "l2_write_buffer_bytes_scanned", 	KSTAT_DATA_UINT64 },
384134758Sstefanf	{ "l2_write_buffer_list_iter", 	KSTAT_DATA_UINT64 },
385134758Sstefanf	{ "l2_write_buffer_list_null_iter", 	KSTAT_DATA_UINT64 }
386134758Sstefanf};
387134758Sstefanf
388134758Sstefanf#define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
389134758Sstefanf
390134758Sstefanf#define	ARCSTAT_INCR(stat, val) \
391134758Sstefanf	atomic_add_64(&arc_stats.stat.value.ui64, (val));
392134758Sstefanf
393134758Sstefanf#define	ARCSTAT_BUMP(stat) 	ARCSTAT_INCR(stat, 1)
394134758Sstefanf#define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
395137587Snik
396134758Sstefanf#define	ARCSTAT_MAX(stat, val) {					\
397134758Sstefanf	uint64_t m;							\
398134758Sstefanf	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
399137587Snik	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
400137587Snik		continue;						\
401134758Sstefanf}
402134758Sstefanf
403134758Sstefanf#define	ARCSTAT_MAXSTAT(stat) \
404134758Sstefanf	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
405134758Sstefanf
406137587Snik/*
407137587Snik * We define a macro to allow ARC hits/misses to be easily broken down by
408134758Sstefanf * two separate conditions, giving a total of four different subtypes for
409134758Sstefanf * each of hits and misses (so eight statistics total).
410134758Sstefanf */
411134758Sstefanf#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
412134758Sstefanf	if (cond1) {							\
413134758Sstefanf		if (cond2) {						\
414134758Sstefanf			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
415134758Sstefanf		} else {						\
416134758Sstefanf			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
417134758Sstefanf		}							\
418134758Sstefanf	} else {							\
419134758Sstefanf		if (cond2) {						\
420134758Sstefanf			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
421134758Sstefanf		} else {						\
422134758Sstefanf			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
423134758Sstefanf		}							\
424134758Sstefanf	}
425134758Sstefanf
426134758Sstefanfkstat_t			*arc_ksp;
427134758Sstefanfstatic arc_state_t 	*arc_anon;
428134758Sstefanfstatic arc_state_t	*arc_mru;
429134758Sstefanfstatic arc_state_t	*arc_mru_ghost;
430134758Sstefanfstatic arc_state_t	*arc_mfu;
431134758Sstefanfstatic arc_state_t	*arc_mfu_ghost;
432134758Sstefanfstatic arc_state_t	*arc_l2c_only;
433134758Sstefanf
434134758Sstefanf/*
435134758Sstefanf * There are several ARC variables that are critical to export as kstats --
436134758Sstefanf * but we don't want to have to grovel around in the kstat whenever we wish to
437134758Sstefanf * manipulate them.  For these variables, we therefore define them to be in
438134758Sstefanf * terms of the statistic variable.  This assures that we are not introducing
439134758Sstefanf * the possibility of inconsistency by having shadow copies of the variables,
440134758Sstefanf * while still allowing the code to be readable.
441134758Sstefanf */
442134758Sstefanf#define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
443134758Sstefanf#define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
444134758Sstefanf#define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
445134758Sstefanf#define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
446134758Sstefanf#define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
447134758Sstefanf
448134758Sstefanfstatic int		arc_no_grow;	/* Don't try to grow cache size */
449134758Sstefanfstatic uint64_t		arc_tempreserve;
450134758Sstefanfstatic uint64_t		arc_meta_used;
451134758Sstefanfstatic uint64_t		arc_meta_limit;
452134758Sstefanfstatic uint64_t		arc_meta_max = 0;
453134758SstefanfSYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RDTUN,
454134758Sstefanf    &arc_meta_used, 0, "ARC metadata used");
455134758SstefanfSYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RDTUN,
456134758Sstefanf    &arc_meta_limit, 0, "ARC metadata limit");
457134758Sstefanf
458134758Sstefanftypedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
459134758Sstefanf
460134758Sstefanftypedef struct arc_callback arc_callback_t;
461134758Sstefanf
462134758Sstefanfstruct arc_callback {
463134758Sstefanf	void			*acb_private;
464134758Sstefanf	arc_done_func_t		*acb_done;
465134758Sstefanf	arc_buf_t		*acb_buf;
466134758Sstefanf	zio_t			*acb_zio_dummy;
467134758Sstefanf	arc_callback_t		*acb_next;
468134758Sstefanf};
469134758Sstefanf
470134758Sstefanftypedef struct arc_write_callback arc_write_callback_t;
471134758Sstefanf
472134758Sstefanfstruct arc_write_callback {
473134758Sstefanf	void		*awcb_private;
474134758Sstefanf	arc_done_func_t	*awcb_ready;
475134758Sstefanf	arc_done_func_t	*awcb_done;
476134758Sstefanf	arc_buf_t	*awcb_buf;
477134758Sstefanf};
478134758Sstefanf
479134758Sstefanfstruct arc_buf_hdr {
480134758Sstefanf	/* protected by hash lock */
481134758Sstefanf	dva_t			b_dva;
482134758Sstefanf	uint64_t		b_birth;
483134758Sstefanf	uint64_t		b_cksum0;
484134758Sstefanf
485134758Sstefanf	kmutex_t		b_freeze_lock;
486134758Sstefanf	zio_cksum_t		*b_freeze_cksum;
487134758Sstefanf
488134758Sstefanf	arc_buf_hdr_t		*b_hash_next;
489134758Sstefanf	arc_buf_t		*b_buf;
490134758Sstefanf	uint32_t		b_flags;
491134758Sstefanf	uint32_t		b_datacnt;
492134758Sstefanf
493134758Sstefanf	arc_callback_t		*b_acb;
494134758Sstefanf	kcondvar_t		b_cv;
495134758Sstefanf
496134758Sstefanf	/* immutable */
497134758Sstefanf	arc_buf_contents_t	b_type;
498134758Sstefanf	uint64_t		b_size;
499134758Sstefanf	spa_t			*b_spa;
500134758Sstefanf
501134758Sstefanf	/* protected by arc state mutex */
502134758Sstefanf	arc_state_t		*b_state;
503134758Sstefanf	list_node_t		b_arc_node;
504134758Sstefanf
505134758Sstefanf	/* updated atomically */
506134758Sstefanf	clock_t			b_arc_access;
507134758Sstefanf
508134758Sstefanf	/* self protecting */
509134758Sstefanf	refcount_t		b_refcnt;
510134758Sstefanf
511134758Sstefanf	l2arc_buf_hdr_t		*b_l2hdr;
512134758Sstefanf	list_node_t		b_l2node;
513134758Sstefanf};
514134758Sstefanf
515134758Sstefanfstatic arc_buf_t *arc_eviction_list;
516134758Sstefanfstatic kmutex_t arc_eviction_mtx;
517134758Sstefanfstatic arc_buf_hdr_t arc_eviction_hdr;
518134758Sstefanfstatic void arc_get_data_buf(arc_buf_t *buf);
519134758Sstefanfstatic void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
520134758Sstefanfstatic int arc_evict_needed(arc_buf_contents_t type);
521134758Sstefanfstatic void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes);
522134758Sstefanf
523134758Sstefanf#define	GHOST_STATE(state)	\
524134758Sstefanf	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
525134758Sstefanf	(state) == arc_l2c_only)
526134758Sstefanf
527134758Sstefanf/*
528134758Sstefanf * Private ARC flags.  These flags are private ARC only flags that will show up
529134758Sstefanf * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
530134758Sstefanf * be passed in as arc_flags in things like arc_read.  However, these flags
531134758Sstefanf * should never be passed and should only be set by ARC code.  When adding new
532134758Sstefanf * public flags, make sure not to smash the private ones.
533134758Sstefanf */
534134758Sstefanf
535134758Sstefanf#define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
536134758Sstefanf#define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
537134758Sstefanf#define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
538134758Sstefanf#define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
539134758Sstefanf#define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
540134758Sstefanf#define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
541134758Sstefanf#define	ARC_FREE_IN_PROGRESS	(1 << 15)	/* hdr about to be freed */
542134758Sstefanf#define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
543134758Sstefanf#define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
544134758Sstefanf#define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
545134758Sstefanf#define	ARC_STORED		(1 << 19)	/* has been store()d to */
546134758Sstefanf
547134758Sstefanf#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
548134758Sstefanf#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
549134758Sstefanf#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
550134758Sstefanf#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
551134758Sstefanf#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
552134758Sstefanf#define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
553134758Sstefanf#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_L2CACHE)
554134758Sstefanf#define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS &&	\
555134758Sstefanf				    (hdr)->b_l2hdr != NULL)
556134758Sstefanf#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
557134758Sstefanf#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
558134758Sstefanf#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
559134758Sstefanf
560134758Sstefanf/*
561134758Sstefanf * Other sizes
562134758Sstefanf */
563134758Sstefanf
564134758Sstefanf#define	HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
565134758Sstefanf#define	L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
566134758Sstefanf
567134758Sstefanf/*
568134758Sstefanf * Hash table routines
569134758Sstefanf */
570134758Sstefanf
571134758Sstefanf#define	HT_LOCK_PAD	CACHE_LINE_SIZE
572134758Sstefanf
573134758Sstefanfstruct ht_lock {
574134758Sstefanf	kmutex_t	ht_lock;
575134758Sstefanf#ifdef _KERNEL
576134758Sstefanf	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
577134758Sstefanf#endif
578134758Sstefanf};
579134758Sstefanf
580134758Sstefanf#define	BUF_LOCKS 256
581134758Sstefanftypedef struct buf_hash_table {
582134758Sstefanf	uint64_t ht_mask;
583134758Sstefanf	arc_buf_hdr_t **ht_table;
584134758Sstefanf	struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
585134758Sstefanf} buf_hash_table_t;
586134758Sstefanf
587134758Sstefanfstatic buf_hash_table_t buf_hash_table;
588134758Sstefanf
589134758Sstefanf#define	BUF_HASH_INDEX(spa, dva, birth) \
590134758Sstefanf	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
591134758Sstefanf#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
592134758Sstefanf#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
593134758Sstefanf#define	HDR_LOCK(buf) \
594134758Sstefanf	(BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
595134758Sstefanf
596134758Sstefanfuint64_t zfs_crc64_table[256];
597134758Sstefanf
598134758Sstefanf#ifdef ZIO_USE_UMA
599134758Sstefanfextern kmem_cache_t	*zio_buf_cache[];
600134758Sstefanfextern kmem_cache_t	*zio_data_buf_cache[];
601134758Sstefanf#endif
602134758Sstefanf
603134758Sstefanf/*
604134758Sstefanf * Level 2 ARC
605134758Sstefanf */
606134758Sstefanf
607134758Sstefanf#define	L2ARC_WRITE_SIZE	(64 * 1024 * 1024)	/* initial write max */
608134758Sstefanf#define	L2ARC_HEADROOM		128		/* num of writes */
609134758Sstefanf#define	L2ARC_FEED_SECS		1		/* caching interval */
610134758Sstefanf#define	L2ARC_FEED_SECS_SHIFT	1		/* caching interval shift */
611134758Sstefanf
612134758Sstefanf#define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
613134758Sstefanf#define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
614134758Sstefanf
615134758Sstefanf/*
616134758Sstefanf * L2ARC Performance Tunables
617134758Sstefanf */
618134758Sstefanfuint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
619134758Sstefanfuint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
620134758Sstefanfuint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
621134758Sstefanfuint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
622uint64_t l2arc_feed_secs_shift = L2ARC_FEED_SECS_SHIFT;	/* interval seconds shift */
623boolean_t l2arc_noprefetch = B_FALSE;		/* don't cache prefetch bufs */
624
625
626SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
627    &l2arc_write_max, 0, "max write size");
628SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
629    &l2arc_write_boost, 0, "extra write during warmup");
630SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
631    &l2arc_headroom, 0, "number of dev writes");
632SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
633    &l2arc_feed_secs, 0, "interval seconds");
634SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs_shift, CTLFLAG_RW,
635    &l2arc_feed_secs_shift, 0, "power of 2 division of feed seconds");
636
637SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
638    &l2arc_noprefetch, 0, "don't cache prefetch bufs");
639
640
641SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
642    &ARC_anon.arcs_size, 0, "size of anonymous state");
643SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
644    &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
645SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
646    &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
647
648SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
649    &ARC_mru.arcs_size, 0, "size of mru state");
650SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
651    &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
652SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
653    &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
654
655SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
656    &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
657SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
658    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
659    "size of metadata in mru ghost state");
660SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
661    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
662    "size of data in mru ghost state");
663
664SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
665    &ARC_mfu.arcs_size, 0, "size of mfu state");
666SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
667    &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
668SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
669    &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
670
671SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
672    &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
673SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
674    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
675    "size of metadata in mfu ghost state");
676SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
677    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
678    "size of data in mfu ghost state");
679
680SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
681    &ARC_l2c_only.arcs_size, 0, "size of mru state");
682
683/*
684 * L2ARC Internals
685 */
686typedef struct l2arc_dev {
687	vdev_t			*l2ad_vdev;	/* vdev */
688	spa_t			*l2ad_spa;	/* spa */
689	uint64_t		l2ad_hand;	/* next write location */
690	uint64_t		l2ad_write;	/* desired write size, bytes */
691	uint64_t		l2ad_boost;	/* warmup write boost, bytes */
692	uint64_t		l2ad_start;	/* first addr on device */
693	uint64_t		l2ad_end;	/* last addr on device */
694	uint64_t		l2ad_evict;	/* last addr eviction reached */
695	boolean_t		l2ad_first;	/* first sweep through */
696	list_t			*l2ad_buflist;	/* buffer list */
697	list_node_t		l2ad_node;	/* device list node */
698} l2arc_dev_t;
699
700static list_t L2ARC_dev_list;			/* device list */
701static list_t *l2arc_dev_list;			/* device list pointer */
702static kmutex_t l2arc_dev_mtx;			/* device list mutex */
703static l2arc_dev_t *l2arc_dev_last;		/* last device used */
704static kmutex_t l2arc_buflist_mtx;		/* mutex for all buflists */
705static list_t L2ARC_free_on_write;		/* free after write buf list */
706static list_t *l2arc_free_on_write;		/* free after write list ptr */
707static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
708static uint64_t l2arc_ndev;			/* number of devices */
709
710typedef struct l2arc_read_callback {
711	arc_buf_t	*l2rcb_buf;		/* read buffer */
712	spa_t		*l2rcb_spa;		/* spa */
713	blkptr_t	l2rcb_bp;		/* original blkptr */
714	zbookmark_t	l2rcb_zb;		/* original bookmark */
715	int		l2rcb_flags;		/* original flags */
716} l2arc_read_callback_t;
717
718typedef struct l2arc_write_callback {
719	l2arc_dev_t	*l2wcb_dev;		/* device info */
720	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
721} l2arc_write_callback_t;
722
723struct l2arc_buf_hdr {
724	/* protected by arc_buf_hdr  mutex */
725	l2arc_dev_t	*b_dev;			/* L2ARC device */
726	daddr_t		b_daddr;		/* disk address, offset byte */
727};
728
729typedef struct l2arc_data_free {
730	/* protected by l2arc_free_on_write_mtx */
731	void		*l2df_data;
732	size_t		l2df_size;
733	void		(*l2df_func)(void *, size_t);
734	list_node_t	l2df_list_node;
735} l2arc_data_free_t;
736
737static kmutex_t l2arc_feed_thr_lock;
738static kcondvar_t l2arc_feed_thr_cv;
739static uint8_t l2arc_thread_exit;
740
741static void l2arc_read_done(zio_t *zio);
742static void l2arc_hdr_stat_add(void);
743static void l2arc_hdr_stat_remove(void);
744
745static uint64_t
746buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth)
747{
748	uintptr_t spav = (uintptr_t)spa;
749	uint8_t *vdva = (uint8_t *)dva;
750	uint64_t crc = -1ULL;
751	int i;
752
753	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
754
755	for (i = 0; i < sizeof (dva_t); i++)
756		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
757
758	crc ^= (spav>>8) ^ birth;
759
760	return (crc);
761}
762
763#define	BUF_EMPTY(buf)						\
764	((buf)->b_dva.dva_word[0] == 0 &&			\
765	(buf)->b_dva.dva_word[1] == 0 &&			\
766	(buf)->b_birth == 0)
767
768#define	BUF_EQUAL(spa, dva, birth, buf)				\
769	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
770	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
771	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
772
773static arc_buf_hdr_t *
774buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
775{
776	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
777	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
778	arc_buf_hdr_t *buf;
779
780	mutex_enter(hash_lock);
781	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
782	    buf = buf->b_hash_next) {
783		if (BUF_EQUAL(spa, dva, birth, buf)) {
784			*lockp = hash_lock;
785			return (buf);
786		}
787	}
788	mutex_exit(hash_lock);
789	*lockp = NULL;
790	return (NULL);
791}
792
793/*
794 * Insert an entry into the hash table.  If there is already an element
795 * equal to elem in the hash table, then the already existing element
796 * will be returned and the new element will not be inserted.
797 * Otherwise returns NULL.
798 */
799static arc_buf_hdr_t *
800buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
801{
802	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
803	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
804	arc_buf_hdr_t *fbuf;
805	uint32_t i;
806
807	ASSERT(!HDR_IN_HASH_TABLE(buf));
808	*lockp = hash_lock;
809	mutex_enter(hash_lock);
810	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
811	    fbuf = fbuf->b_hash_next, i++) {
812		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
813			return (fbuf);
814	}
815
816	buf->b_hash_next = buf_hash_table.ht_table[idx];
817	buf_hash_table.ht_table[idx] = buf;
818	buf->b_flags |= ARC_IN_HASH_TABLE;
819
820	/* collect some hash table performance data */
821	if (i > 0) {
822		ARCSTAT_BUMP(arcstat_hash_collisions);
823		if (i == 1)
824			ARCSTAT_BUMP(arcstat_hash_chains);
825
826		ARCSTAT_MAX(arcstat_hash_chain_max, i);
827	}
828
829	ARCSTAT_BUMP(arcstat_hash_elements);
830	ARCSTAT_MAXSTAT(arcstat_hash_elements);
831
832	return (NULL);
833}
834
835static void
836buf_hash_remove(arc_buf_hdr_t *buf)
837{
838	arc_buf_hdr_t *fbuf, **bufp;
839	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
840
841	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
842	ASSERT(HDR_IN_HASH_TABLE(buf));
843
844	bufp = &buf_hash_table.ht_table[idx];
845	while ((fbuf = *bufp) != buf) {
846		ASSERT(fbuf != NULL);
847		bufp = &fbuf->b_hash_next;
848	}
849	*bufp = buf->b_hash_next;
850	buf->b_hash_next = NULL;
851	buf->b_flags &= ~ARC_IN_HASH_TABLE;
852
853	/* collect some hash table performance data */
854	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
855
856	if (buf_hash_table.ht_table[idx] &&
857	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
858		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
859}
860
861/*
862 * Global data structures and functions for the buf kmem cache.
863 */
864static kmem_cache_t *hdr_cache;
865static kmem_cache_t *buf_cache;
866
867static void
868buf_fini(void)
869{
870	int i;
871
872	kmem_free(buf_hash_table.ht_table,
873	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
874	for (i = 0; i < BUF_LOCKS; i++)
875		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
876	kmem_cache_destroy(hdr_cache);
877	kmem_cache_destroy(buf_cache);
878}
879
880/*
881 * Constructor callback - called when the cache is empty
882 * and a new buf is requested.
883 */
884/* ARGSUSED */
885static int
886hdr_cons(void *vbuf, void *unused, int kmflag)
887{
888	arc_buf_hdr_t *buf = vbuf;
889
890	bzero(buf, sizeof (arc_buf_hdr_t));
891	refcount_create(&buf->b_refcnt);
892	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
893	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
894
895	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
896	return (0);
897}
898
899/* ARGSUSED */
900static int
901buf_cons(void *vbuf, void *unused, int kmflag)
902{
903	arc_buf_t *buf = vbuf;
904
905	bzero(buf, sizeof (arc_buf_t));
906	rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
907	return (0);
908}
909
910/*
911 * Destructor callback - called when a cached buf is
912 * no longer required.
913 */
914/* ARGSUSED */
915static void
916hdr_dest(void *vbuf, void *unused)
917{
918	arc_buf_hdr_t *buf = vbuf;
919
920	refcount_destroy(&buf->b_refcnt);
921	cv_destroy(&buf->b_cv);
922	mutex_destroy(&buf->b_freeze_lock);
923
924	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
925}
926
927/* ARGSUSED */
928static void
929buf_dest(void *vbuf, void *unused)
930{
931	arc_buf_t *buf = vbuf;
932
933	rw_destroy(&buf->b_lock);
934}
935
936/*
937 * Reclaim callback -- invoked when memory is low.
938 */
939/* ARGSUSED */
940static void
941hdr_recl(void *unused)
942{
943	dprintf("hdr_recl called\n");
944	/*
945	 * umem calls the reclaim func when we destroy the buf cache,
946	 * which is after we do arc_fini().
947	 */
948	if (!arc_dead)
949		cv_signal(&arc_reclaim_thr_cv);
950}
951
952static void
953buf_init(void)
954{
955	uint64_t *ct;
956	uint64_t hsize = 1ULL << 12;
957	int i, j;
958
959	/*
960	 * The hash table is big enough to fill all of physical memory
961	 * with an average 64K block size.  The table will take up
962	 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
963	 */
964	while (hsize * 65536 < (uint64_t)physmem * PAGESIZE)
965		hsize <<= 1;
966retry:
967	buf_hash_table.ht_mask = hsize - 1;
968	buf_hash_table.ht_table =
969	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
970	if (buf_hash_table.ht_table == NULL) {
971		ASSERT(hsize > (1ULL << 8));
972		hsize >>= 1;
973		goto retry;
974	}
975
976	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
977	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
978	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
979	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
980
981	for (i = 0; i < 256; i++)
982		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
983			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
984
985	for (i = 0; i < BUF_LOCKS; i++) {
986		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
987		    NULL, MUTEX_DEFAULT, NULL);
988	}
989}
990
991#define	ARC_MINTIME	(hz>>4) /* 62 ms */
992
993static void
994arc_cksum_verify(arc_buf_t *buf)
995{
996	zio_cksum_t zc;
997
998	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
999		return;
1000
1001	mutex_enter(&buf->b_hdr->b_freeze_lock);
1002	if (buf->b_hdr->b_freeze_cksum == NULL ||
1003	    (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
1004		mutex_exit(&buf->b_hdr->b_freeze_lock);
1005		return;
1006	}
1007	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1008	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1009		panic("buffer modified while frozen!");
1010	mutex_exit(&buf->b_hdr->b_freeze_lock);
1011}
1012
1013static int
1014arc_cksum_equal(arc_buf_t *buf)
1015{
1016	zio_cksum_t zc;
1017	int equal;
1018
1019	mutex_enter(&buf->b_hdr->b_freeze_lock);
1020	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1021	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1022	mutex_exit(&buf->b_hdr->b_freeze_lock);
1023
1024	return (equal);
1025}
1026
1027static void
1028arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1029{
1030	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1031		return;
1032
1033	mutex_enter(&buf->b_hdr->b_freeze_lock);
1034	if (buf->b_hdr->b_freeze_cksum != NULL) {
1035		mutex_exit(&buf->b_hdr->b_freeze_lock);
1036		return;
1037	}
1038	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1039	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1040	    buf->b_hdr->b_freeze_cksum);
1041	mutex_exit(&buf->b_hdr->b_freeze_lock);
1042}
1043
1044void
1045arc_buf_thaw(arc_buf_t *buf)
1046{
1047	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1048		if (buf->b_hdr->b_state != arc_anon)
1049			panic("modifying non-anon buffer!");
1050		if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1051			panic("modifying buffer while i/o in progress!");
1052		arc_cksum_verify(buf);
1053	}
1054
1055	mutex_enter(&buf->b_hdr->b_freeze_lock);
1056	if (buf->b_hdr->b_freeze_cksum != NULL) {
1057		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1058		buf->b_hdr->b_freeze_cksum = NULL;
1059	}
1060	mutex_exit(&buf->b_hdr->b_freeze_lock);
1061}
1062
1063void
1064arc_buf_freeze(arc_buf_t *buf)
1065{
1066	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1067		return;
1068
1069	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1070	    buf->b_hdr->b_state == arc_anon);
1071	arc_cksum_compute(buf, B_FALSE);
1072}
1073
1074static void
1075get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock)
1076{
1077	uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth);
1078
1079	if (ab->b_type == ARC_BUFC_METADATA)
1080		buf_hashid &= (ARC_BUFC_NUMMETADATALISTS-1);
1081	else {
1082		buf_hashid &= (ARC_BUFC_NUMDATALISTS-1);
1083		buf_hashid += ARC_BUFC_NUMMETADATALISTS;
1084	}
1085
1086	*list = &state->arcs_lists[buf_hashid];
1087	*lock = ARCS_LOCK(state, buf_hashid);
1088}
1089
1090
1091static void
1092add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1093{
1094
1095	ASSERT(MUTEX_HELD(hash_lock));
1096
1097	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1098	    (ab->b_state != arc_anon)) {
1099		list_t *list;
1100		kmutex_t *lock;
1101		uint64_t delta = ab->b_size * ab->b_datacnt;
1102		uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1103
1104		get_buf_info(ab, ab->b_state, &list, &lock);
1105		ASSERT(!MUTEX_HELD(lock));
1106		mutex_enter(lock);
1107		ASSERT(list_link_active(&ab->b_arc_node));
1108		list_remove(list, ab);
1109		mutex_exit(lock);
1110
1111		if (GHOST_STATE(ab->b_state)) {
1112			ASSERT3U(ab->b_datacnt, ==, 0);
1113			ASSERT3P(ab->b_buf, ==, NULL);
1114			delta = ab->b_size;
1115		}
1116		ASSERT(delta > 0);
1117		ASSERT3U(*size, >=, delta);
1118		atomic_add_64(size, -delta);
1119		/* remove the prefetch flag if we get a reference */
1120		if (ab->b_flags & ARC_PREFETCH)
1121			ab->b_flags &= ~ARC_PREFETCH;
1122	}
1123}
1124
1125static int
1126remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1127{
1128	int cnt;
1129	arc_state_t *state = ab->b_state;
1130
1131	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1132	ASSERT(!GHOST_STATE(state));
1133
1134	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1135	    (state != arc_anon)) {
1136		uint64_t *size = &state->arcs_lsize[ab->b_type];
1137		list_t *list;
1138		kmutex_t *lock;
1139
1140		get_buf_info(ab, state, &list, &lock);
1141
1142		ASSERT(!MUTEX_HELD(lock));
1143		mutex_enter(lock);
1144		ASSERT(!list_link_active(&ab->b_arc_node));
1145		list_insert_head(list, ab);
1146		mutex_exit(lock);
1147
1148		ASSERT(ab->b_datacnt > 0);
1149		atomic_add_64(size, ab->b_size * ab->b_datacnt);
1150	}
1151	return (cnt);
1152}
1153
1154/*
1155 * Move the supplied buffer to the indicated state.  The mutex
1156 * for the buffer must be held by the caller.
1157 */
1158static void
1159arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1160{
1161	arc_state_t *old_state = ab->b_state;
1162	int64_t refcnt = refcount_count(&ab->b_refcnt);
1163	uint64_t from_delta, to_delta;
1164	list_t *list;
1165	kmutex_t *lock;
1166
1167	ASSERT(MUTEX_HELD(hash_lock));
1168	ASSERT(new_state != old_state);
1169	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1170	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1171
1172	from_delta = to_delta = ab->b_datacnt * ab->b_size;
1173
1174	/*
1175	 * If this buffer is evictable, transfer it from the
1176	 * old state list to the new state list.
1177	 */
1178	if (refcnt == 0) {
1179		if (old_state != arc_anon) {
1180			int use_mutex;
1181			uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1182
1183			get_buf_info(ab, old_state, &list, &lock);
1184			use_mutex = !MUTEX_HELD(lock);
1185
1186			if (use_mutex)
1187				mutex_enter(lock);
1188
1189			ASSERT(list_link_active(&ab->b_arc_node));
1190			list_remove(list, ab);
1191
1192			/*
1193			 * If prefetching out of the ghost cache,
1194			 * we will have a non-null datacnt.
1195			 */
1196			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1197				/* ghost elements have a ghost size */
1198				ASSERT(ab->b_buf == NULL);
1199				from_delta = ab->b_size;
1200			}
1201			ASSERT3U(*size, >=, from_delta);
1202			atomic_add_64(size, -from_delta);
1203
1204			if (use_mutex)
1205				mutex_exit(lock);
1206		}
1207		if (new_state != arc_anon) {
1208			int use_mutex;
1209			uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1210
1211			get_buf_info(ab, new_state, &list, &lock);
1212			use_mutex = !MUTEX_HELD(lock);
1213
1214
1215			if (use_mutex)
1216				mutex_enter(lock);
1217
1218			list_insert_head(list, ab);
1219
1220			/* ghost elements have a ghost size */
1221			if (GHOST_STATE(new_state)) {
1222				ASSERT(ab->b_datacnt == 0);
1223				ASSERT(ab->b_buf == NULL);
1224				to_delta = ab->b_size;
1225			}
1226			atomic_add_64(size, to_delta);
1227
1228			if (use_mutex)
1229				mutex_exit(lock);
1230		}
1231	}
1232
1233	ASSERT(!BUF_EMPTY(ab));
1234	if (new_state == arc_anon) {
1235		buf_hash_remove(ab);
1236	}
1237
1238	/* adjust state sizes */
1239	if (to_delta)
1240		atomic_add_64(&new_state->arcs_size, to_delta);
1241	if (from_delta) {
1242		ASSERT3U(old_state->arcs_size, >=, from_delta);
1243		atomic_add_64(&old_state->arcs_size, -from_delta);
1244	}
1245	ab->b_state = new_state;
1246
1247	/* adjust l2arc hdr stats */
1248	if (new_state == arc_l2c_only)
1249		l2arc_hdr_stat_add();
1250	else if (old_state == arc_l2c_only)
1251		l2arc_hdr_stat_remove();
1252}
1253
1254void
1255arc_space_consume(uint64_t space)
1256{
1257	atomic_add_64(&arc_meta_used, space);
1258	atomic_add_64(&arc_size, space);
1259}
1260
1261void
1262arc_space_return(uint64_t space)
1263{
1264	ASSERT(arc_meta_used >= space);
1265	if (arc_meta_max < arc_meta_used)
1266		arc_meta_max = arc_meta_used;
1267	atomic_add_64(&arc_meta_used, -space);
1268	ASSERT(arc_size >= space);
1269	atomic_add_64(&arc_size, -space);
1270}
1271
1272void *
1273arc_data_buf_alloc(uint64_t size)
1274{
1275	if (arc_evict_needed(ARC_BUFC_DATA))
1276		cv_signal(&arc_reclaim_thr_cv);
1277	atomic_add_64(&arc_size, size);
1278	return (zio_data_buf_alloc(size));
1279}
1280
1281void
1282arc_data_buf_free(void *buf, uint64_t size)
1283{
1284	zio_data_buf_free(buf, size);
1285	ASSERT(arc_size >= size);
1286	atomic_add_64(&arc_size, -size);
1287}
1288
1289arc_buf_t *
1290arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1291{
1292	arc_buf_hdr_t *hdr;
1293	arc_buf_t *buf;
1294
1295	ASSERT3U(size, >, 0);
1296	hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1297	ASSERT(BUF_EMPTY(hdr));
1298	hdr->b_size = size;
1299	hdr->b_type = type;
1300	hdr->b_spa = spa;
1301	hdr->b_state = arc_anon;
1302	hdr->b_arc_access = 0;
1303	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1304	buf->b_hdr = hdr;
1305	buf->b_data = NULL;
1306	buf->b_efunc = NULL;
1307	buf->b_private = NULL;
1308	buf->b_next = NULL;
1309	hdr->b_buf = buf;
1310	arc_get_data_buf(buf);
1311	hdr->b_datacnt = 1;
1312	hdr->b_flags = 0;
1313	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1314	(void) refcount_add(&hdr->b_refcnt, tag);
1315
1316	return (buf);
1317}
1318
1319static arc_buf_t *
1320arc_buf_clone(arc_buf_t *from)
1321{
1322	arc_buf_t *buf;
1323	arc_buf_hdr_t *hdr = from->b_hdr;
1324	uint64_t size = hdr->b_size;
1325
1326	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1327	buf->b_hdr = hdr;
1328	buf->b_data = NULL;
1329	buf->b_efunc = NULL;
1330	buf->b_private = NULL;
1331	buf->b_next = hdr->b_buf;
1332	hdr->b_buf = buf;
1333	arc_get_data_buf(buf);
1334	bcopy(from->b_data, buf->b_data, size);
1335	hdr->b_datacnt += 1;
1336	return (buf);
1337}
1338
1339void
1340arc_buf_add_ref(arc_buf_t *buf, void* tag)
1341{
1342	arc_buf_hdr_t *hdr;
1343	kmutex_t *hash_lock;
1344
1345	/*
1346	 * Check to see if this buffer is evicted.  Callers
1347	 * must verify b_data != NULL to know if the add_ref
1348	 * was successful.
1349	 */
1350	rw_enter(&buf->b_lock, RW_READER);
1351	if (buf->b_data == NULL) {
1352		rw_exit(&buf->b_lock);
1353		return;
1354	}
1355	hdr = buf->b_hdr;
1356	ASSERT(hdr != NULL);
1357	hash_lock = HDR_LOCK(hdr);
1358	mutex_enter(hash_lock);
1359	rw_exit(&buf->b_lock);
1360
1361	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1362	add_reference(hdr, hash_lock, tag);
1363	arc_access(hdr, hash_lock);
1364	mutex_exit(hash_lock);
1365	ARCSTAT_BUMP(arcstat_hits);
1366	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1367	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1368	    data, metadata, hits);
1369}
1370
1371/*
1372 * Free the arc data buffer.  If it is an l2arc write in progress,
1373 * the buffer is placed on l2arc_free_on_write to be freed later.
1374 */
1375static void
1376arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
1377    void *data, size_t size)
1378{
1379	if (HDR_L2_WRITING(hdr)) {
1380		l2arc_data_free_t *df;
1381		df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1382		df->l2df_data = data;
1383		df->l2df_size = size;
1384		df->l2df_func = free_func;
1385		mutex_enter(&l2arc_free_on_write_mtx);
1386		list_insert_head(l2arc_free_on_write, df);
1387		mutex_exit(&l2arc_free_on_write_mtx);
1388		ARCSTAT_BUMP(arcstat_l2_free_on_write);
1389	} else {
1390		free_func(data, size);
1391	}
1392}
1393
1394static void
1395arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1396{
1397	arc_buf_t **bufp;
1398
1399	/* free up data associated with the buf */
1400	if (buf->b_data) {
1401		arc_state_t *state = buf->b_hdr->b_state;
1402		uint64_t size = buf->b_hdr->b_size;
1403		arc_buf_contents_t type = buf->b_hdr->b_type;
1404
1405		arc_cksum_verify(buf);
1406		if (!recycle) {
1407			if (type == ARC_BUFC_METADATA) {
1408				arc_buf_data_free(buf->b_hdr, zio_buf_free,
1409				    buf->b_data, size);
1410				arc_space_return(size);
1411			} else {
1412				ASSERT(type == ARC_BUFC_DATA);
1413				arc_buf_data_free(buf->b_hdr,
1414				    zio_data_buf_free, buf->b_data, size);
1415				atomic_add_64(&arc_size, -size);
1416			}
1417		}
1418		if (list_link_active(&buf->b_hdr->b_arc_node)) {
1419			uint64_t *cnt = &state->arcs_lsize[type];
1420
1421			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1422			ASSERT(state != arc_anon);
1423
1424			ASSERT3U(*cnt, >=, size);
1425			atomic_add_64(cnt, -size);
1426		}
1427		ASSERT3U(state->arcs_size, >=, size);
1428		atomic_add_64(&state->arcs_size, -size);
1429		buf->b_data = NULL;
1430		ASSERT(buf->b_hdr->b_datacnt > 0);
1431		buf->b_hdr->b_datacnt -= 1;
1432	}
1433
1434	/* only remove the buf if requested */
1435	if (!all)
1436		return;
1437
1438	/* remove the buf from the hdr list */
1439	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1440		continue;
1441	*bufp = buf->b_next;
1442
1443	ASSERT(buf->b_efunc == NULL);
1444
1445	/* clean up the buf */
1446	buf->b_hdr = NULL;
1447	kmem_cache_free(buf_cache, buf);
1448}
1449
1450static void
1451arc_hdr_destroy(arc_buf_hdr_t *hdr)
1452{
1453	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1454	ASSERT3P(hdr->b_state, ==, arc_anon);
1455	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1456	ASSERT(!(hdr->b_flags & ARC_STORED));
1457
1458	if (hdr->b_l2hdr != NULL) {
1459		if (!MUTEX_HELD(&l2arc_buflist_mtx)) {
1460			/*
1461			 * To prevent arc_free() and l2arc_evict() from
1462			 * attempting to free the same buffer at the same time,
1463			 * a FREE_IN_PROGRESS flag is given to arc_free() to
1464			 * give it priority.  l2arc_evict() can't destroy this
1465			 * header while we are waiting on l2arc_buflist_mtx.
1466			 *
1467			 * The hdr may be removed from l2ad_buflist before we
1468			 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1469			 */
1470			mutex_enter(&l2arc_buflist_mtx);
1471			if (hdr->b_l2hdr != NULL) {
1472				list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist,
1473				    hdr);
1474			}
1475			mutex_exit(&l2arc_buflist_mtx);
1476		} else {
1477			list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
1478		}
1479		ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1480		kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t));
1481		if (hdr->b_state == arc_l2c_only)
1482			l2arc_hdr_stat_remove();
1483		hdr->b_l2hdr = NULL;
1484	}
1485
1486	if (!BUF_EMPTY(hdr)) {
1487		ASSERT(!HDR_IN_HASH_TABLE(hdr));
1488		bzero(&hdr->b_dva, sizeof (dva_t));
1489		hdr->b_birth = 0;
1490		hdr->b_cksum0 = 0;
1491	}
1492	while (hdr->b_buf) {
1493		arc_buf_t *buf = hdr->b_buf;
1494
1495		if (buf->b_efunc) {
1496			mutex_enter(&arc_eviction_mtx);
1497			rw_enter(&buf->b_lock, RW_WRITER);
1498			ASSERT(buf->b_hdr != NULL);
1499			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1500			hdr->b_buf = buf->b_next;
1501			buf->b_hdr = &arc_eviction_hdr;
1502			buf->b_next = arc_eviction_list;
1503			arc_eviction_list = buf;
1504			rw_exit(&buf->b_lock);
1505			mutex_exit(&arc_eviction_mtx);
1506		} else {
1507			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1508		}
1509	}
1510	if (hdr->b_freeze_cksum != NULL) {
1511		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1512		hdr->b_freeze_cksum = NULL;
1513	}
1514
1515	ASSERT(!list_link_active(&hdr->b_arc_node));
1516	ASSERT3P(hdr->b_hash_next, ==, NULL);
1517	ASSERT3P(hdr->b_acb, ==, NULL);
1518	kmem_cache_free(hdr_cache, hdr);
1519}
1520
1521void
1522arc_buf_free(arc_buf_t *buf, void *tag)
1523{
1524	arc_buf_hdr_t *hdr = buf->b_hdr;
1525	int hashed = hdr->b_state != arc_anon;
1526
1527	ASSERT(buf->b_efunc == NULL);
1528	ASSERT(buf->b_data != NULL);
1529
1530	if (hashed) {
1531		kmutex_t *hash_lock = HDR_LOCK(hdr);
1532
1533		mutex_enter(hash_lock);
1534		(void) remove_reference(hdr, hash_lock, tag);
1535		if (hdr->b_datacnt > 1)
1536			arc_buf_destroy(buf, FALSE, TRUE);
1537		else
1538			hdr->b_flags |= ARC_BUF_AVAILABLE;
1539		mutex_exit(hash_lock);
1540	} else if (HDR_IO_IN_PROGRESS(hdr)) {
1541		int destroy_hdr;
1542		/*
1543		 * We are in the middle of an async write.  Don't destroy
1544		 * this buffer unless the write completes before we finish
1545		 * decrementing the reference count.
1546		 */
1547		mutex_enter(&arc_eviction_mtx);
1548		(void) remove_reference(hdr, NULL, tag);
1549		ASSERT(refcount_is_zero(&hdr->b_refcnt));
1550		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1551		mutex_exit(&arc_eviction_mtx);
1552		if (destroy_hdr)
1553			arc_hdr_destroy(hdr);
1554	} else {
1555		if (remove_reference(hdr, NULL, tag) > 0) {
1556			ASSERT(HDR_IO_ERROR(hdr));
1557			arc_buf_destroy(buf, FALSE, TRUE);
1558		} else {
1559			arc_hdr_destroy(hdr);
1560		}
1561	}
1562}
1563
1564int
1565arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1566{
1567	arc_buf_hdr_t *hdr = buf->b_hdr;
1568	kmutex_t *hash_lock = HDR_LOCK(hdr);
1569	int no_callback = (buf->b_efunc == NULL);
1570
1571	if (hdr->b_state == arc_anon) {
1572		arc_buf_free(buf, tag);
1573		return (no_callback);
1574	}
1575
1576	mutex_enter(hash_lock);
1577	ASSERT(hdr->b_state != arc_anon);
1578	ASSERT(buf->b_data != NULL);
1579
1580	(void) remove_reference(hdr, hash_lock, tag);
1581	if (hdr->b_datacnt > 1) {
1582		if (no_callback)
1583			arc_buf_destroy(buf, FALSE, TRUE);
1584	} else if (no_callback) {
1585		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1586		hdr->b_flags |= ARC_BUF_AVAILABLE;
1587	}
1588	ASSERT(no_callback || hdr->b_datacnt > 1 ||
1589	    refcount_is_zero(&hdr->b_refcnt));
1590	mutex_exit(hash_lock);
1591	return (no_callback);
1592}
1593
1594int
1595arc_buf_size(arc_buf_t *buf)
1596{
1597	return (buf->b_hdr->b_size);
1598}
1599
1600/*
1601 * Evict buffers from list until we've removed the specified number of
1602 * bytes.  Move the removed buffers to the appropriate evict state.
1603 * If the recycle flag is set, then attempt to "recycle" a buffer:
1604 * - look for a buffer to evict that is `bytes' long.
1605 * - return the data block from this buffer rather than freeing it.
1606 * This flag is used by callers that are trying to make space for a
1607 * new buffer in a full arc cache.
1608 *
1609 * This function makes a "best effort".  It skips over any buffers
1610 * it can't get a hash_lock on, and so may not catch all candidates.
1611 * It may also return without evicting as much space as requested.
1612 */
1613static void *
1614arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle,
1615    arc_buf_contents_t type)
1616{
1617	arc_state_t *evicted_state;
1618	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1619	int64_t bytes_remaining;
1620	arc_buf_hdr_t *ab, *ab_prev = NULL;
1621	list_t *evicted_list, *list, *evicted_list_start, *list_start;
1622	kmutex_t *lock, *evicted_lock;
1623	kmutex_t *hash_lock;
1624	boolean_t have_lock;
1625	void *stolen = NULL;
1626	static int evict_metadata_offset, evict_data_offset;
1627	int i, idx, offset, list_count, count;
1628
1629	ASSERT(state == arc_mru || state == arc_mfu);
1630
1631	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1632
1633	if (type == ARC_BUFC_METADATA) {
1634		offset = 0;
1635		list_count = ARC_BUFC_NUMMETADATALISTS;
1636		list_start = &state->arcs_lists[0];
1637		evicted_list_start = &evicted_state->arcs_lists[0];
1638		idx = evict_metadata_offset;
1639	} else {
1640		offset = ARC_BUFC_NUMMETADATALISTS;
1641
1642		list_start = &state->arcs_lists[offset];
1643		evicted_list_start = &evicted_state->arcs_lists[offset];
1644		list_count = ARC_BUFC_NUMDATALISTS;
1645		idx = evict_data_offset;
1646	}
1647	bytes_remaining = evicted_state->arcs_lsize[type];
1648	count = 0;
1649
1650evict_start:
1651	list = &list_start[idx];
1652	evicted_list = &evicted_list_start[idx];
1653	lock = ARCS_LOCK(state, (offset + idx));
1654	evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
1655
1656	mutex_enter(lock);
1657	mutex_enter(evicted_lock);
1658
1659	for (ab = list_tail(list); ab; ab = ab_prev) {
1660		ab_prev = list_prev(list, ab);
1661		bytes_remaining -= (ab->b_size * ab->b_datacnt);
1662		/* prefetch buffers have a minimum lifespan */
1663		if (HDR_IO_IN_PROGRESS(ab) ||
1664		    (spa && ab->b_spa != spa) ||
1665		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1666		    LBOLT - ab->b_arc_access < arc_min_prefetch_lifespan)) {
1667			skipped++;
1668			continue;
1669		}
1670		/* "lookahead" for better eviction candidate */
1671		if (recycle && ab->b_size != bytes &&
1672		    ab_prev && ab_prev->b_size == bytes)
1673			continue;
1674		hash_lock = HDR_LOCK(ab);
1675		have_lock = MUTEX_HELD(hash_lock);
1676		if (have_lock || mutex_tryenter(hash_lock)) {
1677			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
1678			ASSERT(ab->b_datacnt > 0);
1679			while (ab->b_buf) {
1680				arc_buf_t *buf = ab->b_buf;
1681				if (!rw_tryenter(&buf->b_lock, RW_WRITER)) {
1682					missed += 1;
1683					break;
1684				}
1685				if (buf->b_data) {
1686					bytes_evicted += ab->b_size;
1687					if (recycle && ab->b_type == type &&
1688					    ab->b_size == bytes &&
1689					    !HDR_L2_WRITING(ab)) {
1690						stolen = buf->b_data;
1691						recycle = FALSE;
1692					}
1693				}
1694				if (buf->b_efunc) {
1695					mutex_enter(&arc_eviction_mtx);
1696					arc_buf_destroy(buf,
1697					    buf->b_data == stolen, FALSE);
1698					ab->b_buf = buf->b_next;
1699					buf->b_hdr = &arc_eviction_hdr;
1700					buf->b_next = arc_eviction_list;
1701					arc_eviction_list = buf;
1702					mutex_exit(&arc_eviction_mtx);
1703					rw_exit(&buf->b_lock);
1704				} else {
1705					rw_exit(&buf->b_lock);
1706					arc_buf_destroy(buf,
1707					    buf->b_data == stolen, TRUE);
1708				}
1709			}
1710			if (ab->b_datacnt == 0) {
1711				arc_change_state(evicted_state, ab, hash_lock);
1712				ASSERT(HDR_IN_HASH_TABLE(ab));
1713				ab->b_flags |= ARC_IN_HASH_TABLE;
1714				ab->b_flags &= ~ARC_BUF_AVAILABLE;
1715				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1716			}
1717			if (!have_lock)
1718				mutex_exit(hash_lock);
1719			if (bytes >= 0 && bytes_evicted >= bytes)
1720				break;
1721			if (bytes_remaining > 0) {
1722				mutex_exit(evicted_lock);
1723				mutex_exit(lock);
1724				idx  = ((idx + 1)&(list_count-1));
1725				count++;
1726				goto evict_start;
1727			}
1728		} else {
1729			missed += 1;
1730		}
1731	}
1732
1733	mutex_exit(evicted_lock);
1734	mutex_exit(lock);
1735
1736	idx  = ((idx + 1)&(list_count-1));
1737	count++;
1738
1739	if (bytes_evicted < bytes) {
1740		if (count < list_count)
1741			goto evict_start;
1742		else
1743			dprintf("only evicted %lld bytes from %x",
1744			    (longlong_t)bytes_evicted, state);
1745	}
1746	if (type == ARC_BUFC_METADATA)
1747		evict_metadata_offset = idx;
1748	else
1749		evict_data_offset = idx;
1750
1751	if (skipped)
1752		ARCSTAT_INCR(arcstat_evict_skip, skipped);
1753
1754	if (missed)
1755		ARCSTAT_INCR(arcstat_mutex_miss, missed);
1756
1757	/*
1758	 * We have just evicted some date into the ghost state, make
1759	 * sure we also adjust the ghost state size if necessary.
1760	 */
1761	if (arc_no_grow &&
1762	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1763		int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1764		    arc_mru_ghost->arcs_size - arc_c;
1765
1766		if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1767			int64_t todelete =
1768			    MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
1769			arc_evict_ghost(arc_mru_ghost, NULL, todelete);
1770		} else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1771			int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1772			    arc_mru_ghost->arcs_size +
1773			    arc_mfu_ghost->arcs_size - arc_c);
1774			arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
1775		}
1776	}
1777	if (stolen)
1778		ARCSTAT_BUMP(arcstat_stolen);
1779
1780	return (stolen);
1781}
1782
1783/*
1784 * Remove buffers from list until we've removed the specified number of
1785 * bytes.  Destroy the buffers that are removed.
1786 */
1787static void
1788arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes)
1789{
1790	arc_buf_hdr_t *ab, *ab_prev;
1791	list_t *list, *list_start;
1792	kmutex_t *hash_lock, *lock;
1793	uint64_t bytes_deleted = 0;
1794	uint64_t bufs_skipped = 0;
1795	static int evict_offset;
1796	int list_count, idx = evict_offset;
1797	int offset, count = 0;
1798
1799	ASSERT(GHOST_STATE(state));
1800
1801	/*
1802	 * data lists come after metadata lists
1803	 */
1804	list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
1805	list_count = ARC_BUFC_NUMDATALISTS;
1806	offset = ARC_BUFC_NUMMETADATALISTS;
1807
1808evict_start:
1809	list = &list_start[idx];
1810	lock = ARCS_LOCK(state, idx + offset);
1811
1812	mutex_enter(lock);
1813	for (ab = list_tail(list); ab; ab = ab_prev) {
1814		ab_prev = list_prev(list, ab);
1815		if (spa && ab->b_spa != spa)
1816			continue;
1817		hash_lock = HDR_LOCK(ab);
1818		if (mutex_tryenter(hash_lock)) {
1819			ASSERT(!HDR_IO_IN_PROGRESS(ab));
1820			ASSERT(ab->b_buf == NULL);
1821			ARCSTAT_BUMP(arcstat_deleted);
1822			bytes_deleted += ab->b_size;
1823
1824			if (ab->b_l2hdr != NULL) {
1825				/*
1826				 * This buffer is cached on the 2nd Level ARC;
1827				 * don't destroy the header.
1828				 */
1829				arc_change_state(arc_l2c_only, ab, hash_lock);
1830				mutex_exit(hash_lock);
1831			} else {
1832				arc_change_state(arc_anon, ab, hash_lock);
1833				mutex_exit(hash_lock);
1834				arc_hdr_destroy(ab);
1835			}
1836
1837			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1838			if (bytes >= 0 && bytes_deleted >= bytes)
1839				break;
1840		} else {
1841			if (bytes < 0) {
1842				/*
1843				 * we're draining the ARC, retry
1844				 */
1845				mutex_exit(lock);
1846				mutex_enter(hash_lock);
1847				mutex_exit(hash_lock);
1848				goto evict_start;
1849			}
1850			bufs_skipped += 1;
1851		}
1852	}
1853	mutex_exit(lock);
1854	idx  = ((idx + 1)&(ARC_BUFC_NUMDATALISTS-1));
1855	count++;
1856
1857	if (count < list_count)
1858		goto evict_start;
1859
1860	evict_offset = idx;
1861	if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
1862	    (bytes < 0 || bytes_deleted < bytes)) {
1863		list_start = &state->arcs_lists[0];
1864		list_count = ARC_BUFC_NUMMETADATALISTS;
1865		offset = count = 0;
1866		goto evict_start;
1867	}
1868
1869	if (bufs_skipped) {
1870		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1871		ASSERT(bytes >= 0);
1872	}
1873
1874	if (bytes_deleted < bytes)
1875		dprintf("only deleted %lld bytes from %p",
1876		    (longlong_t)bytes_deleted, state);
1877}
1878
1879static void
1880arc_adjust(void)
1881{
1882	int64_t top_sz, mru_over, arc_over, todelete;
1883
1884	top_sz = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used;
1885
1886	if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
1887		int64_t toevict =
1888		    MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p);
1889		(void) arc_evict(arc_mru, NULL, toevict, FALSE, ARC_BUFC_DATA);
1890		top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
1891	}
1892
1893	if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1894		int64_t toevict =
1895		    MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p);
1896		(void) arc_evict(arc_mru, NULL, toevict, FALSE,
1897		    ARC_BUFC_METADATA);
1898		top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
1899	}
1900
1901	mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c;
1902
1903	if (mru_over > 0) {
1904		if (arc_mru_ghost->arcs_size > 0) {
1905			todelete = MIN(arc_mru_ghost->arcs_size, mru_over);
1906			arc_evict_ghost(arc_mru_ghost, NULL, todelete);
1907		}
1908	}
1909
1910	if ((arc_over = arc_size - arc_c) > 0) {
1911		int64_t tbl_over;
1912
1913		if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
1914			int64_t toevict =
1915			    MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over);
1916			(void) arc_evict(arc_mfu, NULL, toevict, FALSE,
1917			    ARC_BUFC_DATA);
1918			arc_over = arc_size - arc_c;
1919		}
1920
1921		if (arc_over > 0 &&
1922		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1923			int64_t toevict =
1924			    MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA],
1925			    arc_over);
1926			(void) arc_evict(arc_mfu, NULL, toevict, FALSE,
1927			    ARC_BUFC_METADATA);
1928		}
1929
1930		tbl_over = arc_size + arc_mru_ghost->arcs_size +
1931		    arc_mfu_ghost->arcs_size - arc_c * 2;
1932
1933		if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) {
1934			todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over);
1935			arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
1936		}
1937	}
1938}
1939
1940static void
1941arc_do_user_evicts(void)
1942{
1943	static arc_buf_t *tmp_arc_eviction_list;
1944
1945	/*
1946	 * Move list over to avoid LOR
1947	 */
1948restart:
1949	mutex_enter(&arc_eviction_mtx);
1950	tmp_arc_eviction_list = arc_eviction_list;
1951	arc_eviction_list = NULL;
1952	mutex_exit(&arc_eviction_mtx);
1953
1954	while (tmp_arc_eviction_list != NULL) {
1955		arc_buf_t *buf = tmp_arc_eviction_list;
1956		tmp_arc_eviction_list = buf->b_next;
1957		rw_enter(&buf->b_lock, RW_WRITER);
1958		buf->b_hdr = NULL;
1959		rw_exit(&buf->b_lock);
1960
1961		if (buf->b_efunc != NULL)
1962			VERIFY(buf->b_efunc(buf) == 0);
1963
1964		buf->b_efunc = NULL;
1965		buf->b_private = NULL;
1966		kmem_cache_free(buf_cache, buf);
1967	}
1968
1969	if (arc_eviction_list != NULL)
1970		goto restart;
1971}
1972
1973/*
1974 * Flush all *evictable* data from the cache for the given spa.
1975 * NOTE: this will not touch "active" (i.e. referenced) data.
1976 */
1977void
1978arc_flush(spa_t *spa)
1979{
1980	while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
1981		(void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA);
1982		if (spa)
1983			break;
1984	}
1985	while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
1986		(void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA);
1987		if (spa)
1988			break;
1989	}
1990	while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
1991		(void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA);
1992		if (spa)
1993			break;
1994	}
1995	while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
1996		(void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA);
1997		if (spa)
1998			break;
1999	}
2000
2001	arc_evict_ghost(arc_mru_ghost, spa, -1);
2002	arc_evict_ghost(arc_mfu_ghost, spa, -1);
2003
2004	mutex_enter(&arc_reclaim_thr_lock);
2005	arc_do_user_evicts();
2006	mutex_exit(&arc_reclaim_thr_lock);
2007	ASSERT(spa || arc_eviction_list == NULL);
2008}
2009
2010int arc_shrink_shift = 5;		/* log2(fraction of arc to reclaim) */
2011
2012void
2013arc_shrink(void)
2014{
2015	if (arc_c > arc_c_min) {
2016		uint64_t to_free;
2017
2018#ifdef _KERNEL
2019		to_free = arc_c >> arc_shrink_shift;
2020#else
2021		to_free = arc_c >> arc_shrink_shift;
2022#endif
2023		if (arc_c > arc_c_min + to_free)
2024			atomic_add_64(&arc_c, -to_free);
2025		else
2026			arc_c = arc_c_min;
2027
2028		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2029		if (arc_c > arc_size)
2030			arc_c = MAX(arc_size, arc_c_min);
2031		if (arc_p > arc_c)
2032			arc_p = (arc_c >> 1);
2033		ASSERT(arc_c >= arc_c_min);
2034		ASSERT((int64_t)arc_p >= 0);
2035	}
2036
2037	if (arc_size > arc_c)
2038		arc_adjust();
2039}
2040
2041static int needfree = 0;
2042
2043static int
2044arc_reclaim_needed(void)
2045{
2046#if 0
2047	uint64_t extra;
2048#endif
2049
2050#ifdef _KERNEL
2051	if (needfree)
2052		return (1);
2053	if (arc_size > arc_c_max)
2054		return (1);
2055	if (arc_size <= arc_c_min)
2056		return (0);
2057
2058	/*
2059	 * If pages are needed or we're within 2048 pages
2060	 * of needing to page need to reclaim
2061	 */
2062	if (vm_pages_needed || (vm_paging_target() > -2048))
2063		return (1);
2064
2065#if 0
2066	/*
2067	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
2068	 */
2069	extra = desfree;
2070
2071	/*
2072	 * check that we're out of range of the pageout scanner.  It starts to
2073	 * schedule paging if freemem is less than lotsfree and needfree.
2074	 * lotsfree is the high-water mark for pageout, and needfree is the
2075	 * number of needed free pages.  We add extra pages here to make sure
2076	 * the scanner doesn't start up while we're freeing memory.
2077	 */
2078	if (freemem < lotsfree + needfree + extra)
2079		return (1);
2080
2081	/*
2082	 * check to make sure that swapfs has enough space so that anon
2083	 * reservations can still succeed. anon_resvmem() checks that the
2084	 * availrmem is greater than swapfs_minfree, and the number of reserved
2085	 * swap pages.  We also add a bit of extra here just to prevent
2086	 * circumstances from getting really dire.
2087	 */
2088	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2089		return (1);
2090
2091#if defined(__i386)
2092	/*
2093	 * If we're on an i386 platform, it's possible that we'll exhaust the
2094	 * kernel heap space before we ever run out of available physical
2095	 * memory.  Most checks of the size of the heap_area compare against
2096	 * tune.t_minarmem, which is the minimum available real memory that we
2097	 * can have in the system.  However, this is generally fixed at 25 pages
2098	 * which is so low that it's useless.  In this comparison, we seek to
2099	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
2100	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2101	 * free)
2102	 */
2103	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
2104	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
2105		return (1);
2106#endif
2107#else
2108	if (kmem_used() > (kmem_size() * 3) / 4)
2109		return (1);
2110#endif
2111
2112#else
2113	if (spa_get_random(100) == 0)
2114		return (1);
2115#endif
2116	return (0);
2117}
2118
2119static void
2120arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2121{
2122#ifdef ZIO_USE_UMA
2123	size_t			i;
2124	kmem_cache_t		*prev_cache = NULL;
2125	kmem_cache_t		*prev_data_cache = NULL;
2126#endif
2127
2128#ifdef _KERNEL
2129	if (arc_meta_used >= arc_meta_limit) {
2130		/*
2131		 * We are exceeding our meta-data cache limit.
2132		 * Purge some DNLC entries to release holds on meta-data.
2133		 */
2134		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2135	}
2136#if defined(__i386)
2137	/*
2138	 * Reclaim unused memory from all kmem caches.
2139	 */
2140	kmem_reap();
2141#endif
2142#endif
2143
2144	/*
2145	 * An aggressive reclamation will shrink the cache size as well as
2146	 * reap free buffers from the arc kmem caches.
2147	 */
2148	if (strat == ARC_RECLAIM_AGGR)
2149		arc_shrink();
2150
2151#ifdef ZIO_USE_UMA
2152	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2153		if (zio_buf_cache[i] != prev_cache) {
2154			prev_cache = zio_buf_cache[i];
2155			kmem_cache_reap_now(zio_buf_cache[i]);
2156		}
2157		if (zio_data_buf_cache[i] != prev_data_cache) {
2158			prev_data_cache = zio_data_buf_cache[i];
2159			kmem_cache_reap_now(zio_data_buf_cache[i]);
2160		}
2161	}
2162#endif
2163	kmem_cache_reap_now(buf_cache);
2164	kmem_cache_reap_now(hdr_cache);
2165}
2166
2167static void
2168arc_reclaim_thread(void *dummy __unused)
2169{
2170	clock_t			growtime = 0;
2171	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
2172	callb_cpr_t		cpr;
2173
2174	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2175
2176	mutex_enter(&arc_reclaim_thr_lock);
2177	while (arc_thread_exit == 0) {
2178		if (arc_reclaim_needed()) {
2179
2180			if (arc_no_grow) {
2181				if (last_reclaim == ARC_RECLAIM_CONS) {
2182					last_reclaim = ARC_RECLAIM_AGGR;
2183				} else {
2184					last_reclaim = ARC_RECLAIM_CONS;
2185				}
2186			} else {
2187				arc_no_grow = TRUE;
2188				last_reclaim = ARC_RECLAIM_AGGR;
2189				membar_producer();
2190			}
2191
2192			/* reset the growth delay for every reclaim */
2193			growtime = LBOLT + (arc_grow_retry * hz);
2194
2195			if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
2196				/*
2197				 * If needfree is TRUE our vm_lowmem hook
2198				 * was called and in that case we must free some
2199				 * memory, so switch to aggressive mode.
2200				 */
2201				arc_no_grow = TRUE;
2202				last_reclaim = ARC_RECLAIM_AGGR;
2203			}
2204			arc_kmem_reap_now(last_reclaim);
2205			arc_warm = B_TRUE;
2206
2207		} else if (arc_no_grow && LBOLT >= growtime) {
2208			arc_no_grow = FALSE;
2209		}
2210
2211		if (needfree ||
2212		    (2 * arc_c < arc_size +
2213		    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size))
2214			arc_adjust();
2215
2216		if (arc_eviction_list != NULL)
2217			arc_do_user_evicts();
2218
2219		if (arc_reclaim_needed()) {
2220			needfree = 0;
2221#ifdef _KERNEL
2222			wakeup(&needfree);
2223#endif
2224		}
2225
2226		/* block until needed, or one second, whichever is shorter */
2227		CALLB_CPR_SAFE_BEGIN(&cpr);
2228		(void) cv_timedwait(&arc_reclaim_thr_cv,
2229		    &arc_reclaim_thr_lock, hz);
2230		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2231	}
2232
2233	arc_thread_exit = 0;
2234	cv_broadcast(&arc_reclaim_thr_cv);
2235	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
2236	thread_exit();
2237}
2238
2239/*
2240 * Adapt arc info given the number of bytes we are trying to add and
2241 * the state that we are comming from.  This function is only called
2242 * when we are adding new content to the cache.
2243 */
2244static void
2245arc_adapt(int bytes, arc_state_t *state)
2246{
2247	int mult;
2248
2249	if (state == arc_l2c_only)
2250		return;
2251
2252	ASSERT(bytes > 0);
2253	/*
2254	 * Adapt the target size of the MRU list:
2255	 *	- if we just hit in the MRU ghost list, then increase
2256	 *	  the target size of the MRU list.
2257	 *	- if we just hit in the MFU ghost list, then increase
2258	 *	  the target size of the MFU list by decreasing the
2259	 *	  target size of the MRU list.
2260	 */
2261	if (state == arc_mru_ghost) {
2262		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2263		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2264
2265		arc_p = MIN(arc_c, arc_p + bytes * mult);
2266	} else if (state == arc_mfu_ghost) {
2267		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2268		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2269
2270		arc_p = MAX(0, (int64_t)arc_p - bytes * mult);
2271	}
2272	ASSERT((int64_t)arc_p >= 0);
2273
2274	if (arc_reclaim_needed()) {
2275		cv_signal(&arc_reclaim_thr_cv);
2276		return;
2277	}
2278
2279	if (arc_no_grow)
2280		return;
2281
2282	if (arc_c >= arc_c_max)
2283		return;
2284
2285	/*
2286	 * If we're within (2 * maxblocksize) bytes of the target
2287	 * cache size, increment the target cache size
2288	 */
2289	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2290		atomic_add_64(&arc_c, (int64_t)bytes);
2291		if (arc_c > arc_c_max)
2292			arc_c = arc_c_max;
2293		else if (state == arc_anon)
2294			atomic_add_64(&arc_p, (int64_t)bytes);
2295		if (arc_p > arc_c)
2296			arc_p = arc_c;
2297	}
2298	ASSERT((int64_t)arc_p >= 0);
2299}
2300
2301/*
2302 * Check if the cache has reached its limits and eviction is required
2303 * prior to insert.
2304 */
2305static int
2306arc_evict_needed(arc_buf_contents_t type)
2307{
2308	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2309		return (1);
2310
2311#if 0
2312#ifdef _KERNEL
2313	/*
2314	 * If zio data pages are being allocated out of a separate heap segment,
2315	 * then enforce that the size of available vmem for this area remains
2316	 * above about 1/32nd free.
2317	 */
2318	if (type == ARC_BUFC_DATA && zio_arena != NULL &&
2319	    vmem_size(zio_arena, VMEM_FREE) <
2320	    (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
2321		return (1);
2322#endif
2323#endif
2324
2325	if (arc_reclaim_needed())
2326		return (1);
2327
2328	return (arc_size > arc_c);
2329}
2330
2331/*
2332 * The buffer, supplied as the first argument, needs a data block.
2333 * So, if we are at cache max, determine which cache should be victimized.
2334 * We have the following cases:
2335 *
2336 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2337 * In this situation if we're out of space, but the resident size of the MFU is
2338 * under the limit, victimize the MFU cache to satisfy this insertion request.
2339 *
2340 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2341 * Here, we've used up all of the available space for the MRU, so we need to
2342 * evict from our own cache instead.  Evict from the set of resident MRU
2343 * entries.
2344 *
2345 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2346 * c minus p represents the MFU space in the cache, since p is the size of the
2347 * cache that is dedicated to the MRU.  In this situation there's still space on
2348 * the MFU side, so the MRU side needs to be victimized.
2349 *
2350 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2351 * MFU's resident set is consuming more space than it has been allotted.  In
2352 * this situation, we must victimize our own cache, the MFU, for this insertion.
2353 */
2354static void
2355arc_get_data_buf(arc_buf_t *buf)
2356{
2357	arc_state_t		*state = buf->b_hdr->b_state;
2358	uint64_t		size = buf->b_hdr->b_size;
2359	arc_buf_contents_t	type = buf->b_hdr->b_type;
2360
2361	arc_adapt(size, state);
2362
2363	/*
2364	 * We have not yet reached cache maximum size,
2365	 * just allocate a new buffer.
2366	 */
2367	if (!arc_evict_needed(type)) {
2368		if (type == ARC_BUFC_METADATA) {
2369			buf->b_data = zio_buf_alloc(size);
2370			arc_space_consume(size);
2371		} else {
2372			ASSERT(type == ARC_BUFC_DATA);
2373			buf->b_data = zio_data_buf_alloc(size);
2374			atomic_add_64(&arc_size, size);
2375		}
2376		goto out;
2377	}
2378
2379	/*
2380	 * If we are prefetching from the mfu ghost list, this buffer
2381	 * will end up on the mru list; so steal space from there.
2382	 */
2383	if (state == arc_mfu_ghost)
2384		state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2385	else if (state == arc_mru_ghost)
2386		state = arc_mru;
2387
2388	if (state == arc_mru || state == arc_anon) {
2389		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2390		state = (arc_mfu->arcs_lsize[type] > 0 &&
2391		    arc_p > mru_used) ? arc_mfu : arc_mru;
2392	} else {
2393		/* MFU cases */
2394		uint64_t mfu_space = arc_c - arc_p;
2395		state =  (arc_mru->arcs_lsize[type] > 0 &&
2396		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2397	}
2398	if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2399		if (type == ARC_BUFC_METADATA) {
2400			buf->b_data = zio_buf_alloc(size);
2401			arc_space_consume(size);
2402		} else {
2403			ASSERT(type == ARC_BUFC_DATA);
2404			buf->b_data = zio_data_buf_alloc(size);
2405			atomic_add_64(&arc_size, size);
2406		}
2407		ARCSTAT_BUMP(arcstat_recycle_miss);
2408	}
2409	ASSERT(buf->b_data != NULL);
2410out:
2411	/*
2412	 * Update the state size.  Note that ghost states have a
2413	 * "ghost size" and so don't need to be updated.
2414	 */
2415	if (!GHOST_STATE(buf->b_hdr->b_state)) {
2416		arc_buf_hdr_t *hdr = buf->b_hdr;
2417
2418		atomic_add_64(&hdr->b_state->arcs_size, size);
2419		if (list_link_active(&hdr->b_arc_node)) {
2420			ASSERT(refcount_is_zero(&hdr->b_refcnt));
2421			atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2422		}
2423		/*
2424		 * If we are growing the cache, and we are adding anonymous
2425		 * data, and we have outgrown arc_p, update arc_p
2426		 */
2427		if (arc_size < arc_c && hdr->b_state == arc_anon &&
2428		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2429			arc_p = MIN(arc_c, arc_p + size);
2430	}
2431	ARCSTAT_BUMP(arcstat_allocated);
2432}
2433
2434/*
2435 * This routine is called whenever a buffer is accessed.
2436 * NOTE: the hash lock is dropped in this function.
2437 */
2438static void
2439arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2440{
2441	ASSERT(MUTEX_HELD(hash_lock));
2442
2443	if (buf->b_state == arc_anon) {
2444		/*
2445		 * This buffer is not in the cache, and does not
2446		 * appear in our "ghost" list.  Add the new buffer
2447		 * to the MRU state.
2448		 */
2449
2450		ASSERT(buf->b_arc_access == 0);
2451		buf->b_arc_access = LBOLT;
2452		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2453		arc_change_state(arc_mru, buf, hash_lock);
2454
2455	} else if (buf->b_state == arc_mru) {
2456		/*
2457		 * If this buffer is here because of a prefetch, then either:
2458		 * - clear the flag if this is a "referencing" read
2459		 *   (any subsequent access will bump this into the MFU state).
2460		 * or
2461		 * - move the buffer to the head of the list if this is
2462		 *   another prefetch (to make it less likely to be evicted).
2463		 */
2464		if ((buf->b_flags & ARC_PREFETCH) != 0) {
2465			if (refcount_count(&buf->b_refcnt) == 0) {
2466				ASSERT(list_link_active(&buf->b_arc_node));
2467			} else {
2468				buf->b_flags &= ~ARC_PREFETCH;
2469				ARCSTAT_BUMP(arcstat_mru_hits);
2470			}
2471			buf->b_arc_access = LBOLT;
2472			return;
2473		}
2474
2475		/*
2476		 * This buffer has been "accessed" only once so far,
2477		 * but it is still in the cache. Move it to the MFU
2478		 * state.
2479		 */
2480		if (LBOLT > buf->b_arc_access + ARC_MINTIME) {
2481			/*
2482			 * More than 125ms have passed since we
2483			 * instantiated this buffer.  Move it to the
2484			 * most frequently used state.
2485			 */
2486			buf->b_arc_access = LBOLT;
2487			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2488			arc_change_state(arc_mfu, buf, hash_lock);
2489		}
2490		ARCSTAT_BUMP(arcstat_mru_hits);
2491	} else if (buf->b_state == arc_mru_ghost) {
2492		arc_state_t	*new_state;
2493		/*
2494		 * This buffer has been "accessed" recently, but
2495		 * was evicted from the cache.  Move it to the
2496		 * MFU state.
2497		 */
2498
2499		if (buf->b_flags & ARC_PREFETCH) {
2500			new_state = arc_mru;
2501			if (refcount_count(&buf->b_refcnt) > 0)
2502				buf->b_flags &= ~ARC_PREFETCH;
2503			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2504		} else {
2505			new_state = arc_mfu;
2506			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2507		}
2508
2509		buf->b_arc_access = LBOLT;
2510		arc_change_state(new_state, buf, hash_lock);
2511
2512		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2513	} else if (buf->b_state == arc_mfu) {
2514		/*
2515		 * This buffer has been accessed more than once and is
2516		 * still in the cache.  Keep it in the MFU state.
2517		 *
2518		 * NOTE: an add_reference() that occurred when we did
2519		 * the arc_read() will have kicked this off the list.
2520		 * If it was a prefetch, we will explicitly move it to
2521		 * the head of the list now.
2522		 */
2523		if ((buf->b_flags & ARC_PREFETCH) != 0) {
2524			ASSERT(refcount_count(&buf->b_refcnt) == 0);
2525			ASSERT(list_link_active(&buf->b_arc_node));
2526		}
2527		ARCSTAT_BUMP(arcstat_mfu_hits);
2528		buf->b_arc_access = LBOLT;
2529	} else if (buf->b_state == arc_mfu_ghost) {
2530		arc_state_t	*new_state = arc_mfu;
2531		/*
2532		 * This buffer has been accessed more than once but has
2533		 * been evicted from the cache.  Move it back to the
2534		 * MFU state.
2535		 */
2536
2537		if (buf->b_flags & ARC_PREFETCH) {
2538			/*
2539			 * This is a prefetch access...
2540			 * move this block back to the MRU state.
2541			 */
2542			ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
2543			new_state = arc_mru;
2544		}
2545
2546		buf->b_arc_access = LBOLT;
2547		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2548		arc_change_state(new_state, buf, hash_lock);
2549
2550		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2551	} else if (buf->b_state == arc_l2c_only) {
2552		/*
2553		 * This buffer is on the 2nd Level ARC.
2554		 */
2555
2556		buf->b_arc_access = LBOLT;
2557		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2558		arc_change_state(arc_mfu, buf, hash_lock);
2559	} else {
2560		ASSERT(!"invalid arc state");
2561	}
2562}
2563
2564/* a generic arc_done_func_t which you can use */
2565/* ARGSUSED */
2566void
2567arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2568{
2569	bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2570	VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2571}
2572
2573/* a generic arc_done_func_t */
2574void
2575arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2576{
2577	arc_buf_t **bufp = arg;
2578	if (zio && zio->io_error) {
2579		VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2580		*bufp = NULL;
2581	} else {
2582		*bufp = buf;
2583	}
2584}
2585
2586static void
2587arc_read_done(zio_t *zio)
2588{
2589	arc_buf_hdr_t	*hdr, *found;
2590	arc_buf_t	*buf;
2591	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
2592	kmutex_t	*hash_lock;
2593	arc_callback_t	*callback_list, *acb;
2594	int		freeable = FALSE;
2595
2596	buf = zio->io_private;
2597	hdr = buf->b_hdr;
2598
2599	/*
2600	 * The hdr was inserted into hash-table and removed from lists
2601	 * prior to starting I/O.  We should find this header, since
2602	 * it's in the hash table, and it should be legit since it's
2603	 * not possible to evict it during the I/O.  The only possible
2604	 * reason for it not to be found is if we were freed during the
2605	 * read.
2606	 */
2607	found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
2608	    &hash_lock);
2609
2610	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2611	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2612	    (found == hdr && HDR_L2_READING(hdr)));
2613
2614	hdr->b_flags &= ~ARC_L2_EVICTED;
2615	if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2616		hdr->b_flags &= ~ARC_L2CACHE;
2617#if 0
2618	else if ((hdr->b_flags & ARC_PREFETCH) == 0)
2619		hdr->b_flags |= ARC_L2CACHE;
2620#endif
2621	/* byteswap if necessary */
2622	callback_list = hdr->b_acb;
2623	ASSERT(callback_list != NULL);
2624	if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
2625		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2626		    byteswap_uint64_array :
2627		    dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
2628		func(buf->b_data, hdr->b_size);
2629	}
2630
2631	arc_cksum_compute(buf, B_FALSE);
2632
2633	/* create copies of the data buffer for the callers */
2634	abuf = buf;
2635	for (acb = callback_list; acb; acb = acb->acb_next) {
2636		if (acb->acb_done) {
2637			if (abuf == NULL)
2638				abuf = arc_buf_clone(buf);
2639			acb->acb_buf = abuf;
2640			abuf = NULL;
2641		}
2642	}
2643	hdr->b_acb = NULL;
2644	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2645	ASSERT(!HDR_BUF_AVAILABLE(hdr));
2646	if (abuf == buf)
2647		hdr->b_flags |= ARC_BUF_AVAILABLE;
2648
2649	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2650
2651	if (zio->io_error != 0) {
2652		hdr->b_flags |= ARC_IO_ERROR;
2653		if (hdr->b_state != arc_anon)
2654			arc_change_state(arc_anon, hdr, hash_lock);
2655		if (HDR_IN_HASH_TABLE(hdr))
2656			buf_hash_remove(hdr);
2657		freeable = refcount_is_zero(&hdr->b_refcnt);
2658	}
2659
2660	/*
2661	 * Broadcast before we drop the hash_lock to avoid the possibility
2662	 * that the hdr (and hence the cv) might be freed before we get to
2663	 * the cv_broadcast().
2664	 */
2665	cv_broadcast(&hdr->b_cv);
2666
2667	if (hash_lock) {
2668		/*
2669		 * Only call arc_access on anonymous buffers.  This is because
2670		 * if we've issued an I/O for an evicted buffer, we've already
2671		 * called arc_access (to prevent any simultaneous readers from
2672		 * getting confused).
2673		 */
2674		if (zio->io_error == 0 && hdr->b_state == arc_anon)
2675			arc_access(hdr, hash_lock);
2676		mutex_exit(hash_lock);
2677	} else {
2678		/*
2679		 * This block was freed while we waited for the read to
2680		 * complete.  It has been removed from the hash table and
2681		 * moved to the anonymous state (so that it won't show up
2682		 * in the cache).
2683		 */
2684		ASSERT3P(hdr->b_state, ==, arc_anon);
2685		freeable = refcount_is_zero(&hdr->b_refcnt);
2686	}
2687
2688	/* execute each callback and free its structure */
2689	while ((acb = callback_list) != NULL) {
2690		if (acb->acb_done)
2691			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2692
2693		if (acb->acb_zio_dummy != NULL) {
2694			acb->acb_zio_dummy->io_error = zio->io_error;
2695			zio_nowait(acb->acb_zio_dummy);
2696		}
2697
2698		callback_list = acb->acb_next;
2699		kmem_free(acb, sizeof (arc_callback_t));
2700	}
2701
2702	if (freeable)
2703		arc_hdr_destroy(hdr);
2704}
2705
2706/*
2707 * "Read" the block block at the specified DVA (in bp) via the
2708 * cache.  If the block is found in the cache, invoke the provided
2709 * callback immediately and return.  Note that the `zio' parameter
2710 * in the callback will be NULL in this case, since no IO was
2711 * required.  If the block is not in the cache pass the read request
2712 * on to the spa with a substitute callback function, so that the
2713 * requested block will be added to the cache.
2714 *
2715 * If a read request arrives for a block that has a read in-progress,
2716 * either wait for the in-progress read to complete (and return the
2717 * results); or, if this is a read with a "done" func, add a record
2718 * to the read to invoke the "done" func when the read completes,
2719 * and return; or just return.
2720 *
2721 * arc_read_done() will invoke all the requested "done" functions
2722 * for readers of this block.
2723 *
2724 * Normal callers should use arc_read and pass the arc buffer and offset
2725 * for the bp.  But if you know you don't need locking, you can use
2726 * arc_read_bp.
2727 */
2728int
2729arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
2730    arc_done_func_t *done, void *private, int priority, int zio_flags,
2731    uint32_t *arc_flags, const zbookmark_t *zb)
2732{
2733	int err;
2734	arc_buf_hdr_t *hdr = pbuf->b_hdr;
2735
2736	ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
2737	ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
2738	rw_enter(&pbuf->b_lock, RW_READER);
2739
2740	err = arc_read_nolock(pio, spa, bp, done, private, priority,
2741	    zio_flags, arc_flags, zb);
2742
2743	ASSERT3P(hdr, ==, pbuf->b_hdr);
2744	rw_exit(&pbuf->b_lock);
2745	return (err);
2746}
2747
2748int
2749arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
2750    arc_done_func_t *done, void *private, int priority, int zio_flags,
2751    uint32_t *arc_flags, const zbookmark_t *zb)
2752{
2753	arc_buf_hdr_t *hdr;
2754	arc_buf_t *buf;
2755	kmutex_t *hash_lock;
2756	zio_t *rzio;
2757
2758top:
2759	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
2760	if (hdr && hdr->b_datacnt > 0) {
2761
2762		*arc_flags |= ARC_CACHED;
2763
2764		if (HDR_IO_IN_PROGRESS(hdr)) {
2765
2766			if (*arc_flags & ARC_WAIT) {
2767				cv_wait(&hdr->b_cv, hash_lock);
2768				mutex_exit(hash_lock);
2769				goto top;
2770			}
2771			ASSERT(*arc_flags & ARC_NOWAIT);
2772
2773			if (done) {
2774				arc_callback_t	*acb = NULL;
2775
2776				acb = kmem_zalloc(sizeof (arc_callback_t),
2777				    KM_SLEEP);
2778				acb->acb_done = done;
2779				acb->acb_private = private;
2780				if (pio != NULL)
2781					acb->acb_zio_dummy = zio_null(pio,
2782					    spa, NULL, NULL, zio_flags);
2783
2784				ASSERT(acb->acb_done != NULL);
2785				acb->acb_next = hdr->b_acb;
2786				hdr->b_acb = acb;
2787				add_reference(hdr, hash_lock, private);
2788				mutex_exit(hash_lock);
2789				return (0);
2790			}
2791			mutex_exit(hash_lock);
2792			return (0);
2793		}
2794
2795		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2796
2797		if (done) {
2798			add_reference(hdr, hash_lock, private);
2799			/*
2800			 * If this block is already in use, create a new
2801			 * copy of the data so that we will be guaranteed
2802			 * that arc_release() will always succeed.
2803			 */
2804			buf = hdr->b_buf;
2805			ASSERT(buf);
2806			ASSERT(buf->b_data);
2807			if (HDR_BUF_AVAILABLE(hdr)) {
2808				ASSERT(buf->b_efunc == NULL);
2809				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2810			} else {
2811				buf = arc_buf_clone(buf);
2812			}
2813		} else if (*arc_flags & ARC_PREFETCH &&
2814		    refcount_count(&hdr->b_refcnt) == 0) {
2815			hdr->b_flags |= ARC_PREFETCH;
2816		}
2817		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2818		arc_access(hdr, hash_lock);
2819		if (*arc_flags & ARC_L2CACHE)
2820			hdr->b_flags |= ARC_L2CACHE;
2821		mutex_exit(hash_lock);
2822		ARCSTAT_BUMP(arcstat_hits);
2823		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2824		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2825		    data, metadata, hits);
2826
2827		if (done)
2828			done(NULL, buf, private);
2829	} else {
2830		uint64_t size = BP_GET_LSIZE(bp);
2831		arc_callback_t	*acb;
2832		vdev_t *vd = NULL;
2833		daddr_t addr;
2834
2835		if (hdr == NULL) {
2836			/* this block is not in the cache */
2837			arc_buf_hdr_t	*exists;
2838			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
2839			buf = arc_buf_alloc(spa, size, private, type);
2840			hdr = buf->b_hdr;
2841			hdr->b_dva = *BP_IDENTITY(bp);
2842			hdr->b_birth = bp->blk_birth;
2843			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
2844			exists = buf_hash_insert(hdr, &hash_lock);
2845			if (exists) {
2846				/* somebody beat us to the hash insert */
2847				mutex_exit(hash_lock);
2848				bzero(&hdr->b_dva, sizeof (dva_t));
2849				hdr->b_birth = 0;
2850				hdr->b_cksum0 = 0;
2851				(void) arc_buf_remove_ref(buf, private);
2852				goto top; /* restart the IO request */
2853			}
2854			/* if this is a prefetch, we don't have a reference */
2855			if (*arc_flags & ARC_PREFETCH) {
2856				(void) remove_reference(hdr, hash_lock,
2857				    private);
2858				hdr->b_flags |= ARC_PREFETCH;
2859			}
2860			if (*arc_flags & ARC_L2CACHE)
2861				hdr->b_flags |= ARC_L2CACHE;
2862			if (BP_GET_LEVEL(bp) > 0)
2863				hdr->b_flags |= ARC_INDIRECT;
2864		} else {
2865			/* this block is in the ghost cache */
2866			ASSERT(GHOST_STATE(hdr->b_state));
2867			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2868			ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
2869			ASSERT(hdr->b_buf == NULL);
2870
2871			/* if this is a prefetch, we don't have a reference */
2872			if (*arc_flags & ARC_PREFETCH)
2873				hdr->b_flags |= ARC_PREFETCH;
2874			else
2875				add_reference(hdr, hash_lock, private);
2876			if (*arc_flags & ARC_L2CACHE)
2877				hdr->b_flags |= ARC_L2CACHE;
2878			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2879			buf->b_hdr = hdr;
2880			buf->b_data = NULL;
2881			buf->b_efunc = NULL;
2882			buf->b_private = NULL;
2883			buf->b_next = NULL;
2884			hdr->b_buf = buf;
2885			arc_get_data_buf(buf);
2886			ASSERT(hdr->b_datacnt == 0);
2887			hdr->b_datacnt = 1;
2888
2889		}
2890
2891		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
2892		acb->acb_done = done;
2893		acb->acb_private = private;
2894
2895		ASSERT(hdr->b_acb == NULL);
2896		hdr->b_acb = acb;
2897		hdr->b_flags |= ARC_IO_IN_PROGRESS;
2898
2899		/*
2900		 * If the buffer has been evicted, migrate it to a present state
2901		 * before issuing the I/O.  Once we drop the hash-table lock,
2902		 * the header will be marked as I/O in progress and have an
2903		 * attached buffer.  At this point, anybody who finds this
2904		 * buffer ought to notice that it's legit but has a pending I/O.
2905		 */
2906
2907		if (GHOST_STATE(hdr->b_state))
2908			arc_access(hdr, hash_lock);
2909
2910		if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
2911		    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
2912			addr = hdr->b_l2hdr->b_daddr;
2913			/*
2914			 * Lock out device removal.
2915			 */
2916			if (vdev_is_dead(vd) ||
2917			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
2918				vd = NULL;
2919		}
2920
2921		mutex_exit(hash_lock);
2922
2923		ASSERT3U(hdr->b_size, ==, size);
2924		DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
2925		    zbookmark_t *, zb);
2926		ARCSTAT_BUMP(arcstat_misses);
2927		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2928		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2929		    data, metadata, misses);
2930
2931		if (vd != NULL) {
2932			/*
2933			 * Read from the L2ARC if the following are true:
2934			 * 1. The L2ARC vdev was previously cached.
2935			 * 2. This buffer still has L2ARC metadata.
2936			 * 3. This buffer isn't currently writing to the L2ARC.
2937			 * 4. The L2ARC entry wasn't evicted, which may
2938			 *    also have invalidated the vdev.
2939			 */
2940			if (hdr->b_l2hdr != NULL &&
2941			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) {
2942				l2arc_read_callback_t *cb;
2943
2944				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
2945				ARCSTAT_BUMP(arcstat_l2_hits);
2946
2947				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
2948				    KM_SLEEP);
2949				cb->l2rcb_buf = buf;
2950				cb->l2rcb_spa = spa;
2951				cb->l2rcb_bp = *bp;
2952				cb->l2rcb_zb = *zb;
2953				cb->l2rcb_flags = zio_flags;
2954
2955				/*
2956				 * l2arc read.  The SCL_L2ARC lock will be
2957				 * released by l2arc_read_done().
2958				 */
2959				rzio = zio_read_phys(pio, vd, addr, size,
2960				    buf->b_data, ZIO_CHECKSUM_OFF,
2961				    l2arc_read_done, cb, priority, zio_flags |
2962				    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
2963				    ZIO_FLAG_DONT_PROPAGATE |
2964				    ZIO_FLAG_DONT_RETRY, B_FALSE);
2965				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
2966				    zio_t *, rzio);
2967
2968				if (*arc_flags & ARC_NOWAIT) {
2969					zio_nowait(rzio);
2970					return (0);
2971				}
2972
2973				ASSERT(*arc_flags & ARC_WAIT);
2974				if (zio_wait(rzio) == 0)
2975					return (0);
2976
2977				/* l2arc read error; goto zio_read() */
2978			} else {
2979				DTRACE_PROBE1(l2arc__miss,
2980				    arc_buf_hdr_t *, hdr);
2981				ARCSTAT_BUMP(arcstat_l2_misses);
2982				if (HDR_L2_WRITING(hdr))
2983					ARCSTAT_BUMP(arcstat_l2_rw_clash);
2984				spa_config_exit(spa, SCL_L2ARC, vd);
2985			}
2986		}
2987
2988		rzio = zio_read(pio, spa, bp, buf->b_data, size,
2989		    arc_read_done, buf, priority, zio_flags, zb);
2990
2991		if (*arc_flags & ARC_WAIT)
2992			return (zio_wait(rzio));
2993
2994		ASSERT(*arc_flags & ARC_NOWAIT);
2995		zio_nowait(rzio);
2996	}
2997	return (0);
2998}
2999
3000/*
3001 * arc_read() variant to support pool traversal.  If the block is already
3002 * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
3003 * The idea is that we don't want pool traversal filling up memory, but
3004 * if the ARC already has the data anyway, we shouldn't pay for the I/O.
3005 */
3006int
3007arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
3008{
3009	arc_buf_hdr_t *hdr;
3010	kmutex_t *hash_mtx;
3011	int rc = 0;
3012
3013	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
3014
3015	if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
3016		arc_buf_t *buf = hdr->b_buf;
3017
3018		ASSERT(buf);
3019		while (buf->b_data == NULL) {
3020			buf = buf->b_next;
3021			ASSERT(buf);
3022		}
3023		bcopy(buf->b_data, data, hdr->b_size);
3024	} else {
3025		rc = ENOENT;
3026	}
3027
3028	if (hash_mtx)
3029		mutex_exit(hash_mtx);
3030
3031	return (rc);
3032}
3033
3034void
3035arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3036{
3037	ASSERT(buf->b_hdr != NULL);
3038	ASSERT(buf->b_hdr->b_state != arc_anon);
3039	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3040	buf->b_efunc = func;
3041	buf->b_private = private;
3042}
3043
3044/*
3045 * This is used by the DMU to let the ARC know that a buffer is
3046 * being evicted, so the ARC should clean up.  If this arc buf
3047 * is not yet in the evicted state, it will be put there.
3048 */
3049int
3050arc_buf_evict(arc_buf_t *buf)
3051{
3052	arc_buf_hdr_t *hdr;
3053	kmutex_t *hash_lock;
3054	arc_buf_t **bufp;
3055	list_t *list, *evicted_list;
3056	kmutex_t *lock, *evicted_lock;
3057
3058	rw_enter(&buf->b_lock, RW_WRITER);
3059	hdr = buf->b_hdr;
3060	if (hdr == NULL) {
3061		/*
3062		 * We are in arc_do_user_evicts().
3063		 */
3064		ASSERT(buf->b_data == NULL);
3065		rw_exit(&buf->b_lock);
3066		return (0);
3067	} else if (buf->b_data == NULL) {
3068		arc_buf_t copy = *buf; /* structure assignment */
3069		/*
3070		 * We are on the eviction list; process this buffer now
3071		 * but let arc_do_user_evicts() do the reaping.
3072		 */
3073		buf->b_efunc = NULL;
3074		rw_exit(&buf->b_lock);
3075		VERIFY(copy.b_efunc(&copy) == 0);
3076		return (1);
3077	}
3078	hash_lock = HDR_LOCK(hdr);
3079	mutex_enter(hash_lock);
3080
3081	ASSERT(buf->b_hdr == hdr);
3082	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3083	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3084
3085	/*
3086	 * Pull this buffer off of the hdr
3087	 */
3088	bufp = &hdr->b_buf;
3089	while (*bufp != buf)
3090		bufp = &(*bufp)->b_next;
3091	*bufp = buf->b_next;
3092
3093	ASSERT(buf->b_data != NULL);
3094	arc_buf_destroy(buf, FALSE, FALSE);
3095
3096	if (hdr->b_datacnt == 0) {
3097		arc_state_t *old_state = hdr->b_state;
3098		arc_state_t *evicted_state;
3099
3100		ASSERT(refcount_is_zero(&hdr->b_refcnt));
3101
3102		evicted_state =
3103		    (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3104
3105		get_buf_info(hdr, old_state, &list, &lock);
3106		get_buf_info(hdr, evicted_state, &evicted_list, &evicted_lock);
3107		mutex_enter(lock);
3108		mutex_enter(evicted_lock);
3109
3110		arc_change_state(evicted_state, hdr, hash_lock);
3111		ASSERT(HDR_IN_HASH_TABLE(hdr));
3112		hdr->b_flags |= ARC_IN_HASH_TABLE;
3113		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3114
3115		mutex_exit(evicted_lock);
3116		mutex_exit(lock);
3117	}
3118	mutex_exit(hash_lock);
3119	rw_exit(&buf->b_lock);
3120
3121	VERIFY(buf->b_efunc(buf) == 0);
3122	buf->b_efunc = NULL;
3123	buf->b_private = NULL;
3124	buf->b_hdr = NULL;
3125	kmem_cache_free(buf_cache, buf);
3126	return (1);
3127}
3128
3129/*
3130 * Release this buffer from the cache.  This must be done
3131 * after a read and prior to modifying the buffer contents.
3132 * If the buffer has more than one reference, we must make
3133 * a new hdr for the buffer.
3134 */
3135void
3136arc_release(arc_buf_t *buf, void *tag)
3137{
3138	arc_buf_hdr_t *hdr;
3139	kmutex_t *hash_lock;
3140	l2arc_buf_hdr_t *l2hdr;
3141	uint64_t buf_size;
3142
3143	rw_enter(&buf->b_lock, RW_WRITER);
3144	hdr = buf->b_hdr;
3145
3146	/* this buffer is not on any list */
3147	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3148	ASSERT(!(hdr->b_flags & ARC_STORED));
3149
3150	if (hdr->b_state == arc_anon) {
3151		/* this buffer is already released */
3152		ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
3153		ASSERT(BUF_EMPTY(hdr));
3154		ASSERT(buf->b_efunc == NULL);
3155		arc_buf_thaw(buf);
3156		rw_exit(&buf->b_lock);
3157		return;
3158	}
3159
3160	hash_lock = HDR_LOCK(hdr);
3161	mutex_enter(hash_lock);
3162
3163	l2hdr = hdr->b_l2hdr;
3164	if (l2hdr) {
3165		mutex_enter(&l2arc_buflist_mtx);
3166		hdr->b_l2hdr = NULL;
3167		buf_size = hdr->b_size;
3168	}
3169
3170	/*
3171	 * Do we have more than one buf?
3172	 */
3173	if (hdr->b_datacnt > 1) {
3174		arc_buf_hdr_t *nhdr;
3175		arc_buf_t **bufp;
3176		uint64_t blksz = hdr->b_size;
3177		spa_t *spa = hdr->b_spa;
3178		arc_buf_contents_t type = hdr->b_type;
3179		uint32_t flags = hdr->b_flags;
3180
3181		ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3182		/*
3183		 * Pull the data off of this buf and attach it to
3184		 * a new anonymous buf.
3185		 */
3186		(void) remove_reference(hdr, hash_lock, tag);
3187		bufp = &hdr->b_buf;
3188		while (*bufp != buf)
3189			bufp = &(*bufp)->b_next;
3190		*bufp = (*bufp)->b_next;
3191		buf->b_next = NULL;
3192
3193		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3194		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3195		if (refcount_is_zero(&hdr->b_refcnt)) {
3196			uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3197			ASSERT3U(*size, >=, hdr->b_size);
3198			atomic_add_64(size, -hdr->b_size);
3199		}
3200		hdr->b_datacnt -= 1;
3201		arc_cksum_verify(buf);
3202
3203		mutex_exit(hash_lock);
3204
3205		nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3206		nhdr->b_size = blksz;
3207		nhdr->b_spa = spa;
3208		nhdr->b_type = type;
3209		nhdr->b_buf = buf;
3210		nhdr->b_state = arc_anon;
3211		nhdr->b_arc_access = 0;
3212		nhdr->b_flags = flags & ARC_L2_WRITING;
3213		nhdr->b_l2hdr = NULL;
3214		nhdr->b_datacnt = 1;
3215		nhdr->b_freeze_cksum = NULL;
3216		(void) refcount_add(&nhdr->b_refcnt, tag);
3217		buf->b_hdr = nhdr;
3218		rw_exit(&buf->b_lock);
3219		atomic_add_64(&arc_anon->arcs_size, blksz);
3220	} else {
3221		rw_exit(&buf->b_lock);
3222		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3223		ASSERT(!list_link_active(&hdr->b_arc_node));
3224		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3225		arc_change_state(arc_anon, hdr, hash_lock);
3226		hdr->b_arc_access = 0;
3227		mutex_exit(hash_lock);
3228
3229		bzero(&hdr->b_dva, sizeof (dva_t));
3230		hdr->b_birth = 0;
3231		hdr->b_cksum0 = 0;
3232		arc_buf_thaw(buf);
3233	}
3234	buf->b_efunc = NULL;
3235	buf->b_private = NULL;
3236
3237	if (l2hdr) {
3238		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3239		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3240		ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3241		mutex_exit(&l2arc_buflist_mtx);
3242	}
3243}
3244
3245int
3246arc_released(arc_buf_t *buf)
3247{
3248	int released;
3249
3250	rw_enter(&buf->b_lock, RW_READER);
3251	released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3252	rw_exit(&buf->b_lock);
3253	return (released);
3254}
3255
3256int
3257arc_has_callback(arc_buf_t *buf)
3258{
3259	int callback;
3260
3261	rw_enter(&buf->b_lock, RW_READER);
3262	callback = (buf->b_efunc != NULL);
3263	rw_exit(&buf->b_lock);
3264	return (callback);
3265}
3266
3267#ifdef ZFS_DEBUG
3268int
3269arc_referenced(arc_buf_t *buf)
3270{
3271	int referenced;
3272
3273	rw_enter(&buf->b_lock, RW_READER);
3274	referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3275	rw_exit(&buf->b_lock);
3276	return (referenced);
3277}
3278#endif
3279
3280static void
3281arc_write_ready(zio_t *zio)
3282{
3283	arc_write_callback_t *callback = zio->io_private;
3284	arc_buf_t *buf = callback->awcb_buf;
3285	arc_buf_hdr_t *hdr = buf->b_hdr;
3286
3287	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3288	callback->awcb_ready(zio, buf, callback->awcb_private);
3289
3290	/*
3291	 * If the IO is already in progress, then this is a re-write
3292	 * attempt, so we need to thaw and re-compute the cksum.
3293	 * It is the responsibility of the callback to handle the
3294	 * accounting for any re-write attempt.
3295	 */
3296	if (HDR_IO_IN_PROGRESS(hdr)) {
3297		mutex_enter(&hdr->b_freeze_lock);
3298		if (hdr->b_freeze_cksum != NULL) {
3299			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3300			hdr->b_freeze_cksum = NULL;
3301		}
3302		mutex_exit(&hdr->b_freeze_lock);
3303	}
3304	arc_cksum_compute(buf, B_FALSE);
3305	hdr->b_flags |= ARC_IO_IN_PROGRESS;
3306}
3307
3308static void
3309arc_write_done(zio_t *zio)
3310{
3311	arc_write_callback_t *callback = zio->io_private;
3312	arc_buf_t *buf = callback->awcb_buf;
3313	arc_buf_hdr_t *hdr = buf->b_hdr;
3314
3315	hdr->b_acb = NULL;
3316
3317	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3318	hdr->b_birth = zio->io_bp->blk_birth;
3319	hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3320	/*
3321	 * If the block to be written was all-zero, we may have
3322	 * compressed it away.  In this case no write was performed
3323	 * so there will be no dva/birth-date/checksum.  The buffer
3324	 * must therefor remain anonymous (and uncached).
3325	 */
3326	if (!BUF_EMPTY(hdr)) {
3327		arc_buf_hdr_t *exists;
3328		kmutex_t *hash_lock;
3329
3330		arc_cksum_verify(buf);
3331
3332		exists = buf_hash_insert(hdr, &hash_lock);
3333		if (exists) {
3334			/*
3335			 * This can only happen if we overwrite for
3336			 * sync-to-convergence, because we remove
3337			 * buffers from the hash table when we arc_free().
3338			 */
3339			ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE);
3340			ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
3341			    BP_IDENTITY(zio->io_bp)));
3342			ASSERT3U(zio->io_bp_orig.blk_birth, ==,
3343			    zio->io_bp->blk_birth);
3344
3345			ASSERT(refcount_is_zero(&exists->b_refcnt));
3346			arc_change_state(arc_anon, exists, hash_lock);
3347			mutex_exit(hash_lock);
3348			arc_hdr_destroy(exists);
3349			exists = buf_hash_insert(hdr, &hash_lock);
3350			ASSERT3P(exists, ==, NULL);
3351		}
3352		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3353		/* if it's not anon, we are doing a scrub */
3354		if (hdr->b_state == arc_anon)
3355			arc_access(hdr, hash_lock);
3356		mutex_exit(hash_lock);
3357	} else if (callback->awcb_done == NULL) {
3358		int destroy_hdr;
3359		/*
3360		 * This is an anonymous buffer with no user callback,
3361		 * destroy it if there are no active references.
3362		 */
3363		mutex_enter(&arc_eviction_mtx);
3364		destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
3365		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3366		mutex_exit(&arc_eviction_mtx);
3367		if (destroy_hdr)
3368			arc_hdr_destroy(hdr);
3369	} else {
3370		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3371	}
3372	hdr->b_flags &= ~ARC_STORED;
3373
3374	if (callback->awcb_done) {
3375		ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3376		callback->awcb_done(zio, buf, callback->awcb_private);
3377	}
3378
3379	kmem_free(callback, sizeof (arc_write_callback_t));
3380}
3381
3382static void
3383write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp)
3384{
3385	boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata);
3386
3387	/* Determine checksum setting */
3388	if (ismd) {
3389		/*
3390		 * Metadata always gets checksummed.  If the data
3391		 * checksum is multi-bit correctable, and it's not a
3392		 * ZBT-style checksum, then it's suitable for metadata
3393		 * as well.  Otherwise, the metadata checksum defaults
3394		 * to fletcher4.
3395		 */
3396		if (zio_checksum_table[wp->wp_oschecksum].ci_correctable &&
3397		    !zio_checksum_table[wp->wp_oschecksum].ci_zbt)
3398			zp->zp_checksum = wp->wp_oschecksum;
3399		else
3400			zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4;
3401	} else {
3402		zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum,
3403		    wp->wp_oschecksum);
3404	}
3405
3406	/* Determine compression setting */
3407	if (ismd) {
3408		/*
3409		 * XXX -- we should design a compression algorithm
3410		 * that specializes in arrays of bps.
3411		 */
3412		zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
3413		    ZIO_COMPRESS_LZJB;
3414	} else {
3415		zp->zp_compress = zio_compress_select(wp->wp_dncompress,
3416		    wp->wp_oscompress);
3417	}
3418
3419	zp->zp_type = wp->wp_type;
3420	zp->zp_level = wp->wp_level;
3421	zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa));
3422}
3423
3424zio_t *
3425arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
3426    boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
3427    arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
3428    int zio_flags, const zbookmark_t *zb)
3429{
3430	arc_buf_hdr_t *hdr = buf->b_hdr;
3431	arc_write_callback_t *callback;
3432	zio_t *zio;
3433	zio_prop_t zp;
3434
3435	ASSERT(ready != NULL);
3436	ASSERT(!HDR_IO_ERROR(hdr));
3437	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3438	ASSERT(hdr->b_acb == 0);
3439	if (l2arc)
3440		hdr->b_flags |= ARC_L2CACHE;
3441	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3442	callback->awcb_ready = ready;
3443	callback->awcb_done = done;
3444	callback->awcb_private = private;
3445	callback->awcb_buf = buf;
3446
3447	write_policy(spa, wp, &zp);
3448	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp,
3449	    arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3450
3451	return (zio);
3452}
3453
3454int
3455arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
3456    zio_done_func_t *done, void *private, uint32_t arc_flags)
3457{
3458	arc_buf_hdr_t *ab;
3459	kmutex_t *hash_lock;
3460	zio_t	*zio;
3461
3462	/*
3463	 * If this buffer is in the cache, release it, so it
3464	 * can be re-used.
3465	 */
3466	ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
3467	if (ab != NULL) {
3468		/*
3469		 * The checksum of blocks to free is not always
3470		 * preserved (eg. on the deadlist).  However, if it is
3471		 * nonzero, it should match what we have in the cache.
3472		 */
3473		ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
3474		    bp->blk_cksum.zc_word[0] == ab->b_cksum0 ||
3475		    bp->blk_fill == BLK_FILL_ALREADY_FREED);
3476
3477		if (ab->b_state != arc_anon)
3478			arc_change_state(arc_anon, ab, hash_lock);
3479		if (HDR_IO_IN_PROGRESS(ab)) {
3480			/*
3481			 * This should only happen when we prefetch.
3482			 */
3483			ASSERT(ab->b_flags & ARC_PREFETCH);
3484			ASSERT3U(ab->b_datacnt, ==, 1);
3485			ab->b_flags |= ARC_FREED_IN_READ;
3486			if (HDR_IN_HASH_TABLE(ab))
3487				buf_hash_remove(ab);
3488			ab->b_arc_access = 0;
3489			bzero(&ab->b_dva, sizeof (dva_t));
3490			ab->b_birth = 0;
3491			ab->b_cksum0 = 0;
3492			ab->b_buf->b_efunc = NULL;
3493			ab->b_buf->b_private = NULL;
3494			mutex_exit(hash_lock);
3495		} else if (refcount_is_zero(&ab->b_refcnt)) {
3496			ab->b_flags |= ARC_FREE_IN_PROGRESS;
3497			mutex_exit(hash_lock);
3498			arc_hdr_destroy(ab);
3499			ARCSTAT_BUMP(arcstat_deleted);
3500		} else {
3501			/*
3502			 * We still have an active reference on this
3503			 * buffer.  This can happen, e.g., from
3504			 * dbuf_unoverride().
3505			 */
3506			ASSERT(!HDR_IN_HASH_TABLE(ab));
3507			ab->b_arc_access = 0;
3508			bzero(&ab->b_dva, sizeof (dva_t));
3509			ab->b_birth = 0;
3510			ab->b_cksum0 = 0;
3511			ab->b_buf->b_efunc = NULL;
3512			ab->b_buf->b_private = NULL;
3513			mutex_exit(hash_lock);
3514		}
3515	}
3516
3517	zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED);
3518
3519	if (arc_flags & ARC_WAIT)
3520		return (zio_wait(zio));
3521
3522	ASSERT(arc_flags & ARC_NOWAIT);
3523	zio_nowait(zio);
3524
3525	return (0);
3526}
3527
3528static int
3529arc_memory_throttle(uint64_t reserve, uint64_t txg)
3530{
3531#ifdef _KERNEL
3532	uint64_t inflight_data = arc_anon->arcs_size;
3533	uint64_t available_memory = ptoa((uintmax_t)cnt.v_free_count);
3534	static uint64_t page_load = 0;
3535	static uint64_t last_txg = 0;
3536
3537#if 0
3538#if defined(__i386)
3539	available_memory =
3540	    MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3541#endif
3542#endif
3543	if (available_memory >= zfs_write_limit_max)
3544		return (0);
3545
3546	if (txg > last_txg) {
3547		last_txg = txg;
3548		page_load = 0;
3549	}
3550	/*
3551	 * If we are in pageout, we know that memory is already tight,
3552	 * the arc is already going to be evicting, so we just want to
3553	 * continue to let page writes occur as quickly as possible.
3554	 */
3555	if (curproc == pageproc) {
3556		if (page_load > available_memory / 4)
3557			return (ERESTART);
3558		/* Note: reserve is inflated, so we deflate */
3559		page_load += reserve / 8;
3560		return (0);
3561	} else if (page_load > 0 && arc_reclaim_needed()) {
3562		/* memory is low, delay before restarting */
3563		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3564		return (EAGAIN);
3565	}
3566	page_load = 0;
3567
3568	if (arc_size > arc_c_min) {
3569		uint64_t evictable_memory =
3570		    arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3571		    arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3572		    arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3573		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3574		available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3575	}
3576
3577	if (inflight_data > available_memory / 4) {
3578		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3579		return (ERESTART);
3580	}
3581#endif
3582	return (0);
3583}
3584
3585void
3586arc_tempreserve_clear(uint64_t reserve)
3587{
3588	atomic_add_64(&arc_tempreserve, -reserve);
3589	ASSERT((int64_t)arc_tempreserve >= 0);
3590}
3591
3592int
3593arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3594{
3595	int error;
3596
3597#ifdef ZFS_DEBUG
3598	/*
3599	 * Once in a while, fail for no reason.  Everything should cope.
3600	 */
3601	if (spa_get_random(10000) == 0) {
3602		dprintf("forcing random failure\n");
3603		return (ERESTART);
3604	}
3605#endif
3606	if (reserve > arc_c/4 && !arc_no_grow)
3607		arc_c = MIN(arc_c_max, reserve * 4);
3608	if (reserve > arc_c)
3609		return (ENOMEM);
3610
3611	/*
3612	 * Writes will, almost always, require additional memory allocations
3613	 * in order to compress/encrypt/etc the data.  We therefor need to
3614	 * make sure that there is sufficient available memory for this.
3615	 */
3616	if (error = arc_memory_throttle(reserve, txg))
3617		return (error);
3618
3619	/*
3620	 * Throttle writes when the amount of dirty data in the cache
3621	 * gets too large.  We try to keep the cache less than half full
3622	 * of dirty blocks so that our sync times don't grow too large.
3623	 * Note: if two requests come in concurrently, we might let them
3624	 * both succeed, when one of them should fail.  Not a huge deal.
3625	 */
3626	if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
3627	    arc_anon->arcs_size > arc_c / 4) {
3628		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3629		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3630		    arc_tempreserve>>10,
3631		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3632		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3633		    reserve>>10, arc_c>>10);
3634		return (ERESTART);
3635	}
3636	atomic_add_64(&arc_tempreserve, reserve);
3637	return (0);
3638}
3639
3640static kmutex_t arc_lowmem_lock;
3641#ifdef _KERNEL
3642static eventhandler_tag arc_event_lowmem = NULL;
3643
3644static void
3645arc_lowmem(void *arg __unused, int howto __unused)
3646{
3647
3648	/* Serialize access via arc_lowmem_lock. */
3649	mutex_enter(&arc_lowmem_lock);
3650	needfree = 1;
3651	cv_signal(&arc_reclaim_thr_cv);
3652	while (needfree)
3653		tsleep(&needfree, 0, "zfs:lowmem", hz / 5);
3654	mutex_exit(&arc_lowmem_lock);
3655}
3656#endif
3657
3658void
3659arc_init(void)
3660{
3661	int prefetch_tunable_set = 0;
3662	int i;
3663
3664	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3665	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3666	mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
3667
3668	/* Convert seconds to clock ticks */
3669	arc_min_prefetch_lifespan = 1 * hz;
3670
3671	/* Start out with 1/8 of all memory */
3672	arc_c = kmem_size() / 8;
3673#if 0
3674#ifdef _KERNEL
3675	/*
3676	 * On architectures where the physical memory can be larger
3677	 * than the addressable space (intel in 32-bit mode), we may
3678	 * need to limit the cache to 1/8 of VM size.
3679	 */
3680	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3681#endif
3682#endif
3683	/* set min cache to 1/32 of all memory, or 16MB, whichever is more */
3684	arc_c_min = MAX(arc_c / 4, 64<<18);
3685	/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
3686	if (arc_c * 8 >= 1<<30)
3687		arc_c_max = (arc_c * 8) - (1<<30);
3688	else
3689		arc_c_max = arc_c_min;
3690	arc_c_max = MAX(arc_c * 5, arc_c_max);
3691#ifdef _KERNEL
3692	/*
3693	 * Allow the tunables to override our calculations if they are
3694	 * reasonable (ie. over 16MB)
3695	 */
3696	if (zfs_arc_max >= 64<<18 && zfs_arc_max < kmem_size())
3697		arc_c_max = zfs_arc_max;
3698	if (zfs_arc_min >= 64<<18 && zfs_arc_min <= arc_c_max)
3699		arc_c_min = zfs_arc_min;
3700#endif
3701	arc_c = arc_c_max;
3702	arc_p = (arc_c >> 1);
3703
3704	/* limit meta-data to 1/4 of the arc capacity */
3705	arc_meta_limit = arc_c_max / 4;
3706
3707	/* Allow the tunable to override if it is reasonable */
3708	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3709		arc_meta_limit = zfs_arc_meta_limit;
3710
3711	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3712		arc_c_min = arc_meta_limit / 2;
3713
3714	/* if kmem_flags are set, lets try to use less memory */
3715	if (kmem_debugging())
3716		arc_c = arc_c / 2;
3717	if (arc_c < arc_c_min)
3718		arc_c = arc_c_min;
3719
3720	zfs_arc_min = arc_c_min;
3721	zfs_arc_max = arc_c_max;
3722
3723	arc_anon = &ARC_anon;
3724	arc_mru = &ARC_mru;
3725	arc_mru_ghost = &ARC_mru_ghost;
3726	arc_mfu = &ARC_mfu;
3727	arc_mfu_ghost = &ARC_mfu_ghost;
3728	arc_l2c_only = &ARC_l2c_only;
3729	arc_size = 0;
3730
3731	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
3732
3733		mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
3734		    NULL, MUTEX_DEFAULT, NULL);
3735		mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
3736		    NULL, MUTEX_DEFAULT, NULL);
3737		mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
3738		    NULL, MUTEX_DEFAULT, NULL);
3739		mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
3740		    NULL, MUTEX_DEFAULT, NULL);
3741		mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
3742		    NULL, MUTEX_DEFAULT, NULL);
3743		mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
3744		    NULL, MUTEX_DEFAULT, NULL);
3745
3746		list_create(&arc_mru->arcs_lists[i],
3747		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3748		list_create(&arc_mru_ghost->arcs_lists[i],
3749		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3750		list_create(&arc_mfu->arcs_lists[i],
3751		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3752		list_create(&arc_mfu_ghost->arcs_lists[i],
3753		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3754		list_create(&arc_mfu_ghost->arcs_lists[i],
3755		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3756		list_create(&arc_l2c_only->arcs_lists[i],
3757		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3758	}
3759
3760	buf_init();
3761
3762	arc_thread_exit = 0;
3763	arc_eviction_list = NULL;
3764	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3765	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3766
3767	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3768	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3769
3770	if (arc_ksp != NULL) {
3771		arc_ksp->ks_data = &arc_stats;
3772		kstat_install(arc_ksp);
3773	}
3774
3775	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3776	    TS_RUN, minclsyspri);
3777
3778#ifdef _KERNEL
3779	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
3780	    EVENTHANDLER_PRI_FIRST);
3781#endif
3782
3783	arc_dead = FALSE;
3784	arc_warm = B_FALSE;
3785
3786	if (zfs_write_limit_max == 0)
3787		zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
3788	else
3789		zfs_write_limit_shift = 0;
3790	mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
3791
3792#ifdef _KERNEL
3793	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
3794		prefetch_tunable_set = 1;
3795
3796#ifdef __i386__
3797	if (prefetch_tunable_set == 0) {
3798		printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
3799		    "-- to enable,\n");
3800		printf("            add \"vfs.zfs.prefetch_disable=0\" "
3801		    "to /boot/loader.conf.\n");
3802		zfs_prefetch_disable=1;
3803	}
3804#else
3805	if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
3806	    prefetch_tunable_set == 0) {
3807		printf("ZFS NOTICE: Prefetch is disabled by default if less "
3808		    "than 4GB of RAM is present;\n"
3809		    "            to enable, add \"vfs.zfs.prefetch_disable=0\" "
3810		    "to /boot/loader.conf.\n");
3811		zfs_prefetch_disable=1;
3812	}
3813#endif
3814	/* Warn about ZFS memory and address space requirements. */
3815	if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
3816		printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
3817		    "expect unstable behavior.\n");
3818	}
3819	if (kmem_size() < 512 * (1 << 20)) {
3820		printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
3821		    "expect unstable behavior.\n");
3822		printf("             Consider tuning vm.kmem_size and "
3823		    "vm.kmem_size_max\n");
3824		printf("             in /boot/loader.conf.\n");
3825	}
3826#endif
3827}
3828
3829void
3830arc_fini(void)
3831{
3832	int i;
3833
3834	mutex_enter(&arc_reclaim_thr_lock);
3835	arc_thread_exit = 1;
3836	cv_signal(&arc_reclaim_thr_cv);
3837	while (arc_thread_exit != 0)
3838		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3839	mutex_exit(&arc_reclaim_thr_lock);
3840
3841	arc_flush(NULL);
3842
3843	arc_dead = TRUE;
3844
3845	if (arc_ksp != NULL) {
3846		kstat_delete(arc_ksp);
3847		arc_ksp = NULL;
3848	}
3849
3850	mutex_destroy(&arc_eviction_mtx);
3851	mutex_destroy(&arc_reclaim_thr_lock);
3852	cv_destroy(&arc_reclaim_thr_cv);
3853
3854	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
3855		list_destroy(&arc_mru->arcs_lists[i]);
3856		list_destroy(&arc_mru_ghost->arcs_lists[i]);
3857		list_destroy(&arc_mfu->arcs_lists[i]);
3858		list_destroy(&arc_mfu_ghost->arcs_lists[i]);
3859
3860		mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
3861		mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
3862		mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
3863		mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
3864		mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
3865	}
3866
3867	mutex_destroy(&zfs_write_limit_lock);
3868
3869	buf_fini();
3870
3871	mutex_destroy(&arc_lowmem_lock);
3872#ifdef _KERNEL
3873	if (arc_event_lowmem != NULL)
3874		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
3875#endif
3876}
3877
3878/*
3879 * Level 2 ARC
3880 *
3881 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3882 * It uses dedicated storage devices to hold cached data, which are populated
3883 * using large infrequent writes.  The main role of this cache is to boost
3884 * the performance of random read workloads.  The intended L2ARC devices
3885 * include short-stroked disks, solid state disks, and other media with
3886 * substantially faster read latency than disk.
3887 *
3888 *                 +-----------------------+
3889 *                 |         ARC           |
3890 *                 +-----------------------+
3891 *                    |         ^     ^
3892 *                    |         |     |
3893 *      l2arc_feed_thread()    arc_read()
3894 *                    |         |     |
3895 *                    |  l2arc read   |
3896 *                    V         |     |
3897 *               +---------------+    |
3898 *               |     L2ARC     |    |
3899 *               +---------------+    |
3900 *                   |    ^           |
3901 *          l2arc_write() |           |
3902 *                   |    |           |
3903 *                   V    |           |
3904 *                 +-------+      +-------+
3905 *                 | vdev  |      | vdev  |
3906 *                 | cache |      | cache |
3907 *                 +-------+      +-------+
3908 *                 +=========+     .-----.
3909 *                 :  L2ARC  :    |-_____-|
3910 *                 : devices :    | Disks |
3911 *                 +=========+    `-_____-'
3912 *
3913 * Read requests are satisfied from the following sources, in order:
3914 *
3915 *	1) ARC
3916 *	2) vdev cache of L2ARC devices
3917 *	3) L2ARC devices
3918 *	4) vdev cache of disks
3919 *	5) disks
3920 *
3921 * Some L2ARC device types exhibit extremely slow write performance.
3922 * To accommodate for this there are some significant differences between
3923 * the L2ARC and traditional cache design:
3924 *
3925 * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
3926 * the ARC behave as usual, freeing buffers and placing headers on ghost
3927 * lists.  The ARC does not send buffers to the L2ARC during eviction as
3928 * this would add inflated write latencies for all ARC memory pressure.
3929 *
3930 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
3931 * It does this by periodically scanning buffers from the eviction-end of
3932 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
3933 * not already there.  It scans until a headroom of buffers is satisfied,
3934 * which itself is a buffer for ARC eviction.  The thread that does this is
3935 * l2arc_feed_thread(), illustrated below; example sizes are included to
3936 * provide a better sense of ratio than this diagram:
3937 *
3938 *	       head -->                        tail
3939 *	        +---------------------+----------+
3940 *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
3941 *	        +---------------------+----------+   |   o L2ARC eligible
3942 *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
3943 *	        +---------------------+----------+   |
3944 *	             15.9 Gbytes      ^ 32 Mbytes    |
3945 *	                           headroom          |
3946 *	                                      l2arc_feed_thread()
3947 *	                                             |
3948 *	                 l2arc write hand <--[oooo]--'
3949 *	                         |           8 Mbyte
3950 *	                         |          write max
3951 *	                         V
3952 *		  +==============================+
3953 *	L2ARC dev |####|#|###|###|    |####| ... |
3954 *	          +==============================+
3955 *	                     32 Gbytes
3956 *
3957 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
3958 * evicted, then the L2ARC has cached a buffer much sooner than it probably
3959 * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
3960 * safe to say that this is an uncommon case, since buffers at the end of
3961 * the ARC lists have moved there due to inactivity.
3962 *
3963 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
3964 * then the L2ARC simply misses copying some buffers.  This serves as a
3965 * pressure valve to prevent heavy read workloads from both stalling the ARC
3966 * with waits and clogging the L2ARC with writes.  This also helps prevent
3967 * the potential for the L2ARC to churn if it attempts to cache content too
3968 * quickly, such as during backups of the entire pool.
3969 *
3970 * 5. After system boot and before the ARC has filled main memory, there are
3971 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
3972 * lists can remain mostly static.  Instead of searching from tail of these
3973 * lists as pictured, the l2arc_feed_thread() will search from the list heads
3974 * for eligible buffers, greatly increasing its chance of finding them.
3975 *
3976 * The L2ARC device write speed is also boosted during this time so that
3977 * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
3978 * there are no L2ARC reads, and no fear of degrading read performance
3979 * through increased writes.
3980 *
3981 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
3982 * the vdev queue can aggregate them into larger and fewer writes.  Each
3983 * device is written to in a rotor fashion, sweeping writes through
3984 * available space then repeating.
3985 *
3986 * 7. The L2ARC does not store dirty content.  It never needs to flush
3987 * write buffers back to disk based storage.
3988 *
3989 * 8. If an ARC buffer is written (and dirtied) which also exists in the
3990 * L2ARC, the now stale L2ARC buffer is immediately dropped.
3991 *
3992 * The performance of the L2ARC can be tweaked by a number of tunables, which
3993 * may be necessary for different workloads:
3994 *
3995 *	l2arc_write_max		max write bytes per interval
3996 *	l2arc_write_boost	extra write bytes during device warmup
3997 *	l2arc_noprefetch	skip caching prefetched buffers
3998 *	l2arc_headroom		number of max device writes to precache
3999 *	l2arc_feed_secs		seconds between L2ARC writing
4000 *
4001 * Tunables may be removed or added as future performance improvements are
4002 * integrated, and also may become zpool properties.
4003 */
4004
4005static void
4006l2arc_hdr_stat_add(void)
4007{
4008	ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4009	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4010}
4011
4012static void
4013l2arc_hdr_stat_remove(void)
4014{
4015	ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4016	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4017}
4018
4019/*
4020 * Cycle through L2ARC devices.  This is how L2ARC load balances.
4021 * If a device is returned, this also returns holding the spa config lock.
4022 */
4023static l2arc_dev_t *
4024l2arc_dev_get_next(void)
4025{
4026	l2arc_dev_t *first, *next = NULL;
4027
4028	/*
4029	 * Lock out the removal of spas (spa_namespace_lock), then removal
4030	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4031	 * both locks will be dropped and a spa config lock held instead.
4032	 */
4033	mutex_enter(&spa_namespace_lock);
4034	mutex_enter(&l2arc_dev_mtx);
4035
4036	/* if there are no vdevs, there is nothing to do */
4037	if (l2arc_ndev == 0)
4038		goto out;
4039
4040	first = NULL;
4041	next = l2arc_dev_last;
4042	do {
4043		/* loop around the list looking for a non-faulted vdev */
4044		if (next == NULL) {
4045			next = list_head(l2arc_dev_list);
4046		} else {
4047			next = list_next(l2arc_dev_list, next);
4048			if (next == NULL)
4049				next = list_head(l2arc_dev_list);
4050		}
4051
4052		/* if we have come back to the start, bail out */
4053		if (first == NULL)
4054			first = next;
4055		else if (next == first)
4056			break;
4057
4058	} while (vdev_is_dead(next->l2ad_vdev));
4059
4060	/* if we were unable to find any usable vdevs, return NULL */
4061	if (vdev_is_dead(next->l2ad_vdev))
4062		next = NULL;
4063
4064	l2arc_dev_last = next;
4065
4066out:
4067	mutex_exit(&l2arc_dev_mtx);
4068
4069	/*
4070	 * Grab the config lock to prevent the 'next' device from being
4071	 * removed while we are writing to it.
4072	 */
4073	if (next != NULL)
4074		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4075	mutex_exit(&spa_namespace_lock);
4076
4077	return (next);
4078}
4079
4080/*
4081 * Free buffers that were tagged for destruction.
4082 */
4083static void
4084l2arc_do_free_on_write()
4085{
4086	list_t *buflist;
4087	l2arc_data_free_t *df, *df_prev;
4088
4089	mutex_enter(&l2arc_free_on_write_mtx);
4090	buflist = l2arc_free_on_write;
4091
4092	for (df = list_tail(buflist); df; df = df_prev) {
4093		df_prev = list_prev(buflist, df);
4094		ASSERT(df->l2df_data != NULL);
4095		ASSERT(df->l2df_func != NULL);
4096		df->l2df_func(df->l2df_data, df->l2df_size);
4097		list_remove(buflist, df);
4098		kmem_free(df, sizeof (l2arc_data_free_t));
4099	}
4100
4101	mutex_exit(&l2arc_free_on_write_mtx);
4102}
4103
4104/*
4105 * A write to a cache device has completed.  Update all headers to allow
4106 * reads from these buffers to begin.
4107 */
4108static void
4109l2arc_write_done(zio_t *zio)
4110{
4111	l2arc_write_callback_t *cb;
4112	l2arc_dev_t *dev;
4113	list_t *buflist;
4114	arc_buf_hdr_t *head, *ab, *ab_prev;
4115	l2arc_buf_hdr_t *abl2;
4116	kmutex_t *hash_lock;
4117
4118	cb = zio->io_private;
4119	ASSERT(cb != NULL);
4120	dev = cb->l2wcb_dev;
4121	ASSERT(dev != NULL);
4122	head = cb->l2wcb_head;
4123	ASSERT(head != NULL);
4124	buflist = dev->l2ad_buflist;
4125	ASSERT(buflist != NULL);
4126	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4127	    l2arc_write_callback_t *, cb);
4128
4129	if (zio->io_error != 0)
4130		ARCSTAT_BUMP(arcstat_l2_writes_error);
4131
4132	mutex_enter(&l2arc_buflist_mtx);
4133
4134	/*
4135	 * All writes completed, or an error was hit.
4136	 */
4137	for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4138		ab_prev = list_prev(buflist, ab);
4139
4140		hash_lock = HDR_LOCK(ab);
4141		if (!mutex_tryenter(hash_lock)) {
4142			/*
4143			 * This buffer misses out.  It may be in a stage
4144			 * of eviction.  Its ARC_L2_WRITING flag will be
4145			 * left set, denying reads to this buffer.
4146			 */
4147			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4148			continue;
4149		}
4150
4151		if (zio->io_error != 0) {
4152			/*
4153			 * Error - drop L2ARC entry.
4154			 */
4155			list_remove(buflist, ab);
4156			abl2 = ab->b_l2hdr;
4157			ab->b_l2hdr = NULL;
4158			kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4159			ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4160		}
4161
4162		/*
4163		 * Allow ARC to begin reads to this L2ARC entry.
4164		 */
4165		ab->b_flags &= ~ARC_L2_WRITING;
4166
4167		mutex_exit(hash_lock);
4168	}
4169
4170	atomic_inc_64(&l2arc_writes_done);
4171	list_remove(buflist, head);
4172	kmem_cache_free(hdr_cache, head);
4173	mutex_exit(&l2arc_buflist_mtx);
4174
4175	l2arc_do_free_on_write();
4176
4177	kmem_free(cb, sizeof (l2arc_write_callback_t));
4178}
4179
4180/*
4181 * A read to a cache device completed.  Validate buffer contents before
4182 * handing over to the regular ARC routines.
4183 */
4184static void
4185l2arc_read_done(zio_t *zio)
4186{
4187	l2arc_read_callback_t *cb;
4188	arc_buf_hdr_t *hdr;
4189	arc_buf_t *buf;
4190	kmutex_t *hash_lock;
4191	int equal;
4192
4193	ASSERT(zio->io_vd != NULL);
4194	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4195
4196	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4197
4198	cb = zio->io_private;
4199	ASSERT(cb != NULL);
4200	buf = cb->l2rcb_buf;
4201	ASSERT(buf != NULL);
4202	hdr = buf->b_hdr;
4203	ASSERT(hdr != NULL);
4204
4205	hash_lock = HDR_LOCK(hdr);
4206	mutex_enter(hash_lock);
4207
4208	/*
4209	 * Check this survived the L2ARC journey.
4210	 */
4211	equal = arc_cksum_equal(buf);
4212	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4213		mutex_exit(hash_lock);
4214		zio->io_private = buf;
4215		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
4216		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
4217		arc_read_done(zio);
4218	} else {
4219		mutex_exit(hash_lock);
4220		/*
4221		 * Buffer didn't survive caching.  Increment stats and
4222		 * reissue to the original storage device.
4223		 */
4224		if (zio->io_error != 0) {
4225			ARCSTAT_BUMP(arcstat_l2_io_error);
4226		} else {
4227			zio->io_error = EIO;
4228		}
4229		if (!equal)
4230			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4231
4232		/*
4233		 * If there's no waiter, issue an async i/o to the primary
4234		 * storage now.  If there *is* a waiter, the caller must
4235		 * issue the i/o in a context where it's OK to block.
4236		 */
4237		if (zio->io_waiter == NULL)
4238			zio_nowait(zio_read(zio->io_parent,
4239			    cb->l2rcb_spa, &cb->l2rcb_bp,
4240			    buf->b_data, zio->io_size, arc_read_done, buf,
4241			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4242	}
4243
4244	kmem_free(cb, sizeof (l2arc_read_callback_t));
4245}
4246
4247/*
4248 * This is the list priority from which the L2ARC will search for pages to
4249 * cache.  This is used within loops (0..3) to cycle through lists in the
4250 * desired order.  This order can have a significant effect on cache
4251 * performance.
4252 *
4253 * Currently the metadata lists are hit first, MFU then MRU, followed by
4254 * the data lists.  This function returns a locked list, and also returns
4255 * the lock pointer.
4256 */
4257static list_t *
4258l2arc_list_locked(int list_num, kmutex_t **lock)
4259{
4260	list_t *list;
4261	int idx;
4262
4263	ASSERT(list_num >= 0 && list_num < 2*ARC_BUFC_NUMLISTS);
4264
4265	if (list_num < ARC_BUFC_NUMMETADATALISTS) {
4266		idx = list_num;
4267		list = &arc_mfu->arcs_lists[idx];
4268		*lock = ARCS_LOCK(arc_mfu, idx);
4269	} else if (list_num < ARC_BUFC_NUMMETADATALISTS*2) {
4270		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4271		list = &arc_mru->arcs_lists[idx];
4272		*lock = ARCS_LOCK(arc_mru, idx);
4273	} else if (list_num < (ARC_BUFC_NUMMETADATALISTS*2 +
4274		ARC_BUFC_NUMDATALISTS)) {
4275		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4276		list = &arc_mfu->arcs_lists[idx];
4277		*lock = ARCS_LOCK(arc_mfu, idx);
4278	} else {
4279		idx = list_num - ARC_BUFC_NUMLISTS;
4280		list = &arc_mru->arcs_lists[idx];
4281		*lock = ARCS_LOCK(arc_mru, idx);
4282	}
4283
4284	CTR3(KTR_SPARE2, "list=%p list_num=%d idx=%d",
4285	    list, list_num, idx);
4286	ASSERT(!(MUTEX_HELD(*lock)));
4287	mutex_enter(*lock);
4288	return (list);
4289}
4290
4291/*
4292 * Evict buffers from the device write hand to the distance specified in
4293 * bytes.  This distance may span populated buffers, it may span nothing.
4294 * This is clearing a region on the L2ARC device ready for writing.
4295 * If the 'all' boolean is set, every buffer is evicted.
4296 */
4297static void
4298l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4299{
4300	list_t *buflist;
4301	l2arc_buf_hdr_t *abl2;
4302	arc_buf_hdr_t *ab, *ab_prev;
4303	kmutex_t *hash_lock;
4304	uint64_t taddr;
4305
4306	buflist = dev->l2ad_buflist;
4307
4308	if (buflist == NULL)
4309		return;
4310
4311	if (!all && dev->l2ad_first) {
4312		/*
4313		 * This is the first sweep through the device.  There is
4314		 * nothing to evict.
4315		 */
4316		return;
4317	}
4318
4319	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4320		/*
4321		 * When nearing the end of the device, evict to the end
4322		 * before the device write hand jumps to the start.
4323		 */
4324		taddr = dev->l2ad_end;
4325	} else {
4326		taddr = dev->l2ad_hand + distance;
4327	}
4328	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4329	    uint64_t, taddr, boolean_t, all);
4330
4331top:
4332	mutex_enter(&l2arc_buflist_mtx);
4333	for (ab = list_tail(buflist); ab; ab = ab_prev) {
4334		ab_prev = list_prev(buflist, ab);
4335
4336		hash_lock = HDR_LOCK(ab);
4337		if (!mutex_tryenter(hash_lock)) {
4338			/*
4339			 * Missed the hash lock.  Retry.
4340			 */
4341			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4342			mutex_exit(&l2arc_buflist_mtx);
4343			mutex_enter(hash_lock);
4344			mutex_exit(hash_lock);
4345			goto top;
4346		}
4347
4348		if (HDR_L2_WRITE_HEAD(ab)) {
4349			/*
4350			 * We hit a write head node.  Leave it for
4351			 * l2arc_write_done().
4352			 */
4353			list_remove(buflist, ab);
4354			mutex_exit(hash_lock);
4355			continue;
4356		}
4357
4358		if (!all && ab->b_l2hdr != NULL &&
4359		    (ab->b_l2hdr->b_daddr > taddr ||
4360		    ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4361			/*
4362			 * We've evicted to the target address,
4363			 * or the end of the device.
4364			 */
4365			mutex_exit(hash_lock);
4366			break;
4367		}
4368
4369		if (HDR_FREE_IN_PROGRESS(ab)) {
4370			/*
4371			 * Already on the path to destruction.
4372			 */
4373			mutex_exit(hash_lock);
4374			continue;
4375		}
4376
4377		if (ab->b_state == arc_l2c_only) {
4378			ASSERT(!HDR_L2_READING(ab));
4379			/*
4380			 * This doesn't exist in the ARC.  Destroy.
4381			 * arc_hdr_destroy() will call list_remove()
4382			 * and decrement arcstat_l2_size.
4383			 */
4384			arc_change_state(arc_anon, ab, hash_lock);
4385			arc_hdr_destroy(ab);
4386		} else {
4387			/*
4388			 * Invalidate issued or about to be issued
4389			 * reads, since we may be about to write
4390			 * over this location.
4391			 */
4392			if (HDR_L2_READING(ab)) {
4393				ARCSTAT_BUMP(arcstat_l2_evict_reading);
4394				ab->b_flags |= ARC_L2_EVICTED;
4395			}
4396
4397			/*
4398			 * Tell ARC this no longer exists in L2ARC.
4399			 */
4400			if (ab->b_l2hdr != NULL) {
4401				abl2 = ab->b_l2hdr;
4402				ab->b_l2hdr = NULL;
4403				kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4404				ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4405			}
4406			list_remove(buflist, ab);
4407
4408			/*
4409			 * This may have been leftover after a
4410			 * failed write.
4411			 */
4412			ab->b_flags &= ~ARC_L2_WRITING;
4413		}
4414		mutex_exit(hash_lock);
4415	}
4416	mutex_exit(&l2arc_buflist_mtx);
4417
4418	spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict));
4419	dev->l2ad_evict = taddr;
4420}
4421
4422/*
4423 * Find and write ARC buffers to the L2ARC device.
4424 *
4425 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4426 * for reading until they have completed writing.
4427 */
4428static void
4429l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
4430{
4431	arc_buf_hdr_t *ab, *ab_prev, *head;
4432	l2arc_buf_hdr_t *hdrl2;
4433	list_t *list;
4434	uint64_t passed_sz, write_sz, buf_sz, headroom;
4435	void *buf_data;
4436	kmutex_t *hash_lock, *list_lock;
4437	boolean_t have_lock, full;
4438	l2arc_write_callback_t *cb;
4439	zio_t *pio, *wzio;
4440	int try;
4441
4442	ASSERT(dev->l2ad_vdev != NULL);
4443
4444	pio = NULL;
4445	write_sz = 0;
4446	full = B_FALSE;
4447	head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4448	head->b_flags |= ARC_L2_WRITE_HEAD;
4449
4450	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
4451	/*
4452	 * Copy buffers for L2ARC writing.
4453	 */
4454	mutex_enter(&l2arc_buflist_mtx);
4455	for (try = 0; try < 2*ARC_BUFC_NUMLISTS; try++) {
4456		list = l2arc_list_locked(try, &list_lock);
4457		passed_sz = 0;
4458		ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
4459
4460		/*
4461		 * L2ARC fast warmup.
4462		 *
4463		 * Until the ARC is warm and starts to evict, read from the
4464		 * head of the ARC lists rather than the tail.
4465		 */
4466		headroom = target_sz * l2arc_headroom;
4467		if (arc_warm == B_FALSE)
4468			ab = list_head(list);
4469		else
4470			ab = list_tail(list);
4471		if (ab == NULL) {
4472			ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
4473		}
4474
4475		for (; ab; ab = ab_prev) {
4476			if (arc_warm == B_FALSE)
4477				ab_prev = list_next(list, ab);
4478			else
4479				ab_prev = list_prev(list, ab);
4480			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size);
4481
4482			hash_lock = HDR_LOCK(ab);
4483			have_lock = MUTEX_HELD(hash_lock);
4484			if (!have_lock && !mutex_tryenter(hash_lock)) {
4485				ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
4486				/*
4487				 * Skip this buffer rather than waiting.
4488				 */
4489				continue;
4490			}
4491
4492			if (ab->b_l2hdr != NULL) {
4493				/*
4494				 * Already in L2ARC.
4495				 */
4496				mutex_exit(hash_lock);
4497				ARCSTAT_BUMP(arcstat_l2_write_in_l2);
4498				continue;
4499			}
4500
4501			passed_sz += ab->b_size;
4502			if (passed_sz > headroom) {
4503				/*
4504				 * Searched too far.
4505				 */
4506				mutex_exit(hash_lock);
4507				ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
4508				break;
4509			}
4510
4511			if (ab->b_spa != spa) {
4512				mutex_exit(hash_lock);
4513				ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
4514				continue;
4515			}
4516
4517			if (HDR_IO_IN_PROGRESS(ab)) {
4518				mutex_exit(hash_lock);
4519				ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
4520				continue;
4521			}
4522			if (!HDR_L2CACHE(ab)) {
4523				mutex_exit(hash_lock);
4524				ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
4525				continue;
4526			}
4527			if ((write_sz + ab->b_size) > target_sz) {
4528				full = B_TRUE;
4529				mutex_exit(hash_lock);
4530				ARCSTAT_BUMP(arcstat_l2_write_full);
4531				break;
4532			}
4533
4534			if (ab->b_buf == NULL) {
4535				DTRACE_PROBE1(l2arc__buf__null, void *, ab);
4536				mutex_exit(hash_lock);
4537				continue;
4538			}
4539
4540			if (pio == NULL) {
4541				/*
4542				 * Insert a dummy header on the buflist so
4543				 * l2arc_write_done() can find where the
4544				 * write buffers begin without searching.
4545				 */
4546				list_insert_head(dev->l2ad_buflist, head);
4547
4548				cb = kmem_alloc(
4549				    sizeof (l2arc_write_callback_t), KM_SLEEP);
4550				cb->l2wcb_dev = dev;
4551				cb->l2wcb_head = head;
4552				pio = zio_root(spa, l2arc_write_done, cb,
4553				    ZIO_FLAG_CANFAIL);
4554				ARCSTAT_BUMP(arcstat_l2_write_pios);
4555			}
4556
4557			ARCSTAT_INCR(arcstat_l2_write_bytes_written, ab->b_size);
4558			/*
4559			 * Create and add a new L2ARC header.
4560			 */
4561			hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4562			hdrl2->b_dev = dev;
4563			hdrl2->b_daddr = dev->l2ad_hand;
4564
4565			ab->b_l2hdr = hdrl2;
4566			list_insert_head(dev->l2ad_buflist, ab);
4567			buf_data = ab->b_buf->b_data;
4568			buf_sz = ab->b_size;
4569
4570			/*
4571			 * Compute and store the buffer cksum before
4572			 * writing.  On debug the cksum is verified first.
4573			 */
4574			arc_cksum_verify(ab->b_buf);
4575			arc_cksum_compute(ab->b_buf, B_TRUE);
4576
4577			mutex_exit(hash_lock);
4578
4579			wzio = zio_write_phys(pio, dev->l2ad_vdev,
4580			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4581			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4582			    ZIO_FLAG_CANFAIL, B_FALSE);
4583
4584			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4585			    zio_t *, wzio);
4586			(void) zio_nowait(wzio);
4587
4588			/*
4589			 * Keep the clock hand suitably device-aligned.
4590			 */
4591			buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4592
4593			write_sz += buf_sz;
4594			dev->l2ad_hand += buf_sz;
4595		}
4596
4597		mutex_exit(list_lock);
4598
4599		if (full == B_TRUE)
4600			break;
4601	}
4602	mutex_exit(&l2arc_buflist_mtx);
4603
4604	if (pio == NULL) {
4605		ASSERT3U(write_sz, ==, 0);
4606		kmem_cache_free(hdr_cache, head);
4607		return;
4608	}
4609
4610	ASSERT3U(write_sz, <=, target_sz);
4611	ARCSTAT_BUMP(arcstat_l2_writes_sent);
4612	ARCSTAT_INCR(arcstat_l2_size, write_sz);
4613	spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz);
4614
4615	/*
4616	 * Bump device hand to the device start if it is approaching the end.
4617	 * l2arc_evict() will already have evicted ahead for this case.
4618	 */
4619	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4620		spa_l2cache_space_update(dev->l2ad_vdev, 0,
4621		    dev->l2ad_end - dev->l2ad_hand);
4622		dev->l2ad_hand = dev->l2ad_start;
4623		dev->l2ad_evict = dev->l2ad_start;
4624		dev->l2ad_first = B_FALSE;
4625	}
4626
4627	(void) zio_wait(pio);
4628}
4629
4630/*
4631 * This thread feeds the L2ARC at regular intervals.  This is the beating
4632 * heart of the L2ARC.
4633 */
4634static void
4635l2arc_feed_thread(void *dummy __unused)
4636{
4637	callb_cpr_t cpr;
4638	l2arc_dev_t *dev;
4639	spa_t *spa;
4640	uint64_t size;
4641
4642	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4643
4644	mutex_enter(&l2arc_feed_thr_lock);
4645
4646	while (l2arc_thread_exit == 0) {
4647		/*
4648		 * Pause for l2arc_feed_secs seconds between writes.
4649		 */
4650		CALLB_CPR_SAFE_BEGIN(&cpr);
4651		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
4652		    hz * l2arc_feed_secs >> l2arc_feed_secs_shift);
4653		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
4654
4655		/*
4656		 * Quick check for L2ARC devices.
4657		 */
4658		mutex_enter(&l2arc_dev_mtx);
4659		if (l2arc_ndev == 0) {
4660			mutex_exit(&l2arc_dev_mtx);
4661			continue;
4662		}
4663		mutex_exit(&l2arc_dev_mtx);
4664
4665		/*
4666		 * This selects the next l2arc device to write to, and in
4667		 * doing so the next spa to feed from: dev->l2ad_spa.   This
4668		 * will return NULL if there are now no l2arc devices or if
4669		 * they are all faulted.
4670		 *
4671		 * If a device is returned, its spa's config lock is also
4672		 * held to prevent device removal.  l2arc_dev_get_next()
4673		 * will grab and release l2arc_dev_mtx.
4674		 */
4675		if ((dev = l2arc_dev_get_next()) == NULL)
4676			continue;
4677
4678		spa = dev->l2ad_spa;
4679		ASSERT(spa != NULL);
4680
4681		/*
4682		 * Avoid contributing to memory pressure.
4683		 */
4684		if (arc_reclaim_needed()) {
4685			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
4686			spa_config_exit(spa, SCL_L2ARC, dev);
4687			continue;
4688		}
4689
4690		ARCSTAT_BUMP(arcstat_l2_feeds);
4691
4692		size = dev->l2ad_write;
4693		if (arc_warm == B_FALSE)
4694			size += dev->l2ad_boost;
4695
4696		/*
4697		 * Evict L2ARC buffers that will be overwritten.
4698		 */
4699		l2arc_evict(dev, size, B_FALSE);
4700
4701		/*
4702		 * Write ARC buffers.
4703		 */
4704		l2arc_write_buffers(spa, dev, size);
4705		spa_config_exit(spa, SCL_L2ARC, dev);
4706	}
4707
4708	l2arc_thread_exit = 0;
4709	cv_broadcast(&l2arc_feed_thr_cv);
4710	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
4711	thread_exit();
4712}
4713
4714boolean_t
4715l2arc_vdev_present(vdev_t *vd)
4716{
4717	l2arc_dev_t *dev;
4718
4719	mutex_enter(&l2arc_dev_mtx);
4720	for (dev = list_head(l2arc_dev_list); dev != NULL;
4721	    dev = list_next(l2arc_dev_list, dev)) {
4722		if (dev->l2ad_vdev == vd)
4723			break;
4724	}
4725	mutex_exit(&l2arc_dev_mtx);
4726
4727	return (dev != NULL);
4728}
4729
4730/*
4731 * Add a vdev for use by the L2ARC.  By this point the spa has already
4732 * validated the vdev and opened it.
4733 */
4734void
4735l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
4736{
4737	l2arc_dev_t *adddev;
4738
4739	ASSERT(!l2arc_vdev_present(vd));
4740
4741	/*
4742	 * Create a new l2arc device entry.
4743	 */
4744	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
4745	adddev->l2ad_spa = spa;
4746	adddev->l2ad_vdev = vd;
4747	adddev->l2ad_write = l2arc_write_max;
4748	adddev->l2ad_boost = l2arc_write_boost;
4749	adddev->l2ad_start = start;
4750	adddev->l2ad_end = end;
4751	adddev->l2ad_hand = adddev->l2ad_start;
4752	adddev->l2ad_evict = adddev->l2ad_start;
4753	adddev->l2ad_first = B_TRUE;
4754	ASSERT3U(adddev->l2ad_write, >, 0);
4755
4756	/*
4757	 * This is a list of all ARC buffers that are still valid on the
4758	 * device.
4759	 */
4760	adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
4761	list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
4762	    offsetof(arc_buf_hdr_t, b_l2node));
4763
4764	spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0);
4765
4766	/*
4767	 * Add device to global list
4768	 */
4769	mutex_enter(&l2arc_dev_mtx);
4770	list_insert_head(l2arc_dev_list, adddev);
4771	atomic_inc_64(&l2arc_ndev);
4772	mutex_exit(&l2arc_dev_mtx);
4773}
4774
4775/*
4776 * Remove a vdev from the L2ARC.
4777 */
4778void
4779l2arc_remove_vdev(vdev_t *vd)
4780{
4781	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
4782
4783	/*
4784	 * Find the device by vdev
4785	 */
4786	mutex_enter(&l2arc_dev_mtx);
4787	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
4788		nextdev = list_next(l2arc_dev_list, dev);
4789		if (vd == dev->l2ad_vdev) {
4790			remdev = dev;
4791			break;
4792		}
4793	}
4794	ASSERT(remdev != NULL);
4795
4796	/*
4797	 * Remove device from global list
4798	 */
4799	list_remove(l2arc_dev_list, remdev);
4800	l2arc_dev_last = NULL;		/* may have been invalidated */
4801	atomic_dec_64(&l2arc_ndev);
4802	mutex_exit(&l2arc_dev_mtx);
4803
4804	/*
4805	 * Clear all buflists and ARC references.  L2ARC device flush.
4806	 */
4807	l2arc_evict(remdev, 0, B_TRUE);
4808	list_destroy(remdev->l2ad_buflist);
4809	kmem_free(remdev->l2ad_buflist, sizeof (list_t));
4810	kmem_free(remdev, sizeof (l2arc_dev_t));
4811}
4812
4813void
4814l2arc_init(void)
4815{
4816	l2arc_thread_exit = 0;
4817	l2arc_ndev = 0;
4818	l2arc_writes_sent = 0;
4819	l2arc_writes_done = 0;
4820
4821	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4822	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
4823	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
4824	mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
4825	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
4826
4827	l2arc_dev_list = &L2ARC_dev_list;
4828	l2arc_free_on_write = &L2ARC_free_on_write;
4829	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
4830	    offsetof(l2arc_dev_t, l2ad_node));
4831	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
4832	    offsetof(l2arc_data_free_t, l2df_list_node));
4833}
4834
4835void
4836l2arc_fini(void)
4837{
4838	/*
4839	 * This is called from dmu_fini(), which is called from spa_fini();
4840	 * Because of this, we can assume that all l2arc devices have
4841	 * already been removed when the pools themselves were removed.
4842	 */
4843
4844	l2arc_do_free_on_write();
4845
4846	mutex_destroy(&l2arc_feed_thr_lock);
4847	cv_destroy(&l2arc_feed_thr_cv);
4848	mutex_destroy(&l2arc_dev_mtx);
4849	mutex_destroy(&l2arc_buflist_mtx);
4850	mutex_destroy(&l2arc_free_on_write_mtx);
4851
4852	list_destroy(l2arc_dev_list);
4853	list_destroy(l2arc_free_on_write);
4854}
4855
4856void
4857l2arc_start(void)
4858{
4859	if (!(spa_mode & FWRITE))
4860		return;
4861
4862	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
4863	    TS_RUN, minclsyspri);
4864}
4865
4866void
4867l2arc_stop(void)
4868{
4869	if (!(spa_mode & FWRITE))
4870		return;
4871
4872	mutex_enter(&l2arc_feed_thr_lock);
4873	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
4874	l2arc_thread_exit = 1;
4875	while (l2arc_thread_exit != 0)
4876		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
4877	mutex_exit(&l2arc_feed_thr_lock);
4878}
4879