arc.c revision 193878
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*
27 * DVA-based Adjustable Replacement Cache
28 *
29 * While much of the theory of operation used here is
30 * based on the self-tuning, low overhead replacement cache
31 * presented by Megiddo and Modha at FAST 2003, there are some
32 * significant differences:
33 *
34 * 1. The Megiddo and Modha model assumes any page is evictable.
35 * Pages in its cache cannot be "locked" into memory.  This makes
36 * the eviction algorithm simple: evict the last page in the list.
37 * This also make the performance characteristics easy to reason
38 * about.  Our cache is not so simple.  At any given moment, some
39 * subset of the blocks in the cache are un-evictable because we
40 * have handed out a reference to them.  Blocks are only evictable
41 * when there are no external references active.  This makes
42 * eviction far more problematic:  we choose to evict the evictable
43 * blocks that are the "lowest" in the list.
44 *
45 * There are times when it is not possible to evict the requested
46 * space.  In these circumstances we are unable to adjust the cache
47 * size.  To prevent the cache growing unbounded at these times we
48 * implement a "cache throttle" that slows the flow of new data
49 * into the cache until we can make space available.
50 *
51 * 2. The Megiddo and Modha model assumes a fixed cache size.
52 * Pages are evicted when the cache is full and there is a cache
53 * miss.  Our model has a variable sized cache.  It grows with
54 * high use, but also tries to react to memory pressure from the
55 * operating system: decreasing its size when system memory is
56 * tight.
57 *
58 * 3. The Megiddo and Modha model assumes a fixed page size. All
59 * elements of the cache are therefor exactly the same size.  So
60 * when adjusting the cache size following a cache miss, its simply
61 * a matter of choosing a single page to evict.  In our model, we
62 * have variable sized cache blocks (rangeing from 512 bytes to
63 * 128K bytes).  We therefor choose a set of blocks to evict to make
64 * space for a cache miss that approximates as closely as possible
65 * the space used by the new block.
66 *
67 * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
68 * by N. Megiddo & D. Modha, FAST 2003
69 */
70
71/*
72 * The locking model:
73 *
74 * A new reference to a cache buffer can be obtained in two
75 * ways: 1) via a hash table lookup using the DVA as a key,
76 * or 2) via one of the ARC lists.  The arc_read() interface
77 * uses method 1, while the internal arc algorithms for
78 * adjusting the cache use method 2.  We therefor provide two
79 * types of locks: 1) the hash table lock array, and 2) the
80 * arc list locks.
81 *
82 * Buffers do not have their own mutexs, rather they rely on the
83 * hash table mutexs for the bulk of their protection (i.e. most
84 * fields in the arc_buf_hdr_t are protected by these mutexs).
85 *
86 * buf_hash_find() returns the appropriate mutex (held) when it
87 * locates the requested buffer in the hash table.  It returns
88 * NULL for the mutex if the buffer was not in the table.
89 *
90 * buf_hash_remove() expects the appropriate hash mutex to be
91 * already held before it is invoked.
92 *
93 * Each arc state also has a mutex which is used to protect the
94 * buffer list associated with the state.  When attempting to
95 * obtain a hash table lock while holding an arc list lock you
96 * must use: mutex_tryenter() to avoid deadlock.  Also note that
97 * the active state mutex must be held before the ghost state mutex.
98 *
99 * Arc buffers may have an associated eviction callback function.
100 * This function will be invoked prior to removing the buffer (e.g.
101 * in arc_do_user_evicts()).  Note however that the data associated
102 * with the buffer may be evicted prior to the callback.  The callback
103 * must be made with *no locks held* (to prevent deadlock).  Additionally,
104 * the users of callbacks must ensure that their private data is
105 * protected from simultaneous callbacks from arc_buf_evict()
106 * and arc_do_user_evicts().
107 *
108 * Note that the majority of the performance stats are manipulated
109 * with atomic operations.
110 *
111 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
112 *
113 *	- L2ARC buflist creation
114 *	- L2ARC buflist eviction
115 *	- L2ARC write completion, which walks L2ARC buflists
116 *	- ARC header destruction, as it removes from L2ARC buflists
117 *	- ARC header release, as it removes from L2ARC buflists
118 */
119
120#include <sys/spa.h>
121#include <sys/zio.h>
122#include <sys/zio_checksum.h>
123#include <sys/zfs_context.h>
124#include <sys/arc.h>
125#include <sys/refcount.h>
126#include <sys/vdev.h>
127#ifdef _KERNEL
128#include <sys/dnlc.h>
129#endif
130#include <sys/callb.h>
131#include <sys/kstat.h>
132#include <sys/sdt.h>
133
134#include <vm/vm_pageout.h>
135
136static kmutex_t		arc_reclaim_thr_lock;
137static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
138static uint8_t		arc_thread_exit;
139
140extern int zfs_write_limit_shift;
141extern uint64_t zfs_write_limit_max;
142extern kmutex_t zfs_write_limit_lock;
143
144#define	ARC_REDUCE_DNLC_PERCENT	3
145uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
146
147typedef enum arc_reclaim_strategy {
148	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
149	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
150} arc_reclaim_strategy_t;
151
152/* number of seconds before growing cache again */
153static int		arc_grow_retry = 60;
154
155/*
156 * minimum lifespan of a prefetch block in clock ticks
157 * (initialized in arc_init())
158 */
159static int		arc_min_prefetch_lifespan;
160
161extern int zfs_prefetch_disable;
162extern int zfs_prefetch_enable;
163static int arc_dead;
164
165/*
166 * The arc has filled available memory and has now warmed up.
167 */
168static boolean_t arc_warm;
169
170/*
171 * These tunables are for performance analysis.
172 */
173uint64_t zfs_arc_max;
174uint64_t zfs_arc_min;
175uint64_t zfs_arc_meta_limit = 0;
176int zfs_mdcomp_disable = 0;
177
178TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
179TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
180TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
181TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
182SYSCTL_DECL(_vfs_zfs);
183SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
184    "Maximum ARC size");
185SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
186    "Minimum ARC size");
187SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
188    &zfs_mdcomp_disable, 0, "Disable metadata compression");
189
190/*
191 * Note that buffers can be in one of 6 states:
192 *	ARC_anon	- anonymous (discussed below)
193 *	ARC_mru		- recently used, currently cached
194 *	ARC_mru_ghost	- recentely used, no longer in cache
195 *	ARC_mfu		- frequently used, currently cached
196 *	ARC_mfu_ghost	- frequently used, no longer in cache
197 *	ARC_l2c_only	- exists in L2ARC but not other states
198 * When there are no active references to the buffer, they are
199 * are linked onto a list in one of these arc states.  These are
200 * the only buffers that can be evicted or deleted.  Within each
201 * state there are multiple lists, one for meta-data and one for
202 * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
203 * etc.) is tracked separately so that it can be managed more
204 * explicitly: favored over data, limited explicitly.
205 *
206 * Anonymous buffers are buffers that are not associated with
207 * a DVA.  These are buffers that hold dirty block copies
208 * before they are written to stable storage.  By definition,
209 * they are "ref'd" and are considered part of arc_mru
210 * that cannot be freed.  Generally, they will aquire a DVA
211 * as they are written and migrate onto the arc_mru list.
212 *
213 * The ARC_l2c_only state is for buffers that are in the second
214 * level ARC but no longer in any of the ARC_m* lists.  The second
215 * level ARC itself may also contain buffers that are in any of
216 * the ARC_m* states - meaning that a buffer can exist in two
217 * places.  The reason for the ARC_l2c_only state is to keep the
218 * buffer header in the hash table, so that reads that hit the
219 * second level ARC benefit from these fast lookups.
220 */
221
222typedef struct arc_state {
223	list_t	arcs_list[ARC_BUFC_NUMTYPES];	/* list of evictable buffers */
224	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
225	uint64_t arcs_size;	/* total amount of data in this state */
226	kmutex_t arcs_mtx;
227} arc_state_t;
228
229/* The 6 states: */
230static arc_state_t ARC_anon;
231static arc_state_t ARC_mru;
232static arc_state_t ARC_mru_ghost;
233static arc_state_t ARC_mfu;
234static arc_state_t ARC_mfu_ghost;
235static arc_state_t ARC_l2c_only;
236
237typedef struct arc_stats {
238	kstat_named_t arcstat_hits;
239	kstat_named_t arcstat_misses;
240	kstat_named_t arcstat_demand_data_hits;
241	kstat_named_t arcstat_demand_data_misses;
242	kstat_named_t arcstat_demand_metadata_hits;
243	kstat_named_t arcstat_demand_metadata_misses;
244	kstat_named_t arcstat_prefetch_data_hits;
245	kstat_named_t arcstat_prefetch_data_misses;
246	kstat_named_t arcstat_prefetch_metadata_hits;
247	kstat_named_t arcstat_prefetch_metadata_misses;
248	kstat_named_t arcstat_mru_hits;
249	kstat_named_t arcstat_mru_ghost_hits;
250	kstat_named_t arcstat_mfu_hits;
251	kstat_named_t arcstat_mfu_ghost_hits;
252	kstat_named_t arcstat_deleted;
253	kstat_named_t arcstat_recycle_miss;
254	kstat_named_t arcstat_mutex_miss;
255	kstat_named_t arcstat_evict_skip;
256	kstat_named_t arcstat_hash_elements;
257	kstat_named_t arcstat_hash_elements_max;
258	kstat_named_t arcstat_hash_collisions;
259	kstat_named_t arcstat_hash_chains;
260	kstat_named_t arcstat_hash_chain_max;
261	kstat_named_t arcstat_p;
262	kstat_named_t arcstat_c;
263	kstat_named_t arcstat_c_min;
264	kstat_named_t arcstat_c_max;
265	kstat_named_t arcstat_size;
266	kstat_named_t arcstat_hdr_size;
267	kstat_named_t arcstat_l2_hits;
268	kstat_named_t arcstat_l2_misses;
269	kstat_named_t arcstat_l2_feeds;
270	kstat_named_t arcstat_l2_rw_clash;
271	kstat_named_t arcstat_l2_writes_sent;
272	kstat_named_t arcstat_l2_writes_done;
273	kstat_named_t arcstat_l2_writes_error;
274	kstat_named_t arcstat_l2_writes_hdr_miss;
275	kstat_named_t arcstat_l2_evict_lock_retry;
276	kstat_named_t arcstat_l2_evict_reading;
277	kstat_named_t arcstat_l2_free_on_write;
278	kstat_named_t arcstat_l2_abort_lowmem;
279	kstat_named_t arcstat_l2_cksum_bad;
280	kstat_named_t arcstat_l2_io_error;
281	kstat_named_t arcstat_l2_size;
282	kstat_named_t arcstat_l2_hdr_size;
283	kstat_named_t arcstat_memory_throttle_count;
284} arc_stats_t;
285
286static arc_stats_t arc_stats = {
287	{ "hits",			KSTAT_DATA_UINT64 },
288	{ "misses",			KSTAT_DATA_UINT64 },
289	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
290	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
291	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
292	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
293	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
294	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
295	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
296	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
297	{ "mru_hits",			KSTAT_DATA_UINT64 },
298	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
299	{ "mfu_hits",			KSTAT_DATA_UINT64 },
300	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
301	{ "deleted",			KSTAT_DATA_UINT64 },
302	{ "recycle_miss",		KSTAT_DATA_UINT64 },
303	{ "mutex_miss",			KSTAT_DATA_UINT64 },
304	{ "evict_skip",			KSTAT_DATA_UINT64 },
305	{ "hash_elements",		KSTAT_DATA_UINT64 },
306	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
307	{ "hash_collisions",		KSTAT_DATA_UINT64 },
308	{ "hash_chains",		KSTAT_DATA_UINT64 },
309	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
310	{ "p",				KSTAT_DATA_UINT64 },
311	{ "c",				KSTAT_DATA_UINT64 },
312	{ "c_min",			KSTAT_DATA_UINT64 },
313	{ "c_max",			KSTAT_DATA_UINT64 },
314	{ "size",			KSTAT_DATA_UINT64 },
315	{ "hdr_size",			KSTAT_DATA_UINT64 },
316	{ "l2_hits",			KSTAT_DATA_UINT64 },
317	{ "l2_misses",			KSTAT_DATA_UINT64 },
318	{ "l2_feeds",			KSTAT_DATA_UINT64 },
319	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
320	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
321	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
322	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
323	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
324	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
325	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
326	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
327	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
328	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
329	{ "l2_io_error",		KSTAT_DATA_UINT64 },
330	{ "l2_size",			KSTAT_DATA_UINT64 },
331	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
332	{ "memory_throttle_count",	KSTAT_DATA_UINT64 }
333};
334
335#define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
336
337#define	ARCSTAT_INCR(stat, val) \
338	atomic_add_64(&arc_stats.stat.value.ui64, (val));
339
340#define	ARCSTAT_BUMP(stat) 	ARCSTAT_INCR(stat, 1)
341#define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
342
343#define	ARCSTAT_MAX(stat, val) {					\
344	uint64_t m;							\
345	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
346	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
347		continue;						\
348}
349
350#define	ARCSTAT_MAXSTAT(stat) \
351	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
352
353/*
354 * We define a macro to allow ARC hits/misses to be easily broken down by
355 * two separate conditions, giving a total of four different subtypes for
356 * each of hits and misses (so eight statistics total).
357 */
358#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
359	if (cond1) {							\
360		if (cond2) {						\
361			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
362		} else {						\
363			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
364		}							\
365	} else {							\
366		if (cond2) {						\
367			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
368		} else {						\
369			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
370		}							\
371	}
372
373kstat_t			*arc_ksp;
374static arc_state_t 	*arc_anon;
375static arc_state_t	*arc_mru;
376static arc_state_t	*arc_mru_ghost;
377static arc_state_t	*arc_mfu;
378static arc_state_t	*arc_mfu_ghost;
379static arc_state_t	*arc_l2c_only;
380
381/*
382 * There are several ARC variables that are critical to export as kstats --
383 * but we don't want to have to grovel around in the kstat whenever we wish to
384 * manipulate them.  For these variables, we therefore define them to be in
385 * terms of the statistic variable.  This assures that we are not introducing
386 * the possibility of inconsistency by having shadow copies of the variables,
387 * while still allowing the code to be readable.
388 */
389#define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
390#define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
391#define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
392#define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
393#define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
394
395static int		arc_no_grow;	/* Don't try to grow cache size */
396static uint64_t		arc_tempreserve;
397static uint64_t		arc_meta_used;
398static uint64_t		arc_meta_limit;
399static uint64_t		arc_meta_max = 0;
400SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RDTUN,
401    &arc_meta_used, 0, "ARC metadata used");
402SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RDTUN,
403    &arc_meta_limit, 0, "ARC metadata limit");
404
405typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
406
407typedef struct arc_callback arc_callback_t;
408
409struct arc_callback {
410	void			*acb_private;
411	arc_done_func_t		*acb_done;
412	arc_buf_t		*acb_buf;
413	zio_t			*acb_zio_dummy;
414	arc_callback_t		*acb_next;
415};
416
417typedef struct arc_write_callback arc_write_callback_t;
418
419struct arc_write_callback {
420	void		*awcb_private;
421	arc_done_func_t	*awcb_ready;
422	arc_done_func_t	*awcb_done;
423	arc_buf_t	*awcb_buf;
424};
425
426struct arc_buf_hdr {
427	/* protected by hash lock */
428	dva_t			b_dva;
429	uint64_t		b_birth;
430	uint64_t		b_cksum0;
431
432	kmutex_t		b_freeze_lock;
433	zio_cksum_t		*b_freeze_cksum;
434
435	arc_buf_hdr_t		*b_hash_next;
436	arc_buf_t		*b_buf;
437	uint32_t		b_flags;
438	uint32_t		b_datacnt;
439
440	arc_callback_t		*b_acb;
441	kcondvar_t		b_cv;
442
443	/* immutable */
444	arc_buf_contents_t	b_type;
445	uint64_t		b_size;
446	spa_t			*b_spa;
447
448	/* protected by arc state mutex */
449	arc_state_t		*b_state;
450	list_node_t		b_arc_node;
451
452	/* updated atomically */
453	clock_t			b_arc_access;
454
455	/* self protecting */
456	refcount_t		b_refcnt;
457
458	l2arc_buf_hdr_t		*b_l2hdr;
459	list_node_t		b_l2node;
460};
461
462static arc_buf_t *arc_eviction_list;
463static kmutex_t arc_eviction_mtx;
464static arc_buf_hdr_t arc_eviction_hdr;
465static void arc_get_data_buf(arc_buf_t *buf);
466static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
467static int arc_evict_needed(arc_buf_contents_t type);
468static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes);
469
470#define	GHOST_STATE(state)	\
471	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
472	(state) == arc_l2c_only)
473
474/*
475 * Private ARC flags.  These flags are private ARC only flags that will show up
476 * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
477 * be passed in as arc_flags in things like arc_read.  However, these flags
478 * should never be passed and should only be set by ARC code.  When adding new
479 * public flags, make sure not to smash the private ones.
480 */
481
482#define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
483#define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
484#define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
485#define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
486#define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
487#define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
488#define	ARC_FREE_IN_PROGRESS	(1 << 15)	/* hdr about to be freed */
489#define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
490#define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
491#define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
492#define	ARC_STORED		(1 << 19)	/* has been store()d to */
493
494#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
495#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
496#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
497#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
498#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
499#define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
500#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_L2CACHE)
501#define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS &&	\
502				    (hdr)->b_l2hdr != NULL)
503#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
504#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
505#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
506
507/*
508 * Other sizes
509 */
510
511#define	HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
512#define	L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
513
514/*
515 * Hash table routines
516 */
517
518#define	HT_LOCK_PAD	128
519
520struct ht_lock {
521	kmutex_t	ht_lock;
522#ifdef _KERNEL
523	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
524#endif
525};
526
527#define	BUF_LOCKS 256
528typedef struct buf_hash_table {
529	uint64_t ht_mask;
530	arc_buf_hdr_t **ht_table;
531	struct ht_lock ht_locks[BUF_LOCKS];
532} buf_hash_table_t;
533
534static buf_hash_table_t buf_hash_table;
535
536#define	BUF_HASH_INDEX(spa, dva, birth) \
537	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
538#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
539#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
540#define	HDR_LOCK(buf) \
541	(BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
542
543uint64_t zfs_crc64_table[256];
544
545/*
546 * Level 2 ARC
547 */
548
549#define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
550#define	L2ARC_HEADROOM		4		/* num of writes */
551#define	L2ARC_FEED_SECS		1		/* caching interval */
552
553#define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
554#define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
555
556/*
557 * L2ARC Performance Tunables
558 */
559uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
560uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
561uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
562uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
563boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
564
565/*
566 * L2ARC Internals
567 */
568typedef struct l2arc_dev {
569	vdev_t			*l2ad_vdev;	/* vdev */
570	spa_t			*l2ad_spa;	/* spa */
571	uint64_t		l2ad_hand;	/* next write location */
572	uint64_t		l2ad_write;	/* desired write size, bytes */
573	uint64_t		l2ad_boost;	/* warmup write boost, bytes */
574	uint64_t		l2ad_start;	/* first addr on device */
575	uint64_t		l2ad_end;	/* last addr on device */
576	uint64_t		l2ad_evict;	/* last addr eviction reached */
577	boolean_t		l2ad_first;	/* first sweep through */
578	list_t			*l2ad_buflist;	/* buffer list */
579	list_node_t		l2ad_node;	/* device list node */
580} l2arc_dev_t;
581
582static list_t L2ARC_dev_list;			/* device list */
583static list_t *l2arc_dev_list;			/* device list pointer */
584static kmutex_t l2arc_dev_mtx;			/* device list mutex */
585static l2arc_dev_t *l2arc_dev_last;		/* last device used */
586static kmutex_t l2arc_buflist_mtx;		/* mutex for all buflists */
587static list_t L2ARC_free_on_write;		/* free after write buf list */
588static list_t *l2arc_free_on_write;		/* free after write list ptr */
589static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
590static uint64_t l2arc_ndev;			/* number of devices */
591
592typedef struct l2arc_read_callback {
593	arc_buf_t	*l2rcb_buf;		/* read buffer */
594	spa_t		*l2rcb_spa;		/* spa */
595	blkptr_t	l2rcb_bp;		/* original blkptr */
596	zbookmark_t	l2rcb_zb;		/* original bookmark */
597	int		l2rcb_flags;		/* original flags */
598} l2arc_read_callback_t;
599
600typedef struct l2arc_write_callback {
601	l2arc_dev_t	*l2wcb_dev;		/* device info */
602	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
603} l2arc_write_callback_t;
604
605struct l2arc_buf_hdr {
606	/* protected by arc_buf_hdr  mutex */
607	l2arc_dev_t	*b_dev;			/* L2ARC device */
608	daddr_t		b_daddr;		/* disk address, offset byte */
609};
610
611typedef struct l2arc_data_free {
612	/* protected by l2arc_free_on_write_mtx */
613	void		*l2df_data;
614	size_t		l2df_size;
615	void		(*l2df_func)(void *, size_t);
616	list_node_t	l2df_list_node;
617} l2arc_data_free_t;
618
619static kmutex_t l2arc_feed_thr_lock;
620static kcondvar_t l2arc_feed_thr_cv;
621static uint8_t l2arc_thread_exit;
622
623static void l2arc_read_done(zio_t *zio);
624static void l2arc_hdr_stat_add(void);
625static void l2arc_hdr_stat_remove(void);
626
627static uint64_t
628buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth)
629{
630	uintptr_t spav = (uintptr_t)spa;
631	uint8_t *vdva = (uint8_t *)dva;
632	uint64_t crc = -1ULL;
633	int i;
634
635	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
636
637	for (i = 0; i < sizeof (dva_t); i++)
638		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
639
640	crc ^= (spav>>8) ^ birth;
641
642	return (crc);
643}
644
645#define	BUF_EMPTY(buf)						\
646	((buf)->b_dva.dva_word[0] == 0 &&			\
647	(buf)->b_dva.dva_word[1] == 0 &&			\
648	(buf)->b_birth == 0)
649
650#define	BUF_EQUAL(spa, dva, birth, buf)				\
651	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
652	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
653	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
654
655static arc_buf_hdr_t *
656buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
657{
658	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
659	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
660	arc_buf_hdr_t *buf;
661
662	mutex_enter(hash_lock);
663	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
664	    buf = buf->b_hash_next) {
665		if (BUF_EQUAL(spa, dva, birth, buf)) {
666			*lockp = hash_lock;
667			return (buf);
668		}
669	}
670	mutex_exit(hash_lock);
671	*lockp = NULL;
672	return (NULL);
673}
674
675/*
676 * Insert an entry into the hash table.  If there is already an element
677 * equal to elem in the hash table, then the already existing element
678 * will be returned and the new element will not be inserted.
679 * Otherwise returns NULL.
680 */
681static arc_buf_hdr_t *
682buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
683{
684	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
685	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
686	arc_buf_hdr_t *fbuf;
687	uint32_t i;
688
689	ASSERT(!HDR_IN_HASH_TABLE(buf));
690	*lockp = hash_lock;
691	mutex_enter(hash_lock);
692	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
693	    fbuf = fbuf->b_hash_next, i++) {
694		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
695			return (fbuf);
696	}
697
698	buf->b_hash_next = buf_hash_table.ht_table[idx];
699	buf_hash_table.ht_table[idx] = buf;
700	buf->b_flags |= ARC_IN_HASH_TABLE;
701
702	/* collect some hash table performance data */
703	if (i > 0) {
704		ARCSTAT_BUMP(arcstat_hash_collisions);
705		if (i == 1)
706			ARCSTAT_BUMP(arcstat_hash_chains);
707
708		ARCSTAT_MAX(arcstat_hash_chain_max, i);
709	}
710
711	ARCSTAT_BUMP(arcstat_hash_elements);
712	ARCSTAT_MAXSTAT(arcstat_hash_elements);
713
714	return (NULL);
715}
716
717static void
718buf_hash_remove(arc_buf_hdr_t *buf)
719{
720	arc_buf_hdr_t *fbuf, **bufp;
721	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
722
723	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
724	ASSERT(HDR_IN_HASH_TABLE(buf));
725
726	bufp = &buf_hash_table.ht_table[idx];
727	while ((fbuf = *bufp) != buf) {
728		ASSERT(fbuf != NULL);
729		bufp = &fbuf->b_hash_next;
730	}
731	*bufp = buf->b_hash_next;
732	buf->b_hash_next = NULL;
733	buf->b_flags &= ~ARC_IN_HASH_TABLE;
734
735	/* collect some hash table performance data */
736	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
737
738	if (buf_hash_table.ht_table[idx] &&
739	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
740		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
741}
742
743/*
744 * Global data structures and functions for the buf kmem cache.
745 */
746static kmem_cache_t *hdr_cache;
747static kmem_cache_t *buf_cache;
748
749static void
750buf_fini(void)
751{
752	int i;
753
754	kmem_free(buf_hash_table.ht_table,
755	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
756	for (i = 0; i < BUF_LOCKS; i++)
757		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
758	kmem_cache_destroy(hdr_cache);
759	kmem_cache_destroy(buf_cache);
760}
761
762/*
763 * Constructor callback - called when the cache is empty
764 * and a new buf is requested.
765 */
766/* ARGSUSED */
767static int
768hdr_cons(void *vbuf, void *unused, int kmflag)
769{
770	arc_buf_hdr_t *buf = vbuf;
771
772	bzero(buf, sizeof (arc_buf_hdr_t));
773	refcount_create(&buf->b_refcnt);
774	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
775	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
776
777	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
778	return (0);
779}
780
781/* ARGSUSED */
782static int
783buf_cons(void *vbuf, void *unused, int kmflag)
784{
785	arc_buf_t *buf = vbuf;
786
787	bzero(buf, sizeof (arc_buf_t));
788	rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
789	return (0);
790}
791
792/*
793 * Destructor callback - called when a cached buf is
794 * no longer required.
795 */
796/* ARGSUSED */
797static void
798hdr_dest(void *vbuf, void *unused)
799{
800	arc_buf_hdr_t *buf = vbuf;
801
802	refcount_destroy(&buf->b_refcnt);
803	cv_destroy(&buf->b_cv);
804	mutex_destroy(&buf->b_freeze_lock);
805
806	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
807}
808
809/* ARGSUSED */
810static void
811buf_dest(void *vbuf, void *unused)
812{
813	arc_buf_t *buf = vbuf;
814
815	rw_destroy(&buf->b_lock);
816}
817
818/*
819 * Reclaim callback -- invoked when memory is low.
820 */
821/* ARGSUSED */
822static void
823hdr_recl(void *unused)
824{
825	dprintf("hdr_recl called\n");
826	/*
827	 * umem calls the reclaim func when we destroy the buf cache,
828	 * which is after we do arc_fini().
829	 */
830	if (!arc_dead)
831		cv_signal(&arc_reclaim_thr_cv);
832}
833
834static void
835buf_init(void)
836{
837	uint64_t *ct;
838	uint64_t hsize = 1ULL << 12;
839	int i, j;
840
841	/*
842	 * The hash table is big enough to fill all of physical memory
843	 * with an average 64K block size.  The table will take up
844	 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
845	 */
846	while (hsize * 65536 < (uint64_t)physmem * PAGESIZE)
847		hsize <<= 1;
848retry:
849	buf_hash_table.ht_mask = hsize - 1;
850	buf_hash_table.ht_table =
851	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
852	if (buf_hash_table.ht_table == NULL) {
853		ASSERT(hsize > (1ULL << 8));
854		hsize >>= 1;
855		goto retry;
856	}
857
858	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
859	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
860	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
861	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
862
863	for (i = 0; i < 256; i++)
864		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
865			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
866
867	for (i = 0; i < BUF_LOCKS; i++) {
868		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
869		    NULL, MUTEX_DEFAULT, NULL);
870	}
871}
872
873#define	ARC_MINTIME	(hz>>4) /* 62 ms */
874
875static void
876arc_cksum_verify(arc_buf_t *buf)
877{
878	zio_cksum_t zc;
879
880	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
881		return;
882
883	mutex_enter(&buf->b_hdr->b_freeze_lock);
884	if (buf->b_hdr->b_freeze_cksum == NULL ||
885	    (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
886		mutex_exit(&buf->b_hdr->b_freeze_lock);
887		return;
888	}
889	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
890	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
891		panic("buffer modified while frozen!");
892	mutex_exit(&buf->b_hdr->b_freeze_lock);
893}
894
895static int
896arc_cksum_equal(arc_buf_t *buf)
897{
898	zio_cksum_t zc;
899	int equal;
900
901	mutex_enter(&buf->b_hdr->b_freeze_lock);
902	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
903	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
904	mutex_exit(&buf->b_hdr->b_freeze_lock);
905
906	return (equal);
907}
908
909static void
910arc_cksum_compute(arc_buf_t *buf, boolean_t force)
911{
912	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
913		return;
914
915	mutex_enter(&buf->b_hdr->b_freeze_lock);
916	if (buf->b_hdr->b_freeze_cksum != NULL) {
917		mutex_exit(&buf->b_hdr->b_freeze_lock);
918		return;
919	}
920	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
921	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
922	    buf->b_hdr->b_freeze_cksum);
923	mutex_exit(&buf->b_hdr->b_freeze_lock);
924}
925
926void
927arc_buf_thaw(arc_buf_t *buf)
928{
929	if (zfs_flags & ZFS_DEBUG_MODIFY) {
930		if (buf->b_hdr->b_state != arc_anon)
931			panic("modifying non-anon buffer!");
932		if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
933			panic("modifying buffer while i/o in progress!");
934		arc_cksum_verify(buf);
935	}
936
937	mutex_enter(&buf->b_hdr->b_freeze_lock);
938	if (buf->b_hdr->b_freeze_cksum != NULL) {
939		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
940		buf->b_hdr->b_freeze_cksum = NULL;
941	}
942	mutex_exit(&buf->b_hdr->b_freeze_lock);
943}
944
945void
946arc_buf_freeze(arc_buf_t *buf)
947{
948	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
949		return;
950
951	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
952	    buf->b_hdr->b_state == arc_anon);
953	arc_cksum_compute(buf, B_FALSE);
954}
955
956static void
957add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
958{
959	ASSERT(MUTEX_HELD(hash_lock));
960
961	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
962	    (ab->b_state != arc_anon)) {
963		uint64_t delta = ab->b_size * ab->b_datacnt;
964		list_t *list = &ab->b_state->arcs_list[ab->b_type];
965		uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
966
967		ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
968		mutex_enter(&ab->b_state->arcs_mtx);
969		ASSERT(list_link_active(&ab->b_arc_node));
970		list_remove(list, ab);
971		if (GHOST_STATE(ab->b_state)) {
972			ASSERT3U(ab->b_datacnt, ==, 0);
973			ASSERT3P(ab->b_buf, ==, NULL);
974			delta = ab->b_size;
975		}
976		ASSERT(delta > 0);
977		ASSERT3U(*size, >=, delta);
978		atomic_add_64(size, -delta);
979		mutex_exit(&ab->b_state->arcs_mtx);
980		/* remove the prefetch flag if we get a reference */
981		if (ab->b_flags & ARC_PREFETCH)
982			ab->b_flags &= ~ARC_PREFETCH;
983	}
984}
985
986static int
987remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
988{
989	int cnt;
990	arc_state_t *state = ab->b_state;
991
992	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
993	ASSERT(!GHOST_STATE(state));
994
995	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
996	    (state != arc_anon)) {
997		uint64_t *size = &state->arcs_lsize[ab->b_type];
998
999		ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1000		mutex_enter(&state->arcs_mtx);
1001		ASSERT(!list_link_active(&ab->b_arc_node));
1002		list_insert_head(&state->arcs_list[ab->b_type], ab);
1003		ASSERT(ab->b_datacnt > 0);
1004		atomic_add_64(size, ab->b_size * ab->b_datacnt);
1005		mutex_exit(&state->arcs_mtx);
1006	}
1007	return (cnt);
1008}
1009
1010/*
1011 * Move the supplied buffer to the indicated state.  The mutex
1012 * for the buffer must be held by the caller.
1013 */
1014static void
1015arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1016{
1017	arc_state_t *old_state = ab->b_state;
1018	int64_t refcnt = refcount_count(&ab->b_refcnt);
1019	uint64_t from_delta, to_delta;
1020
1021	ASSERT(MUTEX_HELD(hash_lock));
1022	ASSERT(new_state != old_state);
1023	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1024	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1025
1026	from_delta = to_delta = ab->b_datacnt * ab->b_size;
1027
1028	/*
1029	 * If this buffer is evictable, transfer it from the
1030	 * old state list to the new state list.
1031	 */
1032	if (refcnt == 0) {
1033		if (old_state != arc_anon) {
1034			int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1035			uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1036
1037			if (use_mutex)
1038				mutex_enter(&old_state->arcs_mtx);
1039
1040			ASSERT(list_link_active(&ab->b_arc_node));
1041			list_remove(&old_state->arcs_list[ab->b_type], ab);
1042
1043			/*
1044			 * If prefetching out of the ghost cache,
1045			 * we will have a non-null datacnt.
1046			 */
1047			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1048				/* ghost elements have a ghost size */
1049				ASSERT(ab->b_buf == NULL);
1050				from_delta = ab->b_size;
1051			}
1052			ASSERT3U(*size, >=, from_delta);
1053			atomic_add_64(size, -from_delta);
1054
1055			if (use_mutex)
1056				mutex_exit(&old_state->arcs_mtx);
1057		}
1058		if (new_state != arc_anon) {
1059			int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1060			uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1061
1062			if (use_mutex)
1063				mutex_enter(&new_state->arcs_mtx);
1064
1065			list_insert_head(&new_state->arcs_list[ab->b_type], ab);
1066
1067			/* ghost elements have a ghost size */
1068			if (GHOST_STATE(new_state)) {
1069				ASSERT(ab->b_datacnt == 0);
1070				ASSERT(ab->b_buf == NULL);
1071				to_delta = ab->b_size;
1072			}
1073			atomic_add_64(size, to_delta);
1074
1075			if (use_mutex)
1076				mutex_exit(&new_state->arcs_mtx);
1077		}
1078	}
1079
1080	ASSERT(!BUF_EMPTY(ab));
1081	if (new_state == arc_anon) {
1082		buf_hash_remove(ab);
1083	}
1084
1085	/* adjust state sizes */
1086	if (to_delta)
1087		atomic_add_64(&new_state->arcs_size, to_delta);
1088	if (from_delta) {
1089		ASSERT3U(old_state->arcs_size, >=, from_delta);
1090		atomic_add_64(&old_state->arcs_size, -from_delta);
1091	}
1092	ab->b_state = new_state;
1093
1094	/* adjust l2arc hdr stats */
1095	if (new_state == arc_l2c_only)
1096		l2arc_hdr_stat_add();
1097	else if (old_state == arc_l2c_only)
1098		l2arc_hdr_stat_remove();
1099}
1100
1101void
1102arc_space_consume(uint64_t space)
1103{
1104	atomic_add_64(&arc_meta_used, space);
1105	atomic_add_64(&arc_size, space);
1106}
1107
1108void
1109arc_space_return(uint64_t space)
1110{
1111	ASSERT(arc_meta_used >= space);
1112	if (arc_meta_max < arc_meta_used)
1113		arc_meta_max = arc_meta_used;
1114	atomic_add_64(&arc_meta_used, -space);
1115	ASSERT(arc_size >= space);
1116	atomic_add_64(&arc_size, -space);
1117}
1118
1119void *
1120arc_data_buf_alloc(uint64_t size)
1121{
1122	if (arc_evict_needed(ARC_BUFC_DATA))
1123		cv_signal(&arc_reclaim_thr_cv);
1124	atomic_add_64(&arc_size, size);
1125	return (zio_data_buf_alloc(size));
1126}
1127
1128void
1129arc_data_buf_free(void *buf, uint64_t size)
1130{
1131	zio_data_buf_free(buf, size);
1132	ASSERT(arc_size >= size);
1133	atomic_add_64(&arc_size, -size);
1134}
1135
1136arc_buf_t *
1137arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1138{
1139	arc_buf_hdr_t *hdr;
1140	arc_buf_t *buf;
1141
1142	ASSERT3U(size, >, 0);
1143	hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1144	ASSERT(BUF_EMPTY(hdr));
1145	hdr->b_size = size;
1146	hdr->b_type = type;
1147	hdr->b_spa = spa;
1148	hdr->b_state = arc_anon;
1149	hdr->b_arc_access = 0;
1150	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1151	buf->b_hdr = hdr;
1152	buf->b_data = NULL;
1153	buf->b_efunc = NULL;
1154	buf->b_private = NULL;
1155	buf->b_next = NULL;
1156	hdr->b_buf = buf;
1157	arc_get_data_buf(buf);
1158	hdr->b_datacnt = 1;
1159	hdr->b_flags = 0;
1160	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1161	(void) refcount_add(&hdr->b_refcnt, tag);
1162
1163	return (buf);
1164}
1165
1166static arc_buf_t *
1167arc_buf_clone(arc_buf_t *from)
1168{
1169	arc_buf_t *buf;
1170	arc_buf_hdr_t *hdr = from->b_hdr;
1171	uint64_t size = hdr->b_size;
1172
1173	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1174	buf->b_hdr = hdr;
1175	buf->b_data = NULL;
1176	buf->b_efunc = NULL;
1177	buf->b_private = NULL;
1178	buf->b_next = hdr->b_buf;
1179	hdr->b_buf = buf;
1180	arc_get_data_buf(buf);
1181	bcopy(from->b_data, buf->b_data, size);
1182	hdr->b_datacnt += 1;
1183	return (buf);
1184}
1185
1186void
1187arc_buf_add_ref(arc_buf_t *buf, void* tag)
1188{
1189	arc_buf_hdr_t *hdr;
1190	kmutex_t *hash_lock;
1191
1192	/*
1193	 * Check to see if this buffer is evicted.  Callers
1194	 * must verify b_data != NULL to know if the add_ref
1195	 * was successful.
1196	 */
1197	rw_enter(&buf->b_lock, RW_READER);
1198	if (buf->b_data == NULL) {
1199		rw_exit(&buf->b_lock);
1200		return;
1201	}
1202	hdr = buf->b_hdr;
1203	ASSERT(hdr != NULL);
1204	hash_lock = HDR_LOCK(hdr);
1205	mutex_enter(hash_lock);
1206	rw_exit(&buf->b_lock);
1207
1208	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1209	add_reference(hdr, hash_lock, tag);
1210	arc_access(hdr, hash_lock);
1211	mutex_exit(hash_lock);
1212	ARCSTAT_BUMP(arcstat_hits);
1213	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1214	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1215	    data, metadata, hits);
1216}
1217
1218/*
1219 * Free the arc data buffer.  If it is an l2arc write in progress,
1220 * the buffer is placed on l2arc_free_on_write to be freed later.
1221 */
1222static void
1223arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
1224    void *data, size_t size)
1225{
1226	if (HDR_L2_WRITING(hdr)) {
1227		l2arc_data_free_t *df;
1228		df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1229		df->l2df_data = data;
1230		df->l2df_size = size;
1231		df->l2df_func = free_func;
1232		mutex_enter(&l2arc_free_on_write_mtx);
1233		list_insert_head(l2arc_free_on_write, df);
1234		mutex_exit(&l2arc_free_on_write_mtx);
1235		ARCSTAT_BUMP(arcstat_l2_free_on_write);
1236	} else {
1237		free_func(data, size);
1238	}
1239}
1240
1241static void
1242arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1243{
1244	arc_buf_t **bufp;
1245
1246	/* free up data associated with the buf */
1247	if (buf->b_data) {
1248		arc_state_t *state = buf->b_hdr->b_state;
1249		uint64_t size = buf->b_hdr->b_size;
1250		arc_buf_contents_t type = buf->b_hdr->b_type;
1251
1252		arc_cksum_verify(buf);
1253		if (!recycle) {
1254			if (type == ARC_BUFC_METADATA) {
1255				arc_buf_data_free(buf->b_hdr, zio_buf_free,
1256				    buf->b_data, size);
1257				arc_space_return(size);
1258			} else {
1259				ASSERT(type == ARC_BUFC_DATA);
1260				arc_buf_data_free(buf->b_hdr,
1261				    zio_data_buf_free, buf->b_data, size);
1262				atomic_add_64(&arc_size, -size);
1263			}
1264		}
1265		if (list_link_active(&buf->b_hdr->b_arc_node)) {
1266			uint64_t *cnt = &state->arcs_lsize[type];
1267
1268			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1269			ASSERT(state != arc_anon);
1270
1271			ASSERT3U(*cnt, >=, size);
1272			atomic_add_64(cnt, -size);
1273		}
1274		ASSERT3U(state->arcs_size, >=, size);
1275		atomic_add_64(&state->arcs_size, -size);
1276		buf->b_data = NULL;
1277		ASSERT(buf->b_hdr->b_datacnt > 0);
1278		buf->b_hdr->b_datacnt -= 1;
1279	}
1280
1281	/* only remove the buf if requested */
1282	if (!all)
1283		return;
1284
1285	/* remove the buf from the hdr list */
1286	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1287		continue;
1288	*bufp = buf->b_next;
1289
1290	ASSERT(buf->b_efunc == NULL);
1291
1292	/* clean up the buf */
1293	buf->b_hdr = NULL;
1294	kmem_cache_free(buf_cache, buf);
1295}
1296
1297static void
1298arc_hdr_destroy(arc_buf_hdr_t *hdr)
1299{
1300	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1301	ASSERT3P(hdr->b_state, ==, arc_anon);
1302	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1303	ASSERT(!(hdr->b_flags & ARC_STORED));
1304
1305	if (hdr->b_l2hdr != NULL) {
1306		if (!MUTEX_HELD(&l2arc_buflist_mtx)) {
1307			/*
1308			 * To prevent arc_free() and l2arc_evict() from
1309			 * attempting to free the same buffer at the same time,
1310			 * a FREE_IN_PROGRESS flag is given to arc_free() to
1311			 * give it priority.  l2arc_evict() can't destroy this
1312			 * header while we are waiting on l2arc_buflist_mtx.
1313			 *
1314			 * The hdr may be removed from l2ad_buflist before we
1315			 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1316			 */
1317			mutex_enter(&l2arc_buflist_mtx);
1318			if (hdr->b_l2hdr != NULL) {
1319				list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist,
1320				    hdr);
1321			}
1322			mutex_exit(&l2arc_buflist_mtx);
1323		} else {
1324			list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
1325		}
1326		ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1327		kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t));
1328		if (hdr->b_state == arc_l2c_only)
1329			l2arc_hdr_stat_remove();
1330		hdr->b_l2hdr = NULL;
1331	}
1332
1333	if (!BUF_EMPTY(hdr)) {
1334		ASSERT(!HDR_IN_HASH_TABLE(hdr));
1335		bzero(&hdr->b_dva, sizeof (dva_t));
1336		hdr->b_birth = 0;
1337		hdr->b_cksum0 = 0;
1338	}
1339	while (hdr->b_buf) {
1340		arc_buf_t *buf = hdr->b_buf;
1341
1342		if (buf->b_efunc) {
1343			mutex_enter(&arc_eviction_mtx);
1344			rw_enter(&buf->b_lock, RW_WRITER);
1345			ASSERT(buf->b_hdr != NULL);
1346			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1347			hdr->b_buf = buf->b_next;
1348			buf->b_hdr = &arc_eviction_hdr;
1349			buf->b_next = arc_eviction_list;
1350			arc_eviction_list = buf;
1351			rw_exit(&buf->b_lock);
1352			mutex_exit(&arc_eviction_mtx);
1353		} else {
1354			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1355		}
1356	}
1357	if (hdr->b_freeze_cksum != NULL) {
1358		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1359		hdr->b_freeze_cksum = NULL;
1360	}
1361
1362	ASSERT(!list_link_active(&hdr->b_arc_node));
1363	ASSERT3P(hdr->b_hash_next, ==, NULL);
1364	ASSERT3P(hdr->b_acb, ==, NULL);
1365	kmem_cache_free(hdr_cache, hdr);
1366}
1367
1368void
1369arc_buf_free(arc_buf_t *buf, void *tag)
1370{
1371	arc_buf_hdr_t *hdr = buf->b_hdr;
1372	int hashed = hdr->b_state != arc_anon;
1373
1374	ASSERT(buf->b_efunc == NULL);
1375	ASSERT(buf->b_data != NULL);
1376
1377	if (hashed) {
1378		kmutex_t *hash_lock = HDR_LOCK(hdr);
1379
1380		mutex_enter(hash_lock);
1381		(void) remove_reference(hdr, hash_lock, tag);
1382		if (hdr->b_datacnt > 1)
1383			arc_buf_destroy(buf, FALSE, TRUE);
1384		else
1385			hdr->b_flags |= ARC_BUF_AVAILABLE;
1386		mutex_exit(hash_lock);
1387	} else if (HDR_IO_IN_PROGRESS(hdr)) {
1388		int destroy_hdr;
1389		/*
1390		 * We are in the middle of an async write.  Don't destroy
1391		 * this buffer unless the write completes before we finish
1392		 * decrementing the reference count.
1393		 */
1394		mutex_enter(&arc_eviction_mtx);
1395		(void) remove_reference(hdr, NULL, tag);
1396		ASSERT(refcount_is_zero(&hdr->b_refcnt));
1397		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1398		mutex_exit(&arc_eviction_mtx);
1399		if (destroy_hdr)
1400			arc_hdr_destroy(hdr);
1401	} else {
1402		if (remove_reference(hdr, NULL, tag) > 0) {
1403			ASSERT(HDR_IO_ERROR(hdr));
1404			arc_buf_destroy(buf, FALSE, TRUE);
1405		} else {
1406			arc_hdr_destroy(hdr);
1407		}
1408	}
1409}
1410
1411int
1412arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1413{
1414	arc_buf_hdr_t *hdr = buf->b_hdr;
1415	kmutex_t *hash_lock = HDR_LOCK(hdr);
1416	int no_callback = (buf->b_efunc == NULL);
1417
1418	if (hdr->b_state == arc_anon) {
1419		arc_buf_free(buf, tag);
1420		return (no_callback);
1421	}
1422
1423	mutex_enter(hash_lock);
1424	ASSERT(hdr->b_state != arc_anon);
1425	ASSERT(buf->b_data != NULL);
1426
1427	(void) remove_reference(hdr, hash_lock, tag);
1428	if (hdr->b_datacnt > 1) {
1429		if (no_callback)
1430			arc_buf_destroy(buf, FALSE, TRUE);
1431	} else if (no_callback) {
1432		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1433		hdr->b_flags |= ARC_BUF_AVAILABLE;
1434	}
1435	ASSERT(no_callback || hdr->b_datacnt > 1 ||
1436	    refcount_is_zero(&hdr->b_refcnt));
1437	mutex_exit(hash_lock);
1438	return (no_callback);
1439}
1440
1441int
1442arc_buf_size(arc_buf_t *buf)
1443{
1444	return (buf->b_hdr->b_size);
1445}
1446
1447/*
1448 * Evict buffers from list until we've removed the specified number of
1449 * bytes.  Move the removed buffers to the appropriate evict state.
1450 * If the recycle flag is set, then attempt to "recycle" a buffer:
1451 * - look for a buffer to evict that is `bytes' long.
1452 * - return the data block from this buffer rather than freeing it.
1453 * This flag is used by callers that are trying to make space for a
1454 * new buffer in a full arc cache.
1455 *
1456 * This function makes a "best effort".  It skips over any buffers
1457 * it can't get a hash_lock on, and so may not catch all candidates.
1458 * It may also return without evicting as much space as requested.
1459 */
1460static void *
1461arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle,
1462    arc_buf_contents_t type)
1463{
1464	arc_state_t *evicted_state;
1465	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1466	arc_buf_hdr_t *ab, *ab_prev = NULL;
1467	list_t *list = &state->arcs_list[type];
1468	kmutex_t *hash_lock;
1469	boolean_t have_lock;
1470	void *stolen = NULL;
1471
1472	ASSERT(state == arc_mru || state == arc_mfu);
1473
1474	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1475
1476	mutex_enter(&state->arcs_mtx);
1477	mutex_enter(&evicted_state->arcs_mtx);
1478
1479	for (ab = list_tail(list); ab; ab = ab_prev) {
1480		ab_prev = list_prev(list, ab);
1481		/* prefetch buffers have a minimum lifespan */
1482		if (HDR_IO_IN_PROGRESS(ab) ||
1483		    (spa && ab->b_spa != spa) ||
1484		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1485		    LBOLT - ab->b_arc_access < arc_min_prefetch_lifespan)) {
1486			skipped++;
1487			continue;
1488		}
1489		/* "lookahead" for better eviction candidate */
1490		if (recycle && ab->b_size != bytes &&
1491		    ab_prev && ab_prev->b_size == bytes)
1492			continue;
1493		hash_lock = HDR_LOCK(ab);
1494		have_lock = MUTEX_HELD(hash_lock);
1495		if (have_lock || mutex_tryenter(hash_lock)) {
1496			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
1497			ASSERT(ab->b_datacnt > 0);
1498			while (ab->b_buf) {
1499				arc_buf_t *buf = ab->b_buf;
1500				if (!rw_tryenter(&buf->b_lock, RW_WRITER)) {
1501					missed += 1;
1502					break;
1503				}
1504				if (buf->b_data) {
1505					bytes_evicted += ab->b_size;
1506					if (recycle && ab->b_type == type &&
1507					    ab->b_size == bytes &&
1508					    !HDR_L2_WRITING(ab)) {
1509						stolen = buf->b_data;
1510						recycle = FALSE;
1511					}
1512				}
1513				if (buf->b_efunc) {
1514					mutex_enter(&arc_eviction_mtx);
1515					arc_buf_destroy(buf,
1516					    buf->b_data == stolen, FALSE);
1517					ab->b_buf = buf->b_next;
1518					buf->b_hdr = &arc_eviction_hdr;
1519					buf->b_next = arc_eviction_list;
1520					arc_eviction_list = buf;
1521					mutex_exit(&arc_eviction_mtx);
1522					rw_exit(&buf->b_lock);
1523				} else {
1524					rw_exit(&buf->b_lock);
1525					arc_buf_destroy(buf,
1526					    buf->b_data == stolen, TRUE);
1527				}
1528			}
1529			if (ab->b_datacnt == 0) {
1530				arc_change_state(evicted_state, ab, hash_lock);
1531				ASSERT(HDR_IN_HASH_TABLE(ab));
1532				ab->b_flags |= ARC_IN_HASH_TABLE;
1533				ab->b_flags &= ~ARC_BUF_AVAILABLE;
1534				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1535			}
1536			if (!have_lock)
1537				mutex_exit(hash_lock);
1538			if (bytes >= 0 && bytes_evicted >= bytes)
1539				break;
1540		} else {
1541			missed += 1;
1542		}
1543	}
1544
1545	mutex_exit(&evicted_state->arcs_mtx);
1546	mutex_exit(&state->arcs_mtx);
1547
1548	if (bytes_evicted < bytes)
1549		dprintf("only evicted %lld bytes from %x",
1550		    (longlong_t)bytes_evicted, state);
1551
1552	if (skipped)
1553		ARCSTAT_INCR(arcstat_evict_skip, skipped);
1554
1555	if (missed)
1556		ARCSTAT_INCR(arcstat_mutex_miss, missed);
1557
1558	/*
1559	 * We have just evicted some date into the ghost state, make
1560	 * sure we also adjust the ghost state size if necessary.
1561	 */
1562	if (arc_no_grow &&
1563	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1564		int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1565		    arc_mru_ghost->arcs_size - arc_c;
1566
1567		if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1568			int64_t todelete =
1569			    MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
1570			arc_evict_ghost(arc_mru_ghost, NULL, todelete);
1571		} else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1572			int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1573			    arc_mru_ghost->arcs_size +
1574			    arc_mfu_ghost->arcs_size - arc_c);
1575			arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
1576		}
1577	}
1578
1579	return (stolen);
1580}
1581
1582/*
1583 * Remove buffers from list until we've removed the specified number of
1584 * bytes.  Destroy the buffers that are removed.
1585 */
1586static void
1587arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes)
1588{
1589	arc_buf_hdr_t *ab, *ab_prev;
1590	list_t *list = &state->arcs_list[ARC_BUFC_DATA];
1591	kmutex_t *hash_lock;
1592	uint64_t bytes_deleted = 0;
1593	uint64_t bufs_skipped = 0;
1594
1595	ASSERT(GHOST_STATE(state));
1596top:
1597	mutex_enter(&state->arcs_mtx);
1598	for (ab = list_tail(list); ab; ab = ab_prev) {
1599		ab_prev = list_prev(list, ab);
1600		if (spa && ab->b_spa != spa)
1601			continue;
1602		hash_lock = HDR_LOCK(ab);
1603		if (mutex_tryenter(hash_lock)) {
1604			ASSERT(!HDR_IO_IN_PROGRESS(ab));
1605			ASSERT(ab->b_buf == NULL);
1606			ARCSTAT_BUMP(arcstat_deleted);
1607			bytes_deleted += ab->b_size;
1608
1609			if (ab->b_l2hdr != NULL) {
1610				/*
1611				 * This buffer is cached on the 2nd Level ARC;
1612				 * don't destroy the header.
1613				 */
1614				arc_change_state(arc_l2c_only, ab, hash_lock);
1615				mutex_exit(hash_lock);
1616			} else {
1617				arc_change_state(arc_anon, ab, hash_lock);
1618				mutex_exit(hash_lock);
1619				arc_hdr_destroy(ab);
1620			}
1621
1622			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1623			if (bytes >= 0 && bytes_deleted >= bytes)
1624				break;
1625		} else {
1626			if (bytes < 0) {
1627				mutex_exit(&state->arcs_mtx);
1628				mutex_enter(hash_lock);
1629				mutex_exit(hash_lock);
1630				goto top;
1631			}
1632			bufs_skipped += 1;
1633		}
1634	}
1635	mutex_exit(&state->arcs_mtx);
1636
1637	if (list == &state->arcs_list[ARC_BUFC_DATA] &&
1638	    (bytes < 0 || bytes_deleted < bytes)) {
1639		list = &state->arcs_list[ARC_BUFC_METADATA];
1640		goto top;
1641	}
1642
1643	if (bufs_skipped) {
1644		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1645		ASSERT(bytes >= 0);
1646	}
1647
1648	if (bytes_deleted < bytes)
1649		dprintf("only deleted %lld bytes from %p",
1650		    (longlong_t)bytes_deleted, state);
1651}
1652
1653static void
1654arc_adjust(void)
1655{
1656	int64_t top_sz, mru_over, arc_over, todelete;
1657
1658	top_sz = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used;
1659
1660	if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
1661		int64_t toevict =
1662		    MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p);
1663		(void) arc_evict(arc_mru, NULL, toevict, FALSE, ARC_BUFC_DATA);
1664		top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
1665	}
1666
1667	if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1668		int64_t toevict =
1669		    MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p);
1670		(void) arc_evict(arc_mru, NULL, toevict, FALSE,
1671		    ARC_BUFC_METADATA);
1672		top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
1673	}
1674
1675	mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c;
1676
1677	if (mru_over > 0) {
1678		if (arc_mru_ghost->arcs_size > 0) {
1679			todelete = MIN(arc_mru_ghost->arcs_size, mru_over);
1680			arc_evict_ghost(arc_mru_ghost, NULL, todelete);
1681		}
1682	}
1683
1684	if ((arc_over = arc_size - arc_c) > 0) {
1685		int64_t tbl_over;
1686
1687		if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
1688			int64_t toevict =
1689			    MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over);
1690			(void) arc_evict(arc_mfu, NULL, toevict, FALSE,
1691			    ARC_BUFC_DATA);
1692			arc_over = arc_size - arc_c;
1693		}
1694
1695		if (arc_over > 0 &&
1696		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
1697			int64_t toevict =
1698			    MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA],
1699			    arc_over);
1700			(void) arc_evict(arc_mfu, NULL, toevict, FALSE,
1701			    ARC_BUFC_METADATA);
1702		}
1703
1704		tbl_over = arc_size + arc_mru_ghost->arcs_size +
1705		    arc_mfu_ghost->arcs_size - arc_c * 2;
1706
1707		if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) {
1708			todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over);
1709			arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
1710		}
1711	}
1712}
1713
1714static void
1715arc_do_user_evicts(void)
1716{
1717	static arc_buf_t *tmp_arc_eviction_list;
1718
1719	/*
1720	 * Move list over to avoid LOR
1721	 */
1722restart:
1723	mutex_enter(&arc_eviction_mtx);
1724	tmp_arc_eviction_list = arc_eviction_list;
1725	arc_eviction_list = NULL;
1726	mutex_exit(&arc_eviction_mtx);
1727
1728	while (tmp_arc_eviction_list != NULL) {
1729		arc_buf_t *buf = tmp_arc_eviction_list;
1730		tmp_arc_eviction_list = buf->b_next;
1731		rw_enter(&buf->b_lock, RW_WRITER);
1732		buf->b_hdr = NULL;
1733		rw_exit(&buf->b_lock);
1734
1735		if (buf->b_efunc != NULL)
1736			VERIFY(buf->b_efunc(buf) == 0);
1737
1738		buf->b_efunc = NULL;
1739		buf->b_private = NULL;
1740		kmem_cache_free(buf_cache, buf);
1741	}
1742
1743	if (arc_eviction_list != NULL)
1744		goto restart;
1745}
1746
1747/*
1748 * Flush all *evictable* data from the cache for the given spa.
1749 * NOTE: this will not touch "active" (i.e. referenced) data.
1750 */
1751void
1752arc_flush(spa_t *spa)
1753{
1754	while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
1755		(void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA);
1756		if (spa)
1757			break;
1758	}
1759	while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
1760		(void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA);
1761		if (spa)
1762			break;
1763	}
1764	while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
1765		(void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA);
1766		if (spa)
1767			break;
1768	}
1769	while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
1770		(void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA);
1771		if (spa)
1772			break;
1773	}
1774
1775	arc_evict_ghost(arc_mru_ghost, spa, -1);
1776	arc_evict_ghost(arc_mfu_ghost, spa, -1);
1777
1778	mutex_enter(&arc_reclaim_thr_lock);
1779	arc_do_user_evicts();
1780	mutex_exit(&arc_reclaim_thr_lock);
1781	ASSERT(spa || arc_eviction_list == NULL);
1782}
1783
1784int arc_shrink_shift = 5;		/* log2(fraction of arc to reclaim) */
1785
1786void
1787arc_shrink(void)
1788{
1789	if (arc_c > arc_c_min) {
1790		uint64_t to_free;
1791
1792#ifdef _KERNEL
1793		to_free = arc_c >> arc_shrink_shift;
1794#else
1795		to_free = arc_c >> arc_shrink_shift;
1796#endif
1797		if (arc_c > arc_c_min + to_free)
1798			atomic_add_64(&arc_c, -to_free);
1799		else
1800			arc_c = arc_c_min;
1801
1802		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
1803		if (arc_c > arc_size)
1804			arc_c = MAX(arc_size, arc_c_min);
1805		if (arc_p > arc_c)
1806			arc_p = (arc_c >> 1);
1807		ASSERT(arc_c >= arc_c_min);
1808		ASSERT((int64_t)arc_p >= 0);
1809	}
1810
1811	if (arc_size > arc_c)
1812		arc_adjust();
1813}
1814
1815static int needfree = 0;
1816
1817static int
1818arc_reclaim_needed(void)
1819{
1820#if 0
1821	uint64_t extra;
1822#endif
1823
1824#ifdef _KERNEL
1825
1826	/*
1827	 * If pages are needed or we're within 2048 pages
1828	 * of needing to page need to reclaim
1829	 */
1830	if (vm_pages_needed || (vm_paging_target() > -2048))
1831		return (1);
1832
1833	if (needfree)
1834		return (1);
1835
1836#if 0
1837	/*
1838	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
1839	 */
1840	extra = desfree;
1841
1842	/*
1843	 * check that we're out of range of the pageout scanner.  It starts to
1844	 * schedule paging if freemem is less than lotsfree and needfree.
1845	 * lotsfree is the high-water mark for pageout, and needfree is the
1846	 * number of needed free pages.  We add extra pages here to make sure
1847	 * the scanner doesn't start up while we're freeing memory.
1848	 */
1849	if (freemem < lotsfree + needfree + extra)
1850		return (1);
1851
1852	/*
1853	 * check to make sure that swapfs has enough space so that anon
1854	 * reservations can still succeed. anon_resvmem() checks that the
1855	 * availrmem is greater than swapfs_minfree, and the number of reserved
1856	 * swap pages.  We also add a bit of extra here just to prevent
1857	 * circumstances from getting really dire.
1858	 */
1859	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
1860		return (1);
1861
1862#if defined(__i386)
1863	/*
1864	 * If we're on an i386 platform, it's possible that we'll exhaust the
1865	 * kernel heap space before we ever run out of available physical
1866	 * memory.  Most checks of the size of the heap_area compare against
1867	 * tune.t_minarmem, which is the minimum available real memory that we
1868	 * can have in the system.  However, this is generally fixed at 25 pages
1869	 * which is so low that it's useless.  In this comparison, we seek to
1870	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
1871	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
1872	 * free)
1873	 */
1874	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
1875	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
1876		return (1);
1877#endif
1878#else
1879	if (kmem_used() > (kmem_size() * 3) / 4)
1880		return (1);
1881#endif
1882
1883#else
1884	if (spa_get_random(100) == 0)
1885		return (1);
1886#endif
1887	return (0);
1888}
1889
1890static void
1891arc_kmem_reap_now(arc_reclaim_strategy_t strat)
1892{
1893#ifdef ZIO_USE_UMA
1894	size_t			i;
1895	kmem_cache_t		*prev_cache = NULL;
1896	kmem_cache_t		*prev_data_cache = NULL;
1897	extern kmem_cache_t	*zio_buf_cache[];
1898	extern kmem_cache_t	*zio_data_buf_cache[];
1899#endif
1900
1901#ifdef _KERNEL
1902	if (arc_meta_used >= arc_meta_limit) {
1903		/*
1904		 * We are exceeding our meta-data cache limit.
1905		 * Purge some DNLC entries to release holds on meta-data.
1906		 */
1907		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
1908	}
1909#if defined(__i386)
1910	/*
1911	 * Reclaim unused memory from all kmem caches.
1912	 */
1913	kmem_reap();
1914#endif
1915#endif
1916
1917	/*
1918	 * An aggressive reclamation will shrink the cache size as well as
1919	 * reap free buffers from the arc kmem caches.
1920	 */
1921	if (strat == ARC_RECLAIM_AGGR)
1922		arc_shrink();
1923
1924#ifdef ZIO_USE_UMA
1925	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
1926		if (zio_buf_cache[i] != prev_cache) {
1927			prev_cache = zio_buf_cache[i];
1928			kmem_cache_reap_now(zio_buf_cache[i]);
1929		}
1930		if (zio_data_buf_cache[i] != prev_data_cache) {
1931			prev_data_cache = zio_data_buf_cache[i];
1932			kmem_cache_reap_now(zio_data_buf_cache[i]);
1933		}
1934	}
1935#endif
1936	kmem_cache_reap_now(buf_cache);
1937	kmem_cache_reap_now(hdr_cache);
1938}
1939
1940static void
1941arc_reclaim_thread(void *dummy __unused)
1942{
1943	clock_t			growtime = 0;
1944	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
1945	callb_cpr_t		cpr;
1946
1947	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
1948
1949	mutex_enter(&arc_reclaim_thr_lock);
1950	while (arc_thread_exit == 0) {
1951		if (arc_reclaim_needed()) {
1952
1953			if (arc_no_grow) {
1954				if (last_reclaim == ARC_RECLAIM_CONS) {
1955					last_reclaim = ARC_RECLAIM_AGGR;
1956				} else {
1957					last_reclaim = ARC_RECLAIM_CONS;
1958				}
1959			} else {
1960				arc_no_grow = TRUE;
1961				last_reclaim = ARC_RECLAIM_AGGR;
1962				membar_producer();
1963			}
1964
1965			/* reset the growth delay for every reclaim */
1966			growtime = LBOLT + (arc_grow_retry * hz);
1967
1968			if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
1969				/*
1970				 * If needfree is TRUE our vm_lowmem hook
1971				 * was called and in that case we must free some
1972				 * memory, so switch to aggressive mode.
1973				 */
1974				arc_no_grow = TRUE;
1975				last_reclaim = ARC_RECLAIM_AGGR;
1976			}
1977			arc_kmem_reap_now(last_reclaim);
1978			arc_warm = B_TRUE;
1979
1980		} else if (arc_no_grow && LBOLT >= growtime) {
1981			arc_no_grow = FALSE;
1982		}
1983
1984		if (needfree ||
1985		    (2 * arc_c < arc_size +
1986		    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size))
1987			arc_adjust();
1988
1989		if (arc_eviction_list != NULL)
1990			arc_do_user_evicts();
1991
1992		if (arc_reclaim_needed()) {
1993			needfree = 0;
1994#ifdef _KERNEL
1995			wakeup(&needfree);
1996#endif
1997		}
1998
1999		/* block until needed, or one second, whichever is shorter */
2000		CALLB_CPR_SAFE_BEGIN(&cpr);
2001		(void) cv_timedwait(&arc_reclaim_thr_cv,
2002		    &arc_reclaim_thr_lock, hz);
2003		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2004	}
2005
2006	arc_thread_exit = 0;
2007	cv_broadcast(&arc_reclaim_thr_cv);
2008	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
2009	thread_exit();
2010}
2011
2012/*
2013 * Adapt arc info given the number of bytes we are trying to add and
2014 * the state that we are comming from.  This function is only called
2015 * when we are adding new content to the cache.
2016 */
2017static void
2018arc_adapt(int bytes, arc_state_t *state)
2019{
2020	int mult;
2021
2022	if (state == arc_l2c_only)
2023		return;
2024
2025	ASSERT(bytes > 0);
2026	/*
2027	 * Adapt the target size of the MRU list:
2028	 *	- if we just hit in the MRU ghost list, then increase
2029	 *	  the target size of the MRU list.
2030	 *	- if we just hit in the MFU ghost list, then increase
2031	 *	  the target size of the MFU list by decreasing the
2032	 *	  target size of the MRU list.
2033	 */
2034	if (state == arc_mru_ghost) {
2035		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2036		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2037
2038		arc_p = MIN(arc_c, arc_p + bytes * mult);
2039	} else if (state == arc_mfu_ghost) {
2040		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2041		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2042
2043		arc_p = MAX(0, (int64_t)arc_p - bytes * mult);
2044	}
2045	ASSERT((int64_t)arc_p >= 0);
2046
2047	if (arc_reclaim_needed()) {
2048		cv_signal(&arc_reclaim_thr_cv);
2049		return;
2050	}
2051
2052	if (arc_no_grow)
2053		return;
2054
2055	if (arc_c >= arc_c_max)
2056		return;
2057
2058	/*
2059	 * If we're within (2 * maxblocksize) bytes of the target
2060	 * cache size, increment the target cache size
2061	 */
2062	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2063		atomic_add_64(&arc_c, (int64_t)bytes);
2064		if (arc_c > arc_c_max)
2065			arc_c = arc_c_max;
2066		else if (state == arc_anon)
2067			atomic_add_64(&arc_p, (int64_t)bytes);
2068		if (arc_p > arc_c)
2069			arc_p = arc_c;
2070	}
2071	ASSERT((int64_t)arc_p >= 0);
2072}
2073
2074/*
2075 * Check if the cache has reached its limits and eviction is required
2076 * prior to insert.
2077 */
2078static int
2079arc_evict_needed(arc_buf_contents_t type)
2080{
2081	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2082		return (1);
2083
2084#if 0
2085#ifdef _KERNEL
2086	/*
2087	 * If zio data pages are being allocated out of a separate heap segment,
2088	 * then enforce that the size of available vmem for this area remains
2089	 * above about 1/32nd free.
2090	 */
2091	if (type == ARC_BUFC_DATA && zio_arena != NULL &&
2092	    vmem_size(zio_arena, VMEM_FREE) <
2093	    (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
2094		return (1);
2095#endif
2096#endif
2097
2098	if (arc_reclaim_needed())
2099		return (1);
2100
2101	return (arc_size > arc_c);
2102}
2103
2104/*
2105 * The buffer, supplied as the first argument, needs a data block.
2106 * So, if we are at cache max, determine which cache should be victimized.
2107 * We have the following cases:
2108 *
2109 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2110 * In this situation if we're out of space, but the resident size of the MFU is
2111 * under the limit, victimize the MFU cache to satisfy this insertion request.
2112 *
2113 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2114 * Here, we've used up all of the available space for the MRU, so we need to
2115 * evict from our own cache instead.  Evict from the set of resident MRU
2116 * entries.
2117 *
2118 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2119 * c minus p represents the MFU space in the cache, since p is the size of the
2120 * cache that is dedicated to the MRU.  In this situation there's still space on
2121 * the MFU side, so the MRU side needs to be victimized.
2122 *
2123 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2124 * MFU's resident set is consuming more space than it has been allotted.  In
2125 * this situation, we must victimize our own cache, the MFU, for this insertion.
2126 */
2127static void
2128arc_get_data_buf(arc_buf_t *buf)
2129{
2130	arc_state_t		*state = buf->b_hdr->b_state;
2131	uint64_t		size = buf->b_hdr->b_size;
2132	arc_buf_contents_t	type = buf->b_hdr->b_type;
2133
2134	arc_adapt(size, state);
2135
2136	/*
2137	 * We have not yet reached cache maximum size,
2138	 * just allocate a new buffer.
2139	 */
2140	if (!arc_evict_needed(type)) {
2141		if (type == ARC_BUFC_METADATA) {
2142			buf->b_data = zio_buf_alloc(size);
2143			arc_space_consume(size);
2144		} else {
2145			ASSERT(type == ARC_BUFC_DATA);
2146			buf->b_data = zio_data_buf_alloc(size);
2147			atomic_add_64(&arc_size, size);
2148		}
2149		goto out;
2150	}
2151
2152	/*
2153	 * If we are prefetching from the mfu ghost list, this buffer
2154	 * will end up on the mru list; so steal space from there.
2155	 */
2156	if (state == arc_mfu_ghost)
2157		state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2158	else if (state == arc_mru_ghost)
2159		state = arc_mru;
2160
2161	if (state == arc_mru || state == arc_anon) {
2162		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2163		state = (arc_mfu->arcs_lsize[type] > 0 &&
2164		    arc_p > mru_used) ? arc_mfu : arc_mru;
2165	} else {
2166		/* MFU cases */
2167		uint64_t mfu_space = arc_c - arc_p;
2168		state =  (arc_mru->arcs_lsize[type] > 0 &&
2169		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2170	}
2171	if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
2172		if (type == ARC_BUFC_METADATA) {
2173			buf->b_data = zio_buf_alloc(size);
2174			arc_space_consume(size);
2175		} else {
2176			ASSERT(type == ARC_BUFC_DATA);
2177			buf->b_data = zio_data_buf_alloc(size);
2178			atomic_add_64(&arc_size, size);
2179		}
2180		ARCSTAT_BUMP(arcstat_recycle_miss);
2181	}
2182	ASSERT(buf->b_data != NULL);
2183out:
2184	/*
2185	 * Update the state size.  Note that ghost states have a
2186	 * "ghost size" and so don't need to be updated.
2187	 */
2188	if (!GHOST_STATE(buf->b_hdr->b_state)) {
2189		arc_buf_hdr_t *hdr = buf->b_hdr;
2190
2191		atomic_add_64(&hdr->b_state->arcs_size, size);
2192		if (list_link_active(&hdr->b_arc_node)) {
2193			ASSERT(refcount_is_zero(&hdr->b_refcnt));
2194			atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2195		}
2196		/*
2197		 * If we are growing the cache, and we are adding anonymous
2198		 * data, and we have outgrown arc_p, update arc_p
2199		 */
2200		if (arc_size < arc_c && hdr->b_state == arc_anon &&
2201		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2202			arc_p = MIN(arc_c, arc_p + size);
2203	}
2204}
2205
2206/*
2207 * This routine is called whenever a buffer is accessed.
2208 * NOTE: the hash lock is dropped in this function.
2209 */
2210static void
2211arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2212{
2213	ASSERT(MUTEX_HELD(hash_lock));
2214
2215	if (buf->b_state == arc_anon) {
2216		/*
2217		 * This buffer is not in the cache, and does not
2218		 * appear in our "ghost" list.  Add the new buffer
2219		 * to the MRU state.
2220		 */
2221
2222		ASSERT(buf->b_arc_access == 0);
2223		buf->b_arc_access = LBOLT;
2224		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2225		arc_change_state(arc_mru, buf, hash_lock);
2226
2227	} else if (buf->b_state == arc_mru) {
2228		/*
2229		 * If this buffer is here because of a prefetch, then either:
2230		 * - clear the flag if this is a "referencing" read
2231		 *   (any subsequent access will bump this into the MFU state).
2232		 * or
2233		 * - move the buffer to the head of the list if this is
2234		 *   another prefetch (to make it less likely to be evicted).
2235		 */
2236		if ((buf->b_flags & ARC_PREFETCH) != 0) {
2237			if (refcount_count(&buf->b_refcnt) == 0) {
2238				ASSERT(list_link_active(&buf->b_arc_node));
2239			} else {
2240				buf->b_flags &= ~ARC_PREFETCH;
2241				ARCSTAT_BUMP(arcstat_mru_hits);
2242			}
2243			buf->b_arc_access = LBOLT;
2244			return;
2245		}
2246
2247		/*
2248		 * This buffer has been "accessed" only once so far,
2249		 * but it is still in the cache. Move it to the MFU
2250		 * state.
2251		 */
2252		if (LBOLT > buf->b_arc_access + ARC_MINTIME) {
2253			/*
2254			 * More than 125ms have passed since we
2255			 * instantiated this buffer.  Move it to the
2256			 * most frequently used state.
2257			 */
2258			buf->b_arc_access = LBOLT;
2259			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2260			arc_change_state(arc_mfu, buf, hash_lock);
2261		}
2262		ARCSTAT_BUMP(arcstat_mru_hits);
2263	} else if (buf->b_state == arc_mru_ghost) {
2264		arc_state_t	*new_state;
2265		/*
2266		 * This buffer has been "accessed" recently, but
2267		 * was evicted from the cache.  Move it to the
2268		 * MFU state.
2269		 */
2270
2271		if (buf->b_flags & ARC_PREFETCH) {
2272			new_state = arc_mru;
2273			if (refcount_count(&buf->b_refcnt) > 0)
2274				buf->b_flags &= ~ARC_PREFETCH;
2275			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2276		} else {
2277			new_state = arc_mfu;
2278			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2279		}
2280
2281		buf->b_arc_access = LBOLT;
2282		arc_change_state(new_state, buf, hash_lock);
2283
2284		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2285	} else if (buf->b_state == arc_mfu) {
2286		/*
2287		 * This buffer has been accessed more than once and is
2288		 * still in the cache.  Keep it in the MFU state.
2289		 *
2290		 * NOTE: an add_reference() that occurred when we did
2291		 * the arc_read() will have kicked this off the list.
2292		 * If it was a prefetch, we will explicitly move it to
2293		 * the head of the list now.
2294		 */
2295		if ((buf->b_flags & ARC_PREFETCH) != 0) {
2296			ASSERT(refcount_count(&buf->b_refcnt) == 0);
2297			ASSERT(list_link_active(&buf->b_arc_node));
2298		}
2299		ARCSTAT_BUMP(arcstat_mfu_hits);
2300		buf->b_arc_access = LBOLT;
2301	} else if (buf->b_state == arc_mfu_ghost) {
2302		arc_state_t	*new_state = arc_mfu;
2303		/*
2304		 * This buffer has been accessed more than once but has
2305		 * been evicted from the cache.  Move it back to the
2306		 * MFU state.
2307		 */
2308
2309		if (buf->b_flags & ARC_PREFETCH) {
2310			/*
2311			 * This is a prefetch access...
2312			 * move this block back to the MRU state.
2313			 */
2314			ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
2315			new_state = arc_mru;
2316		}
2317
2318		buf->b_arc_access = LBOLT;
2319		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2320		arc_change_state(new_state, buf, hash_lock);
2321
2322		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2323	} else if (buf->b_state == arc_l2c_only) {
2324		/*
2325		 * This buffer is on the 2nd Level ARC.
2326		 */
2327
2328		buf->b_arc_access = LBOLT;
2329		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2330		arc_change_state(arc_mfu, buf, hash_lock);
2331	} else {
2332		ASSERT(!"invalid arc state");
2333	}
2334}
2335
2336/* a generic arc_done_func_t which you can use */
2337/* ARGSUSED */
2338void
2339arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2340{
2341	bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2342	VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2343}
2344
2345/* a generic arc_done_func_t */
2346void
2347arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2348{
2349	arc_buf_t **bufp = arg;
2350	if (zio && zio->io_error) {
2351		VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2352		*bufp = NULL;
2353	} else {
2354		*bufp = buf;
2355	}
2356}
2357
2358static void
2359arc_read_done(zio_t *zio)
2360{
2361	arc_buf_hdr_t	*hdr, *found;
2362	arc_buf_t	*buf;
2363	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
2364	kmutex_t	*hash_lock;
2365	arc_callback_t	*callback_list, *acb;
2366	int		freeable = FALSE;
2367
2368	buf = zio->io_private;
2369	hdr = buf->b_hdr;
2370
2371	/*
2372	 * The hdr was inserted into hash-table and removed from lists
2373	 * prior to starting I/O.  We should find this header, since
2374	 * it's in the hash table, and it should be legit since it's
2375	 * not possible to evict it during the I/O.  The only possible
2376	 * reason for it not to be found is if we were freed during the
2377	 * read.
2378	 */
2379	found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
2380	    &hash_lock);
2381
2382	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2383	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2384	    (found == hdr && HDR_L2_READING(hdr)));
2385
2386	hdr->b_flags &= ~ARC_L2_EVICTED;
2387	if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2388		hdr->b_flags &= ~ARC_L2CACHE;
2389
2390	/* byteswap if necessary */
2391	callback_list = hdr->b_acb;
2392	ASSERT(callback_list != NULL);
2393	if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
2394		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2395		    byteswap_uint64_array :
2396		    dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
2397		func(buf->b_data, hdr->b_size);
2398	}
2399
2400	arc_cksum_compute(buf, B_FALSE);
2401
2402	/* create copies of the data buffer for the callers */
2403	abuf = buf;
2404	for (acb = callback_list; acb; acb = acb->acb_next) {
2405		if (acb->acb_done) {
2406			if (abuf == NULL)
2407				abuf = arc_buf_clone(buf);
2408			acb->acb_buf = abuf;
2409			abuf = NULL;
2410		}
2411	}
2412	hdr->b_acb = NULL;
2413	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2414	ASSERT(!HDR_BUF_AVAILABLE(hdr));
2415	if (abuf == buf)
2416		hdr->b_flags |= ARC_BUF_AVAILABLE;
2417
2418	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2419
2420	if (zio->io_error != 0) {
2421		hdr->b_flags |= ARC_IO_ERROR;
2422		if (hdr->b_state != arc_anon)
2423			arc_change_state(arc_anon, hdr, hash_lock);
2424		if (HDR_IN_HASH_TABLE(hdr))
2425			buf_hash_remove(hdr);
2426		freeable = refcount_is_zero(&hdr->b_refcnt);
2427	}
2428
2429	/*
2430	 * Broadcast before we drop the hash_lock to avoid the possibility
2431	 * that the hdr (and hence the cv) might be freed before we get to
2432	 * the cv_broadcast().
2433	 */
2434	cv_broadcast(&hdr->b_cv);
2435
2436	if (hash_lock) {
2437		/*
2438		 * Only call arc_access on anonymous buffers.  This is because
2439		 * if we've issued an I/O for an evicted buffer, we've already
2440		 * called arc_access (to prevent any simultaneous readers from
2441		 * getting confused).
2442		 */
2443		if (zio->io_error == 0 && hdr->b_state == arc_anon)
2444			arc_access(hdr, hash_lock);
2445		mutex_exit(hash_lock);
2446	} else {
2447		/*
2448		 * This block was freed while we waited for the read to
2449		 * complete.  It has been removed from the hash table and
2450		 * moved to the anonymous state (so that it won't show up
2451		 * in the cache).
2452		 */
2453		ASSERT3P(hdr->b_state, ==, arc_anon);
2454		freeable = refcount_is_zero(&hdr->b_refcnt);
2455	}
2456
2457	/* execute each callback and free its structure */
2458	while ((acb = callback_list) != NULL) {
2459		if (acb->acb_done)
2460			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2461
2462		if (acb->acb_zio_dummy != NULL) {
2463			acb->acb_zio_dummy->io_error = zio->io_error;
2464			zio_nowait(acb->acb_zio_dummy);
2465		}
2466
2467		callback_list = acb->acb_next;
2468		kmem_free(acb, sizeof (arc_callback_t));
2469	}
2470
2471	if (freeable)
2472		arc_hdr_destroy(hdr);
2473}
2474
2475/*
2476 * "Read" the block block at the specified DVA (in bp) via the
2477 * cache.  If the block is found in the cache, invoke the provided
2478 * callback immediately and return.  Note that the `zio' parameter
2479 * in the callback will be NULL in this case, since no IO was
2480 * required.  If the block is not in the cache pass the read request
2481 * on to the spa with a substitute callback function, so that the
2482 * requested block will be added to the cache.
2483 *
2484 * If a read request arrives for a block that has a read in-progress,
2485 * either wait for the in-progress read to complete (and return the
2486 * results); or, if this is a read with a "done" func, add a record
2487 * to the read to invoke the "done" func when the read completes,
2488 * and return; or just return.
2489 *
2490 * arc_read_done() will invoke all the requested "done" functions
2491 * for readers of this block.
2492 *
2493 * Normal callers should use arc_read and pass the arc buffer and offset
2494 * for the bp.  But if you know you don't need locking, you can use
2495 * arc_read_bp.
2496 */
2497int
2498arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
2499    arc_done_func_t *done, void *private, int priority, int zio_flags,
2500    uint32_t *arc_flags, const zbookmark_t *zb)
2501{
2502	int err;
2503	arc_buf_hdr_t *hdr = pbuf->b_hdr;
2504
2505	ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
2506	ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
2507	rw_enter(&pbuf->b_lock, RW_READER);
2508
2509	err = arc_read_nolock(pio, spa, bp, done, private, priority,
2510	    zio_flags, arc_flags, zb);
2511
2512	ASSERT3P(hdr, ==, pbuf->b_hdr);
2513	rw_exit(&pbuf->b_lock);
2514	return (err);
2515}
2516
2517int
2518arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
2519    arc_done_func_t *done, void *private, int priority, int zio_flags,
2520    uint32_t *arc_flags, const zbookmark_t *zb)
2521{
2522	arc_buf_hdr_t *hdr;
2523	arc_buf_t *buf;
2524	kmutex_t *hash_lock;
2525	zio_t *rzio;
2526
2527top:
2528	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
2529	if (hdr && hdr->b_datacnt > 0) {
2530
2531		*arc_flags |= ARC_CACHED;
2532
2533		if (HDR_IO_IN_PROGRESS(hdr)) {
2534
2535			if (*arc_flags & ARC_WAIT) {
2536				cv_wait(&hdr->b_cv, hash_lock);
2537				mutex_exit(hash_lock);
2538				goto top;
2539			}
2540			ASSERT(*arc_flags & ARC_NOWAIT);
2541
2542			if (done) {
2543				arc_callback_t	*acb = NULL;
2544
2545				acb = kmem_zalloc(sizeof (arc_callback_t),
2546				    KM_SLEEP);
2547				acb->acb_done = done;
2548				acb->acb_private = private;
2549				if (pio != NULL)
2550					acb->acb_zio_dummy = zio_null(pio,
2551					    spa, NULL, NULL, zio_flags);
2552
2553				ASSERT(acb->acb_done != NULL);
2554				acb->acb_next = hdr->b_acb;
2555				hdr->b_acb = acb;
2556				add_reference(hdr, hash_lock, private);
2557				mutex_exit(hash_lock);
2558				return (0);
2559			}
2560			mutex_exit(hash_lock);
2561			return (0);
2562		}
2563
2564		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2565
2566		if (done) {
2567			add_reference(hdr, hash_lock, private);
2568			/*
2569			 * If this block is already in use, create a new
2570			 * copy of the data so that we will be guaranteed
2571			 * that arc_release() will always succeed.
2572			 */
2573			buf = hdr->b_buf;
2574			ASSERT(buf);
2575			ASSERT(buf->b_data);
2576			if (HDR_BUF_AVAILABLE(hdr)) {
2577				ASSERT(buf->b_efunc == NULL);
2578				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2579			} else {
2580				buf = arc_buf_clone(buf);
2581			}
2582		} else if (*arc_flags & ARC_PREFETCH &&
2583		    refcount_count(&hdr->b_refcnt) == 0) {
2584			hdr->b_flags |= ARC_PREFETCH;
2585		}
2586		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2587		arc_access(hdr, hash_lock);
2588		if (*arc_flags & ARC_L2CACHE)
2589			hdr->b_flags |= ARC_L2CACHE;
2590		mutex_exit(hash_lock);
2591		ARCSTAT_BUMP(arcstat_hits);
2592		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2593		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2594		    data, metadata, hits);
2595
2596		if (done)
2597			done(NULL, buf, private);
2598	} else {
2599		uint64_t size = BP_GET_LSIZE(bp);
2600		arc_callback_t	*acb;
2601		vdev_t *vd = NULL;
2602		daddr_t addr;
2603
2604		if (hdr == NULL) {
2605			/* this block is not in the cache */
2606			arc_buf_hdr_t	*exists;
2607			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
2608			buf = arc_buf_alloc(spa, size, private, type);
2609			hdr = buf->b_hdr;
2610			hdr->b_dva = *BP_IDENTITY(bp);
2611			hdr->b_birth = bp->blk_birth;
2612			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
2613			exists = buf_hash_insert(hdr, &hash_lock);
2614			if (exists) {
2615				/* somebody beat us to the hash insert */
2616				mutex_exit(hash_lock);
2617				bzero(&hdr->b_dva, sizeof (dva_t));
2618				hdr->b_birth = 0;
2619				hdr->b_cksum0 = 0;
2620				(void) arc_buf_remove_ref(buf, private);
2621				goto top; /* restart the IO request */
2622			}
2623			/* if this is a prefetch, we don't have a reference */
2624			if (*arc_flags & ARC_PREFETCH) {
2625				(void) remove_reference(hdr, hash_lock,
2626				    private);
2627				hdr->b_flags |= ARC_PREFETCH;
2628			}
2629			if (*arc_flags & ARC_L2CACHE)
2630				hdr->b_flags |= ARC_L2CACHE;
2631			if (BP_GET_LEVEL(bp) > 0)
2632				hdr->b_flags |= ARC_INDIRECT;
2633		} else {
2634			/* this block is in the ghost cache */
2635			ASSERT(GHOST_STATE(hdr->b_state));
2636			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2637			ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
2638			ASSERT(hdr->b_buf == NULL);
2639
2640			/* if this is a prefetch, we don't have a reference */
2641			if (*arc_flags & ARC_PREFETCH)
2642				hdr->b_flags |= ARC_PREFETCH;
2643			else
2644				add_reference(hdr, hash_lock, private);
2645			if (*arc_flags & ARC_L2CACHE)
2646				hdr->b_flags |= ARC_L2CACHE;
2647			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2648			buf->b_hdr = hdr;
2649			buf->b_data = NULL;
2650			buf->b_efunc = NULL;
2651			buf->b_private = NULL;
2652			buf->b_next = NULL;
2653			hdr->b_buf = buf;
2654			arc_get_data_buf(buf);
2655			ASSERT(hdr->b_datacnt == 0);
2656			hdr->b_datacnt = 1;
2657
2658		}
2659
2660		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
2661		acb->acb_done = done;
2662		acb->acb_private = private;
2663
2664		ASSERT(hdr->b_acb == NULL);
2665		hdr->b_acb = acb;
2666		hdr->b_flags |= ARC_IO_IN_PROGRESS;
2667
2668		/*
2669		 * If the buffer has been evicted, migrate it to a present state
2670		 * before issuing the I/O.  Once we drop the hash-table lock,
2671		 * the header will be marked as I/O in progress and have an
2672		 * attached buffer.  At this point, anybody who finds this
2673		 * buffer ought to notice that it's legit but has a pending I/O.
2674		 */
2675
2676		if (GHOST_STATE(hdr->b_state))
2677			arc_access(hdr, hash_lock);
2678
2679		if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
2680		    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
2681			addr = hdr->b_l2hdr->b_daddr;
2682			/*
2683			 * Lock out device removal.
2684			 */
2685			if (vdev_is_dead(vd) ||
2686			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
2687				vd = NULL;
2688		}
2689
2690		mutex_exit(hash_lock);
2691
2692		ASSERT3U(hdr->b_size, ==, size);
2693		DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
2694		    zbookmark_t *, zb);
2695		ARCSTAT_BUMP(arcstat_misses);
2696		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2697		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2698		    data, metadata, misses);
2699
2700		if (vd != NULL) {
2701			/*
2702			 * Read from the L2ARC if the following are true:
2703			 * 1. The L2ARC vdev was previously cached.
2704			 * 2. This buffer still has L2ARC metadata.
2705			 * 3. This buffer isn't currently writing to the L2ARC.
2706			 * 4. The L2ARC entry wasn't evicted, which may
2707			 *    also have invalidated the vdev.
2708			 */
2709			if (hdr->b_l2hdr != NULL &&
2710			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) {
2711				l2arc_read_callback_t *cb;
2712
2713				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
2714				ARCSTAT_BUMP(arcstat_l2_hits);
2715
2716				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
2717				    KM_SLEEP);
2718				cb->l2rcb_buf = buf;
2719				cb->l2rcb_spa = spa;
2720				cb->l2rcb_bp = *bp;
2721				cb->l2rcb_zb = *zb;
2722				cb->l2rcb_flags = zio_flags;
2723
2724				/*
2725				 * l2arc read.  The SCL_L2ARC lock will be
2726				 * released by l2arc_read_done().
2727				 */
2728				rzio = zio_read_phys(pio, vd, addr, size,
2729				    buf->b_data, ZIO_CHECKSUM_OFF,
2730				    l2arc_read_done, cb, priority, zio_flags |
2731				    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
2732				    ZIO_FLAG_DONT_PROPAGATE |
2733				    ZIO_FLAG_DONT_RETRY, B_FALSE);
2734				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
2735				    zio_t *, rzio);
2736
2737				if (*arc_flags & ARC_NOWAIT) {
2738					zio_nowait(rzio);
2739					return (0);
2740				}
2741
2742				ASSERT(*arc_flags & ARC_WAIT);
2743				if (zio_wait(rzio) == 0)
2744					return (0);
2745
2746				/* l2arc read error; goto zio_read() */
2747			} else {
2748				DTRACE_PROBE1(l2arc__miss,
2749				    arc_buf_hdr_t *, hdr);
2750				ARCSTAT_BUMP(arcstat_l2_misses);
2751				if (HDR_L2_WRITING(hdr))
2752					ARCSTAT_BUMP(arcstat_l2_rw_clash);
2753				spa_config_exit(spa, SCL_L2ARC, vd);
2754			}
2755		}
2756
2757		rzio = zio_read(pio, spa, bp, buf->b_data, size,
2758		    arc_read_done, buf, priority, zio_flags, zb);
2759
2760		if (*arc_flags & ARC_WAIT)
2761			return (zio_wait(rzio));
2762
2763		ASSERT(*arc_flags & ARC_NOWAIT);
2764		zio_nowait(rzio);
2765	}
2766	return (0);
2767}
2768
2769/*
2770 * arc_read() variant to support pool traversal.  If the block is already
2771 * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
2772 * The idea is that we don't want pool traversal filling up memory, but
2773 * if the ARC already has the data anyway, we shouldn't pay for the I/O.
2774 */
2775int
2776arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
2777{
2778	arc_buf_hdr_t *hdr;
2779	kmutex_t *hash_mtx;
2780	int rc = 0;
2781
2782	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
2783
2784	if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
2785		arc_buf_t *buf = hdr->b_buf;
2786
2787		ASSERT(buf);
2788		while (buf->b_data == NULL) {
2789			buf = buf->b_next;
2790			ASSERT(buf);
2791		}
2792		bcopy(buf->b_data, data, hdr->b_size);
2793	} else {
2794		rc = ENOENT;
2795	}
2796
2797	if (hash_mtx)
2798		mutex_exit(hash_mtx);
2799
2800	return (rc);
2801}
2802
2803void
2804arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
2805{
2806	ASSERT(buf->b_hdr != NULL);
2807	ASSERT(buf->b_hdr->b_state != arc_anon);
2808	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
2809	buf->b_efunc = func;
2810	buf->b_private = private;
2811}
2812
2813/*
2814 * This is used by the DMU to let the ARC know that a buffer is
2815 * being evicted, so the ARC should clean up.  If this arc buf
2816 * is not yet in the evicted state, it will be put there.
2817 */
2818int
2819arc_buf_evict(arc_buf_t *buf)
2820{
2821	arc_buf_hdr_t *hdr;
2822	kmutex_t *hash_lock;
2823	arc_buf_t **bufp;
2824
2825	rw_enter(&buf->b_lock, RW_WRITER);
2826	hdr = buf->b_hdr;
2827	if (hdr == NULL) {
2828		/*
2829		 * We are in arc_do_user_evicts().
2830		 */
2831		ASSERT(buf->b_data == NULL);
2832		rw_exit(&buf->b_lock);
2833		return (0);
2834	} else if (buf->b_data == NULL) {
2835		arc_buf_t copy = *buf; /* structure assignment */
2836		/*
2837		 * We are on the eviction list; process this buffer now
2838		 * but let arc_do_user_evicts() do the reaping.
2839		 */
2840		buf->b_efunc = NULL;
2841		rw_exit(&buf->b_lock);
2842		VERIFY(copy.b_efunc(&copy) == 0);
2843		return (1);
2844	}
2845	hash_lock = HDR_LOCK(hdr);
2846	mutex_enter(hash_lock);
2847
2848	ASSERT(buf->b_hdr == hdr);
2849	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
2850	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2851
2852	/*
2853	 * Pull this buffer off of the hdr
2854	 */
2855	bufp = &hdr->b_buf;
2856	while (*bufp != buf)
2857		bufp = &(*bufp)->b_next;
2858	*bufp = buf->b_next;
2859
2860	ASSERT(buf->b_data != NULL);
2861	arc_buf_destroy(buf, FALSE, FALSE);
2862
2863	if (hdr->b_datacnt == 0) {
2864		arc_state_t *old_state = hdr->b_state;
2865		arc_state_t *evicted_state;
2866
2867		ASSERT(refcount_is_zero(&hdr->b_refcnt));
2868
2869		evicted_state =
2870		    (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2871
2872		mutex_enter(&old_state->arcs_mtx);
2873		mutex_enter(&evicted_state->arcs_mtx);
2874
2875		arc_change_state(evicted_state, hdr, hash_lock);
2876		ASSERT(HDR_IN_HASH_TABLE(hdr));
2877		hdr->b_flags |= ARC_IN_HASH_TABLE;
2878		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2879
2880		mutex_exit(&evicted_state->arcs_mtx);
2881		mutex_exit(&old_state->arcs_mtx);
2882	}
2883	mutex_exit(hash_lock);
2884	rw_exit(&buf->b_lock);
2885
2886	VERIFY(buf->b_efunc(buf) == 0);
2887	buf->b_efunc = NULL;
2888	buf->b_private = NULL;
2889	buf->b_hdr = NULL;
2890	kmem_cache_free(buf_cache, buf);
2891	return (1);
2892}
2893
2894/*
2895 * Release this buffer from the cache.  This must be done
2896 * after a read and prior to modifying the buffer contents.
2897 * If the buffer has more than one reference, we must make
2898 * a new hdr for the buffer.
2899 */
2900void
2901arc_release(arc_buf_t *buf, void *tag)
2902{
2903	arc_buf_hdr_t *hdr;
2904	kmutex_t *hash_lock;
2905	l2arc_buf_hdr_t *l2hdr;
2906	uint64_t buf_size;
2907
2908	rw_enter(&buf->b_lock, RW_WRITER);
2909	hdr = buf->b_hdr;
2910
2911	/* this buffer is not on any list */
2912	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
2913	ASSERT(!(hdr->b_flags & ARC_STORED));
2914
2915	if (hdr->b_state == arc_anon) {
2916		/* this buffer is already released */
2917		ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
2918		ASSERT(BUF_EMPTY(hdr));
2919		ASSERT(buf->b_efunc == NULL);
2920		arc_buf_thaw(buf);
2921		rw_exit(&buf->b_lock);
2922		return;
2923	}
2924
2925	hash_lock = HDR_LOCK(hdr);
2926	mutex_enter(hash_lock);
2927
2928	l2hdr = hdr->b_l2hdr;
2929	if (l2hdr) {
2930		mutex_enter(&l2arc_buflist_mtx);
2931		hdr->b_l2hdr = NULL;
2932		buf_size = hdr->b_size;
2933	}
2934
2935	/*
2936	 * Do we have more than one buf?
2937	 */
2938	if (hdr->b_datacnt > 1) {
2939		arc_buf_hdr_t *nhdr;
2940		arc_buf_t **bufp;
2941		uint64_t blksz = hdr->b_size;
2942		spa_t *spa = hdr->b_spa;
2943		arc_buf_contents_t type = hdr->b_type;
2944		uint32_t flags = hdr->b_flags;
2945
2946		ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
2947		/*
2948		 * Pull the data off of this buf and attach it to
2949		 * a new anonymous buf.
2950		 */
2951		(void) remove_reference(hdr, hash_lock, tag);
2952		bufp = &hdr->b_buf;
2953		while (*bufp != buf)
2954			bufp = &(*bufp)->b_next;
2955		*bufp = (*bufp)->b_next;
2956		buf->b_next = NULL;
2957
2958		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
2959		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
2960		if (refcount_is_zero(&hdr->b_refcnt)) {
2961			uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
2962			ASSERT3U(*size, >=, hdr->b_size);
2963			atomic_add_64(size, -hdr->b_size);
2964		}
2965		hdr->b_datacnt -= 1;
2966		arc_cksum_verify(buf);
2967
2968		mutex_exit(hash_lock);
2969
2970		nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
2971		nhdr->b_size = blksz;
2972		nhdr->b_spa = spa;
2973		nhdr->b_type = type;
2974		nhdr->b_buf = buf;
2975		nhdr->b_state = arc_anon;
2976		nhdr->b_arc_access = 0;
2977		nhdr->b_flags = flags & ARC_L2_WRITING;
2978		nhdr->b_l2hdr = NULL;
2979		nhdr->b_datacnt = 1;
2980		nhdr->b_freeze_cksum = NULL;
2981		(void) refcount_add(&nhdr->b_refcnt, tag);
2982		buf->b_hdr = nhdr;
2983		rw_exit(&buf->b_lock);
2984		atomic_add_64(&arc_anon->arcs_size, blksz);
2985	} else {
2986		rw_exit(&buf->b_lock);
2987		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
2988		ASSERT(!list_link_active(&hdr->b_arc_node));
2989		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2990		arc_change_state(arc_anon, hdr, hash_lock);
2991		hdr->b_arc_access = 0;
2992		mutex_exit(hash_lock);
2993
2994		bzero(&hdr->b_dva, sizeof (dva_t));
2995		hdr->b_birth = 0;
2996		hdr->b_cksum0 = 0;
2997		arc_buf_thaw(buf);
2998	}
2999	buf->b_efunc = NULL;
3000	buf->b_private = NULL;
3001
3002	if (l2hdr) {
3003		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3004		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3005		ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3006		mutex_exit(&l2arc_buflist_mtx);
3007	}
3008}
3009
3010int
3011arc_released(arc_buf_t *buf)
3012{
3013	int released;
3014
3015	rw_enter(&buf->b_lock, RW_READER);
3016	released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3017	rw_exit(&buf->b_lock);
3018	return (released);
3019}
3020
3021int
3022arc_has_callback(arc_buf_t *buf)
3023{
3024	int callback;
3025
3026	rw_enter(&buf->b_lock, RW_READER);
3027	callback = (buf->b_efunc != NULL);
3028	rw_exit(&buf->b_lock);
3029	return (callback);
3030}
3031
3032#ifdef ZFS_DEBUG
3033int
3034arc_referenced(arc_buf_t *buf)
3035{
3036	int referenced;
3037
3038	rw_enter(&buf->b_lock, RW_READER);
3039	referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3040	rw_exit(&buf->b_lock);
3041	return (referenced);
3042}
3043#endif
3044
3045static void
3046arc_write_ready(zio_t *zio)
3047{
3048	arc_write_callback_t *callback = zio->io_private;
3049	arc_buf_t *buf = callback->awcb_buf;
3050	arc_buf_hdr_t *hdr = buf->b_hdr;
3051
3052	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3053	callback->awcb_ready(zio, buf, callback->awcb_private);
3054
3055	/*
3056	 * If the IO is already in progress, then this is a re-write
3057	 * attempt, so we need to thaw and re-compute the cksum.
3058	 * It is the responsibility of the callback to handle the
3059	 * accounting for any re-write attempt.
3060	 */
3061	if (HDR_IO_IN_PROGRESS(hdr)) {
3062		mutex_enter(&hdr->b_freeze_lock);
3063		if (hdr->b_freeze_cksum != NULL) {
3064			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3065			hdr->b_freeze_cksum = NULL;
3066		}
3067		mutex_exit(&hdr->b_freeze_lock);
3068	}
3069	arc_cksum_compute(buf, B_FALSE);
3070	hdr->b_flags |= ARC_IO_IN_PROGRESS;
3071}
3072
3073static void
3074arc_write_done(zio_t *zio)
3075{
3076	arc_write_callback_t *callback = zio->io_private;
3077	arc_buf_t *buf = callback->awcb_buf;
3078	arc_buf_hdr_t *hdr = buf->b_hdr;
3079
3080	hdr->b_acb = NULL;
3081
3082	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3083	hdr->b_birth = zio->io_bp->blk_birth;
3084	hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3085	/*
3086	 * If the block to be written was all-zero, we may have
3087	 * compressed it away.  In this case no write was performed
3088	 * so there will be no dva/birth-date/checksum.  The buffer
3089	 * must therefor remain anonymous (and uncached).
3090	 */
3091	if (!BUF_EMPTY(hdr)) {
3092		arc_buf_hdr_t *exists;
3093		kmutex_t *hash_lock;
3094
3095		arc_cksum_verify(buf);
3096
3097		exists = buf_hash_insert(hdr, &hash_lock);
3098		if (exists) {
3099			/*
3100			 * This can only happen if we overwrite for
3101			 * sync-to-convergence, because we remove
3102			 * buffers from the hash table when we arc_free().
3103			 */
3104			ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE);
3105			ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
3106			    BP_IDENTITY(zio->io_bp)));
3107			ASSERT3U(zio->io_bp_orig.blk_birth, ==,
3108			    zio->io_bp->blk_birth);
3109
3110			ASSERT(refcount_is_zero(&exists->b_refcnt));
3111			arc_change_state(arc_anon, exists, hash_lock);
3112			mutex_exit(hash_lock);
3113			arc_hdr_destroy(exists);
3114			exists = buf_hash_insert(hdr, &hash_lock);
3115			ASSERT3P(exists, ==, NULL);
3116		}
3117		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3118		/* if it's not anon, we are doing a scrub */
3119		if (hdr->b_state == arc_anon)
3120			arc_access(hdr, hash_lock);
3121		mutex_exit(hash_lock);
3122	} else if (callback->awcb_done == NULL) {
3123		int destroy_hdr;
3124		/*
3125		 * This is an anonymous buffer with no user callback,
3126		 * destroy it if there are no active references.
3127		 */
3128		mutex_enter(&arc_eviction_mtx);
3129		destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
3130		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3131		mutex_exit(&arc_eviction_mtx);
3132		if (destroy_hdr)
3133			arc_hdr_destroy(hdr);
3134	} else {
3135		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3136	}
3137	hdr->b_flags &= ~ARC_STORED;
3138
3139	if (callback->awcb_done) {
3140		ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3141		callback->awcb_done(zio, buf, callback->awcb_private);
3142	}
3143
3144	kmem_free(callback, sizeof (arc_write_callback_t));
3145}
3146
3147static void
3148write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp)
3149{
3150	boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata);
3151
3152	/* Determine checksum setting */
3153	if (ismd) {
3154		/*
3155		 * Metadata always gets checksummed.  If the data
3156		 * checksum is multi-bit correctable, and it's not a
3157		 * ZBT-style checksum, then it's suitable for metadata
3158		 * as well.  Otherwise, the metadata checksum defaults
3159		 * to fletcher4.
3160		 */
3161		if (zio_checksum_table[wp->wp_oschecksum].ci_correctable &&
3162		    !zio_checksum_table[wp->wp_oschecksum].ci_zbt)
3163			zp->zp_checksum = wp->wp_oschecksum;
3164		else
3165			zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4;
3166	} else {
3167		zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum,
3168		    wp->wp_oschecksum);
3169	}
3170
3171	/* Determine compression setting */
3172	if (ismd) {
3173		/*
3174		 * XXX -- we should design a compression algorithm
3175		 * that specializes in arrays of bps.
3176		 */
3177		zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
3178		    ZIO_COMPRESS_LZJB;
3179	} else {
3180		zp->zp_compress = zio_compress_select(wp->wp_dncompress,
3181		    wp->wp_oscompress);
3182	}
3183
3184	zp->zp_type = wp->wp_type;
3185	zp->zp_level = wp->wp_level;
3186	zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa));
3187}
3188
3189zio_t *
3190arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
3191    boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
3192    arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
3193    int zio_flags, const zbookmark_t *zb)
3194{
3195	arc_buf_hdr_t *hdr = buf->b_hdr;
3196	arc_write_callback_t *callback;
3197	zio_t *zio;
3198	zio_prop_t zp;
3199
3200	ASSERT(ready != NULL);
3201	ASSERT(!HDR_IO_ERROR(hdr));
3202	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3203	ASSERT(hdr->b_acb == 0);
3204	if (l2arc)
3205		hdr->b_flags |= ARC_L2CACHE;
3206	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3207	callback->awcb_ready = ready;
3208	callback->awcb_done = done;
3209	callback->awcb_private = private;
3210	callback->awcb_buf = buf;
3211
3212	write_policy(spa, wp, &zp);
3213	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp,
3214	    arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3215
3216	return (zio);
3217}
3218
3219int
3220arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
3221    zio_done_func_t *done, void *private, uint32_t arc_flags)
3222{
3223	arc_buf_hdr_t *ab;
3224	kmutex_t *hash_lock;
3225	zio_t	*zio;
3226
3227	/*
3228	 * If this buffer is in the cache, release it, so it
3229	 * can be re-used.
3230	 */
3231	ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
3232	if (ab != NULL) {
3233		/*
3234		 * The checksum of blocks to free is not always
3235		 * preserved (eg. on the deadlist).  However, if it is
3236		 * nonzero, it should match what we have in the cache.
3237		 */
3238		ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
3239		    bp->blk_cksum.zc_word[0] == ab->b_cksum0 ||
3240		    bp->blk_fill == BLK_FILL_ALREADY_FREED);
3241
3242		if (ab->b_state != arc_anon)
3243			arc_change_state(arc_anon, ab, hash_lock);
3244		if (HDR_IO_IN_PROGRESS(ab)) {
3245			/*
3246			 * This should only happen when we prefetch.
3247			 */
3248			ASSERT(ab->b_flags & ARC_PREFETCH);
3249			ASSERT3U(ab->b_datacnt, ==, 1);
3250			ab->b_flags |= ARC_FREED_IN_READ;
3251			if (HDR_IN_HASH_TABLE(ab))
3252				buf_hash_remove(ab);
3253			ab->b_arc_access = 0;
3254			bzero(&ab->b_dva, sizeof (dva_t));
3255			ab->b_birth = 0;
3256			ab->b_cksum0 = 0;
3257			ab->b_buf->b_efunc = NULL;
3258			ab->b_buf->b_private = NULL;
3259			mutex_exit(hash_lock);
3260		} else if (refcount_is_zero(&ab->b_refcnt)) {
3261			ab->b_flags |= ARC_FREE_IN_PROGRESS;
3262			mutex_exit(hash_lock);
3263			arc_hdr_destroy(ab);
3264			ARCSTAT_BUMP(arcstat_deleted);
3265		} else {
3266			/*
3267			 * We still have an active reference on this
3268			 * buffer.  This can happen, e.g., from
3269			 * dbuf_unoverride().
3270			 */
3271			ASSERT(!HDR_IN_HASH_TABLE(ab));
3272			ab->b_arc_access = 0;
3273			bzero(&ab->b_dva, sizeof (dva_t));
3274			ab->b_birth = 0;
3275			ab->b_cksum0 = 0;
3276			ab->b_buf->b_efunc = NULL;
3277			ab->b_buf->b_private = NULL;
3278			mutex_exit(hash_lock);
3279		}
3280	}
3281
3282	zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED);
3283
3284	if (arc_flags & ARC_WAIT)
3285		return (zio_wait(zio));
3286
3287	ASSERT(arc_flags & ARC_NOWAIT);
3288	zio_nowait(zio);
3289
3290	return (0);
3291}
3292
3293static int
3294arc_memory_throttle(uint64_t reserve, uint64_t txg)
3295{
3296#ifdef _KERNEL
3297	uint64_t inflight_data = arc_anon->arcs_size;
3298	uint64_t available_memory = ptoa((uintmax_t)cnt.v_free_count);
3299	static uint64_t page_load = 0;
3300	static uint64_t last_txg = 0;
3301
3302#if 0
3303#if defined(__i386)
3304	available_memory =
3305	    MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3306#endif
3307#endif
3308	if (available_memory >= zfs_write_limit_max)
3309		return (0);
3310
3311	if (txg > last_txg) {
3312		last_txg = txg;
3313		page_load = 0;
3314	}
3315	/*
3316	 * If we are in pageout, we know that memory is already tight,
3317	 * the arc is already going to be evicting, so we just want to
3318	 * continue to let page writes occur as quickly as possible.
3319	 */
3320	if (curproc == pageproc) {
3321		if (page_load > available_memory / 4)
3322			return (ERESTART);
3323		/* Note: reserve is inflated, so we deflate */
3324		page_load += reserve / 8;
3325		return (0);
3326	} else if (page_load > 0 && arc_reclaim_needed()) {
3327		/* memory is low, delay before restarting */
3328		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3329		return (EAGAIN);
3330	}
3331	page_load = 0;
3332
3333	if (arc_size > arc_c_min) {
3334		uint64_t evictable_memory =
3335		    arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3336		    arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3337		    arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3338		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3339		available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3340	}
3341
3342	if (inflight_data > available_memory / 4) {
3343		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3344		return (ERESTART);
3345	}
3346#endif
3347	return (0);
3348}
3349
3350void
3351arc_tempreserve_clear(uint64_t reserve)
3352{
3353	atomic_add_64(&arc_tempreserve, -reserve);
3354	ASSERT((int64_t)arc_tempreserve >= 0);
3355}
3356
3357int
3358arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3359{
3360	int error;
3361
3362#ifdef ZFS_DEBUG
3363	/*
3364	 * Once in a while, fail for no reason.  Everything should cope.
3365	 */
3366	if (spa_get_random(10000) == 0) {
3367		dprintf("forcing random failure\n");
3368		return (ERESTART);
3369	}
3370#endif
3371	if (reserve > arc_c/4 && !arc_no_grow)
3372		arc_c = MIN(arc_c_max, reserve * 4);
3373	if (reserve > arc_c)
3374		return (ENOMEM);
3375
3376	/*
3377	 * Writes will, almost always, require additional memory allocations
3378	 * in order to compress/encrypt/etc the data.  We therefor need to
3379	 * make sure that there is sufficient available memory for this.
3380	 */
3381	if (error = arc_memory_throttle(reserve, txg))
3382		return (error);
3383
3384	/*
3385	 * Throttle writes when the amount of dirty data in the cache
3386	 * gets too large.  We try to keep the cache less than half full
3387	 * of dirty blocks so that our sync times don't grow too large.
3388	 * Note: if two requests come in concurrently, we might let them
3389	 * both succeed, when one of them should fail.  Not a huge deal.
3390	 */
3391	if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
3392	    arc_anon->arcs_size > arc_c / 4) {
3393		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3394		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3395		    arc_tempreserve>>10,
3396		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3397		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3398		    reserve>>10, arc_c>>10);
3399		return (ERESTART);
3400	}
3401	atomic_add_64(&arc_tempreserve, reserve);
3402	return (0);
3403}
3404
3405static kmutex_t arc_lowmem_lock;
3406#ifdef _KERNEL
3407static eventhandler_tag arc_event_lowmem = NULL;
3408
3409static void
3410arc_lowmem(void *arg __unused, int howto __unused)
3411{
3412
3413	/* Serialize access via arc_lowmem_lock. */
3414	mutex_enter(&arc_lowmem_lock);
3415	needfree = 1;
3416	cv_signal(&arc_reclaim_thr_cv);
3417	while (needfree)
3418		tsleep(&needfree, 0, "zfs:lowmem", hz / 5);
3419	mutex_exit(&arc_lowmem_lock);
3420}
3421#endif
3422
3423void
3424arc_init(void)
3425{
3426	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3427	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3428	mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
3429
3430	/* Convert seconds to clock ticks */
3431	arc_min_prefetch_lifespan = 1 * hz;
3432
3433	/* Start out with 1/8 of all memory */
3434	arc_c = kmem_size() / 8;
3435#if 0
3436#ifdef _KERNEL
3437	/*
3438	 * On architectures where the physical memory can be larger
3439	 * than the addressable space (intel in 32-bit mode), we may
3440	 * need to limit the cache to 1/8 of VM size.
3441	 */
3442	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3443#endif
3444#endif
3445	/* set min cache to 1/32 of all memory, or 16MB, whichever is more */
3446	arc_c_min = MAX(arc_c / 4, 64<<18);
3447	/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
3448	if (arc_c * 8 >= 1<<30)
3449		arc_c_max = (arc_c * 8) - (1<<30);
3450	else
3451		arc_c_max = arc_c_min;
3452	arc_c_max = MAX(arc_c * 5, arc_c_max);
3453#ifdef _KERNEL
3454	/*
3455	 * Allow the tunables to override our calculations if they are
3456	 * reasonable (ie. over 16MB)
3457	 */
3458	if (zfs_arc_max >= 64<<18 && zfs_arc_max < kmem_size())
3459		arc_c_max = zfs_arc_max;
3460	if (zfs_arc_min >= 64<<18 && zfs_arc_min <= arc_c_max)
3461		arc_c_min = zfs_arc_min;
3462#endif
3463	arc_c = arc_c_max;
3464	arc_p = (arc_c >> 1);
3465
3466	/* limit meta-data to 1/4 of the arc capacity */
3467	arc_meta_limit = arc_c_max / 4;
3468
3469	/* Allow the tunable to override if it is reasonable */
3470	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3471		arc_meta_limit = zfs_arc_meta_limit;
3472
3473	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3474		arc_c_min = arc_meta_limit / 2;
3475
3476	/* if kmem_flags are set, lets try to use less memory */
3477	if (kmem_debugging())
3478		arc_c = arc_c / 2;
3479	if (arc_c < arc_c_min)
3480		arc_c = arc_c_min;
3481
3482	zfs_arc_min = arc_c_min;
3483	zfs_arc_max = arc_c_max;
3484
3485	arc_anon = &ARC_anon;
3486	arc_mru = &ARC_mru;
3487	arc_mru_ghost = &ARC_mru_ghost;
3488	arc_mfu = &ARC_mfu;
3489	arc_mfu_ghost = &ARC_mfu_ghost;
3490	arc_l2c_only = &ARC_l2c_only;
3491	arc_size = 0;
3492
3493	mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3494	mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3495	mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3496	mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3497	mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3498	mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
3499
3500	list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
3501	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3502	list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
3503	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3504	list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
3505	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3506	list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
3507	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3508	list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
3509	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3510	list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
3511	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3512	list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
3513	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3514	list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
3515	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3516	list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
3517	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3518	list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
3519	    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3520
3521	buf_init();
3522
3523	arc_thread_exit = 0;
3524	arc_eviction_list = NULL;
3525	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3526	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3527
3528	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3529	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3530
3531	if (arc_ksp != NULL) {
3532		arc_ksp->ks_data = &arc_stats;
3533		kstat_install(arc_ksp);
3534	}
3535
3536	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3537	    TS_RUN, minclsyspri);
3538
3539#ifdef _KERNEL
3540	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
3541	    EVENTHANDLER_PRI_FIRST);
3542#endif
3543
3544	arc_dead = FALSE;
3545	arc_warm = B_FALSE;
3546
3547	if (zfs_write_limit_max == 0)
3548		zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
3549	else
3550		zfs_write_limit_shift = 0;
3551	mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
3552
3553#ifdef _KERNEL
3554#ifdef __i386__
3555	if (zfs_prefetch_enable != 1) {
3556		printf("ZFS NOTICE: prefetch is disabled by default on i386"
3557		    " - add enable to tunable to change.\n" );
3558		zfs_prefetch_disable=1;
3559	}
3560#endif
3561	if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
3562	    (zfs_prefetch_enable != 1) && (zfs_prefetch_disable != 1)) {
3563		printf("ZFS NOTICE: system has less than 4GB and prefetch enable is not set"
3564		    "... disabling.\n");
3565		zfs_prefetch_disable=1;
3566	}
3567	/* Warn about ZFS memory and address space requirements. */
3568	if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
3569		printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
3570		    "expect unstable behavior.\n");
3571	}
3572	if (kmem_size() < 512 * (1 << 20)) {
3573		printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
3574		    "expect unstable behavior.\n");
3575		printf("             Consider tuning vm.kmem_size and "
3576		    "vm.kmem_size_max\n");
3577		printf("             in /boot/loader.conf.\n");
3578	}
3579#endif
3580}
3581
3582void
3583arc_fini(void)
3584{
3585
3586	mutex_enter(&arc_reclaim_thr_lock);
3587	arc_thread_exit = 1;
3588	cv_signal(&arc_reclaim_thr_cv);
3589	while (arc_thread_exit != 0)
3590		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3591	mutex_exit(&arc_reclaim_thr_lock);
3592
3593	arc_flush(NULL);
3594
3595	arc_dead = TRUE;
3596
3597	if (arc_ksp != NULL) {
3598		kstat_delete(arc_ksp);
3599		arc_ksp = NULL;
3600	}
3601
3602	mutex_destroy(&arc_eviction_mtx);
3603	mutex_destroy(&arc_reclaim_thr_lock);
3604	cv_destroy(&arc_reclaim_thr_cv);
3605
3606	list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
3607	list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
3608	list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
3609	list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
3610	list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
3611	list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
3612	list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
3613	list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
3614
3615	mutex_destroy(&arc_anon->arcs_mtx);
3616	mutex_destroy(&arc_mru->arcs_mtx);
3617	mutex_destroy(&arc_mru_ghost->arcs_mtx);
3618	mutex_destroy(&arc_mfu->arcs_mtx);
3619	mutex_destroy(&arc_mfu_ghost->arcs_mtx);
3620
3621	mutex_destroy(&zfs_write_limit_lock);
3622
3623	buf_fini();
3624
3625	mutex_destroy(&arc_lowmem_lock);
3626#ifdef _KERNEL
3627	if (arc_event_lowmem != NULL)
3628		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
3629#endif
3630}
3631
3632/*
3633 * Level 2 ARC
3634 *
3635 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3636 * It uses dedicated storage devices to hold cached data, which are populated
3637 * using large infrequent writes.  The main role of this cache is to boost
3638 * the performance of random read workloads.  The intended L2ARC devices
3639 * include short-stroked disks, solid state disks, and other media with
3640 * substantially faster read latency than disk.
3641 *
3642 *                 +-----------------------+
3643 *                 |         ARC           |
3644 *                 +-----------------------+
3645 *                    |         ^     ^
3646 *                    |         |     |
3647 *      l2arc_feed_thread()    arc_read()
3648 *                    |         |     |
3649 *                    |  l2arc read   |
3650 *                    V         |     |
3651 *               +---------------+    |
3652 *               |     L2ARC     |    |
3653 *               +---------------+    |
3654 *                   |    ^           |
3655 *          l2arc_write() |           |
3656 *                   |    |           |
3657 *                   V    |           |
3658 *                 +-------+      +-------+
3659 *                 | vdev  |      | vdev  |
3660 *                 | cache |      | cache |
3661 *                 +-------+      +-------+
3662 *                 +=========+     .-----.
3663 *                 :  L2ARC  :    |-_____-|
3664 *                 : devices :    | Disks |
3665 *                 +=========+    `-_____-'
3666 *
3667 * Read requests are satisfied from the following sources, in order:
3668 *
3669 *	1) ARC
3670 *	2) vdev cache of L2ARC devices
3671 *	3) L2ARC devices
3672 *	4) vdev cache of disks
3673 *	5) disks
3674 *
3675 * Some L2ARC device types exhibit extremely slow write performance.
3676 * To accommodate for this there are some significant differences between
3677 * the L2ARC and traditional cache design:
3678 *
3679 * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
3680 * the ARC behave as usual, freeing buffers and placing headers on ghost
3681 * lists.  The ARC does not send buffers to the L2ARC during eviction as
3682 * this would add inflated write latencies for all ARC memory pressure.
3683 *
3684 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
3685 * It does this by periodically scanning buffers from the eviction-end of
3686 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
3687 * not already there.  It scans until a headroom of buffers is satisfied,
3688 * which itself is a buffer for ARC eviction.  The thread that does this is
3689 * l2arc_feed_thread(), illustrated below; example sizes are included to
3690 * provide a better sense of ratio than this diagram:
3691 *
3692 *	       head -->                        tail
3693 *	        +---------------------+----------+
3694 *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
3695 *	        +---------------------+----------+   |   o L2ARC eligible
3696 *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
3697 *	        +---------------------+----------+   |
3698 *	             15.9 Gbytes      ^ 32 Mbytes    |
3699 *	                           headroom          |
3700 *	                                      l2arc_feed_thread()
3701 *	                                             |
3702 *	                 l2arc write hand <--[oooo]--'
3703 *	                         |           8 Mbyte
3704 *	                         |          write max
3705 *	                         V
3706 *		  +==============================+
3707 *	L2ARC dev |####|#|###|###|    |####| ... |
3708 *	          +==============================+
3709 *	                     32 Gbytes
3710 *
3711 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
3712 * evicted, then the L2ARC has cached a buffer much sooner than it probably
3713 * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
3714 * safe to say that this is an uncommon case, since buffers at the end of
3715 * the ARC lists have moved there due to inactivity.
3716 *
3717 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
3718 * then the L2ARC simply misses copying some buffers.  This serves as a
3719 * pressure valve to prevent heavy read workloads from both stalling the ARC
3720 * with waits and clogging the L2ARC with writes.  This also helps prevent
3721 * the potential for the L2ARC to churn if it attempts to cache content too
3722 * quickly, such as during backups of the entire pool.
3723 *
3724 * 5. After system boot and before the ARC has filled main memory, there are
3725 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
3726 * lists can remain mostly static.  Instead of searching from tail of these
3727 * lists as pictured, the l2arc_feed_thread() will search from the list heads
3728 * for eligible buffers, greatly increasing its chance of finding them.
3729 *
3730 * The L2ARC device write speed is also boosted during this time so that
3731 * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
3732 * there are no L2ARC reads, and no fear of degrading read performance
3733 * through increased writes.
3734 *
3735 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
3736 * the vdev queue can aggregate them into larger and fewer writes.  Each
3737 * device is written to in a rotor fashion, sweeping writes through
3738 * available space then repeating.
3739 *
3740 * 7. The L2ARC does not store dirty content.  It never needs to flush
3741 * write buffers back to disk based storage.
3742 *
3743 * 8. If an ARC buffer is written (and dirtied) which also exists in the
3744 * L2ARC, the now stale L2ARC buffer is immediately dropped.
3745 *
3746 * The performance of the L2ARC can be tweaked by a number of tunables, which
3747 * may be necessary for different workloads:
3748 *
3749 *	l2arc_write_max		max write bytes per interval
3750 *	l2arc_write_boost	extra write bytes during device warmup
3751 *	l2arc_noprefetch	skip caching prefetched buffers
3752 *	l2arc_headroom		number of max device writes to precache
3753 *	l2arc_feed_secs		seconds between L2ARC writing
3754 *
3755 * Tunables may be removed or added as future performance improvements are
3756 * integrated, and also may become zpool properties.
3757 */
3758
3759static void
3760l2arc_hdr_stat_add(void)
3761{
3762	ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
3763	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
3764}
3765
3766static void
3767l2arc_hdr_stat_remove(void)
3768{
3769	ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
3770	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
3771}
3772
3773/*
3774 * Cycle through L2ARC devices.  This is how L2ARC load balances.
3775 * If a device is returned, this also returns holding the spa config lock.
3776 */
3777static l2arc_dev_t *
3778l2arc_dev_get_next(void)
3779{
3780	l2arc_dev_t *first, *next = NULL;
3781
3782	/*
3783	 * Lock out the removal of spas (spa_namespace_lock), then removal
3784	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
3785	 * both locks will be dropped and a spa config lock held instead.
3786	 */
3787	mutex_enter(&spa_namespace_lock);
3788	mutex_enter(&l2arc_dev_mtx);
3789
3790	/* if there are no vdevs, there is nothing to do */
3791	if (l2arc_ndev == 0)
3792		goto out;
3793
3794	first = NULL;
3795	next = l2arc_dev_last;
3796	do {
3797		/* loop around the list looking for a non-faulted vdev */
3798		if (next == NULL) {
3799			next = list_head(l2arc_dev_list);
3800		} else {
3801			next = list_next(l2arc_dev_list, next);
3802			if (next == NULL)
3803				next = list_head(l2arc_dev_list);
3804		}
3805
3806		/* if we have come back to the start, bail out */
3807		if (first == NULL)
3808			first = next;
3809		else if (next == first)
3810			break;
3811
3812	} while (vdev_is_dead(next->l2ad_vdev));
3813
3814	/* if we were unable to find any usable vdevs, return NULL */
3815	if (vdev_is_dead(next->l2ad_vdev))
3816		next = NULL;
3817
3818	l2arc_dev_last = next;
3819
3820out:
3821	mutex_exit(&l2arc_dev_mtx);
3822
3823	/*
3824	 * Grab the config lock to prevent the 'next' device from being
3825	 * removed while we are writing to it.
3826	 */
3827	if (next != NULL)
3828		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
3829	mutex_exit(&spa_namespace_lock);
3830
3831	return (next);
3832}
3833
3834/*
3835 * Free buffers that were tagged for destruction.
3836 */
3837static void
3838l2arc_do_free_on_write()
3839{
3840	list_t *buflist;
3841	l2arc_data_free_t *df, *df_prev;
3842
3843	mutex_enter(&l2arc_free_on_write_mtx);
3844	buflist = l2arc_free_on_write;
3845
3846	for (df = list_tail(buflist); df; df = df_prev) {
3847		df_prev = list_prev(buflist, df);
3848		ASSERT(df->l2df_data != NULL);
3849		ASSERT(df->l2df_func != NULL);
3850		df->l2df_func(df->l2df_data, df->l2df_size);
3851		list_remove(buflist, df);
3852		kmem_free(df, sizeof (l2arc_data_free_t));
3853	}
3854
3855	mutex_exit(&l2arc_free_on_write_mtx);
3856}
3857
3858/*
3859 * A write to a cache device has completed.  Update all headers to allow
3860 * reads from these buffers to begin.
3861 */
3862static void
3863l2arc_write_done(zio_t *zio)
3864{
3865	l2arc_write_callback_t *cb;
3866	l2arc_dev_t *dev;
3867	list_t *buflist;
3868	arc_buf_hdr_t *head, *ab, *ab_prev;
3869	l2arc_buf_hdr_t *abl2;
3870	kmutex_t *hash_lock;
3871
3872	cb = zio->io_private;
3873	ASSERT(cb != NULL);
3874	dev = cb->l2wcb_dev;
3875	ASSERT(dev != NULL);
3876	head = cb->l2wcb_head;
3877	ASSERT(head != NULL);
3878	buflist = dev->l2ad_buflist;
3879	ASSERT(buflist != NULL);
3880	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
3881	    l2arc_write_callback_t *, cb);
3882
3883	if (zio->io_error != 0)
3884		ARCSTAT_BUMP(arcstat_l2_writes_error);
3885
3886	mutex_enter(&l2arc_buflist_mtx);
3887
3888	/*
3889	 * All writes completed, or an error was hit.
3890	 */
3891	for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
3892		ab_prev = list_prev(buflist, ab);
3893
3894		hash_lock = HDR_LOCK(ab);
3895		if (!mutex_tryenter(hash_lock)) {
3896			/*
3897			 * This buffer misses out.  It may be in a stage
3898			 * of eviction.  Its ARC_L2_WRITING flag will be
3899			 * left set, denying reads to this buffer.
3900			 */
3901			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
3902			continue;
3903		}
3904
3905		if (zio->io_error != 0) {
3906			/*
3907			 * Error - drop L2ARC entry.
3908			 */
3909			list_remove(buflist, ab);
3910			abl2 = ab->b_l2hdr;
3911			ab->b_l2hdr = NULL;
3912			kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
3913			ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
3914		}
3915
3916		/*
3917		 * Allow ARC to begin reads to this L2ARC entry.
3918		 */
3919		ab->b_flags &= ~ARC_L2_WRITING;
3920
3921		mutex_exit(hash_lock);
3922	}
3923
3924	atomic_inc_64(&l2arc_writes_done);
3925	list_remove(buflist, head);
3926	kmem_cache_free(hdr_cache, head);
3927	mutex_exit(&l2arc_buflist_mtx);
3928
3929	l2arc_do_free_on_write();
3930
3931	kmem_free(cb, sizeof (l2arc_write_callback_t));
3932}
3933
3934/*
3935 * A read to a cache device completed.  Validate buffer contents before
3936 * handing over to the regular ARC routines.
3937 */
3938static void
3939l2arc_read_done(zio_t *zio)
3940{
3941	l2arc_read_callback_t *cb;
3942	arc_buf_hdr_t *hdr;
3943	arc_buf_t *buf;
3944	kmutex_t *hash_lock;
3945	int equal;
3946
3947	ASSERT(zio->io_vd != NULL);
3948	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
3949
3950	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
3951
3952	cb = zio->io_private;
3953	ASSERT(cb != NULL);
3954	buf = cb->l2rcb_buf;
3955	ASSERT(buf != NULL);
3956	hdr = buf->b_hdr;
3957	ASSERT(hdr != NULL);
3958
3959	hash_lock = HDR_LOCK(hdr);
3960	mutex_enter(hash_lock);
3961
3962	/*
3963	 * Check this survived the L2ARC journey.
3964	 */
3965	equal = arc_cksum_equal(buf);
3966	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
3967		mutex_exit(hash_lock);
3968		zio->io_private = buf;
3969		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
3970		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
3971		arc_read_done(zio);
3972	} else {
3973		mutex_exit(hash_lock);
3974		/*
3975		 * Buffer didn't survive caching.  Increment stats and
3976		 * reissue to the original storage device.
3977		 */
3978		if (zio->io_error != 0) {
3979			ARCSTAT_BUMP(arcstat_l2_io_error);
3980		} else {
3981			zio->io_error = EIO;
3982		}
3983		if (!equal)
3984			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
3985
3986		/*
3987		 * If there's no waiter, issue an async i/o to the primary
3988		 * storage now.  If there *is* a waiter, the caller must
3989		 * issue the i/o in a context where it's OK to block.
3990		 */
3991		if (zio->io_waiter == NULL)
3992			zio_nowait(zio_read(zio->io_parent,
3993			    cb->l2rcb_spa, &cb->l2rcb_bp,
3994			    buf->b_data, zio->io_size, arc_read_done, buf,
3995			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
3996	}
3997
3998	kmem_free(cb, sizeof (l2arc_read_callback_t));
3999}
4000
4001/*
4002 * This is the list priority from which the L2ARC will search for pages to
4003 * cache.  This is used within loops (0..3) to cycle through lists in the
4004 * desired order.  This order can have a significant effect on cache
4005 * performance.
4006 *
4007 * Currently the metadata lists are hit first, MFU then MRU, followed by
4008 * the data lists.  This function returns a locked list, and also returns
4009 * the lock pointer.
4010 */
4011static list_t *
4012l2arc_list_locked(int list_num, kmutex_t **lock)
4013{
4014	list_t *list;
4015
4016	ASSERT(list_num >= 0 && list_num <= 3);
4017
4018	switch (list_num) {
4019	case 0:
4020		list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
4021		*lock = &arc_mfu->arcs_mtx;
4022		break;
4023	case 1:
4024		list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
4025		*lock = &arc_mru->arcs_mtx;
4026		break;
4027	case 2:
4028		list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
4029		*lock = &arc_mfu->arcs_mtx;
4030		break;
4031	case 3:
4032		list = &arc_mru->arcs_list[ARC_BUFC_DATA];
4033		*lock = &arc_mru->arcs_mtx;
4034		break;
4035	}
4036
4037	ASSERT(!(MUTEX_HELD(*lock)));
4038	mutex_enter(*lock);
4039	return (list);
4040}
4041
4042/*
4043 * Evict buffers from the device write hand to the distance specified in
4044 * bytes.  This distance may span populated buffers, it may span nothing.
4045 * This is clearing a region on the L2ARC device ready for writing.
4046 * If the 'all' boolean is set, every buffer is evicted.
4047 */
4048static void
4049l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4050{
4051	list_t *buflist;
4052	l2arc_buf_hdr_t *abl2;
4053	arc_buf_hdr_t *ab, *ab_prev;
4054	kmutex_t *hash_lock;
4055	uint64_t taddr;
4056
4057	buflist = dev->l2ad_buflist;
4058
4059	if (buflist == NULL)
4060		return;
4061
4062	if (!all && dev->l2ad_first) {
4063		/*
4064		 * This is the first sweep through the device.  There is
4065		 * nothing to evict.
4066		 */
4067		return;
4068	}
4069
4070	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4071		/*
4072		 * When nearing the end of the device, evict to the end
4073		 * before the device write hand jumps to the start.
4074		 */
4075		taddr = dev->l2ad_end;
4076	} else {
4077		taddr = dev->l2ad_hand + distance;
4078	}
4079	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4080	    uint64_t, taddr, boolean_t, all);
4081
4082top:
4083	mutex_enter(&l2arc_buflist_mtx);
4084	for (ab = list_tail(buflist); ab; ab = ab_prev) {
4085		ab_prev = list_prev(buflist, ab);
4086
4087		hash_lock = HDR_LOCK(ab);
4088		if (!mutex_tryenter(hash_lock)) {
4089			/*
4090			 * Missed the hash lock.  Retry.
4091			 */
4092			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4093			mutex_exit(&l2arc_buflist_mtx);
4094			mutex_enter(hash_lock);
4095			mutex_exit(hash_lock);
4096			goto top;
4097		}
4098
4099		if (HDR_L2_WRITE_HEAD(ab)) {
4100			/*
4101			 * We hit a write head node.  Leave it for
4102			 * l2arc_write_done().
4103			 */
4104			list_remove(buflist, ab);
4105			mutex_exit(hash_lock);
4106			continue;
4107		}
4108
4109		if (!all && ab->b_l2hdr != NULL &&
4110		    (ab->b_l2hdr->b_daddr > taddr ||
4111		    ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4112			/*
4113			 * We've evicted to the target address,
4114			 * or the end of the device.
4115			 */
4116			mutex_exit(hash_lock);
4117			break;
4118		}
4119
4120		if (HDR_FREE_IN_PROGRESS(ab)) {
4121			/*
4122			 * Already on the path to destruction.
4123			 */
4124			mutex_exit(hash_lock);
4125			continue;
4126		}
4127
4128		if (ab->b_state == arc_l2c_only) {
4129			ASSERT(!HDR_L2_READING(ab));
4130			/*
4131			 * This doesn't exist in the ARC.  Destroy.
4132			 * arc_hdr_destroy() will call list_remove()
4133			 * and decrement arcstat_l2_size.
4134			 */
4135			arc_change_state(arc_anon, ab, hash_lock);
4136			arc_hdr_destroy(ab);
4137		} else {
4138			/*
4139			 * Invalidate issued or about to be issued
4140			 * reads, since we may be about to write
4141			 * over this location.
4142			 */
4143			if (HDR_L2_READING(ab)) {
4144				ARCSTAT_BUMP(arcstat_l2_evict_reading);
4145				ab->b_flags |= ARC_L2_EVICTED;
4146			}
4147
4148			/*
4149			 * Tell ARC this no longer exists in L2ARC.
4150			 */
4151			if (ab->b_l2hdr != NULL) {
4152				abl2 = ab->b_l2hdr;
4153				ab->b_l2hdr = NULL;
4154				kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4155				ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4156			}
4157			list_remove(buflist, ab);
4158
4159			/*
4160			 * This may have been leftover after a
4161			 * failed write.
4162			 */
4163			ab->b_flags &= ~ARC_L2_WRITING;
4164		}
4165		mutex_exit(hash_lock);
4166	}
4167	mutex_exit(&l2arc_buflist_mtx);
4168
4169	spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict));
4170	dev->l2ad_evict = taddr;
4171}
4172
4173/*
4174 * Find and write ARC buffers to the L2ARC device.
4175 *
4176 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4177 * for reading until they have completed writing.
4178 */
4179static void
4180l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
4181{
4182	arc_buf_hdr_t *ab, *ab_prev, *head;
4183	l2arc_buf_hdr_t *hdrl2;
4184	list_t *list;
4185	uint64_t passed_sz, write_sz, buf_sz, headroom;
4186	void *buf_data;
4187	kmutex_t *hash_lock, *list_lock;
4188	boolean_t have_lock, full;
4189	l2arc_write_callback_t *cb;
4190	zio_t *pio, *wzio;
4191	int try;
4192
4193	ASSERT(dev->l2ad_vdev != NULL);
4194
4195	pio = NULL;
4196	write_sz = 0;
4197	full = B_FALSE;
4198	head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4199	head->b_flags |= ARC_L2_WRITE_HEAD;
4200
4201	/*
4202	 * Copy buffers for L2ARC writing.
4203	 */
4204	mutex_enter(&l2arc_buflist_mtx);
4205	for (try = 0; try <= 3; try++) {
4206		list = l2arc_list_locked(try, &list_lock);
4207		passed_sz = 0;
4208
4209		/*
4210		 * L2ARC fast warmup.
4211		 *
4212		 * Until the ARC is warm and starts to evict, read from the
4213		 * head of the ARC lists rather than the tail.
4214		 */
4215		headroom = target_sz * l2arc_headroom;
4216		if (arc_warm == B_FALSE)
4217			ab = list_head(list);
4218		else
4219			ab = list_tail(list);
4220
4221		for (; ab; ab = ab_prev) {
4222			if (arc_warm == B_FALSE)
4223				ab_prev = list_next(list, ab);
4224			else
4225				ab_prev = list_prev(list, ab);
4226
4227			hash_lock = HDR_LOCK(ab);
4228			have_lock = MUTEX_HELD(hash_lock);
4229			if (!have_lock && !mutex_tryenter(hash_lock)) {
4230				/*
4231				 * Skip this buffer rather than waiting.
4232				 */
4233				continue;
4234			}
4235
4236			passed_sz += ab->b_size;
4237			if (passed_sz > headroom) {
4238				/*
4239				 * Searched too far.
4240				 */
4241				mutex_exit(hash_lock);
4242				break;
4243			}
4244
4245			if (ab->b_spa != spa) {
4246				mutex_exit(hash_lock);
4247				continue;
4248			}
4249
4250			if (ab->b_l2hdr != NULL) {
4251				/*
4252				 * Already in L2ARC.
4253				 */
4254				mutex_exit(hash_lock);
4255				continue;
4256			}
4257
4258			if (HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) {
4259				mutex_exit(hash_lock);
4260				continue;
4261			}
4262
4263			if ((write_sz + ab->b_size) > target_sz) {
4264				full = B_TRUE;
4265				mutex_exit(hash_lock);
4266				break;
4267			}
4268
4269			if (ab->b_buf == NULL) {
4270				DTRACE_PROBE1(l2arc__buf__null, void *, ab);
4271				mutex_exit(hash_lock);
4272				continue;
4273			}
4274
4275			if (pio == NULL) {
4276				/*
4277				 * Insert a dummy header on the buflist so
4278				 * l2arc_write_done() can find where the
4279				 * write buffers begin without searching.
4280				 */
4281				list_insert_head(dev->l2ad_buflist, head);
4282
4283				cb = kmem_alloc(
4284				    sizeof (l2arc_write_callback_t), KM_SLEEP);
4285				cb->l2wcb_dev = dev;
4286				cb->l2wcb_head = head;
4287				pio = zio_root(spa, l2arc_write_done, cb,
4288				    ZIO_FLAG_CANFAIL);
4289			}
4290
4291			/*
4292			 * Create and add a new L2ARC header.
4293			 */
4294			hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4295			hdrl2->b_dev = dev;
4296			hdrl2->b_daddr = dev->l2ad_hand;
4297
4298			ab->b_flags |= ARC_L2_WRITING;
4299			ab->b_l2hdr = hdrl2;
4300			list_insert_head(dev->l2ad_buflist, ab);
4301			buf_data = ab->b_buf->b_data;
4302			buf_sz = ab->b_size;
4303
4304			/*
4305			 * Compute and store the buffer cksum before
4306			 * writing.  On debug the cksum is verified first.
4307			 */
4308			arc_cksum_verify(ab->b_buf);
4309			arc_cksum_compute(ab->b_buf, B_TRUE);
4310
4311			mutex_exit(hash_lock);
4312
4313			wzio = zio_write_phys(pio, dev->l2ad_vdev,
4314			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4315			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4316			    ZIO_FLAG_CANFAIL, B_FALSE);
4317
4318			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4319			    zio_t *, wzio);
4320			(void) zio_nowait(wzio);
4321
4322			/*
4323			 * Keep the clock hand suitably device-aligned.
4324			 */
4325			buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4326
4327			write_sz += buf_sz;
4328			dev->l2ad_hand += buf_sz;
4329		}
4330
4331		mutex_exit(list_lock);
4332
4333		if (full == B_TRUE)
4334			break;
4335	}
4336	mutex_exit(&l2arc_buflist_mtx);
4337
4338	if (pio == NULL) {
4339		ASSERT3U(write_sz, ==, 0);
4340		kmem_cache_free(hdr_cache, head);
4341		return;
4342	}
4343
4344	ASSERT3U(write_sz, <=, target_sz);
4345	ARCSTAT_BUMP(arcstat_l2_writes_sent);
4346	ARCSTAT_INCR(arcstat_l2_size, write_sz);
4347	spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz);
4348
4349	/*
4350	 * Bump device hand to the device start if it is approaching the end.
4351	 * l2arc_evict() will already have evicted ahead for this case.
4352	 */
4353	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4354		spa_l2cache_space_update(dev->l2ad_vdev, 0,
4355		    dev->l2ad_end - dev->l2ad_hand);
4356		dev->l2ad_hand = dev->l2ad_start;
4357		dev->l2ad_evict = dev->l2ad_start;
4358		dev->l2ad_first = B_FALSE;
4359	}
4360
4361	(void) zio_wait(pio);
4362}
4363
4364/*
4365 * This thread feeds the L2ARC at regular intervals.  This is the beating
4366 * heart of the L2ARC.
4367 */
4368static void
4369l2arc_feed_thread(void *dummy __unused)
4370{
4371	callb_cpr_t cpr;
4372	l2arc_dev_t *dev;
4373	spa_t *spa;
4374	uint64_t size;
4375
4376	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4377
4378	mutex_enter(&l2arc_feed_thr_lock);
4379
4380	while (l2arc_thread_exit == 0) {
4381		/*
4382		 * Pause for l2arc_feed_secs seconds between writes.
4383		 */
4384		CALLB_CPR_SAFE_BEGIN(&cpr);
4385		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
4386		    hz * l2arc_feed_secs);
4387		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
4388
4389		/*
4390		 * Quick check for L2ARC devices.
4391		 */
4392		mutex_enter(&l2arc_dev_mtx);
4393		if (l2arc_ndev == 0) {
4394			mutex_exit(&l2arc_dev_mtx);
4395			continue;
4396		}
4397		mutex_exit(&l2arc_dev_mtx);
4398
4399		/*
4400		 * This selects the next l2arc device to write to, and in
4401		 * doing so the next spa to feed from: dev->l2ad_spa.   This
4402		 * will return NULL if there are now no l2arc devices or if
4403		 * they are all faulted.
4404		 *
4405		 * If a device is returned, its spa's config lock is also
4406		 * held to prevent device removal.  l2arc_dev_get_next()
4407		 * will grab and release l2arc_dev_mtx.
4408		 */
4409		if ((dev = l2arc_dev_get_next()) == NULL)
4410			continue;
4411
4412		spa = dev->l2ad_spa;
4413		ASSERT(spa != NULL);
4414
4415		/*
4416		 * Avoid contributing to memory pressure.
4417		 */
4418		if (arc_reclaim_needed()) {
4419			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
4420			spa_config_exit(spa, SCL_L2ARC, dev);
4421			continue;
4422		}
4423
4424		ARCSTAT_BUMP(arcstat_l2_feeds);
4425
4426		size = dev->l2ad_write;
4427		if (arc_warm == B_FALSE)
4428			size += dev->l2ad_boost;
4429
4430		/*
4431		 * Evict L2ARC buffers that will be overwritten.
4432		 */
4433		l2arc_evict(dev, size, B_FALSE);
4434
4435		/*
4436		 * Write ARC buffers.
4437		 */
4438		l2arc_write_buffers(spa, dev, size);
4439		spa_config_exit(spa, SCL_L2ARC, dev);
4440	}
4441
4442	l2arc_thread_exit = 0;
4443	cv_broadcast(&l2arc_feed_thr_cv);
4444	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
4445	thread_exit();
4446}
4447
4448boolean_t
4449l2arc_vdev_present(vdev_t *vd)
4450{
4451	l2arc_dev_t *dev;
4452
4453	mutex_enter(&l2arc_dev_mtx);
4454	for (dev = list_head(l2arc_dev_list); dev != NULL;
4455	    dev = list_next(l2arc_dev_list, dev)) {
4456		if (dev->l2ad_vdev == vd)
4457			break;
4458	}
4459	mutex_exit(&l2arc_dev_mtx);
4460
4461	return (dev != NULL);
4462}
4463
4464/*
4465 * Add a vdev for use by the L2ARC.  By this point the spa has already
4466 * validated the vdev and opened it.
4467 */
4468void
4469l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
4470{
4471	l2arc_dev_t *adddev;
4472
4473	ASSERT(!l2arc_vdev_present(vd));
4474
4475	/*
4476	 * Create a new l2arc device entry.
4477	 */
4478	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
4479	adddev->l2ad_spa = spa;
4480	adddev->l2ad_vdev = vd;
4481	adddev->l2ad_write = l2arc_write_max;
4482	adddev->l2ad_boost = l2arc_write_boost;
4483	adddev->l2ad_start = start;
4484	adddev->l2ad_end = end;
4485	adddev->l2ad_hand = adddev->l2ad_start;
4486	adddev->l2ad_evict = adddev->l2ad_start;
4487	adddev->l2ad_first = B_TRUE;
4488	ASSERT3U(adddev->l2ad_write, >, 0);
4489
4490	/*
4491	 * This is a list of all ARC buffers that are still valid on the
4492	 * device.
4493	 */
4494	adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
4495	list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
4496	    offsetof(arc_buf_hdr_t, b_l2node));
4497
4498	spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0);
4499
4500	/*
4501	 * Add device to global list
4502	 */
4503	mutex_enter(&l2arc_dev_mtx);
4504	list_insert_head(l2arc_dev_list, adddev);
4505	atomic_inc_64(&l2arc_ndev);
4506	mutex_exit(&l2arc_dev_mtx);
4507}
4508
4509/*
4510 * Remove a vdev from the L2ARC.
4511 */
4512void
4513l2arc_remove_vdev(vdev_t *vd)
4514{
4515	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
4516
4517	/*
4518	 * Find the device by vdev
4519	 */
4520	mutex_enter(&l2arc_dev_mtx);
4521	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
4522		nextdev = list_next(l2arc_dev_list, dev);
4523		if (vd == dev->l2ad_vdev) {
4524			remdev = dev;
4525			break;
4526		}
4527	}
4528	ASSERT(remdev != NULL);
4529
4530	/*
4531	 * Remove device from global list
4532	 */
4533	list_remove(l2arc_dev_list, remdev);
4534	l2arc_dev_last = NULL;		/* may have been invalidated */
4535	atomic_dec_64(&l2arc_ndev);
4536	mutex_exit(&l2arc_dev_mtx);
4537
4538	/*
4539	 * Clear all buflists and ARC references.  L2ARC device flush.
4540	 */
4541	l2arc_evict(remdev, 0, B_TRUE);
4542	list_destroy(remdev->l2ad_buflist);
4543	kmem_free(remdev->l2ad_buflist, sizeof (list_t));
4544	kmem_free(remdev, sizeof (l2arc_dev_t));
4545}
4546
4547void
4548l2arc_init(void)
4549{
4550	l2arc_thread_exit = 0;
4551	l2arc_ndev = 0;
4552	l2arc_writes_sent = 0;
4553	l2arc_writes_done = 0;
4554
4555	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4556	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
4557	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
4558	mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
4559	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
4560
4561	l2arc_dev_list = &L2ARC_dev_list;
4562	l2arc_free_on_write = &L2ARC_free_on_write;
4563	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
4564	    offsetof(l2arc_dev_t, l2ad_node));
4565	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
4566	    offsetof(l2arc_data_free_t, l2df_list_node));
4567}
4568
4569void
4570l2arc_fini(void)
4571{
4572	/*
4573	 * This is called from dmu_fini(), which is called from spa_fini();
4574	 * Because of this, we can assume that all l2arc devices have
4575	 * already been removed when the pools themselves were removed.
4576	 */
4577
4578	l2arc_do_free_on_write();
4579
4580	mutex_destroy(&l2arc_feed_thr_lock);
4581	cv_destroy(&l2arc_feed_thr_cv);
4582	mutex_destroy(&l2arc_dev_mtx);
4583	mutex_destroy(&l2arc_buflist_mtx);
4584	mutex_destroy(&l2arc_free_on_write_mtx);
4585
4586	list_destroy(l2arc_dev_list);
4587	list_destroy(l2arc_free_on_write);
4588}
4589
4590void
4591l2arc_start(void)
4592{
4593	if (!(spa_mode & FWRITE))
4594		return;
4595
4596	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
4597	    TS_RUN, minclsyspri);
4598}
4599
4600void
4601l2arc_stop(void)
4602{
4603	if (!(spa_mode & FWRITE))
4604		return;
4605
4606	mutex_enter(&l2arc_feed_thr_lock);
4607	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
4608	l2arc_thread_exit = 1;
4609	while (l2arc_thread_exit != 0)
4610		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
4611	mutex_exit(&l2arc_feed_thr_lock);
4612}
4613