arc.c revision 168481
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28/*
29 * DVA-based Adjustable Replacement Cache
30 *
31 * While much of the theory of operation used here is
32 * based on the self-tuning, low overhead replacement cache
33 * presented by Megiddo and Modha at FAST 2003, there are some
34 * significant differences:
35 *
36 * 1. The Megiddo and Modha model assumes any page is evictable.
37 * Pages in its cache cannot be "locked" into memory.  This makes
38 * the eviction algorithm simple: evict the last page in the list.
39 * This also make the performance characteristics easy to reason
40 * about.  Our cache is not so simple.  At any given moment, some
41 * subset of the blocks in the cache are un-evictable because we
42 * have handed out a reference to them.  Blocks are only evictable
43 * when there are no external references active.  This makes
44 * eviction far more problematic:  we choose to evict the evictable
45 * blocks that are the "lowest" in the list.
46 *
47 * There are times when it is not possible to evict the requested
48 * space.  In these circumstances we are unable to adjust the cache
49 * size.  To prevent the cache growing unbounded at these times we
50 * implement a "cache throttle" that slowes the flow of new data
51 * into the cache until we can make space avaiable.
52 *
53 * 2. The Megiddo and Modha model assumes a fixed cache size.
54 * Pages are evicted when the cache is full and there is a cache
55 * miss.  Our model has a variable sized cache.  It grows with
56 * high use, but also tries to react to memory preasure from the
57 * operating system: decreasing its size when system memory is
58 * tight.
59 *
60 * 3. The Megiddo and Modha model assumes a fixed page size. All
61 * elements of the cache are therefor exactly the same size.  So
62 * when adjusting the cache size following a cache miss, its simply
63 * a matter of choosing a single page to evict.  In our model, we
64 * have variable sized cache blocks (rangeing from 512 bytes to
65 * 128K bytes).  We therefor choose a set of blocks to evict to make
66 * space for a cache miss that approximates as closely as possible
67 * the space used by the new block.
68 *
69 * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
70 * by N. Megiddo & D. Modha, FAST 2003
71 */
72
73/*
74 * The locking model:
75 *
76 * A new reference to a cache buffer can be obtained in two
77 * ways: 1) via a hash table lookup using the DVA as a key,
78 * or 2) via one of the ARC lists.  The arc_read() inerface
79 * uses method 1, while the internal arc algorithms for
80 * adjusting the cache use method 2.  We therefor provide two
81 * types of locks: 1) the hash table lock array, and 2) the
82 * arc list locks.
83 *
84 * Buffers do not have their own mutexs, rather they rely on the
85 * hash table mutexs for the bulk of their protection (i.e. most
86 * fields in the arc_buf_hdr_t are protected by these mutexs).
87 *
88 * buf_hash_find() returns the appropriate mutex (held) when it
89 * locates the requested buffer in the hash table.  It returns
90 * NULL for the mutex if the buffer was not in the table.
91 *
92 * buf_hash_remove() expects the appropriate hash mutex to be
93 * already held before it is invoked.
94 *
95 * Each arc state also has a mutex which is used to protect the
96 * buffer list associated with the state.  When attempting to
97 * obtain a hash table lock while holding an arc list lock you
98 * must use: mutex_tryenter() to avoid deadlock.  Also note that
99 * the active state mutex must be held before the ghost state mutex.
100 *
101 * Arc buffers may have an associated eviction callback function.
102 * This function will be invoked prior to removing the buffer (e.g.
103 * in arc_do_user_evicts()).  Note however that the data associated
104 * with the buffer may be evicted prior to the callback.  The callback
105 * must be made with *no locks held* (to prevent deadlock).  Additionally,
106 * the users of callbacks must ensure that their private data is
107 * protected from simultaneous callbacks from arc_buf_evict()
108 * and arc_do_user_evicts().
109 *
110 * Note that the majority of the performance stats are manipulated
111 * with atomic operations.
112 */
113
114#include <sys/spa.h>
115#include <sys/zio.h>
116#include <sys/zio_checksum.h>
117#include <sys/zfs_context.h>
118#include <sys/arc.h>
119#include <sys/refcount.h>
120#ifdef _KERNEL
121#include <sys/dnlc.h>
122#endif
123#include <sys/callb.h>
124#include <sys/kstat.h>
125#include <sys/sdt.h>
126
127#define	ARC_FREE_AT_ONCE	4194304
128
129static kmutex_t		arc_reclaim_thr_lock;
130static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
131static uint8_t		arc_thread_exit;
132
133#define	ARC_REDUCE_DNLC_PERCENT	3
134uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
135
136typedef enum arc_reclaim_strategy {
137	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
138	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
139} arc_reclaim_strategy_t;
140
141/* number of seconds before growing cache again */
142static int		arc_grow_retry = 60;
143
144/*
145 * minimum lifespan of a prefetch block in clock ticks
146 * (initialized in arc_init())
147 */
148static int		arc_min_prefetch_lifespan;
149
150static int arc_dead;
151
152/*
153 * These tunables are for performance analysis.
154 */
155u_long zfs_arc_max;
156u_long zfs_arc_min;
157#ifdef _KERNEL
158TUNABLE_ULONG("vfs.zfs.arc_max", &zfs_arc_max);
159TUNABLE_ULONG("vfs.zfs.arc_min", &zfs_arc_min);
160SYSCTL_DECL(_vfs_zfs);
161SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RD, &zfs_arc_max, 0,
162    "Maximum ARC size");
163SYSCTL_ULONG(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RD, &zfs_arc_min, 0,
164    "Minimum ARC size");
165#endif
166
167/*
168 * Note that buffers can be on one of 5 states:
169 *	ARC_anon	- anonymous (discussed below)
170 *	ARC_mru		- recently used, currently cached
171 *	ARC_mru_ghost	- recentely used, no longer in cache
172 *	ARC_mfu		- frequently used, currently cached
173 *	ARC_mfu_ghost	- frequently used, no longer in cache
174 * When there are no active references to the buffer, they
175 * are linked onto one of the lists in arc.  These are the
176 * only buffers that can be evicted or deleted.
177 *
178 * Anonymous buffers are buffers that are not associated with
179 * a DVA.  These are buffers that hold dirty block copies
180 * before they are written to stable storage.  By definition,
181 * they are "ref'd" and are considered part of arc_mru
182 * that cannot be freed.  Generally, they will aquire a DVA
183 * as they are written and migrate onto the arc_mru list.
184 */
185
186typedef struct arc_state {
187	list_t	arcs_list;	/* linked list of evictable buffer in state */
188	uint64_t arcs_lsize;	/* total size of buffers in the linked list */
189	uint64_t arcs_size;	/* total size of all buffers in this state */
190	kmutex_t arcs_mtx;
191} arc_state_t;
192
193/* The 5 states: */
194static arc_state_t ARC_anon;
195static arc_state_t ARC_mru;
196static arc_state_t ARC_mru_ghost;
197static arc_state_t ARC_mfu;
198static arc_state_t ARC_mfu_ghost;
199
200typedef struct arc_stats {
201	kstat_named_t arcstat_hits;
202	kstat_named_t arcstat_misses;
203	kstat_named_t arcstat_demand_data_hits;
204	kstat_named_t arcstat_demand_data_misses;
205	kstat_named_t arcstat_demand_metadata_hits;
206	kstat_named_t arcstat_demand_metadata_misses;
207	kstat_named_t arcstat_prefetch_data_hits;
208	kstat_named_t arcstat_prefetch_data_misses;
209	kstat_named_t arcstat_prefetch_metadata_hits;
210	kstat_named_t arcstat_prefetch_metadata_misses;
211	kstat_named_t arcstat_mru_hits;
212	kstat_named_t arcstat_mru_ghost_hits;
213	kstat_named_t arcstat_mfu_hits;
214	kstat_named_t arcstat_mfu_ghost_hits;
215	kstat_named_t arcstat_deleted;
216	kstat_named_t arcstat_recycle_miss;
217	kstat_named_t arcstat_mutex_miss;
218	kstat_named_t arcstat_evict_skip;
219	kstat_named_t arcstat_hash_elements;
220	kstat_named_t arcstat_hash_elements_max;
221	kstat_named_t arcstat_hash_collisions;
222	kstat_named_t arcstat_hash_chains;
223	kstat_named_t arcstat_hash_chain_max;
224	kstat_named_t arcstat_p;
225	kstat_named_t arcstat_c;
226	kstat_named_t arcstat_c_min;
227	kstat_named_t arcstat_c_max;
228	kstat_named_t arcstat_size;
229} arc_stats_t;
230
231static arc_stats_t arc_stats = {
232	{ "hits",			KSTAT_DATA_UINT64 },
233	{ "misses",			KSTAT_DATA_UINT64 },
234	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
235	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
236	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
237	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
238	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
239	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
240	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
241	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
242	{ "mru_hits",			KSTAT_DATA_UINT64 },
243	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
244	{ "mfu_hits",			KSTAT_DATA_UINT64 },
245	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
246	{ "deleted",			KSTAT_DATA_UINT64 },
247	{ "recycle_miss",		KSTAT_DATA_UINT64 },
248	{ "mutex_miss",			KSTAT_DATA_UINT64 },
249	{ "evict_skip",			KSTAT_DATA_UINT64 },
250	{ "hash_elements",		KSTAT_DATA_UINT64 },
251	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
252	{ "hash_collisions",		KSTAT_DATA_UINT64 },
253	{ "hash_chains",		KSTAT_DATA_UINT64 },
254	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
255	{ "p",				KSTAT_DATA_UINT64 },
256	{ "c",				KSTAT_DATA_UINT64 },
257	{ "c_min",			KSTAT_DATA_UINT64 },
258	{ "c_max",			KSTAT_DATA_UINT64 },
259	{ "size",			KSTAT_DATA_UINT64 }
260};
261
262#define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
263
264#define	ARCSTAT_INCR(stat, val) \
265	atomic_add_64(&arc_stats.stat.value.ui64, (val));
266
267#define	ARCSTAT_BUMP(stat) 	ARCSTAT_INCR(stat, 1)
268#define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
269
270#define	ARCSTAT_MAX(stat, val) {					\
271	uint64_t m;							\
272	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
273	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
274		continue;						\
275}
276
277#define	ARCSTAT_MAXSTAT(stat) \
278	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
279
280/*
281 * We define a macro to allow ARC hits/misses to be easily broken down by
282 * two separate conditions, giving a total of four different subtypes for
283 * each of hits and misses (so eight statistics total).
284 */
285#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
286	if (cond1) {							\
287		if (cond2) {						\
288			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
289		} else {						\
290			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
291		}							\
292	} else {							\
293		if (cond2) {						\
294			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
295		} else {						\
296			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
297		}							\
298	}
299
300kstat_t			*arc_ksp;
301static arc_state_t 	*arc_anon;
302static arc_state_t	*arc_mru;
303static arc_state_t	*arc_mru_ghost;
304static arc_state_t	*arc_mfu;
305static arc_state_t	*arc_mfu_ghost;
306
307/*
308 * There are several ARC variables that are critical to export as kstats --
309 * but we don't want to have to grovel around in the kstat whenever we wish to
310 * manipulate them.  For these variables, we therefore define them to be in
311 * terms of the statistic variable.  This assures that we are not introducing
312 * the possibility of inconsistency by having shadow copies of the variables,
313 * while still allowing the code to be readable.
314 */
315#define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
316#define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
317#define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
318#define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
319#define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
320
321static int		arc_no_grow;	/* Don't try to grow cache size */
322static uint64_t		arc_tempreserve;
323
324typedef struct arc_callback arc_callback_t;
325
326struct arc_callback {
327	void			*acb_private;
328	arc_done_func_t		*acb_done;
329	arc_byteswap_func_t	*acb_byteswap;
330	arc_buf_t		*acb_buf;
331	zio_t			*acb_zio_dummy;
332	arc_callback_t		*acb_next;
333};
334
335typedef struct arc_write_callback arc_write_callback_t;
336
337struct arc_write_callback {
338	void		*awcb_private;
339	arc_done_func_t	*awcb_ready;
340	arc_done_func_t	*awcb_done;
341	arc_buf_t	*awcb_buf;
342};
343
344struct arc_buf_hdr {
345	/* protected by hash lock */
346	dva_t			b_dva;
347	uint64_t		b_birth;
348	uint64_t		b_cksum0;
349
350	kmutex_t		b_freeze_lock;
351	zio_cksum_t		*b_freeze_cksum;
352
353	arc_buf_hdr_t		*b_hash_next;
354	arc_buf_t		*b_buf;
355	uint32_t		b_flags;
356	uint32_t		b_datacnt;
357
358	arc_callback_t		*b_acb;
359	kcondvar_t		b_cv;
360
361	/* immutable */
362	arc_buf_contents_t	b_type;
363	uint64_t		b_size;
364	spa_t			*b_spa;
365
366	/* protected by arc state mutex */
367	arc_state_t		*b_state;
368	list_node_t		b_arc_node;
369
370	/* updated atomically */
371	clock_t			b_arc_access;
372
373	/* self protecting */
374	refcount_t		b_refcnt;
375};
376
377static arc_buf_t *arc_eviction_list;
378static kmutex_t arc_eviction_mtx;
379static arc_buf_hdr_t arc_eviction_hdr;
380static void arc_get_data_buf(arc_buf_t *buf);
381static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
382
383#define	GHOST_STATE(state)	\
384	((state) == arc_mru_ghost || (state) == arc_mfu_ghost)
385
386/*
387 * Private ARC flags.  These flags are private ARC only flags that will show up
388 * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
389 * be passed in as arc_flags in things like arc_read.  However, these flags
390 * should never be passed and should only be set by ARC code.  When adding new
391 * public flags, make sure not to smash the private ones.
392 */
393
394#define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
395#define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
396#define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
397#define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
398#define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
399#define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
400
401#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
402#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
403#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
404#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
405#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
406
407/*
408 * Hash table routines
409 */
410
411#define	HT_LOCK_PAD	128
412
413struct ht_lock {
414	kmutex_t	ht_lock;
415#ifdef _KERNEL
416	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
417#endif
418};
419
420#define	BUF_LOCKS 256
421typedef struct buf_hash_table {
422	uint64_t ht_mask;
423	arc_buf_hdr_t **ht_table;
424	struct ht_lock ht_locks[BUF_LOCKS];
425} buf_hash_table_t;
426
427static buf_hash_table_t buf_hash_table;
428
429#define	BUF_HASH_INDEX(spa, dva, birth) \
430	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
431#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
432#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
433#define	HDR_LOCK(buf) \
434	(BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
435
436uint64_t zfs_crc64_table[256];
437
438static uint64_t
439buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
440{
441	uintptr_t spav = (uintptr_t)spa;
442	uint8_t *vdva = (uint8_t *)dva;
443	uint64_t crc = -1ULL;
444	int i;
445
446	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
447
448	for (i = 0; i < sizeof (dva_t); i++)
449		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
450
451	crc ^= (spav>>8) ^ birth;
452
453	return (crc);
454}
455
456#define	BUF_EMPTY(buf)						\
457	((buf)->b_dva.dva_word[0] == 0 &&			\
458	(buf)->b_dva.dva_word[1] == 0 &&			\
459	(buf)->b_birth == 0)
460
461#define	BUF_EQUAL(spa, dva, birth, buf)				\
462	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
463	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
464	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
465
466static arc_buf_hdr_t *
467buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp)
468{
469	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
470	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
471	arc_buf_hdr_t *buf;
472
473	mutex_enter(hash_lock);
474	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
475	    buf = buf->b_hash_next) {
476		if (BUF_EQUAL(spa, dva, birth, buf)) {
477			*lockp = hash_lock;
478			return (buf);
479		}
480	}
481	mutex_exit(hash_lock);
482	*lockp = NULL;
483	return (NULL);
484}
485
486/*
487 * Insert an entry into the hash table.  If there is already an element
488 * equal to elem in the hash table, then the already existing element
489 * will be returned and the new element will not be inserted.
490 * Otherwise returns NULL.
491 */
492static arc_buf_hdr_t *
493buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
494{
495	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
496	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
497	arc_buf_hdr_t *fbuf;
498	uint32_t i;
499
500	ASSERT(!HDR_IN_HASH_TABLE(buf));
501	*lockp = hash_lock;
502	mutex_enter(hash_lock);
503	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
504	    fbuf = fbuf->b_hash_next, i++) {
505		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
506			return (fbuf);
507	}
508
509	buf->b_hash_next = buf_hash_table.ht_table[idx];
510	buf_hash_table.ht_table[idx] = buf;
511	buf->b_flags |= ARC_IN_HASH_TABLE;
512
513	/* collect some hash table performance data */
514	if (i > 0) {
515		ARCSTAT_BUMP(arcstat_hash_collisions);
516		if (i == 1)
517			ARCSTAT_BUMP(arcstat_hash_chains);
518
519		ARCSTAT_MAX(arcstat_hash_chain_max, i);
520	}
521
522	ARCSTAT_BUMP(arcstat_hash_elements);
523	ARCSTAT_MAXSTAT(arcstat_hash_elements);
524
525	return (NULL);
526}
527
528static void
529buf_hash_remove(arc_buf_hdr_t *buf)
530{
531	arc_buf_hdr_t *fbuf, **bufp;
532	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
533
534	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
535	ASSERT(HDR_IN_HASH_TABLE(buf));
536
537	bufp = &buf_hash_table.ht_table[idx];
538	while ((fbuf = *bufp) != buf) {
539		ASSERT(fbuf != NULL);
540		bufp = &fbuf->b_hash_next;
541	}
542	*bufp = buf->b_hash_next;
543	buf->b_hash_next = NULL;
544	buf->b_flags &= ~ARC_IN_HASH_TABLE;
545
546	/* collect some hash table performance data */
547	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
548
549	if (buf_hash_table.ht_table[idx] &&
550	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
551		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
552}
553
554/*
555 * Global data structures and functions for the buf kmem cache.
556 */
557static kmem_cache_t *hdr_cache;
558static kmem_cache_t *buf_cache;
559
560static void
561buf_fini(void)
562{
563	int i;
564
565	kmem_free(buf_hash_table.ht_table,
566	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
567	for (i = 0; i < BUF_LOCKS; i++)
568		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
569	kmem_cache_destroy(hdr_cache);
570	kmem_cache_destroy(buf_cache);
571}
572
573/*
574 * Constructor callback - called when the cache is empty
575 * and a new buf is requested.
576 */
577/* ARGSUSED */
578static int
579hdr_cons(void *vbuf, void *unused, int kmflag)
580{
581	arc_buf_hdr_t *buf = vbuf;
582
583	bzero(buf, sizeof (arc_buf_hdr_t));
584	refcount_create(&buf->b_refcnt);
585	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
586	return (0);
587}
588
589/*
590 * Destructor callback - called when a cached buf is
591 * no longer required.
592 */
593/* ARGSUSED */
594static void
595hdr_dest(void *vbuf, void *unused)
596{
597	arc_buf_hdr_t *buf = vbuf;
598
599	refcount_destroy(&buf->b_refcnt);
600	cv_destroy(&buf->b_cv);
601}
602
603/*
604 * Reclaim callback -- invoked when memory is low.
605 */
606/* ARGSUSED */
607static void
608hdr_recl(void *unused)
609{
610	dprintf("hdr_recl called\n");
611	/*
612	 * umem calls the reclaim func when we destroy the buf cache,
613	 * which is after we do arc_fini().
614	 */
615	if (!arc_dead)
616		cv_signal(&arc_reclaim_thr_cv);
617}
618
619static void
620buf_init(void)
621{
622	uint64_t *ct;
623	uint64_t hsize = 1ULL << 12;
624	int i, j;
625
626	/*
627	 * The hash table is big enough to fill all of physical memory
628	 * with an average 64K block size.  The table will take up
629	 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
630	 */
631	while (hsize * 65536 < physmem * PAGESIZE)
632		hsize <<= 1;
633retry:
634	buf_hash_table.ht_mask = hsize - 1;
635	buf_hash_table.ht_table =
636	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
637	if (buf_hash_table.ht_table == NULL) {
638		ASSERT(hsize > (1ULL << 8));
639		hsize >>= 1;
640		goto retry;
641	}
642
643	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
644	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
645	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
646	    0, NULL, NULL, NULL, NULL, NULL, 0);
647
648	for (i = 0; i < 256; i++)
649		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
650			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
651
652	for (i = 0; i < BUF_LOCKS; i++) {
653		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
654		    NULL, MUTEX_DEFAULT, NULL);
655	}
656}
657
658#define	ARC_MINTIME	(hz>>4) /* 62 ms */
659
660static void
661arc_cksum_verify(arc_buf_t *buf)
662{
663	zio_cksum_t zc;
664
665	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
666		return;
667
668	mutex_enter(&buf->b_hdr->b_freeze_lock);
669	if (buf->b_hdr->b_freeze_cksum == NULL ||
670	    (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
671		mutex_exit(&buf->b_hdr->b_freeze_lock);
672		return;
673	}
674	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
675	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
676		panic("buffer modified while frozen!");
677	mutex_exit(&buf->b_hdr->b_freeze_lock);
678}
679
680static void
681arc_cksum_compute(arc_buf_t *buf)
682{
683	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
684		return;
685
686	mutex_enter(&buf->b_hdr->b_freeze_lock);
687	if (buf->b_hdr->b_freeze_cksum != NULL) {
688		mutex_exit(&buf->b_hdr->b_freeze_lock);
689		return;
690	}
691	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
692	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
693	    buf->b_hdr->b_freeze_cksum);
694	mutex_exit(&buf->b_hdr->b_freeze_lock);
695}
696
697void
698arc_buf_thaw(arc_buf_t *buf)
699{
700	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
701		return;
702
703	if (buf->b_hdr->b_state != arc_anon)
704		panic("modifying non-anon buffer!");
705	if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
706		panic("modifying buffer while i/o in progress!");
707	arc_cksum_verify(buf);
708	mutex_enter(&buf->b_hdr->b_freeze_lock);
709	if (buf->b_hdr->b_freeze_cksum != NULL) {
710		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
711		buf->b_hdr->b_freeze_cksum = NULL;
712	}
713	mutex_exit(&buf->b_hdr->b_freeze_lock);
714}
715
716void
717arc_buf_freeze(arc_buf_t *buf)
718{
719	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
720		return;
721
722	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
723	    buf->b_hdr->b_state == arc_anon);
724	arc_cksum_compute(buf);
725}
726
727static void
728add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
729{
730	ASSERT(MUTEX_HELD(hash_lock));
731
732	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
733	    (ab->b_state != arc_anon)) {
734		uint64_t delta = ab->b_size * ab->b_datacnt;
735
736		ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
737		mutex_enter(&ab->b_state->arcs_mtx);
738		ASSERT(list_link_active(&ab->b_arc_node));
739		list_remove(&ab->b_state->arcs_list, ab);
740		if (GHOST_STATE(ab->b_state)) {
741			ASSERT3U(ab->b_datacnt, ==, 0);
742			ASSERT3P(ab->b_buf, ==, NULL);
743			delta = ab->b_size;
744		}
745		ASSERT(delta > 0);
746		ASSERT3U(ab->b_state->arcs_lsize, >=, delta);
747		atomic_add_64(&ab->b_state->arcs_lsize, -delta);
748		mutex_exit(&ab->b_state->arcs_mtx);
749		/* remove the prefetch flag is we get a reference */
750		if (ab->b_flags & ARC_PREFETCH)
751			ab->b_flags &= ~ARC_PREFETCH;
752	}
753}
754
755static int
756remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
757{
758	int cnt;
759	arc_state_t *state = ab->b_state;
760
761	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
762	ASSERT(!GHOST_STATE(state));
763
764	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
765	    (state != arc_anon)) {
766		ASSERT(!MUTEX_HELD(&state->arcs_mtx));
767		mutex_enter(&state->arcs_mtx);
768		ASSERT(!list_link_active(&ab->b_arc_node));
769		list_insert_head(&state->arcs_list, ab);
770		ASSERT(ab->b_datacnt > 0);
771		atomic_add_64(&state->arcs_lsize, ab->b_size * ab->b_datacnt);
772		ASSERT3U(state->arcs_size, >=, state->arcs_lsize);
773		mutex_exit(&state->arcs_mtx);
774	}
775	return (cnt);
776}
777
778/*
779 * Move the supplied buffer to the indicated state.  The mutex
780 * for the buffer must be held by the caller.
781 */
782static void
783arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
784{
785	arc_state_t *old_state = ab->b_state;
786	int64_t refcnt = refcount_count(&ab->b_refcnt);
787	uint64_t from_delta, to_delta;
788
789	ASSERT(MUTEX_HELD(hash_lock));
790	ASSERT(new_state != old_state);
791	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
792	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
793
794	from_delta = to_delta = ab->b_datacnt * ab->b_size;
795
796	/*
797	 * If this buffer is evictable, transfer it from the
798	 * old state list to the new state list.
799	 */
800	if (refcnt == 0) {
801		if (old_state != arc_anon) {
802			int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
803
804			if (use_mutex)
805				mutex_enter(&old_state->arcs_mtx);
806
807			ASSERT(list_link_active(&ab->b_arc_node));
808			list_remove(&old_state->arcs_list, ab);
809
810			/*
811			 * If prefetching out of the ghost cache,
812			 * we will have a non-null datacnt.
813			 */
814			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
815				/* ghost elements have a ghost size */
816				ASSERT(ab->b_buf == NULL);
817				from_delta = ab->b_size;
818			}
819			ASSERT3U(old_state->arcs_lsize, >=, from_delta);
820			atomic_add_64(&old_state->arcs_lsize, -from_delta);
821
822			if (use_mutex)
823				mutex_exit(&old_state->arcs_mtx);
824		}
825		if (new_state != arc_anon) {
826			int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
827
828			if (use_mutex)
829				mutex_enter(&new_state->arcs_mtx);
830
831			list_insert_head(&new_state->arcs_list, ab);
832
833			/* ghost elements have a ghost size */
834			if (GHOST_STATE(new_state)) {
835				ASSERT(ab->b_datacnt == 0);
836				ASSERT(ab->b_buf == NULL);
837				to_delta = ab->b_size;
838			}
839			atomic_add_64(&new_state->arcs_lsize, to_delta);
840			ASSERT3U(new_state->arcs_size + to_delta, >=,
841			    new_state->arcs_lsize);
842
843			if (use_mutex)
844				mutex_exit(&new_state->arcs_mtx);
845		}
846	}
847
848	ASSERT(!BUF_EMPTY(ab));
849	if (new_state == arc_anon && old_state != arc_anon) {
850		buf_hash_remove(ab);
851	}
852
853	/* adjust state sizes */
854	if (to_delta)
855		atomic_add_64(&new_state->arcs_size, to_delta);
856	if (from_delta) {
857		ASSERT3U(old_state->arcs_size, >=, from_delta);
858		atomic_add_64(&old_state->arcs_size, -from_delta);
859	}
860	ab->b_state = new_state;
861}
862
863arc_buf_t *
864arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
865{
866	arc_buf_hdr_t *hdr;
867	arc_buf_t *buf;
868
869	ASSERT3U(size, >, 0);
870	hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
871	ASSERT(BUF_EMPTY(hdr));
872	hdr->b_size = size;
873	hdr->b_type = type;
874	hdr->b_spa = spa;
875	hdr->b_state = arc_anon;
876	hdr->b_arc_access = 0;
877	mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
878	buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
879	buf->b_hdr = hdr;
880	buf->b_data = NULL;
881	buf->b_efunc = NULL;
882	buf->b_private = NULL;
883	buf->b_next = NULL;
884	hdr->b_buf = buf;
885	arc_get_data_buf(buf);
886	hdr->b_datacnt = 1;
887	hdr->b_flags = 0;
888	ASSERT(refcount_is_zero(&hdr->b_refcnt));
889	(void) refcount_add(&hdr->b_refcnt, tag);
890
891	return (buf);
892}
893
894static arc_buf_t *
895arc_buf_clone(arc_buf_t *from)
896{
897	arc_buf_t *buf;
898	arc_buf_hdr_t *hdr = from->b_hdr;
899	uint64_t size = hdr->b_size;
900
901	buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
902	buf->b_hdr = hdr;
903	buf->b_data = NULL;
904	buf->b_efunc = NULL;
905	buf->b_private = NULL;
906	buf->b_next = hdr->b_buf;
907	hdr->b_buf = buf;
908	arc_get_data_buf(buf);
909	bcopy(from->b_data, buf->b_data, size);
910	hdr->b_datacnt += 1;
911	return (buf);
912}
913
914void
915arc_buf_add_ref(arc_buf_t *buf, void* tag)
916{
917	arc_buf_hdr_t *hdr;
918	kmutex_t *hash_lock;
919
920	/*
921	 * Check to see if this buffer is currently being evicted via
922	 * arc_do_user_evicts().
923	 */
924	mutex_enter(&arc_eviction_mtx);
925	hdr = buf->b_hdr;
926	if (hdr == NULL) {
927		mutex_exit(&arc_eviction_mtx);
928		return;
929	}
930	hash_lock = HDR_LOCK(hdr);
931	mutex_exit(&arc_eviction_mtx);
932
933	mutex_enter(hash_lock);
934	if (buf->b_data == NULL) {
935		/*
936		 * This buffer is evicted.
937		 */
938		mutex_exit(hash_lock);
939		return;
940	}
941
942	ASSERT(buf->b_hdr == hdr);
943	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
944	add_reference(hdr, hash_lock, tag);
945	arc_access(hdr, hash_lock);
946	mutex_exit(hash_lock);
947	ARCSTAT_BUMP(arcstat_hits);
948	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
949	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
950	    data, metadata, hits);
951}
952
953static void
954arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
955{
956	arc_buf_t **bufp;
957
958	/* free up data associated with the buf */
959	if (buf->b_data) {
960		arc_state_t *state = buf->b_hdr->b_state;
961		uint64_t size = buf->b_hdr->b_size;
962		arc_buf_contents_t type = buf->b_hdr->b_type;
963
964		arc_cksum_verify(buf);
965		if (!recycle) {
966			if (type == ARC_BUFC_METADATA) {
967				zio_buf_free(buf->b_data, size);
968			} else {
969				ASSERT(type == ARC_BUFC_DATA);
970				zio_data_buf_free(buf->b_data, size);
971			}
972			atomic_add_64(&arc_size, -size);
973		}
974		if (list_link_active(&buf->b_hdr->b_arc_node)) {
975			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
976			ASSERT(state != arc_anon);
977			ASSERT3U(state->arcs_lsize, >=, size);
978			atomic_add_64(&state->arcs_lsize, -size);
979		}
980		ASSERT3U(state->arcs_size, >=, size);
981		atomic_add_64(&state->arcs_size, -size);
982		buf->b_data = NULL;
983		ASSERT(buf->b_hdr->b_datacnt > 0);
984		buf->b_hdr->b_datacnt -= 1;
985	}
986
987	/* only remove the buf if requested */
988	if (!all)
989		return;
990
991	/* remove the buf from the hdr list */
992	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
993		continue;
994	*bufp = buf->b_next;
995
996	ASSERT(buf->b_efunc == NULL);
997
998	/* clean up the buf */
999	buf->b_hdr = NULL;
1000	kmem_cache_free(buf_cache, buf);
1001}
1002
1003static void
1004arc_hdr_destroy(arc_buf_hdr_t *hdr)
1005{
1006	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1007	ASSERT3P(hdr->b_state, ==, arc_anon);
1008	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1009
1010	if (!BUF_EMPTY(hdr)) {
1011		ASSERT(!HDR_IN_HASH_TABLE(hdr));
1012		bzero(&hdr->b_dva, sizeof (dva_t));
1013		hdr->b_birth = 0;
1014		hdr->b_cksum0 = 0;
1015	}
1016	while (hdr->b_buf) {
1017		arc_buf_t *buf = hdr->b_buf;
1018
1019		if (buf->b_efunc) {
1020			mutex_enter(&arc_eviction_mtx);
1021			ASSERT(buf->b_hdr != NULL);
1022			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1023			hdr->b_buf = buf->b_next;
1024			buf->b_hdr = &arc_eviction_hdr;
1025			buf->b_next = arc_eviction_list;
1026			arc_eviction_list = buf;
1027			mutex_exit(&arc_eviction_mtx);
1028		} else {
1029			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1030		}
1031	}
1032	if (hdr->b_freeze_cksum != NULL) {
1033		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1034		hdr->b_freeze_cksum = NULL;
1035	}
1036	mutex_destroy(&hdr->b_freeze_lock);
1037
1038	ASSERT(!list_link_active(&hdr->b_arc_node));
1039	ASSERT3P(hdr->b_hash_next, ==, NULL);
1040	ASSERT3P(hdr->b_acb, ==, NULL);
1041	kmem_cache_free(hdr_cache, hdr);
1042}
1043
1044void
1045arc_buf_free(arc_buf_t *buf, void *tag)
1046{
1047	arc_buf_hdr_t *hdr = buf->b_hdr;
1048	int hashed = hdr->b_state != arc_anon;
1049
1050	ASSERT(buf->b_efunc == NULL);
1051	ASSERT(buf->b_data != NULL);
1052
1053	if (hashed) {
1054		kmutex_t *hash_lock = HDR_LOCK(hdr);
1055
1056		mutex_enter(hash_lock);
1057		(void) remove_reference(hdr, hash_lock, tag);
1058		if (hdr->b_datacnt > 1)
1059			arc_buf_destroy(buf, FALSE, TRUE);
1060		else
1061			hdr->b_flags |= ARC_BUF_AVAILABLE;
1062		mutex_exit(hash_lock);
1063	} else if (HDR_IO_IN_PROGRESS(hdr)) {
1064		int destroy_hdr;
1065		/*
1066		 * We are in the middle of an async write.  Don't destroy
1067		 * this buffer unless the write completes before we finish
1068		 * decrementing the reference count.
1069		 */
1070		mutex_enter(&arc_eviction_mtx);
1071		(void) remove_reference(hdr, NULL, tag);
1072		ASSERT(refcount_is_zero(&hdr->b_refcnt));
1073		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1074		mutex_exit(&arc_eviction_mtx);
1075		if (destroy_hdr)
1076			arc_hdr_destroy(hdr);
1077	} else {
1078		if (remove_reference(hdr, NULL, tag) > 0) {
1079			ASSERT(HDR_IO_ERROR(hdr));
1080			arc_buf_destroy(buf, FALSE, TRUE);
1081		} else {
1082			arc_hdr_destroy(hdr);
1083		}
1084	}
1085}
1086
1087int
1088arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1089{
1090	arc_buf_hdr_t *hdr = buf->b_hdr;
1091	kmutex_t *hash_lock = HDR_LOCK(hdr);
1092	int no_callback = (buf->b_efunc == NULL);
1093
1094	if (hdr->b_state == arc_anon) {
1095		arc_buf_free(buf, tag);
1096		return (no_callback);
1097	}
1098
1099	mutex_enter(hash_lock);
1100	ASSERT(hdr->b_state != arc_anon);
1101	ASSERT(buf->b_data != NULL);
1102
1103	(void) remove_reference(hdr, hash_lock, tag);
1104	if (hdr->b_datacnt > 1) {
1105		if (no_callback)
1106			arc_buf_destroy(buf, FALSE, TRUE);
1107	} else if (no_callback) {
1108		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1109		hdr->b_flags |= ARC_BUF_AVAILABLE;
1110	}
1111	ASSERT(no_callback || hdr->b_datacnt > 1 ||
1112	    refcount_is_zero(&hdr->b_refcnt));
1113	mutex_exit(hash_lock);
1114	return (no_callback);
1115}
1116
1117int
1118arc_buf_size(arc_buf_t *buf)
1119{
1120	return (buf->b_hdr->b_size);
1121}
1122
1123/*
1124 * Evict buffers from list until we've removed the specified number of
1125 * bytes.  Move the removed buffers to the appropriate evict state.
1126 * If the recycle flag is set, then attempt to "recycle" a buffer:
1127 * - look for a buffer to evict that is `bytes' long.
1128 * - return the data block from this buffer rather than freeing it.
1129 * This flag is used by callers that are trying to make space for a
1130 * new buffer in a full arc cache.
1131 */
1132static void *
1133arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
1134    arc_buf_contents_t type)
1135{
1136	arc_state_t *evicted_state;
1137	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1138	arc_buf_hdr_t *ab, *ab_prev = NULL;
1139	kmutex_t *hash_lock;
1140	boolean_t have_lock;
1141	void *stolen = NULL;
1142
1143	ASSERT(state == arc_mru || state == arc_mfu);
1144
1145	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1146
1147	mutex_enter(&state->arcs_mtx);
1148	mutex_enter(&evicted_state->arcs_mtx);
1149
1150	for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) {
1151		ab_prev = list_prev(&state->arcs_list, ab);
1152		/* prefetch buffers have a minimum lifespan */
1153		if (HDR_IO_IN_PROGRESS(ab) ||
1154		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1155		    lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) {
1156			skipped++;
1157			continue;
1158		}
1159		/* "lookahead" for better eviction candidate */
1160		if (recycle && ab->b_size != bytes &&
1161		    ab_prev && ab_prev->b_size == bytes)
1162			continue;
1163		hash_lock = HDR_LOCK(ab);
1164		have_lock = MUTEX_HELD(hash_lock);
1165		if (have_lock || mutex_tryenter(hash_lock)) {
1166			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
1167			ASSERT(ab->b_datacnt > 0);
1168			while (ab->b_buf) {
1169				arc_buf_t *buf = ab->b_buf;
1170				if (buf->b_data) {
1171					bytes_evicted += ab->b_size;
1172					if (recycle && ab->b_type == type &&
1173					    ab->b_size == bytes) {
1174						stolen = buf->b_data;
1175						recycle = FALSE;
1176					}
1177				}
1178				if (buf->b_efunc) {
1179					mutex_enter(&arc_eviction_mtx);
1180					arc_buf_destroy(buf,
1181					    buf->b_data == stolen, FALSE);
1182					ab->b_buf = buf->b_next;
1183					buf->b_hdr = &arc_eviction_hdr;
1184					buf->b_next = arc_eviction_list;
1185					arc_eviction_list = buf;
1186					mutex_exit(&arc_eviction_mtx);
1187				} else {
1188					arc_buf_destroy(buf,
1189					    buf->b_data == stolen, TRUE);
1190				}
1191			}
1192			ASSERT(ab->b_datacnt == 0);
1193			arc_change_state(evicted_state, ab, hash_lock);
1194			ASSERT(HDR_IN_HASH_TABLE(ab));
1195			ab->b_flags = ARC_IN_HASH_TABLE;
1196			DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1197			if (!have_lock)
1198				mutex_exit(hash_lock);
1199			if (bytes >= 0 && bytes_evicted >= bytes)
1200				break;
1201		} else {
1202			missed += 1;
1203		}
1204	}
1205
1206	mutex_exit(&evicted_state->arcs_mtx);
1207	mutex_exit(&state->arcs_mtx);
1208
1209	if (bytes_evicted < bytes)
1210		dprintf("only evicted %lld bytes from %x",
1211		    (longlong_t)bytes_evicted, state);
1212
1213	if (skipped)
1214		ARCSTAT_INCR(arcstat_evict_skip, skipped);
1215
1216	if (missed)
1217		ARCSTAT_INCR(arcstat_mutex_miss, missed);
1218
1219	return (stolen);
1220}
1221
1222/*
1223 * Remove buffers from list until we've removed the specified number of
1224 * bytes.  Destroy the buffers that are removed.
1225 */
1226static void
1227arc_evict_ghost(arc_state_t *state, int64_t bytes)
1228{
1229	arc_buf_hdr_t *ab, *ab_prev;
1230	kmutex_t *hash_lock;
1231	uint64_t bytes_deleted = 0;
1232	uint64_t bufs_skipped = 0;
1233
1234	ASSERT(GHOST_STATE(state));
1235top:
1236	mutex_enter(&state->arcs_mtx);
1237	for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) {
1238		ab_prev = list_prev(&state->arcs_list, ab);
1239		hash_lock = HDR_LOCK(ab);
1240		if (mutex_tryenter(hash_lock)) {
1241			ASSERT(!HDR_IO_IN_PROGRESS(ab));
1242			ASSERT(ab->b_buf == NULL);
1243			arc_change_state(arc_anon, ab, hash_lock);
1244			mutex_exit(hash_lock);
1245			ARCSTAT_BUMP(arcstat_deleted);
1246			bytes_deleted += ab->b_size;
1247			arc_hdr_destroy(ab);
1248			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1249			if (bytes >= 0 && bytes_deleted >= bytes)
1250				break;
1251		} else {
1252			if (bytes < 0) {
1253				mutex_exit(&state->arcs_mtx);
1254				mutex_enter(hash_lock);
1255				mutex_exit(hash_lock);
1256				goto top;
1257			}
1258			bufs_skipped += 1;
1259		}
1260	}
1261	mutex_exit(&state->arcs_mtx);
1262
1263	if (bufs_skipped) {
1264		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1265		ASSERT(bytes >= 0);
1266	}
1267
1268	if (bytes_deleted < bytes)
1269		dprintf("only deleted %lld bytes from %p",
1270		    (longlong_t)bytes_deleted, state);
1271}
1272
1273static void
1274arc_adjust(void)
1275{
1276	int64_t top_sz, mru_over, arc_over, todelete;
1277
1278	top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
1279
1280	if (top_sz > arc_p && arc_mru->arcs_lsize > 0) {
1281		int64_t toevict = MIN(arc_mru->arcs_lsize, top_sz - arc_p);
1282		(void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_UNDEF);
1283		top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
1284	}
1285
1286	mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c;
1287
1288	if (mru_over > 0) {
1289		if (arc_mru_ghost->arcs_lsize > 0) {
1290			todelete = MIN(arc_mru_ghost->arcs_lsize, mru_over);
1291			arc_evict_ghost(arc_mru_ghost, todelete);
1292		}
1293	}
1294
1295	if ((arc_over = arc_size - arc_c) > 0) {
1296		int64_t tbl_over;
1297
1298		if (arc_mfu->arcs_lsize > 0) {
1299			int64_t toevict = MIN(arc_mfu->arcs_lsize, arc_over);
1300			(void) arc_evict(arc_mfu, toevict, FALSE,
1301			    ARC_BUFC_UNDEF);
1302		}
1303
1304		tbl_over = arc_size + arc_mru_ghost->arcs_lsize +
1305		    arc_mfu_ghost->arcs_lsize - arc_c*2;
1306
1307		if (tbl_over > 0 && arc_mfu_ghost->arcs_lsize > 0) {
1308			todelete = MIN(arc_mfu_ghost->arcs_lsize, tbl_over);
1309			arc_evict_ghost(arc_mfu_ghost, todelete);
1310		}
1311	}
1312}
1313
1314static void
1315arc_do_user_evicts(void)
1316{
1317	mutex_enter(&arc_eviction_mtx);
1318	while (arc_eviction_list != NULL) {
1319		arc_buf_t *buf = arc_eviction_list;
1320		arc_eviction_list = buf->b_next;
1321		buf->b_hdr = NULL;
1322		mutex_exit(&arc_eviction_mtx);
1323
1324		if (buf->b_efunc != NULL)
1325			VERIFY(buf->b_efunc(buf) == 0);
1326
1327		buf->b_efunc = NULL;
1328		buf->b_private = NULL;
1329		kmem_cache_free(buf_cache, buf);
1330		mutex_enter(&arc_eviction_mtx);
1331	}
1332	mutex_exit(&arc_eviction_mtx);
1333}
1334
1335/*
1336 * Flush all *evictable* data from the cache.
1337 * NOTE: this will not touch "active" (i.e. referenced) data.
1338 */
1339void
1340arc_flush(void)
1341{
1342	while (list_head(&arc_mru->arcs_list))
1343		(void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_UNDEF);
1344	while (list_head(&arc_mfu->arcs_list))
1345		(void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_UNDEF);
1346
1347	arc_evict_ghost(arc_mru_ghost, -1);
1348	arc_evict_ghost(arc_mfu_ghost, -1);
1349
1350	mutex_enter(&arc_reclaim_thr_lock);
1351	arc_do_user_evicts();
1352	mutex_exit(&arc_reclaim_thr_lock);
1353	ASSERT(arc_eviction_list == NULL);
1354}
1355
1356int arc_shrink_shift = 5;		/* log2(fraction of arc to reclaim) */
1357
1358void
1359arc_shrink(void)
1360{
1361	if (arc_c > arc_c_min) {
1362		uint64_t to_free;
1363
1364#ifdef _KERNEL
1365		to_free = arc_c >> arc_shrink_shift;
1366#else
1367		to_free = arc_c >> arc_shrink_shift;
1368#endif
1369		if (arc_c > arc_c_min + to_free)
1370			atomic_add_64(&arc_c, -to_free);
1371		else
1372			arc_c = arc_c_min;
1373
1374		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
1375		if (arc_c > arc_size)
1376			arc_c = MAX(arc_size, arc_c_min);
1377		if (arc_p > arc_c)
1378			arc_p = (arc_c >> 1);
1379		ASSERT(arc_c >= arc_c_min);
1380		ASSERT((int64_t)arc_p >= 0);
1381	}
1382
1383	if (arc_size > arc_c)
1384		arc_adjust();
1385}
1386
1387static int zfs_needfree = 0;
1388
1389static int
1390arc_reclaim_needed(void)
1391{
1392#if 0
1393	uint64_t extra;
1394#endif
1395
1396#ifdef _KERNEL
1397
1398	if (zfs_needfree)
1399		return (1);
1400
1401#if 0
1402	/*
1403	 * check to make sure that swapfs has enough space so that anon
1404	 * reservations can still succeeed. anon_resvmem() checks that the
1405	 * availrmem is greater than swapfs_minfree, and the number of reserved
1406	 * swap pages.  We also add a bit of extra here just to prevent
1407	 * circumstances from getting really dire.
1408	 */
1409	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
1410		return (1);
1411
1412	/*
1413	 * If zio data pages are being allocated out of a separate heap segment,
1414	 * then check that the size of available vmem for this area remains
1415	 * above 1/4th free.  This needs to be done when the size of the
1416	 * non-default segment is smaller than physical memory, so we could
1417	 * conceivably run out of VA in that segment before running out of
1418	 * physical memory.
1419	 */
1420	if (zio_arena != NULL) {
1421		size_t arc_ziosize =
1422		    btop(vmem_size(zio_arena, VMEM_FREE | VMEM_ALLOC));
1423
1424		if ((physmem > arc_ziosize) &&
1425		    (btop(vmem_size(zio_arena, VMEM_FREE)) < arc_ziosize >> 2))
1426			return (1);
1427	}
1428
1429#if defined(__i386)
1430	/*
1431	 * If we're on an i386 platform, it's possible that we'll exhaust the
1432	 * kernel heap space before we ever run out of available physical
1433	 * memory.  Most checks of the size of the heap_area compare against
1434	 * tune.t_minarmem, which is the minimum available real memory that we
1435	 * can have in the system.  However, this is generally fixed at 25 pages
1436	 * which is so low that it's useless.  In this comparison, we seek to
1437	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
1438	 * heap is allocated.  (Or, in the caclulation, if less than 1/4th is
1439	 * free)
1440	 */
1441	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
1442	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
1443		return (1);
1444#endif
1445#else
1446	if (kmem_map->size > (vm_kmem_size * 3) / 4)
1447		return (1);
1448#endif
1449
1450#else
1451	if (spa_get_random(100) == 0)
1452		return (1);
1453#endif
1454	return (0);
1455}
1456
1457static void
1458arc_kmem_reap_now(arc_reclaim_strategy_t strat)
1459{
1460#ifdef ZIO_USE_UMA
1461	size_t			i;
1462	kmem_cache_t		*prev_cache = NULL;
1463	kmem_cache_t		*prev_data_cache = NULL;
1464	extern kmem_cache_t	*zio_buf_cache[];
1465	extern kmem_cache_t	*zio_data_buf_cache[];
1466#endif
1467
1468#ifdef _KERNEL
1469	/*
1470	 * First purge some DNLC entries, in case the DNLC is using
1471	 * up too much memory.
1472	 */
1473	dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
1474
1475#if defined(__i386)
1476	/*
1477	 * Reclaim unused memory from all kmem caches.
1478	 */
1479	kmem_reap();
1480#endif
1481#endif
1482
1483	/*
1484	 * An agressive reclamation will shrink the cache size as well as
1485	 * reap free buffers from the arc kmem caches.
1486	 */
1487	if (strat == ARC_RECLAIM_AGGR)
1488		arc_shrink();
1489
1490#ifdef ZIO_USE_UMA
1491	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
1492		if (zio_buf_cache[i] != prev_cache) {
1493			prev_cache = zio_buf_cache[i];
1494			kmem_cache_reap_now(zio_buf_cache[i]);
1495		}
1496		if (zio_data_buf_cache[i] != prev_data_cache) {
1497			prev_data_cache = zio_data_buf_cache[i];
1498			kmem_cache_reap_now(zio_data_buf_cache[i]);
1499		}
1500	}
1501#endif
1502	kmem_cache_reap_now(buf_cache);
1503	kmem_cache_reap_now(hdr_cache);
1504}
1505
1506static void
1507arc_reclaim_thread(void *dummy __unused)
1508{
1509	clock_t			growtime = 0;
1510	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
1511	callb_cpr_t		cpr;
1512
1513	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
1514
1515	mutex_enter(&arc_reclaim_thr_lock);
1516	while (arc_thread_exit == 0) {
1517		if (arc_reclaim_needed()) {
1518
1519			if (arc_no_grow) {
1520				if (last_reclaim == ARC_RECLAIM_CONS) {
1521					last_reclaim = ARC_RECLAIM_AGGR;
1522				} else {
1523					last_reclaim = ARC_RECLAIM_CONS;
1524				}
1525			} else {
1526				arc_no_grow = TRUE;
1527				last_reclaim = ARC_RECLAIM_AGGR;
1528				membar_producer();
1529			}
1530
1531			/* reset the growth delay for every reclaim */
1532			growtime = lbolt + (arc_grow_retry * hz);
1533			ASSERT(growtime > 0);
1534
1535			if (zfs_needfree && last_reclaim == ARC_RECLAIM_CONS) {
1536				/*
1537				 * If zfs_needfree is TRUE our vm_lowmem hook
1538				 * was called and in that case we must free some
1539				 * memory, so switch to aggressive mode.
1540				 */
1541				arc_no_grow = TRUE;
1542				last_reclaim = ARC_RECLAIM_AGGR;
1543			}
1544			arc_kmem_reap_now(last_reclaim);
1545		} else if ((growtime > 0) && ((growtime - lbolt) <= 0)) {
1546			arc_no_grow = FALSE;
1547		}
1548
1549		if (zfs_needfree ||
1550		    (2 * arc_c < arc_size +
1551		    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size))
1552			arc_adjust();
1553
1554		if (arc_eviction_list != NULL)
1555			arc_do_user_evicts();
1556
1557		if (arc_reclaim_needed()) {
1558			zfs_needfree = 0;
1559#ifdef _KERNEL
1560			wakeup(&zfs_needfree);
1561#endif
1562		}
1563
1564		/* block until needed, or one second, whichever is shorter */
1565		CALLB_CPR_SAFE_BEGIN(&cpr);
1566		(void) cv_timedwait(&arc_reclaim_thr_cv,
1567		    &arc_reclaim_thr_lock, hz);
1568		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
1569	}
1570
1571	arc_thread_exit = 0;
1572	cv_broadcast(&arc_reclaim_thr_cv);
1573	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
1574	thread_exit();
1575}
1576
1577/*
1578 * Adapt arc info given the number of bytes we are trying to add and
1579 * the state that we are comming from.  This function is only called
1580 * when we are adding new content to the cache.
1581 */
1582static void
1583arc_adapt(int bytes, arc_state_t *state)
1584{
1585	int mult;
1586
1587	ASSERT(bytes > 0);
1588	/*
1589	 * Adapt the target size of the MRU list:
1590	 *	- if we just hit in the MRU ghost list, then increase
1591	 *	  the target size of the MRU list.
1592	 *	- if we just hit in the MFU ghost list, then increase
1593	 *	  the target size of the MFU list by decreasing the
1594	 *	  target size of the MRU list.
1595	 */
1596	if (state == arc_mru_ghost) {
1597		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
1598		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
1599
1600		arc_p = MIN(arc_c, arc_p + bytes * mult);
1601	} else if (state == arc_mfu_ghost) {
1602		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
1603		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
1604
1605		arc_p = MAX(0, (int64_t)arc_p - bytes * mult);
1606	}
1607	ASSERT((int64_t)arc_p >= 0);
1608
1609	if (arc_reclaim_needed()) {
1610		cv_signal(&arc_reclaim_thr_cv);
1611		return;
1612	}
1613
1614	if (arc_no_grow)
1615		return;
1616
1617	if (arc_c >= arc_c_max)
1618		return;
1619
1620	/*
1621	 * If we're within (2 * maxblocksize) bytes of the target
1622	 * cache size, increment the target cache size
1623	 */
1624	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
1625		atomic_add_64(&arc_c, (int64_t)bytes);
1626		if (arc_c > arc_c_max)
1627			arc_c = arc_c_max;
1628		else if (state == arc_anon)
1629			atomic_add_64(&arc_p, (int64_t)bytes);
1630		if (arc_p > arc_c)
1631			arc_p = arc_c;
1632	}
1633	ASSERT((int64_t)arc_p >= 0);
1634}
1635
1636/*
1637 * Check if the cache has reached its limits and eviction is required
1638 * prior to insert.
1639 */
1640static int
1641arc_evict_needed()
1642{
1643	if (arc_reclaim_needed())
1644		return (1);
1645
1646	return (arc_size > arc_c);
1647}
1648
1649/*
1650 * The buffer, supplied as the first argument, needs a data block.
1651 * So, if we are at cache max, determine which cache should be victimized.
1652 * We have the following cases:
1653 *
1654 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
1655 * In this situation if we're out of space, but the resident size of the MFU is
1656 * under the limit, victimize the MFU cache to satisfy this insertion request.
1657 *
1658 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
1659 * Here, we've used up all of the available space for the MRU, so we need to
1660 * evict from our own cache instead.  Evict from the set of resident MRU
1661 * entries.
1662 *
1663 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
1664 * c minus p represents the MFU space in the cache, since p is the size of the
1665 * cache that is dedicated to the MRU.  In this situation there's still space on
1666 * the MFU side, so the MRU side needs to be victimized.
1667 *
1668 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
1669 * MFU's resident set is consuming more space than it has been allotted.  In
1670 * this situation, we must victimize our own cache, the MFU, for this insertion.
1671 */
1672static void
1673arc_get_data_buf(arc_buf_t *buf)
1674{
1675	arc_state_t		*state = buf->b_hdr->b_state;
1676	uint64_t		size = buf->b_hdr->b_size;
1677	arc_buf_contents_t	type = buf->b_hdr->b_type;
1678
1679	arc_adapt(size, state);
1680
1681	/*
1682	 * We have not yet reached cache maximum size,
1683	 * just allocate a new buffer.
1684	 */
1685	if (!arc_evict_needed()) {
1686		if (type == ARC_BUFC_METADATA) {
1687			buf->b_data = zio_buf_alloc(size);
1688		} else {
1689			ASSERT(type == ARC_BUFC_DATA);
1690			buf->b_data = zio_data_buf_alloc(size);
1691		}
1692		atomic_add_64(&arc_size, size);
1693		goto out;
1694	}
1695
1696	/*
1697	 * If we are prefetching from the mfu ghost list, this buffer
1698	 * will end up on the mru list; so steal space from there.
1699	 */
1700	if (state == arc_mfu_ghost)
1701		state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
1702	else if (state == arc_mru_ghost)
1703		state = arc_mru;
1704
1705	if (state == arc_mru || state == arc_anon) {
1706		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
1707		state = (arc_p > mru_used) ? arc_mfu : arc_mru;
1708	} else {
1709		/* MFU cases */
1710		uint64_t mfu_space = arc_c - arc_p;
1711		state =  (mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
1712	}
1713	if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) {
1714		if (type == ARC_BUFC_METADATA) {
1715			buf->b_data = zio_buf_alloc(size);
1716		} else {
1717			ASSERT(type == ARC_BUFC_DATA);
1718			buf->b_data = zio_data_buf_alloc(size);
1719		}
1720		atomic_add_64(&arc_size, size);
1721		ARCSTAT_BUMP(arcstat_recycle_miss);
1722	}
1723	ASSERT(buf->b_data != NULL);
1724out:
1725	/*
1726	 * Update the state size.  Note that ghost states have a
1727	 * "ghost size" and so don't need to be updated.
1728	 */
1729	if (!GHOST_STATE(buf->b_hdr->b_state)) {
1730		arc_buf_hdr_t *hdr = buf->b_hdr;
1731
1732		atomic_add_64(&hdr->b_state->arcs_size, size);
1733		if (list_link_active(&hdr->b_arc_node)) {
1734			ASSERT(refcount_is_zero(&hdr->b_refcnt));
1735			atomic_add_64(&hdr->b_state->arcs_lsize, size);
1736		}
1737		/*
1738		 * If we are growing the cache, and we are adding anonymous
1739		 * data, and we have outgrown arc_p, update arc_p
1740		 */
1741		if (arc_size < arc_c && hdr->b_state == arc_anon &&
1742		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
1743			arc_p = MIN(arc_c, arc_p + size);
1744	}
1745}
1746
1747/*
1748 * This routine is called whenever a buffer is accessed.
1749 * NOTE: the hash lock is dropped in this function.
1750 */
1751static void
1752arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
1753{
1754	ASSERT(MUTEX_HELD(hash_lock));
1755
1756	if (buf->b_state == arc_anon) {
1757		/*
1758		 * This buffer is not in the cache, and does not
1759		 * appear in our "ghost" list.  Add the new buffer
1760		 * to the MRU state.
1761		 */
1762
1763		ASSERT(buf->b_arc_access == 0);
1764		buf->b_arc_access = lbolt;
1765		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
1766		arc_change_state(arc_mru, buf, hash_lock);
1767
1768	} else if (buf->b_state == arc_mru) {
1769		/*
1770		 * If this buffer is here because of a prefetch, then either:
1771		 * - clear the flag if this is a "referencing" read
1772		 *   (any subsequent access will bump this into the MFU state).
1773		 * or
1774		 * - move the buffer to the head of the list if this is
1775		 *   another prefetch (to make it less likely to be evicted).
1776		 */
1777		if ((buf->b_flags & ARC_PREFETCH) != 0) {
1778			if (refcount_count(&buf->b_refcnt) == 0) {
1779				ASSERT(list_link_active(&buf->b_arc_node));
1780				mutex_enter(&arc_mru->arcs_mtx);
1781				list_remove(&arc_mru->arcs_list, buf);
1782				list_insert_head(&arc_mru->arcs_list, buf);
1783				mutex_exit(&arc_mru->arcs_mtx);
1784			} else {
1785				buf->b_flags &= ~ARC_PREFETCH;
1786				ARCSTAT_BUMP(arcstat_mru_hits);
1787			}
1788			buf->b_arc_access = lbolt;
1789			return;
1790		}
1791
1792		/*
1793		 * This buffer has been "accessed" only once so far,
1794		 * but it is still in the cache. Move it to the MFU
1795		 * state.
1796		 */
1797		if (lbolt > buf->b_arc_access + ARC_MINTIME) {
1798			/*
1799			 * More than 125ms have passed since we
1800			 * instantiated this buffer.  Move it to the
1801			 * most frequently used state.
1802			 */
1803			buf->b_arc_access = lbolt;
1804			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
1805			arc_change_state(arc_mfu, buf, hash_lock);
1806		}
1807		ARCSTAT_BUMP(arcstat_mru_hits);
1808	} else if (buf->b_state == arc_mru_ghost) {
1809		arc_state_t	*new_state;
1810		/*
1811		 * This buffer has been "accessed" recently, but
1812		 * was evicted from the cache.  Move it to the
1813		 * MFU state.
1814		 */
1815
1816		if (buf->b_flags & ARC_PREFETCH) {
1817			new_state = arc_mru;
1818			if (refcount_count(&buf->b_refcnt) > 0)
1819				buf->b_flags &= ~ARC_PREFETCH;
1820			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
1821		} else {
1822			new_state = arc_mfu;
1823			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
1824		}
1825
1826		buf->b_arc_access = lbolt;
1827		arc_change_state(new_state, buf, hash_lock);
1828
1829		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
1830	} else if (buf->b_state == arc_mfu) {
1831		/*
1832		 * This buffer has been accessed more than once and is
1833		 * still in the cache.  Keep it in the MFU state.
1834		 *
1835		 * NOTE: an add_reference() that occurred when we did
1836		 * the arc_read() will have kicked this off the list.
1837		 * If it was a prefetch, we will explicitly move it to
1838		 * the head of the list now.
1839		 */
1840		if ((buf->b_flags & ARC_PREFETCH) != 0) {
1841			ASSERT(refcount_count(&buf->b_refcnt) == 0);
1842			ASSERT(list_link_active(&buf->b_arc_node));
1843			mutex_enter(&arc_mfu->arcs_mtx);
1844			list_remove(&arc_mfu->arcs_list, buf);
1845			list_insert_head(&arc_mfu->arcs_list, buf);
1846			mutex_exit(&arc_mfu->arcs_mtx);
1847		}
1848		ARCSTAT_BUMP(arcstat_mfu_hits);
1849		buf->b_arc_access = lbolt;
1850	} else if (buf->b_state == arc_mfu_ghost) {
1851		arc_state_t	*new_state = arc_mfu;
1852		/*
1853		 * This buffer has been accessed more than once but has
1854		 * been evicted from the cache.  Move it back to the
1855		 * MFU state.
1856		 */
1857
1858		if (buf->b_flags & ARC_PREFETCH) {
1859			/*
1860			 * This is a prefetch access...
1861			 * move this block back to the MRU state.
1862			 */
1863			ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
1864			new_state = arc_mru;
1865		}
1866
1867		buf->b_arc_access = lbolt;
1868		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
1869		arc_change_state(new_state, buf, hash_lock);
1870
1871		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
1872	} else {
1873		ASSERT(!"invalid arc state");
1874	}
1875}
1876
1877/* a generic arc_done_func_t which you can use */
1878/* ARGSUSED */
1879void
1880arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
1881{
1882	bcopy(buf->b_data, arg, buf->b_hdr->b_size);
1883	VERIFY(arc_buf_remove_ref(buf, arg) == 1);
1884}
1885
1886/* a generic arc_done_func_t which you can use */
1887void
1888arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
1889{
1890	arc_buf_t **bufp = arg;
1891	if (zio && zio->io_error) {
1892		VERIFY(arc_buf_remove_ref(buf, arg) == 1);
1893		*bufp = NULL;
1894	} else {
1895		*bufp = buf;
1896	}
1897}
1898
1899static void
1900arc_read_done(zio_t *zio)
1901{
1902	arc_buf_hdr_t	*hdr, *found;
1903	arc_buf_t	*buf;
1904	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
1905	kmutex_t	*hash_lock;
1906	arc_callback_t	*callback_list, *acb;
1907	int		freeable = FALSE;
1908
1909	buf = zio->io_private;
1910	hdr = buf->b_hdr;
1911
1912	/*
1913	 * The hdr was inserted into hash-table and removed from lists
1914	 * prior to starting I/O.  We should find this header, since
1915	 * it's in the hash table, and it should be legit since it's
1916	 * not possible to evict it during the I/O.  The only possible
1917	 * reason for it not to be found is if we were freed during the
1918	 * read.
1919	 */
1920	found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
1921	    &hash_lock);
1922
1923	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
1924	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))));
1925
1926	/* byteswap if necessary */
1927	callback_list = hdr->b_acb;
1928	ASSERT(callback_list != NULL);
1929	if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
1930		callback_list->acb_byteswap(buf->b_data, hdr->b_size);
1931
1932	arc_cksum_compute(buf);
1933
1934	/* create copies of the data buffer for the callers */
1935	abuf = buf;
1936	for (acb = callback_list; acb; acb = acb->acb_next) {
1937		if (acb->acb_done) {
1938			if (abuf == NULL)
1939				abuf = arc_buf_clone(buf);
1940			acb->acb_buf = abuf;
1941			abuf = NULL;
1942		}
1943	}
1944	hdr->b_acb = NULL;
1945	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
1946	ASSERT(!HDR_BUF_AVAILABLE(hdr));
1947	if (abuf == buf)
1948		hdr->b_flags |= ARC_BUF_AVAILABLE;
1949
1950	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
1951
1952	if (zio->io_error != 0) {
1953		hdr->b_flags |= ARC_IO_ERROR;
1954		if (hdr->b_state != arc_anon)
1955			arc_change_state(arc_anon, hdr, hash_lock);
1956		if (HDR_IN_HASH_TABLE(hdr))
1957			buf_hash_remove(hdr);
1958		freeable = refcount_is_zero(&hdr->b_refcnt);
1959		/* convert checksum errors into IO errors */
1960		if (zio->io_error == ECKSUM)
1961			zio->io_error = EIO;
1962	}
1963
1964	/*
1965	 * Broadcast before we drop the hash_lock to avoid the possibility
1966	 * that the hdr (and hence the cv) might be freed before we get to
1967	 * the cv_broadcast().
1968	 */
1969	cv_broadcast(&hdr->b_cv);
1970
1971	if (hash_lock) {
1972		/*
1973		 * Only call arc_access on anonymous buffers.  This is because
1974		 * if we've issued an I/O for an evicted buffer, we've already
1975		 * called arc_access (to prevent any simultaneous readers from
1976		 * getting confused).
1977		 */
1978		if (zio->io_error == 0 && hdr->b_state == arc_anon)
1979			arc_access(hdr, hash_lock);
1980		mutex_exit(hash_lock);
1981	} else {
1982		/*
1983		 * This block was freed while we waited for the read to
1984		 * complete.  It has been removed from the hash table and
1985		 * moved to the anonymous state (so that it won't show up
1986		 * in the cache).
1987		 */
1988		ASSERT3P(hdr->b_state, ==, arc_anon);
1989		freeable = refcount_is_zero(&hdr->b_refcnt);
1990	}
1991
1992	/* execute each callback and free its structure */
1993	while ((acb = callback_list) != NULL) {
1994		if (acb->acb_done)
1995			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
1996
1997		if (acb->acb_zio_dummy != NULL) {
1998			acb->acb_zio_dummy->io_error = zio->io_error;
1999			zio_nowait(acb->acb_zio_dummy);
2000		}
2001
2002		callback_list = acb->acb_next;
2003		kmem_free(acb, sizeof (arc_callback_t));
2004	}
2005
2006	if (freeable)
2007		arc_hdr_destroy(hdr);
2008}
2009
2010/*
2011 * "Read" the block block at the specified DVA (in bp) via the
2012 * cache.  If the block is found in the cache, invoke the provided
2013 * callback immediately and return.  Note that the `zio' parameter
2014 * in the callback will be NULL in this case, since no IO was
2015 * required.  If the block is not in the cache pass the read request
2016 * on to the spa with a substitute callback function, so that the
2017 * requested block will be added to the cache.
2018 *
2019 * If a read request arrives for a block that has a read in-progress,
2020 * either wait for the in-progress read to complete (and return the
2021 * results); or, if this is a read with a "done" func, add a record
2022 * to the read to invoke the "done" func when the read completes,
2023 * and return; or just return.
2024 *
2025 * arc_read_done() will invoke all the requested "done" functions
2026 * for readers of this block.
2027 */
2028int
2029arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
2030    arc_done_func_t *done, void *private, int priority, int flags,
2031    uint32_t *arc_flags, zbookmark_t *zb)
2032{
2033	arc_buf_hdr_t *hdr;
2034	arc_buf_t *buf;
2035	kmutex_t *hash_lock;
2036	zio_t	*rzio;
2037
2038top:
2039	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
2040	if (hdr && hdr->b_datacnt > 0) {
2041
2042		*arc_flags |= ARC_CACHED;
2043
2044		if (HDR_IO_IN_PROGRESS(hdr)) {
2045
2046			if (*arc_flags & ARC_WAIT) {
2047				cv_wait(&hdr->b_cv, hash_lock);
2048				mutex_exit(hash_lock);
2049				goto top;
2050			}
2051			ASSERT(*arc_flags & ARC_NOWAIT);
2052
2053			if (done) {
2054				arc_callback_t	*acb = NULL;
2055
2056				acb = kmem_zalloc(sizeof (arc_callback_t),
2057				    KM_SLEEP);
2058				acb->acb_done = done;
2059				acb->acb_private = private;
2060				acb->acb_byteswap = swap;
2061				if (pio != NULL)
2062					acb->acb_zio_dummy = zio_null(pio,
2063					    spa, NULL, NULL, flags);
2064
2065				ASSERT(acb->acb_done != NULL);
2066				acb->acb_next = hdr->b_acb;
2067				hdr->b_acb = acb;
2068				add_reference(hdr, hash_lock, private);
2069				mutex_exit(hash_lock);
2070				return (0);
2071			}
2072			mutex_exit(hash_lock);
2073			return (0);
2074		}
2075
2076		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2077
2078		if (done) {
2079			add_reference(hdr, hash_lock, private);
2080			/*
2081			 * If this block is already in use, create a new
2082			 * copy of the data so that we will be guaranteed
2083			 * that arc_release() will always succeed.
2084			 */
2085			buf = hdr->b_buf;
2086			ASSERT(buf);
2087			ASSERT(buf->b_data);
2088			if (HDR_BUF_AVAILABLE(hdr)) {
2089				ASSERT(buf->b_efunc == NULL);
2090				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2091			} else {
2092				buf = arc_buf_clone(buf);
2093			}
2094		} else if (*arc_flags & ARC_PREFETCH &&
2095		    refcount_count(&hdr->b_refcnt) == 0) {
2096			hdr->b_flags |= ARC_PREFETCH;
2097		}
2098		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2099		arc_access(hdr, hash_lock);
2100		mutex_exit(hash_lock);
2101		ARCSTAT_BUMP(arcstat_hits);
2102		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2103		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2104		    data, metadata, hits);
2105
2106		if (done)
2107			done(NULL, buf, private);
2108	} else {
2109		uint64_t size = BP_GET_LSIZE(bp);
2110		arc_callback_t	*acb;
2111
2112		if (hdr == NULL) {
2113			/* this block is not in the cache */
2114			arc_buf_hdr_t	*exists;
2115			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
2116			buf = arc_buf_alloc(spa, size, private, type);
2117			hdr = buf->b_hdr;
2118			hdr->b_dva = *BP_IDENTITY(bp);
2119			hdr->b_birth = bp->blk_birth;
2120			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
2121			exists = buf_hash_insert(hdr, &hash_lock);
2122			if (exists) {
2123				/* somebody beat us to the hash insert */
2124				mutex_exit(hash_lock);
2125				bzero(&hdr->b_dva, sizeof (dva_t));
2126				hdr->b_birth = 0;
2127				hdr->b_cksum0 = 0;
2128				(void) arc_buf_remove_ref(buf, private);
2129				goto top; /* restart the IO request */
2130			}
2131			/* if this is a prefetch, we don't have a reference */
2132			if (*arc_flags & ARC_PREFETCH) {
2133				(void) remove_reference(hdr, hash_lock,
2134				    private);
2135				hdr->b_flags |= ARC_PREFETCH;
2136			}
2137			if (BP_GET_LEVEL(bp) > 0)
2138				hdr->b_flags |= ARC_INDIRECT;
2139		} else {
2140			/* this block is in the ghost cache */
2141			ASSERT(GHOST_STATE(hdr->b_state));
2142			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2143			ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
2144			ASSERT(hdr->b_buf == NULL);
2145
2146			/* if this is a prefetch, we don't have a reference */
2147			if (*arc_flags & ARC_PREFETCH)
2148				hdr->b_flags |= ARC_PREFETCH;
2149			else
2150				add_reference(hdr, hash_lock, private);
2151			buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
2152			buf->b_hdr = hdr;
2153			buf->b_data = NULL;
2154			buf->b_efunc = NULL;
2155			buf->b_private = NULL;
2156			buf->b_next = NULL;
2157			hdr->b_buf = buf;
2158			arc_get_data_buf(buf);
2159			ASSERT(hdr->b_datacnt == 0);
2160			hdr->b_datacnt = 1;
2161
2162		}
2163
2164		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
2165		acb->acb_done = done;
2166		acb->acb_private = private;
2167		acb->acb_byteswap = swap;
2168
2169		ASSERT(hdr->b_acb == NULL);
2170		hdr->b_acb = acb;
2171		hdr->b_flags |= ARC_IO_IN_PROGRESS;
2172
2173		/*
2174		 * If the buffer has been evicted, migrate it to a present state
2175		 * before issuing the I/O.  Once we drop the hash-table lock,
2176		 * the header will be marked as I/O in progress and have an
2177		 * attached buffer.  At this point, anybody who finds this
2178		 * buffer ought to notice that it's legit but has a pending I/O.
2179		 */
2180
2181		if (GHOST_STATE(hdr->b_state))
2182			arc_access(hdr, hash_lock);
2183		mutex_exit(hash_lock);
2184
2185		ASSERT3U(hdr->b_size, ==, size);
2186		DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
2187		    zbookmark_t *, zb);
2188		ARCSTAT_BUMP(arcstat_misses);
2189		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2190		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2191		    data, metadata, misses);
2192
2193		rzio = zio_read(pio, spa, bp, buf->b_data, size,
2194		    arc_read_done, buf, priority, flags, zb);
2195
2196		if (*arc_flags & ARC_WAIT)
2197			return (zio_wait(rzio));
2198
2199		ASSERT(*arc_flags & ARC_NOWAIT);
2200		zio_nowait(rzio);
2201	}
2202	return (0);
2203}
2204
2205/*
2206 * arc_read() variant to support pool traversal.  If the block is already
2207 * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
2208 * The idea is that we don't want pool traversal filling up memory, but
2209 * if the ARC already has the data anyway, we shouldn't pay for the I/O.
2210 */
2211int
2212arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
2213{
2214	arc_buf_hdr_t *hdr;
2215	kmutex_t *hash_mtx;
2216	int rc = 0;
2217
2218	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
2219
2220	if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
2221		arc_buf_t *buf = hdr->b_buf;
2222
2223		ASSERT(buf);
2224		while (buf->b_data == NULL) {
2225			buf = buf->b_next;
2226			ASSERT(buf);
2227		}
2228		bcopy(buf->b_data, data, hdr->b_size);
2229	} else {
2230		rc = ENOENT;
2231	}
2232
2233	if (hash_mtx)
2234		mutex_exit(hash_mtx);
2235
2236	return (rc);
2237}
2238
2239void
2240arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
2241{
2242	ASSERT(buf->b_hdr != NULL);
2243	ASSERT(buf->b_hdr->b_state != arc_anon);
2244	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
2245	buf->b_efunc = func;
2246	buf->b_private = private;
2247}
2248
2249/*
2250 * This is used by the DMU to let the ARC know that a buffer is
2251 * being evicted, so the ARC should clean up.  If this arc buf
2252 * is not yet in the evicted state, it will be put there.
2253 */
2254int
2255arc_buf_evict(arc_buf_t *buf)
2256{
2257	arc_buf_hdr_t *hdr;
2258	kmutex_t *hash_lock;
2259	arc_buf_t **bufp;
2260
2261	mutex_enter(&arc_eviction_mtx);
2262	hdr = buf->b_hdr;
2263	if (hdr == NULL) {
2264		/*
2265		 * We are in arc_do_user_evicts().
2266		 */
2267		ASSERT(buf->b_data == NULL);
2268		mutex_exit(&arc_eviction_mtx);
2269		return (0);
2270	}
2271	hash_lock = HDR_LOCK(hdr);
2272	mutex_exit(&arc_eviction_mtx);
2273
2274	mutex_enter(hash_lock);
2275
2276	if (buf->b_data == NULL) {
2277		/*
2278		 * We are on the eviction list.
2279		 */
2280		mutex_exit(hash_lock);
2281		mutex_enter(&arc_eviction_mtx);
2282		if (buf->b_hdr == NULL) {
2283			/*
2284			 * We are already in arc_do_user_evicts().
2285			 */
2286			mutex_exit(&arc_eviction_mtx);
2287			return (0);
2288		} else {
2289			arc_buf_t copy = *buf; /* structure assignment */
2290			/*
2291			 * Process this buffer now
2292			 * but let arc_do_user_evicts() do the reaping.
2293			 */
2294			buf->b_efunc = NULL;
2295			mutex_exit(&arc_eviction_mtx);
2296			VERIFY(copy.b_efunc(&copy) == 0);
2297			return (1);
2298		}
2299	}
2300
2301	ASSERT(buf->b_hdr == hdr);
2302	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
2303	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2304
2305	/*
2306	 * Pull this buffer off of the hdr
2307	 */
2308	bufp = &hdr->b_buf;
2309	while (*bufp != buf)
2310		bufp = &(*bufp)->b_next;
2311	*bufp = buf->b_next;
2312
2313	ASSERT(buf->b_data != NULL);
2314	arc_buf_destroy(buf, FALSE, FALSE);
2315
2316	if (hdr->b_datacnt == 0) {
2317		arc_state_t *old_state = hdr->b_state;
2318		arc_state_t *evicted_state;
2319
2320		ASSERT(refcount_is_zero(&hdr->b_refcnt));
2321
2322		evicted_state =
2323		    (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2324
2325		mutex_enter(&old_state->arcs_mtx);
2326		mutex_enter(&evicted_state->arcs_mtx);
2327
2328		arc_change_state(evicted_state, hdr, hash_lock);
2329		ASSERT(HDR_IN_HASH_TABLE(hdr));
2330		hdr->b_flags = ARC_IN_HASH_TABLE;
2331
2332		mutex_exit(&evicted_state->arcs_mtx);
2333		mutex_exit(&old_state->arcs_mtx);
2334	}
2335	mutex_exit(hash_lock);
2336
2337	VERIFY(buf->b_efunc(buf) == 0);
2338	buf->b_efunc = NULL;
2339	buf->b_private = NULL;
2340	buf->b_hdr = NULL;
2341	kmem_cache_free(buf_cache, buf);
2342	return (1);
2343}
2344
2345/*
2346 * Release this buffer from the cache.  This must be done
2347 * after a read and prior to modifying the buffer contents.
2348 * If the buffer has more than one reference, we must make
2349 * make a new hdr for the buffer.
2350 */
2351void
2352arc_release(arc_buf_t *buf, void *tag)
2353{
2354	arc_buf_hdr_t *hdr = buf->b_hdr;
2355	kmutex_t *hash_lock = HDR_LOCK(hdr);
2356
2357	/* this buffer is not on any list */
2358	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
2359
2360	if (hdr->b_state == arc_anon) {
2361		/* this buffer is already released */
2362		ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
2363		ASSERT(BUF_EMPTY(hdr));
2364		ASSERT(buf->b_efunc == NULL);
2365		arc_buf_thaw(buf);
2366		return;
2367	}
2368
2369	mutex_enter(hash_lock);
2370
2371	/*
2372	 * Do we have more than one buf?
2373	 */
2374	if (hdr->b_buf != buf || buf->b_next != NULL) {
2375		arc_buf_hdr_t *nhdr;
2376		arc_buf_t **bufp;
2377		uint64_t blksz = hdr->b_size;
2378		spa_t *spa = hdr->b_spa;
2379		arc_buf_contents_t type = hdr->b_type;
2380
2381		ASSERT(hdr->b_datacnt > 1);
2382		/*
2383		 * Pull the data off of this buf and attach it to
2384		 * a new anonymous buf.
2385		 */
2386		(void) remove_reference(hdr, hash_lock, tag);
2387		bufp = &hdr->b_buf;
2388		while (*bufp != buf)
2389			bufp = &(*bufp)->b_next;
2390		*bufp = (*bufp)->b_next;
2391		buf->b_next = NULL;
2392
2393		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
2394		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
2395		if (refcount_is_zero(&hdr->b_refcnt)) {
2396			ASSERT3U(hdr->b_state->arcs_lsize, >=, hdr->b_size);
2397			atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size);
2398		}
2399		hdr->b_datacnt -= 1;
2400		arc_cksum_verify(buf);
2401
2402		mutex_exit(hash_lock);
2403
2404		nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
2405		nhdr->b_size = blksz;
2406		nhdr->b_spa = spa;
2407		nhdr->b_type = type;
2408		nhdr->b_buf = buf;
2409		nhdr->b_state = arc_anon;
2410		nhdr->b_arc_access = 0;
2411		nhdr->b_flags = 0;
2412		nhdr->b_datacnt = 1;
2413		nhdr->b_freeze_cksum = NULL;
2414		mutex_init(&nhdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
2415		(void) refcount_add(&nhdr->b_refcnt, tag);
2416		buf->b_hdr = nhdr;
2417		atomic_add_64(&arc_anon->arcs_size, blksz);
2418
2419		hdr = nhdr;
2420	} else {
2421		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
2422		ASSERT(!list_link_active(&hdr->b_arc_node));
2423		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2424		arc_change_state(arc_anon, hdr, hash_lock);
2425		hdr->b_arc_access = 0;
2426		mutex_exit(hash_lock);
2427		bzero(&hdr->b_dva, sizeof (dva_t));
2428		hdr->b_birth = 0;
2429		hdr->b_cksum0 = 0;
2430		arc_buf_thaw(buf);
2431	}
2432	buf->b_efunc = NULL;
2433	buf->b_private = NULL;
2434}
2435
2436int
2437arc_released(arc_buf_t *buf)
2438{
2439	return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
2440}
2441
2442int
2443arc_has_callback(arc_buf_t *buf)
2444{
2445	return (buf->b_efunc != NULL);
2446}
2447
2448#ifdef ZFS_DEBUG
2449int
2450arc_referenced(arc_buf_t *buf)
2451{
2452	return (refcount_count(&buf->b_hdr->b_refcnt));
2453}
2454#endif
2455
2456static void
2457arc_write_ready(zio_t *zio)
2458{
2459	arc_write_callback_t *callback = zio->io_private;
2460	arc_buf_t *buf = callback->awcb_buf;
2461
2462	if (callback->awcb_ready) {
2463		ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
2464		callback->awcb_ready(zio, buf, callback->awcb_private);
2465	}
2466	arc_cksum_compute(buf);
2467}
2468
2469static void
2470arc_write_done(zio_t *zio)
2471{
2472	arc_write_callback_t *callback = zio->io_private;
2473	arc_buf_t *buf = callback->awcb_buf;
2474	arc_buf_hdr_t *hdr = buf->b_hdr;
2475
2476	hdr->b_acb = NULL;
2477
2478	/* this buffer is on no lists and is not in the hash table */
2479	ASSERT3P(hdr->b_state, ==, arc_anon);
2480
2481	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
2482	hdr->b_birth = zio->io_bp->blk_birth;
2483	hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
2484	/*
2485	 * If the block to be written was all-zero, we may have
2486	 * compressed it away.  In this case no write was performed
2487	 * so there will be no dva/birth-date/checksum.  The buffer
2488	 * must therefor remain anonymous (and uncached).
2489	 */
2490	if (!BUF_EMPTY(hdr)) {
2491		arc_buf_hdr_t *exists;
2492		kmutex_t *hash_lock;
2493
2494		arc_cksum_verify(buf);
2495
2496		exists = buf_hash_insert(hdr, &hash_lock);
2497		if (exists) {
2498			/*
2499			 * This can only happen if we overwrite for
2500			 * sync-to-convergence, because we remove
2501			 * buffers from the hash table when we arc_free().
2502			 */
2503			ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
2504			    BP_IDENTITY(zio->io_bp)));
2505			ASSERT3U(zio->io_bp_orig.blk_birth, ==,
2506			    zio->io_bp->blk_birth);
2507
2508			ASSERT(refcount_is_zero(&exists->b_refcnt));
2509			arc_change_state(arc_anon, exists, hash_lock);
2510			mutex_exit(hash_lock);
2511			arc_hdr_destroy(exists);
2512			exists = buf_hash_insert(hdr, &hash_lock);
2513			ASSERT3P(exists, ==, NULL);
2514		}
2515		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2516		arc_access(hdr, hash_lock);
2517		mutex_exit(hash_lock);
2518	} else if (callback->awcb_done == NULL) {
2519		int destroy_hdr;
2520		/*
2521		 * This is an anonymous buffer with no user callback,
2522		 * destroy it if there are no active references.
2523		 */
2524		mutex_enter(&arc_eviction_mtx);
2525		destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
2526		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2527		mutex_exit(&arc_eviction_mtx);
2528		if (destroy_hdr)
2529			arc_hdr_destroy(hdr);
2530	} else {
2531		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2532	}
2533
2534	if (callback->awcb_done) {
2535		ASSERT(!refcount_is_zero(&hdr->b_refcnt));
2536		callback->awcb_done(zio, buf, callback->awcb_private);
2537	}
2538
2539	kmem_free(callback, sizeof (arc_write_callback_t));
2540}
2541
2542zio_t *
2543arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
2544    uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
2545    arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
2546    int flags, zbookmark_t *zb)
2547{
2548	arc_buf_hdr_t *hdr = buf->b_hdr;
2549	arc_write_callback_t *callback;
2550	zio_t	*zio;
2551
2552	/* this is a private buffer - no locking required */
2553	ASSERT3P(hdr->b_state, ==, arc_anon);
2554	ASSERT(BUF_EMPTY(hdr));
2555	ASSERT(!HDR_IO_ERROR(hdr));
2556	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
2557	ASSERT(hdr->b_acb == 0);
2558	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
2559	callback->awcb_ready = ready;
2560	callback->awcb_done = done;
2561	callback->awcb_private = private;
2562	callback->awcb_buf = buf;
2563	hdr->b_flags |= ARC_IO_IN_PROGRESS;
2564	zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
2565	    buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback,
2566	    priority, flags, zb);
2567
2568	return (zio);
2569}
2570
2571int
2572arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
2573    zio_done_func_t *done, void *private, uint32_t arc_flags)
2574{
2575	arc_buf_hdr_t *ab;
2576	kmutex_t *hash_lock;
2577	zio_t	*zio;
2578
2579	/*
2580	 * If this buffer is in the cache, release it, so it
2581	 * can be re-used.
2582	 */
2583	ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
2584	if (ab != NULL) {
2585		/*
2586		 * The checksum of blocks to free is not always
2587		 * preserved (eg. on the deadlist).  However, if it is
2588		 * nonzero, it should match what we have in the cache.
2589		 */
2590		ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
2591		    ab->b_cksum0 == bp->blk_cksum.zc_word[0]);
2592		if (ab->b_state != arc_anon)
2593			arc_change_state(arc_anon, ab, hash_lock);
2594		if (HDR_IO_IN_PROGRESS(ab)) {
2595			/*
2596			 * This should only happen when we prefetch.
2597			 */
2598			ASSERT(ab->b_flags & ARC_PREFETCH);
2599			ASSERT3U(ab->b_datacnt, ==, 1);
2600			ab->b_flags |= ARC_FREED_IN_READ;
2601			if (HDR_IN_HASH_TABLE(ab))
2602				buf_hash_remove(ab);
2603			ab->b_arc_access = 0;
2604			bzero(&ab->b_dva, sizeof (dva_t));
2605			ab->b_birth = 0;
2606			ab->b_cksum0 = 0;
2607			ab->b_buf->b_efunc = NULL;
2608			ab->b_buf->b_private = NULL;
2609			mutex_exit(hash_lock);
2610		} else if (refcount_is_zero(&ab->b_refcnt)) {
2611			mutex_exit(hash_lock);
2612			arc_hdr_destroy(ab);
2613			ARCSTAT_BUMP(arcstat_deleted);
2614		} else {
2615			/*
2616			 * We still have an active reference on this
2617			 * buffer.  This can happen, e.g., from
2618			 * dbuf_unoverride().
2619			 */
2620			ASSERT(!HDR_IN_HASH_TABLE(ab));
2621			ab->b_arc_access = 0;
2622			bzero(&ab->b_dva, sizeof (dva_t));
2623			ab->b_birth = 0;
2624			ab->b_cksum0 = 0;
2625			ab->b_buf->b_efunc = NULL;
2626			ab->b_buf->b_private = NULL;
2627			mutex_exit(hash_lock);
2628		}
2629	}
2630
2631	zio = zio_free(pio, spa, txg, bp, done, private);
2632
2633	if (arc_flags & ARC_WAIT)
2634		return (zio_wait(zio));
2635
2636	ASSERT(arc_flags & ARC_NOWAIT);
2637	zio_nowait(zio);
2638
2639	return (0);
2640}
2641
2642void
2643arc_tempreserve_clear(uint64_t tempreserve)
2644{
2645	atomic_add_64(&arc_tempreserve, -tempreserve);
2646	ASSERT((int64_t)arc_tempreserve >= 0);
2647}
2648
2649int
2650arc_tempreserve_space(uint64_t tempreserve)
2651{
2652#ifdef ZFS_DEBUG
2653	/*
2654	 * Once in a while, fail for no reason.  Everything should cope.
2655	 */
2656	if (spa_get_random(10000) == 0) {
2657		dprintf("forcing random failure\n");
2658		return (ERESTART);
2659	}
2660#endif
2661	if (tempreserve > arc_c/4 && !arc_no_grow)
2662		arc_c = MIN(arc_c_max, tempreserve * 4);
2663	if (tempreserve > arc_c)
2664		return (ENOMEM);
2665
2666	/*
2667	 * Throttle writes when the amount of dirty data in the cache
2668	 * gets too large.  We try to keep the cache less than half full
2669	 * of dirty blocks so that our sync times don't grow too large.
2670	 * Note: if two requests come in concurrently, we might let them
2671	 * both succeed, when one of them should fail.  Not a huge deal.
2672	 *
2673	 * XXX The limit should be adjusted dynamically to keep the time
2674	 * to sync a dataset fixed (around 1-5 seconds?).
2675	 */
2676
2677	if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
2678	    arc_tempreserve + arc_anon->arcs_size > arc_c / 4) {
2679		dprintf("failing, arc_tempreserve=%lluK anon=%lluK "
2680		    "tempreserve=%lluK arc_c=%lluK\n",
2681		    arc_tempreserve>>10, arc_anon->arcs_lsize>>10,
2682		    tempreserve>>10, arc_c>>10);
2683		return (ERESTART);
2684	}
2685	atomic_add_64(&arc_tempreserve, tempreserve);
2686	return (0);
2687}
2688
2689#ifdef _KERNEL
2690static eventhandler_tag zfs_event_lowmem = NULL;
2691
2692static void
2693zfs_lowmem(void *arg __unused, int howto __unused)
2694{
2695
2696	zfs_needfree = 1;
2697	cv_signal(&arc_reclaim_thr_cv);
2698	while (zfs_needfree)
2699		tsleep(&zfs_needfree, 0, "zfs:lowmem", hz / 5);
2700}
2701#endif
2702
2703void
2704arc_init(void)
2705{
2706	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
2707	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
2708
2709	/* Convert seconds to clock ticks */
2710	arc_min_prefetch_lifespan = 1 * hz;
2711
2712	/* Start out with 1/8 of all memory */
2713	arc_c = physmem * PAGESIZE / 8;
2714#if 0
2715#ifdef _KERNEL
2716	/*
2717	 * On architectures where the physical memory can be larger
2718	 * than the addressable space (intel in 32-bit mode), we may
2719	 * need to limit the cache to 1/8 of VM size.
2720	 */
2721	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
2722#endif
2723#endif
2724	/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
2725	arc_c_min = MAX(arc_c / 4, 64<<20);
2726	/* set max to 3/4 of all memory, or all but 1GB, whichever is more */
2727	if (arc_c * 8 >= 1<<30)
2728		arc_c_max = (arc_c * 8) - (1<<30);
2729	else
2730		arc_c_max = arc_c_min;
2731	arc_c_max = MAX(arc_c * 6, arc_c_max);
2732#ifdef _KERNEL
2733	/*
2734	 * Allow the tunables to override our calculations if they are
2735	 * reasonable (ie. over 64MB)
2736	 */
2737	if (zfs_arc_max > 64<<20 && zfs_arc_max < vm_kmem_size)
2738		arc_c_max = zfs_arc_max;
2739	if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
2740		arc_c_min = zfs_arc_min;
2741#endif
2742	arc_c = arc_c_max;
2743	arc_p = (arc_c >> 1);
2744
2745	/* if kmem_flags are set, lets try to use less memory */
2746	if (kmem_debugging())
2747		arc_c = arc_c / 2;
2748	if (arc_c < arc_c_min)
2749		arc_c = arc_c_min;
2750
2751	zfs_arc_min = arc_c_min;
2752	zfs_arc_max = arc_c_max;
2753
2754	arc_anon = &ARC_anon;
2755	arc_mru = &ARC_mru;
2756	arc_mru_ghost = &ARC_mru_ghost;
2757	arc_mfu = &ARC_mfu;
2758	arc_mfu_ghost = &ARC_mfu_ghost;
2759	arc_size = 0;
2760
2761	mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
2762	mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
2763	mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
2764	mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
2765	mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
2766
2767	list_create(&arc_mru->arcs_list, sizeof (arc_buf_hdr_t),
2768	    offsetof(arc_buf_hdr_t, b_arc_node));
2769	list_create(&arc_mru_ghost->arcs_list, sizeof (arc_buf_hdr_t),
2770	    offsetof(arc_buf_hdr_t, b_arc_node));
2771	list_create(&arc_mfu->arcs_list, sizeof (arc_buf_hdr_t),
2772	    offsetof(arc_buf_hdr_t, b_arc_node));
2773	list_create(&arc_mfu_ghost->arcs_list, sizeof (arc_buf_hdr_t),
2774	    offsetof(arc_buf_hdr_t, b_arc_node));
2775
2776	buf_init();
2777
2778	arc_thread_exit = 0;
2779	arc_eviction_list = NULL;
2780	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
2781	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
2782
2783	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
2784	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
2785
2786	if (arc_ksp != NULL) {
2787		arc_ksp->ks_data = &arc_stats;
2788		kstat_install(arc_ksp);
2789	}
2790
2791	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
2792	    TS_RUN, minclsyspri);
2793
2794#ifdef _KERNEL
2795	zfs_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, zfs_lowmem, NULL,
2796	    EVENTHANDLER_PRI_FIRST);
2797#endif
2798
2799	arc_dead = FALSE;
2800}
2801
2802void
2803arc_fini(void)
2804{
2805	mutex_enter(&arc_reclaim_thr_lock);
2806	arc_thread_exit = 1;
2807	cv_signal(&arc_reclaim_thr_cv);
2808	while (arc_thread_exit != 0)
2809		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
2810	mutex_exit(&arc_reclaim_thr_lock);
2811
2812	arc_flush();
2813
2814	arc_dead = TRUE;
2815
2816	if (arc_ksp != NULL) {
2817		kstat_delete(arc_ksp);
2818		arc_ksp = NULL;
2819	}
2820
2821	mutex_destroy(&arc_eviction_mtx);
2822	mutex_destroy(&arc_reclaim_thr_lock);
2823	cv_destroy(&arc_reclaim_thr_cv);
2824
2825	list_destroy(&arc_mru->arcs_list);
2826	list_destroy(&arc_mru_ghost->arcs_list);
2827	list_destroy(&arc_mfu->arcs_list);
2828	list_destroy(&arc_mfu_ghost->arcs_list);
2829
2830	mutex_destroy(&arc_anon->arcs_mtx);
2831	mutex_destroy(&arc_mru->arcs_mtx);
2832	mutex_destroy(&arc_mru_ghost->arcs_mtx);
2833	mutex_destroy(&arc_mfu->arcs_mtx);
2834	mutex_destroy(&arc_mfu_ghost->arcs_mtx);
2835
2836	buf_fini();
2837
2838#ifdef _KERNEL
2839	if (zfs_event_lowmem != NULL)
2840		EVENTHANDLER_DEREGISTER(vm_lowmem, zfs_event_lowmem);
2841#endif
2842}
2843