arc.c revision 168404
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28/*
29 * DVA-based Adjustable Replacement Cache
30 *
31 * While much of the theory of operation used here is
32 * based on the self-tuning, low overhead replacement cache
33 * presented by Megiddo and Modha at FAST 2003, there are some
34 * significant differences:
35 *
36 * 1. The Megiddo and Modha model assumes any page is evictable.
37 * Pages in its cache cannot be "locked" into memory.  This makes
38 * the eviction algorithm simple: evict the last page in the list.
39 * This also make the performance characteristics easy to reason
40 * about.  Our cache is not so simple.  At any given moment, some
41 * subset of the blocks in the cache are un-evictable because we
42 * have handed out a reference to them.  Blocks are only evictable
43 * when there are no external references active.  This makes
44 * eviction far more problematic:  we choose to evict the evictable
45 * blocks that are the "lowest" in the list.
46 *
47 * There are times when it is not possible to evict the requested
48 * space.  In these circumstances we are unable to adjust the cache
49 * size.  To prevent the cache growing unbounded at these times we
50 * implement a "cache throttle" that slowes the flow of new data
51 * into the cache until we can make space avaiable.
52 *
53 * 2. The Megiddo and Modha model assumes a fixed cache size.
54 * Pages are evicted when the cache is full and there is a cache
55 * miss.  Our model has a variable sized cache.  It grows with
56 * high use, but also tries to react to memory preasure from the
57 * operating system: decreasing its size when system memory is
58 * tight.
59 *
60 * 3. The Megiddo and Modha model assumes a fixed page size. All
61 * elements of the cache are therefor exactly the same size.  So
62 * when adjusting the cache size following a cache miss, its simply
63 * a matter of choosing a single page to evict.  In our model, we
64 * have variable sized cache blocks (rangeing from 512 bytes to
65 * 128K bytes).  We therefor choose a set of blocks to evict to make
66 * space for a cache miss that approximates as closely as possible
67 * the space used by the new block.
68 *
69 * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
70 * by N. Megiddo & D. Modha, FAST 2003
71 */
72
73/*
74 * The locking model:
75 *
76 * A new reference to a cache buffer can be obtained in two
77 * ways: 1) via a hash table lookup using the DVA as a key,
78 * or 2) via one of the ARC lists.  The arc_read() inerface
79 * uses method 1, while the internal arc algorithms for
80 * adjusting the cache use method 2.  We therefor provide two
81 * types of locks: 1) the hash table lock array, and 2) the
82 * arc list locks.
83 *
84 * Buffers do not have their own mutexs, rather they rely on the
85 * hash table mutexs for the bulk of their protection (i.e. most
86 * fields in the arc_buf_hdr_t are protected by these mutexs).
87 *
88 * buf_hash_find() returns the appropriate mutex (held) when it
89 * locates the requested buffer in the hash table.  It returns
90 * NULL for the mutex if the buffer was not in the table.
91 *
92 * buf_hash_remove() expects the appropriate hash mutex to be
93 * already held before it is invoked.
94 *
95 * Each arc state also has a mutex which is used to protect the
96 * buffer list associated with the state.  When attempting to
97 * obtain a hash table lock while holding an arc list lock you
98 * must use: mutex_tryenter() to avoid deadlock.  Also note that
99 * the active state mutex must be held before the ghost state mutex.
100 *
101 * Arc buffers may have an associated eviction callback function.
102 * This function will be invoked prior to removing the buffer (e.g.
103 * in arc_do_user_evicts()).  Note however that the data associated
104 * with the buffer may be evicted prior to the callback.  The callback
105 * must be made with *no locks held* (to prevent deadlock).  Additionally,
106 * the users of callbacks must ensure that their private data is
107 * protected from simultaneous callbacks from arc_buf_evict()
108 * and arc_do_user_evicts().
109 *
110 * Note that the majority of the performance stats are manipulated
111 * with atomic operations.
112 */
113
114#include <sys/spa.h>
115#include <sys/zio.h>
116#include <sys/zio_checksum.h>
117#include <sys/zfs_context.h>
118#include <sys/arc.h>
119#include <sys/refcount.h>
120#ifdef _KERNEL
121#include <sys/dnlc.h>
122#endif
123#include <sys/callb.h>
124#include <sys/kstat.h>
125#include <sys/sdt.h>
126
127#define	ARC_FREE_AT_ONCE	4194304
128
129static kmutex_t		arc_reclaim_thr_lock;
130static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
131static uint8_t		arc_thread_exit;
132
133#define	ARC_REDUCE_DNLC_PERCENT	3
134uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
135
136typedef enum arc_reclaim_strategy {
137	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
138	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
139} arc_reclaim_strategy_t;
140
141/* number of seconds before growing cache again */
142static int		arc_grow_retry = 60;
143
144/*
145 * minimum lifespan of a prefetch block in clock ticks
146 * (initialized in arc_init())
147 */
148static int		arc_min_prefetch_lifespan;
149
150static int arc_dead;
151
152/*
153 * These tunables are for performance analysis.
154 */
155uint64_t zfs_arc_max;
156uint64_t zfs_arc_min;
157
158/*
159 * Note that buffers can be on one of 5 states:
160 *	ARC_anon	- anonymous (discussed below)
161 *	ARC_mru		- recently used, currently cached
162 *	ARC_mru_ghost	- recentely used, no longer in cache
163 *	ARC_mfu		- frequently used, currently cached
164 *	ARC_mfu_ghost	- frequently used, no longer in cache
165 * When there are no active references to the buffer, they
166 * are linked onto one of the lists in arc.  These are the
167 * only buffers that can be evicted or deleted.
168 *
169 * Anonymous buffers are buffers that are not associated with
170 * a DVA.  These are buffers that hold dirty block copies
171 * before they are written to stable storage.  By definition,
172 * they are "ref'd" and are considered part of arc_mru
173 * that cannot be freed.  Generally, they will aquire a DVA
174 * as they are written and migrate onto the arc_mru list.
175 */
176
177typedef struct arc_state {
178	list_t	arcs_list;	/* linked list of evictable buffer in state */
179	uint64_t arcs_lsize;	/* total size of buffers in the linked list */
180	uint64_t arcs_size;	/* total size of all buffers in this state */
181	kmutex_t arcs_mtx;
182} arc_state_t;
183
184/* The 5 states: */
185static arc_state_t ARC_anon;
186static arc_state_t ARC_mru;
187static arc_state_t ARC_mru_ghost;
188static arc_state_t ARC_mfu;
189static arc_state_t ARC_mfu_ghost;
190
191typedef struct arc_stats {
192	kstat_named_t arcstat_hits;
193	kstat_named_t arcstat_misses;
194	kstat_named_t arcstat_demand_data_hits;
195	kstat_named_t arcstat_demand_data_misses;
196	kstat_named_t arcstat_demand_metadata_hits;
197	kstat_named_t arcstat_demand_metadata_misses;
198	kstat_named_t arcstat_prefetch_data_hits;
199	kstat_named_t arcstat_prefetch_data_misses;
200	kstat_named_t arcstat_prefetch_metadata_hits;
201	kstat_named_t arcstat_prefetch_metadata_misses;
202	kstat_named_t arcstat_mru_hits;
203	kstat_named_t arcstat_mru_ghost_hits;
204	kstat_named_t arcstat_mfu_hits;
205	kstat_named_t arcstat_mfu_ghost_hits;
206	kstat_named_t arcstat_deleted;
207	kstat_named_t arcstat_recycle_miss;
208	kstat_named_t arcstat_mutex_miss;
209	kstat_named_t arcstat_evict_skip;
210	kstat_named_t arcstat_hash_elements;
211	kstat_named_t arcstat_hash_elements_max;
212	kstat_named_t arcstat_hash_collisions;
213	kstat_named_t arcstat_hash_chains;
214	kstat_named_t arcstat_hash_chain_max;
215	kstat_named_t arcstat_p;
216	kstat_named_t arcstat_c;
217	kstat_named_t arcstat_c_min;
218	kstat_named_t arcstat_c_max;
219	kstat_named_t arcstat_size;
220} arc_stats_t;
221
222static arc_stats_t arc_stats = {
223	{ "hits",			KSTAT_DATA_UINT64 },
224	{ "misses",			KSTAT_DATA_UINT64 },
225	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
226	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
227	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
228	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
229	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
230	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
231	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
232	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
233	{ "mru_hits",			KSTAT_DATA_UINT64 },
234	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
235	{ "mfu_hits",			KSTAT_DATA_UINT64 },
236	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
237	{ "deleted",			KSTAT_DATA_UINT64 },
238	{ "recycle_miss",		KSTAT_DATA_UINT64 },
239	{ "mutex_miss",			KSTAT_DATA_UINT64 },
240	{ "evict_skip",			KSTAT_DATA_UINT64 },
241	{ "hash_elements",		KSTAT_DATA_UINT64 },
242	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
243	{ "hash_collisions",		KSTAT_DATA_UINT64 },
244	{ "hash_chains",		KSTAT_DATA_UINT64 },
245	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
246	{ "p",				KSTAT_DATA_UINT64 },
247	{ "c",				KSTAT_DATA_UINT64 },
248	{ "c_min",			KSTAT_DATA_UINT64 },
249	{ "c_max",			KSTAT_DATA_UINT64 },
250	{ "size",			KSTAT_DATA_UINT64 }
251};
252
253#define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
254
255#define	ARCSTAT_INCR(stat, val) \
256	atomic_add_64(&arc_stats.stat.value.ui64, (val));
257
258#define	ARCSTAT_BUMP(stat) 	ARCSTAT_INCR(stat, 1)
259#define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
260
261#define	ARCSTAT_MAX(stat, val) {					\
262	uint64_t m;							\
263	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
264	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
265		continue;						\
266}
267
268#define	ARCSTAT_MAXSTAT(stat) \
269	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
270
271/*
272 * We define a macro to allow ARC hits/misses to be easily broken down by
273 * two separate conditions, giving a total of four different subtypes for
274 * each of hits and misses (so eight statistics total).
275 */
276#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
277	if (cond1) {							\
278		if (cond2) {						\
279			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
280		} else {						\
281			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
282		}							\
283	} else {							\
284		if (cond2) {						\
285			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
286		} else {						\
287			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
288		}							\
289	}
290
291kstat_t			*arc_ksp;
292static arc_state_t 	*arc_anon;
293static arc_state_t	*arc_mru;
294static arc_state_t	*arc_mru_ghost;
295static arc_state_t	*arc_mfu;
296static arc_state_t	*arc_mfu_ghost;
297
298/*
299 * There are several ARC variables that are critical to export as kstats --
300 * but we don't want to have to grovel around in the kstat whenever we wish to
301 * manipulate them.  For these variables, we therefore define them to be in
302 * terms of the statistic variable.  This assures that we are not introducing
303 * the possibility of inconsistency by having shadow copies of the variables,
304 * while still allowing the code to be readable.
305 */
306#define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
307#define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
308#define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
309#define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
310#define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
311
312static int		arc_no_grow;	/* Don't try to grow cache size */
313static uint64_t		arc_tempreserve;
314
315typedef struct arc_callback arc_callback_t;
316
317struct arc_callback {
318	void			*acb_private;
319	arc_done_func_t		*acb_done;
320	arc_byteswap_func_t	*acb_byteswap;
321	arc_buf_t		*acb_buf;
322	zio_t			*acb_zio_dummy;
323	arc_callback_t		*acb_next;
324};
325
326typedef struct arc_write_callback arc_write_callback_t;
327
328struct arc_write_callback {
329	void		*awcb_private;
330	arc_done_func_t	*awcb_ready;
331	arc_done_func_t	*awcb_done;
332	arc_buf_t	*awcb_buf;
333};
334
335struct arc_buf_hdr {
336	/* protected by hash lock */
337	dva_t			b_dva;
338	uint64_t		b_birth;
339	uint64_t		b_cksum0;
340
341	kmutex_t		b_freeze_lock;
342	zio_cksum_t		*b_freeze_cksum;
343
344	arc_buf_hdr_t		*b_hash_next;
345	arc_buf_t		*b_buf;
346	uint32_t		b_flags;
347	uint32_t		b_datacnt;
348
349	arc_callback_t		*b_acb;
350	kcondvar_t		b_cv;
351
352	/* immutable */
353	arc_buf_contents_t	b_type;
354	uint64_t		b_size;
355	spa_t			*b_spa;
356
357	/* protected by arc state mutex */
358	arc_state_t		*b_state;
359	list_node_t		b_arc_node;
360
361	/* updated atomically */
362	clock_t			b_arc_access;
363
364	/* self protecting */
365	refcount_t		b_refcnt;
366};
367
368static arc_buf_t *arc_eviction_list;
369static kmutex_t arc_eviction_mtx;
370static arc_buf_hdr_t arc_eviction_hdr;
371static void arc_get_data_buf(arc_buf_t *buf);
372static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
373
374#define	GHOST_STATE(state)	\
375	((state) == arc_mru_ghost || (state) == arc_mfu_ghost)
376
377/*
378 * Private ARC flags.  These flags are private ARC only flags that will show up
379 * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
380 * be passed in as arc_flags in things like arc_read.  However, these flags
381 * should never be passed and should only be set by ARC code.  When adding new
382 * public flags, make sure not to smash the private ones.
383 */
384
385#define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
386#define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
387#define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
388#define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
389#define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
390#define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
391
392#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
393#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
394#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
395#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
396#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
397
398/*
399 * Hash table routines
400 */
401
402#define	HT_LOCK_PAD	128
403
404struct ht_lock {
405	kmutex_t	ht_lock;
406#ifdef _KERNEL
407	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
408#endif
409};
410
411#define	BUF_LOCKS 256
412typedef struct buf_hash_table {
413	uint64_t ht_mask;
414	arc_buf_hdr_t **ht_table;
415	struct ht_lock ht_locks[BUF_LOCKS];
416} buf_hash_table_t;
417
418static buf_hash_table_t buf_hash_table;
419
420#define	BUF_HASH_INDEX(spa, dva, birth) \
421	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
422#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
423#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
424#define	HDR_LOCK(buf) \
425	(BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
426
427uint64_t zfs_crc64_table[256];
428
429static uint64_t
430buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
431{
432	uintptr_t spav = (uintptr_t)spa;
433	uint8_t *vdva = (uint8_t *)dva;
434	uint64_t crc = -1ULL;
435	int i;
436
437	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
438
439	for (i = 0; i < sizeof (dva_t); i++)
440		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
441
442	crc ^= (spav>>8) ^ birth;
443
444	return (crc);
445}
446
447#define	BUF_EMPTY(buf)						\
448	((buf)->b_dva.dva_word[0] == 0 &&			\
449	(buf)->b_dva.dva_word[1] == 0 &&			\
450	(buf)->b_birth == 0)
451
452#define	BUF_EQUAL(spa, dva, birth, buf)				\
453	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
454	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
455	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
456
457static arc_buf_hdr_t *
458buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp)
459{
460	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
461	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
462	arc_buf_hdr_t *buf;
463
464	mutex_enter(hash_lock);
465	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
466	    buf = buf->b_hash_next) {
467		if (BUF_EQUAL(spa, dva, birth, buf)) {
468			*lockp = hash_lock;
469			return (buf);
470		}
471	}
472	mutex_exit(hash_lock);
473	*lockp = NULL;
474	return (NULL);
475}
476
477/*
478 * Insert an entry into the hash table.  If there is already an element
479 * equal to elem in the hash table, then the already existing element
480 * will be returned and the new element will not be inserted.
481 * Otherwise returns NULL.
482 */
483static arc_buf_hdr_t *
484buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
485{
486	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
487	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
488	arc_buf_hdr_t *fbuf;
489	uint32_t i;
490
491	ASSERT(!HDR_IN_HASH_TABLE(buf));
492	*lockp = hash_lock;
493	mutex_enter(hash_lock);
494	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
495	    fbuf = fbuf->b_hash_next, i++) {
496		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
497			return (fbuf);
498	}
499
500	buf->b_hash_next = buf_hash_table.ht_table[idx];
501	buf_hash_table.ht_table[idx] = buf;
502	buf->b_flags |= ARC_IN_HASH_TABLE;
503
504	/* collect some hash table performance data */
505	if (i > 0) {
506		ARCSTAT_BUMP(arcstat_hash_collisions);
507		if (i == 1)
508			ARCSTAT_BUMP(arcstat_hash_chains);
509
510		ARCSTAT_MAX(arcstat_hash_chain_max, i);
511	}
512
513	ARCSTAT_BUMP(arcstat_hash_elements);
514	ARCSTAT_MAXSTAT(arcstat_hash_elements);
515
516	return (NULL);
517}
518
519static void
520buf_hash_remove(arc_buf_hdr_t *buf)
521{
522	arc_buf_hdr_t *fbuf, **bufp;
523	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
524
525	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
526	ASSERT(HDR_IN_HASH_TABLE(buf));
527
528	bufp = &buf_hash_table.ht_table[idx];
529	while ((fbuf = *bufp) != buf) {
530		ASSERT(fbuf != NULL);
531		bufp = &fbuf->b_hash_next;
532	}
533	*bufp = buf->b_hash_next;
534	buf->b_hash_next = NULL;
535	buf->b_flags &= ~ARC_IN_HASH_TABLE;
536
537	/* collect some hash table performance data */
538	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
539
540	if (buf_hash_table.ht_table[idx] &&
541	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
542		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
543}
544
545/*
546 * Global data structures and functions for the buf kmem cache.
547 */
548static kmem_cache_t *hdr_cache;
549static kmem_cache_t *buf_cache;
550
551static void
552buf_fini(void)
553{
554	int i;
555
556	kmem_free(buf_hash_table.ht_table,
557	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
558	for (i = 0; i < BUF_LOCKS; i++)
559		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
560	kmem_cache_destroy(hdr_cache);
561	kmem_cache_destroy(buf_cache);
562}
563
564/*
565 * Constructor callback - called when the cache is empty
566 * and a new buf is requested.
567 */
568/* ARGSUSED */
569static int
570hdr_cons(void *vbuf, void *unused, int kmflag)
571{
572	arc_buf_hdr_t *buf = vbuf;
573
574	bzero(buf, sizeof (arc_buf_hdr_t));
575	refcount_create(&buf->b_refcnt);
576	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
577	return (0);
578}
579
580/*
581 * Destructor callback - called when a cached buf is
582 * no longer required.
583 */
584/* ARGSUSED */
585static void
586hdr_dest(void *vbuf, void *unused)
587{
588	arc_buf_hdr_t *buf = vbuf;
589
590	refcount_destroy(&buf->b_refcnt);
591	cv_destroy(&buf->b_cv);
592}
593
594/*
595 * Reclaim callback -- invoked when memory is low.
596 */
597/* ARGSUSED */
598static void
599hdr_recl(void *unused)
600{
601	dprintf("hdr_recl called\n");
602	/*
603	 * umem calls the reclaim func when we destroy the buf cache,
604	 * which is after we do arc_fini().
605	 */
606	if (!arc_dead)
607		cv_signal(&arc_reclaim_thr_cv);
608}
609
610static void
611buf_init(void)
612{
613	uint64_t *ct;
614	uint64_t hsize = 1ULL << 12;
615	int i, j;
616
617	/*
618	 * The hash table is big enough to fill all of physical memory
619	 * with an average 64K block size.  The table will take up
620	 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
621	 */
622	while (hsize * 65536 < physmem * PAGESIZE)
623		hsize <<= 1;
624retry:
625	buf_hash_table.ht_mask = hsize - 1;
626	buf_hash_table.ht_table =
627	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
628	if (buf_hash_table.ht_table == NULL) {
629		ASSERT(hsize > (1ULL << 8));
630		hsize >>= 1;
631		goto retry;
632	}
633
634	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
635	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
636	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
637	    0, NULL, NULL, NULL, NULL, NULL, 0);
638
639	for (i = 0; i < 256; i++)
640		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
641			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
642
643	for (i = 0; i < BUF_LOCKS; i++) {
644		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
645		    NULL, MUTEX_DEFAULT, NULL);
646	}
647}
648
649#define	ARC_MINTIME	(hz>>4) /* 62 ms */
650
651static void
652arc_cksum_verify(arc_buf_t *buf)
653{
654	zio_cksum_t zc;
655
656	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
657		return;
658
659	mutex_enter(&buf->b_hdr->b_freeze_lock);
660	if (buf->b_hdr->b_freeze_cksum == NULL ||
661	    (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
662		mutex_exit(&buf->b_hdr->b_freeze_lock);
663		return;
664	}
665	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
666	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
667		panic("buffer modified while frozen!");
668	mutex_exit(&buf->b_hdr->b_freeze_lock);
669}
670
671static void
672arc_cksum_compute(arc_buf_t *buf)
673{
674	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
675		return;
676
677	mutex_enter(&buf->b_hdr->b_freeze_lock);
678	if (buf->b_hdr->b_freeze_cksum != NULL) {
679		mutex_exit(&buf->b_hdr->b_freeze_lock);
680		return;
681	}
682	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
683	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
684	    buf->b_hdr->b_freeze_cksum);
685	mutex_exit(&buf->b_hdr->b_freeze_lock);
686}
687
688void
689arc_buf_thaw(arc_buf_t *buf)
690{
691	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
692		return;
693
694	if (buf->b_hdr->b_state != arc_anon)
695		panic("modifying non-anon buffer!");
696	if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
697		panic("modifying buffer while i/o in progress!");
698	arc_cksum_verify(buf);
699	mutex_enter(&buf->b_hdr->b_freeze_lock);
700	if (buf->b_hdr->b_freeze_cksum != NULL) {
701		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
702		buf->b_hdr->b_freeze_cksum = NULL;
703	}
704	mutex_exit(&buf->b_hdr->b_freeze_lock);
705}
706
707void
708arc_buf_freeze(arc_buf_t *buf)
709{
710	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
711		return;
712
713	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
714	    buf->b_hdr->b_state == arc_anon);
715	arc_cksum_compute(buf);
716}
717
718static void
719add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
720{
721	ASSERT(MUTEX_HELD(hash_lock));
722
723	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
724	    (ab->b_state != arc_anon)) {
725		uint64_t delta = ab->b_size * ab->b_datacnt;
726
727		ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx));
728		mutex_enter(&ab->b_state->arcs_mtx);
729		ASSERT(list_link_active(&ab->b_arc_node));
730		list_remove(&ab->b_state->arcs_list, ab);
731		if (GHOST_STATE(ab->b_state)) {
732			ASSERT3U(ab->b_datacnt, ==, 0);
733			ASSERT3P(ab->b_buf, ==, NULL);
734			delta = ab->b_size;
735		}
736		ASSERT(delta > 0);
737		ASSERT3U(ab->b_state->arcs_lsize, >=, delta);
738		atomic_add_64(&ab->b_state->arcs_lsize, -delta);
739		mutex_exit(&ab->b_state->arcs_mtx);
740		/* remove the prefetch flag is we get a reference */
741		if (ab->b_flags & ARC_PREFETCH)
742			ab->b_flags &= ~ARC_PREFETCH;
743	}
744}
745
746static int
747remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
748{
749	int cnt;
750	arc_state_t *state = ab->b_state;
751
752	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
753	ASSERT(!GHOST_STATE(state));
754
755	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
756	    (state != arc_anon)) {
757		ASSERT(!MUTEX_HELD(&state->arcs_mtx));
758		mutex_enter(&state->arcs_mtx);
759		ASSERT(!list_link_active(&ab->b_arc_node));
760		list_insert_head(&state->arcs_list, ab);
761		ASSERT(ab->b_datacnt > 0);
762		atomic_add_64(&state->arcs_lsize, ab->b_size * ab->b_datacnt);
763		ASSERT3U(state->arcs_size, >=, state->arcs_lsize);
764		mutex_exit(&state->arcs_mtx);
765	}
766	return (cnt);
767}
768
769/*
770 * Move the supplied buffer to the indicated state.  The mutex
771 * for the buffer must be held by the caller.
772 */
773static void
774arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
775{
776	arc_state_t *old_state = ab->b_state;
777	int64_t refcnt = refcount_count(&ab->b_refcnt);
778	uint64_t from_delta, to_delta;
779
780	ASSERT(MUTEX_HELD(hash_lock));
781	ASSERT(new_state != old_state);
782	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
783	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
784
785	from_delta = to_delta = ab->b_datacnt * ab->b_size;
786
787	/*
788	 * If this buffer is evictable, transfer it from the
789	 * old state list to the new state list.
790	 */
791	if (refcnt == 0) {
792		if (old_state != arc_anon) {
793			int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
794
795			if (use_mutex)
796				mutex_enter(&old_state->arcs_mtx);
797
798			ASSERT(list_link_active(&ab->b_arc_node));
799			list_remove(&old_state->arcs_list, ab);
800
801			/*
802			 * If prefetching out of the ghost cache,
803			 * we will have a non-null datacnt.
804			 */
805			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
806				/* ghost elements have a ghost size */
807				ASSERT(ab->b_buf == NULL);
808				from_delta = ab->b_size;
809			}
810			ASSERT3U(old_state->arcs_lsize, >=, from_delta);
811			atomic_add_64(&old_state->arcs_lsize, -from_delta);
812
813			if (use_mutex)
814				mutex_exit(&old_state->arcs_mtx);
815		}
816		if (new_state != arc_anon) {
817			int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
818
819			if (use_mutex)
820				mutex_enter(&new_state->arcs_mtx);
821
822			list_insert_head(&new_state->arcs_list, ab);
823
824			/* ghost elements have a ghost size */
825			if (GHOST_STATE(new_state)) {
826				ASSERT(ab->b_datacnt == 0);
827				ASSERT(ab->b_buf == NULL);
828				to_delta = ab->b_size;
829			}
830			atomic_add_64(&new_state->arcs_lsize, to_delta);
831			ASSERT3U(new_state->arcs_size + to_delta, >=,
832			    new_state->arcs_lsize);
833
834			if (use_mutex)
835				mutex_exit(&new_state->arcs_mtx);
836		}
837	}
838
839	ASSERT(!BUF_EMPTY(ab));
840	if (new_state == arc_anon && old_state != arc_anon) {
841		buf_hash_remove(ab);
842	}
843
844	/* adjust state sizes */
845	if (to_delta)
846		atomic_add_64(&new_state->arcs_size, to_delta);
847	if (from_delta) {
848		ASSERT3U(old_state->arcs_size, >=, from_delta);
849		atomic_add_64(&old_state->arcs_size, -from_delta);
850	}
851	ab->b_state = new_state;
852}
853
854arc_buf_t *
855arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
856{
857	arc_buf_hdr_t *hdr;
858	arc_buf_t *buf;
859
860	ASSERT3U(size, >, 0);
861	hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
862	ASSERT(BUF_EMPTY(hdr));
863	hdr->b_size = size;
864	hdr->b_type = type;
865	hdr->b_spa = spa;
866	hdr->b_state = arc_anon;
867	hdr->b_arc_access = 0;
868	mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
869	buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
870	buf->b_hdr = hdr;
871	buf->b_data = NULL;
872	buf->b_efunc = NULL;
873	buf->b_private = NULL;
874	buf->b_next = NULL;
875	hdr->b_buf = buf;
876	arc_get_data_buf(buf);
877	hdr->b_datacnt = 1;
878	hdr->b_flags = 0;
879	ASSERT(refcount_is_zero(&hdr->b_refcnt));
880	(void) refcount_add(&hdr->b_refcnt, tag);
881
882	return (buf);
883}
884
885static arc_buf_t *
886arc_buf_clone(arc_buf_t *from)
887{
888	arc_buf_t *buf;
889	arc_buf_hdr_t *hdr = from->b_hdr;
890	uint64_t size = hdr->b_size;
891
892	buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
893	buf->b_hdr = hdr;
894	buf->b_data = NULL;
895	buf->b_efunc = NULL;
896	buf->b_private = NULL;
897	buf->b_next = hdr->b_buf;
898	hdr->b_buf = buf;
899	arc_get_data_buf(buf);
900	bcopy(from->b_data, buf->b_data, size);
901	hdr->b_datacnt += 1;
902	return (buf);
903}
904
905void
906arc_buf_add_ref(arc_buf_t *buf, void* tag)
907{
908	arc_buf_hdr_t *hdr;
909	kmutex_t *hash_lock;
910
911	/*
912	 * Check to see if this buffer is currently being evicted via
913	 * arc_do_user_evicts().
914	 */
915	mutex_enter(&arc_eviction_mtx);
916	hdr = buf->b_hdr;
917	if (hdr == NULL) {
918		mutex_exit(&arc_eviction_mtx);
919		return;
920	}
921	hash_lock = HDR_LOCK(hdr);
922	mutex_exit(&arc_eviction_mtx);
923
924	mutex_enter(hash_lock);
925	if (buf->b_data == NULL) {
926		/*
927		 * This buffer is evicted.
928		 */
929		mutex_exit(hash_lock);
930		return;
931	}
932
933	ASSERT(buf->b_hdr == hdr);
934	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
935	add_reference(hdr, hash_lock, tag);
936	arc_access(hdr, hash_lock);
937	mutex_exit(hash_lock);
938	ARCSTAT_BUMP(arcstat_hits);
939	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
940	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
941	    data, metadata, hits);
942}
943
944static void
945arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
946{
947	arc_buf_t **bufp;
948
949	/* free up data associated with the buf */
950	if (buf->b_data) {
951		arc_state_t *state = buf->b_hdr->b_state;
952		uint64_t size = buf->b_hdr->b_size;
953		arc_buf_contents_t type = buf->b_hdr->b_type;
954
955		arc_cksum_verify(buf);
956		if (!recycle) {
957			if (type == ARC_BUFC_METADATA) {
958				zio_buf_free(buf->b_data, size);
959			} else {
960				ASSERT(type == ARC_BUFC_DATA);
961				zio_data_buf_free(buf->b_data, size);
962			}
963			atomic_add_64(&arc_size, -size);
964		}
965		if (list_link_active(&buf->b_hdr->b_arc_node)) {
966			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
967			ASSERT(state != arc_anon);
968			ASSERT3U(state->arcs_lsize, >=, size);
969			atomic_add_64(&state->arcs_lsize, -size);
970		}
971		ASSERT3U(state->arcs_size, >=, size);
972		atomic_add_64(&state->arcs_size, -size);
973		buf->b_data = NULL;
974		ASSERT(buf->b_hdr->b_datacnt > 0);
975		buf->b_hdr->b_datacnt -= 1;
976	}
977
978	/* only remove the buf if requested */
979	if (!all)
980		return;
981
982	/* remove the buf from the hdr list */
983	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
984		continue;
985	*bufp = buf->b_next;
986
987	ASSERT(buf->b_efunc == NULL);
988
989	/* clean up the buf */
990	buf->b_hdr = NULL;
991	kmem_cache_free(buf_cache, buf);
992}
993
994static void
995arc_hdr_destroy(arc_buf_hdr_t *hdr)
996{
997	ASSERT(refcount_is_zero(&hdr->b_refcnt));
998	ASSERT3P(hdr->b_state, ==, arc_anon);
999	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1000
1001	if (!BUF_EMPTY(hdr)) {
1002		ASSERT(!HDR_IN_HASH_TABLE(hdr));
1003		bzero(&hdr->b_dva, sizeof (dva_t));
1004		hdr->b_birth = 0;
1005		hdr->b_cksum0 = 0;
1006	}
1007	while (hdr->b_buf) {
1008		arc_buf_t *buf = hdr->b_buf;
1009
1010		if (buf->b_efunc) {
1011			mutex_enter(&arc_eviction_mtx);
1012			ASSERT(buf->b_hdr != NULL);
1013			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1014			hdr->b_buf = buf->b_next;
1015			buf->b_hdr = &arc_eviction_hdr;
1016			buf->b_next = arc_eviction_list;
1017			arc_eviction_list = buf;
1018			mutex_exit(&arc_eviction_mtx);
1019		} else {
1020			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1021		}
1022	}
1023	if (hdr->b_freeze_cksum != NULL) {
1024		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1025		hdr->b_freeze_cksum = NULL;
1026	}
1027	mutex_destroy(&hdr->b_freeze_lock);
1028
1029	ASSERT(!list_link_active(&hdr->b_arc_node));
1030	ASSERT3P(hdr->b_hash_next, ==, NULL);
1031	ASSERT3P(hdr->b_acb, ==, NULL);
1032	kmem_cache_free(hdr_cache, hdr);
1033}
1034
1035void
1036arc_buf_free(arc_buf_t *buf, void *tag)
1037{
1038	arc_buf_hdr_t *hdr = buf->b_hdr;
1039	int hashed = hdr->b_state != arc_anon;
1040
1041	ASSERT(buf->b_efunc == NULL);
1042	ASSERT(buf->b_data != NULL);
1043
1044	if (hashed) {
1045		kmutex_t *hash_lock = HDR_LOCK(hdr);
1046
1047		mutex_enter(hash_lock);
1048		(void) remove_reference(hdr, hash_lock, tag);
1049		if (hdr->b_datacnt > 1)
1050			arc_buf_destroy(buf, FALSE, TRUE);
1051		else
1052			hdr->b_flags |= ARC_BUF_AVAILABLE;
1053		mutex_exit(hash_lock);
1054	} else if (HDR_IO_IN_PROGRESS(hdr)) {
1055		int destroy_hdr;
1056		/*
1057		 * We are in the middle of an async write.  Don't destroy
1058		 * this buffer unless the write completes before we finish
1059		 * decrementing the reference count.
1060		 */
1061		mutex_enter(&arc_eviction_mtx);
1062		(void) remove_reference(hdr, NULL, tag);
1063		ASSERT(refcount_is_zero(&hdr->b_refcnt));
1064		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1065		mutex_exit(&arc_eviction_mtx);
1066		if (destroy_hdr)
1067			arc_hdr_destroy(hdr);
1068	} else {
1069		if (remove_reference(hdr, NULL, tag) > 0) {
1070			ASSERT(HDR_IO_ERROR(hdr));
1071			arc_buf_destroy(buf, FALSE, TRUE);
1072		} else {
1073			arc_hdr_destroy(hdr);
1074		}
1075	}
1076}
1077
1078int
1079arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1080{
1081	arc_buf_hdr_t *hdr = buf->b_hdr;
1082	kmutex_t *hash_lock = HDR_LOCK(hdr);
1083	int no_callback = (buf->b_efunc == NULL);
1084
1085	if (hdr->b_state == arc_anon) {
1086		arc_buf_free(buf, tag);
1087		return (no_callback);
1088	}
1089
1090	mutex_enter(hash_lock);
1091	ASSERT(hdr->b_state != arc_anon);
1092	ASSERT(buf->b_data != NULL);
1093
1094	(void) remove_reference(hdr, hash_lock, tag);
1095	if (hdr->b_datacnt > 1) {
1096		if (no_callback)
1097			arc_buf_destroy(buf, FALSE, TRUE);
1098	} else if (no_callback) {
1099		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1100		hdr->b_flags |= ARC_BUF_AVAILABLE;
1101	}
1102	ASSERT(no_callback || hdr->b_datacnt > 1 ||
1103	    refcount_is_zero(&hdr->b_refcnt));
1104	mutex_exit(hash_lock);
1105	return (no_callback);
1106}
1107
1108int
1109arc_buf_size(arc_buf_t *buf)
1110{
1111	return (buf->b_hdr->b_size);
1112}
1113
1114/*
1115 * Evict buffers from list until we've removed the specified number of
1116 * bytes.  Move the removed buffers to the appropriate evict state.
1117 * If the recycle flag is set, then attempt to "recycle" a buffer:
1118 * - look for a buffer to evict that is `bytes' long.
1119 * - return the data block from this buffer rather than freeing it.
1120 * This flag is used by callers that are trying to make space for a
1121 * new buffer in a full arc cache.
1122 */
1123static void *
1124arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
1125    arc_buf_contents_t type)
1126{
1127	arc_state_t *evicted_state;
1128	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1129	arc_buf_hdr_t *ab, *ab_prev = NULL;
1130	kmutex_t *hash_lock;
1131	boolean_t have_lock;
1132	void *stolen = NULL;
1133
1134	ASSERT(state == arc_mru || state == arc_mfu);
1135
1136	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1137
1138	mutex_enter(&state->arcs_mtx);
1139	mutex_enter(&evicted_state->arcs_mtx);
1140
1141	for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) {
1142		ab_prev = list_prev(&state->arcs_list, ab);
1143		/* prefetch buffers have a minimum lifespan */
1144		if (HDR_IO_IN_PROGRESS(ab) ||
1145		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1146		    lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) {
1147			skipped++;
1148			continue;
1149		}
1150		/* "lookahead" for better eviction candidate */
1151		if (recycle && ab->b_size != bytes &&
1152		    ab_prev && ab_prev->b_size == bytes)
1153			continue;
1154		hash_lock = HDR_LOCK(ab);
1155		have_lock = MUTEX_HELD(hash_lock);
1156		if (have_lock || mutex_tryenter(hash_lock)) {
1157			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
1158			ASSERT(ab->b_datacnt > 0);
1159			while (ab->b_buf) {
1160				arc_buf_t *buf = ab->b_buf;
1161				if (buf->b_data) {
1162					bytes_evicted += ab->b_size;
1163					if (recycle && ab->b_type == type &&
1164					    ab->b_size == bytes) {
1165						stolen = buf->b_data;
1166						recycle = FALSE;
1167					}
1168				}
1169				if (buf->b_efunc) {
1170					mutex_enter(&arc_eviction_mtx);
1171					arc_buf_destroy(buf,
1172					    buf->b_data == stolen, FALSE);
1173					ab->b_buf = buf->b_next;
1174					buf->b_hdr = &arc_eviction_hdr;
1175					buf->b_next = arc_eviction_list;
1176					arc_eviction_list = buf;
1177					mutex_exit(&arc_eviction_mtx);
1178				} else {
1179					arc_buf_destroy(buf,
1180					    buf->b_data == stolen, TRUE);
1181				}
1182			}
1183			ASSERT(ab->b_datacnt == 0);
1184			arc_change_state(evicted_state, ab, hash_lock);
1185			ASSERT(HDR_IN_HASH_TABLE(ab));
1186			ab->b_flags = ARC_IN_HASH_TABLE;
1187			DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1188			if (!have_lock)
1189				mutex_exit(hash_lock);
1190			if (bytes >= 0 && bytes_evicted >= bytes)
1191				break;
1192		} else {
1193			missed += 1;
1194		}
1195	}
1196
1197	mutex_exit(&evicted_state->arcs_mtx);
1198	mutex_exit(&state->arcs_mtx);
1199
1200	if (bytes_evicted < bytes)
1201		dprintf("only evicted %lld bytes from %x",
1202		    (longlong_t)bytes_evicted, state);
1203
1204	if (skipped)
1205		ARCSTAT_INCR(arcstat_evict_skip, skipped);
1206
1207	if (missed)
1208		ARCSTAT_INCR(arcstat_mutex_miss, missed);
1209
1210	return (stolen);
1211}
1212
1213/*
1214 * Remove buffers from list until we've removed the specified number of
1215 * bytes.  Destroy the buffers that are removed.
1216 */
1217static void
1218arc_evict_ghost(arc_state_t *state, int64_t bytes)
1219{
1220	arc_buf_hdr_t *ab, *ab_prev;
1221	kmutex_t *hash_lock;
1222	uint64_t bytes_deleted = 0;
1223	uint64_t bufs_skipped = 0;
1224
1225	ASSERT(GHOST_STATE(state));
1226top:
1227	mutex_enter(&state->arcs_mtx);
1228	for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) {
1229		ab_prev = list_prev(&state->arcs_list, ab);
1230		hash_lock = HDR_LOCK(ab);
1231		if (mutex_tryenter(hash_lock)) {
1232			ASSERT(!HDR_IO_IN_PROGRESS(ab));
1233			ASSERT(ab->b_buf == NULL);
1234			arc_change_state(arc_anon, ab, hash_lock);
1235			mutex_exit(hash_lock);
1236			ARCSTAT_BUMP(arcstat_deleted);
1237			bytes_deleted += ab->b_size;
1238			arc_hdr_destroy(ab);
1239			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
1240			if (bytes >= 0 && bytes_deleted >= bytes)
1241				break;
1242		} else {
1243			if (bytes < 0) {
1244				mutex_exit(&state->arcs_mtx);
1245				mutex_enter(hash_lock);
1246				mutex_exit(hash_lock);
1247				goto top;
1248			}
1249			bufs_skipped += 1;
1250		}
1251	}
1252	mutex_exit(&state->arcs_mtx);
1253
1254	if (bufs_skipped) {
1255		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
1256		ASSERT(bytes >= 0);
1257	}
1258
1259	if (bytes_deleted < bytes)
1260		dprintf("only deleted %lld bytes from %p",
1261		    (longlong_t)bytes_deleted, state);
1262}
1263
1264static void
1265arc_adjust(void)
1266{
1267	int64_t top_sz, mru_over, arc_over, todelete;
1268
1269	top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
1270
1271	if (top_sz > arc_p && arc_mru->arcs_lsize > 0) {
1272		int64_t toevict = MIN(arc_mru->arcs_lsize, top_sz - arc_p);
1273		(void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_UNDEF);
1274		top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
1275	}
1276
1277	mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c;
1278
1279	if (mru_over > 0) {
1280		if (arc_mru_ghost->arcs_lsize > 0) {
1281			todelete = MIN(arc_mru_ghost->arcs_lsize, mru_over);
1282			arc_evict_ghost(arc_mru_ghost, todelete);
1283		}
1284	}
1285
1286	if ((arc_over = arc_size - arc_c) > 0) {
1287		int64_t tbl_over;
1288
1289		if (arc_mfu->arcs_lsize > 0) {
1290			int64_t toevict = MIN(arc_mfu->arcs_lsize, arc_over);
1291			(void) arc_evict(arc_mfu, toevict, FALSE,
1292			    ARC_BUFC_UNDEF);
1293		}
1294
1295		tbl_over = arc_size + arc_mru_ghost->arcs_lsize +
1296		    arc_mfu_ghost->arcs_lsize - arc_c*2;
1297
1298		if (tbl_over > 0 && arc_mfu_ghost->arcs_lsize > 0) {
1299			todelete = MIN(arc_mfu_ghost->arcs_lsize, tbl_over);
1300			arc_evict_ghost(arc_mfu_ghost, todelete);
1301		}
1302	}
1303}
1304
1305static void
1306arc_do_user_evicts(void)
1307{
1308	mutex_enter(&arc_eviction_mtx);
1309	while (arc_eviction_list != NULL) {
1310		arc_buf_t *buf = arc_eviction_list;
1311		arc_eviction_list = buf->b_next;
1312		buf->b_hdr = NULL;
1313		mutex_exit(&arc_eviction_mtx);
1314
1315		if (buf->b_efunc != NULL)
1316			VERIFY(buf->b_efunc(buf) == 0);
1317
1318		buf->b_efunc = NULL;
1319		buf->b_private = NULL;
1320		kmem_cache_free(buf_cache, buf);
1321		mutex_enter(&arc_eviction_mtx);
1322	}
1323	mutex_exit(&arc_eviction_mtx);
1324}
1325
1326/*
1327 * Flush all *evictable* data from the cache.
1328 * NOTE: this will not touch "active" (i.e. referenced) data.
1329 */
1330void
1331arc_flush(void)
1332{
1333	while (list_head(&arc_mru->arcs_list))
1334		(void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_UNDEF);
1335	while (list_head(&arc_mfu->arcs_list))
1336		(void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_UNDEF);
1337
1338	arc_evict_ghost(arc_mru_ghost, -1);
1339	arc_evict_ghost(arc_mfu_ghost, -1);
1340
1341	mutex_enter(&arc_reclaim_thr_lock);
1342	arc_do_user_evicts();
1343	mutex_exit(&arc_reclaim_thr_lock);
1344	ASSERT(arc_eviction_list == NULL);
1345}
1346
1347int arc_shrink_shift = 5;		/* log2(fraction of arc to reclaim) */
1348
1349void
1350arc_shrink(void)
1351{
1352	if (arc_c > arc_c_min) {
1353		uint64_t to_free;
1354
1355#ifdef _KERNEL
1356		to_free = arc_c >> arc_shrink_shift;
1357#else
1358		to_free = arc_c >> arc_shrink_shift;
1359#endif
1360		if (arc_c > arc_c_min + to_free)
1361			atomic_add_64(&arc_c, -to_free);
1362		else
1363			arc_c = arc_c_min;
1364
1365		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
1366		if (arc_c > arc_size)
1367			arc_c = MAX(arc_size, arc_c_min);
1368		if (arc_p > arc_c)
1369			arc_p = (arc_c >> 1);
1370		ASSERT(arc_c >= arc_c_min);
1371		ASSERT((int64_t)arc_p >= 0);
1372	}
1373
1374	if (arc_size > arc_c)
1375		arc_adjust();
1376}
1377
1378static int zfs_needfree = 0;
1379
1380static int
1381arc_reclaim_needed(void)
1382{
1383#if 0
1384	uint64_t extra;
1385#endif
1386
1387#ifdef _KERNEL
1388
1389	if (zfs_needfree)
1390		return (1);
1391
1392#if 0
1393	/*
1394	 * check to make sure that swapfs has enough space so that anon
1395	 * reservations can still succeeed. anon_resvmem() checks that the
1396	 * availrmem is greater than swapfs_minfree, and the number of reserved
1397	 * swap pages.  We also add a bit of extra here just to prevent
1398	 * circumstances from getting really dire.
1399	 */
1400	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
1401		return (1);
1402
1403	/*
1404	 * If zio data pages are being allocated out of a separate heap segment,
1405	 * then check that the size of available vmem for this area remains
1406	 * above 1/4th free.  This needs to be done when the size of the
1407	 * non-default segment is smaller than physical memory, so we could
1408	 * conceivably run out of VA in that segment before running out of
1409	 * physical memory.
1410	 */
1411	if (zio_arena != NULL) {
1412		size_t arc_ziosize =
1413		    btop(vmem_size(zio_arena, VMEM_FREE | VMEM_ALLOC));
1414
1415		if ((physmem > arc_ziosize) &&
1416		    (btop(vmem_size(zio_arena, VMEM_FREE)) < arc_ziosize >> 2))
1417			return (1);
1418	}
1419
1420#if defined(__i386)
1421	/*
1422	 * If we're on an i386 platform, it's possible that we'll exhaust the
1423	 * kernel heap space before we ever run out of available physical
1424	 * memory.  Most checks of the size of the heap_area compare against
1425	 * tune.t_minarmem, which is the minimum available real memory that we
1426	 * can have in the system.  However, this is generally fixed at 25 pages
1427	 * which is so low that it's useless.  In this comparison, we seek to
1428	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
1429	 * heap is allocated.  (Or, in the caclulation, if less than 1/4th is
1430	 * free)
1431	 */
1432	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
1433	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
1434		return (1);
1435#endif
1436#else
1437	if (kmem_map->size > (vm_kmem_size * 3) / 4)
1438		return (1);
1439#endif
1440
1441#else
1442	if (spa_get_random(100) == 0)
1443		return (1);
1444#endif
1445	return (0);
1446}
1447
1448static void
1449arc_kmem_reap_now(arc_reclaim_strategy_t strat)
1450{
1451#ifdef ZIO_USE_UMA
1452	size_t			i;
1453	kmem_cache_t		*prev_cache = NULL;
1454	kmem_cache_t		*prev_data_cache = NULL;
1455	extern kmem_cache_t	*zio_buf_cache[];
1456	extern kmem_cache_t	*zio_data_buf_cache[];
1457#endif
1458
1459#ifdef _KERNEL
1460	/*
1461	 * First purge some DNLC entries, in case the DNLC is using
1462	 * up too much memory.
1463	 */
1464	dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
1465
1466#if defined(__i386)
1467	/*
1468	 * Reclaim unused memory from all kmem caches.
1469	 */
1470	kmem_reap();
1471#endif
1472#endif
1473
1474	/*
1475	 * An agressive reclamation will shrink the cache size as well as
1476	 * reap free buffers from the arc kmem caches.
1477	 */
1478	if (strat == ARC_RECLAIM_AGGR)
1479		arc_shrink();
1480
1481#ifdef ZIO_USE_UMA
1482	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
1483		if (zio_buf_cache[i] != prev_cache) {
1484			prev_cache = zio_buf_cache[i];
1485			kmem_cache_reap_now(zio_buf_cache[i]);
1486		}
1487		if (zio_data_buf_cache[i] != prev_data_cache) {
1488			prev_data_cache = zio_data_buf_cache[i];
1489			kmem_cache_reap_now(zio_data_buf_cache[i]);
1490		}
1491	}
1492#endif
1493	kmem_cache_reap_now(buf_cache);
1494	kmem_cache_reap_now(hdr_cache);
1495}
1496
1497static void
1498arc_reclaim_thread(void *dummy __unused)
1499{
1500	clock_t			growtime = 0;
1501	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
1502	callb_cpr_t		cpr;
1503
1504	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
1505
1506	mutex_enter(&arc_reclaim_thr_lock);
1507	while (arc_thread_exit == 0) {
1508		if (arc_reclaim_needed()) {
1509
1510			if (arc_no_grow) {
1511				if (last_reclaim == ARC_RECLAIM_CONS) {
1512					last_reclaim = ARC_RECLAIM_AGGR;
1513				} else {
1514					last_reclaim = ARC_RECLAIM_CONS;
1515				}
1516			} else {
1517				arc_no_grow = TRUE;
1518				last_reclaim = ARC_RECLAIM_AGGR;
1519				membar_producer();
1520			}
1521
1522			/* reset the growth delay for every reclaim */
1523			growtime = lbolt + (arc_grow_retry * hz);
1524			ASSERT(growtime > 0);
1525
1526			if (zfs_needfree && last_reclaim == ARC_RECLAIM_CONS) {
1527				/*
1528				 * If zfs_needfree is TRUE our vm_lowmem hook
1529				 * was called and in that case we must free some
1530				 * memory, so switch to aggressive mode.
1531				 */
1532				arc_no_grow = TRUE;
1533				last_reclaim = ARC_RECLAIM_AGGR;
1534			}
1535			arc_kmem_reap_now(last_reclaim);
1536		} else if ((growtime > 0) && ((growtime - lbolt) <= 0)) {
1537			arc_no_grow = FALSE;
1538		}
1539
1540		if (zfs_needfree ||
1541		    (2 * arc_c < arc_size +
1542		    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size))
1543			arc_adjust();
1544
1545		if (arc_eviction_list != NULL)
1546			arc_do_user_evicts();
1547
1548		if (arc_reclaim_needed()) {
1549			zfs_needfree = 0;
1550#ifdef _KERNEL
1551			wakeup(&zfs_needfree);
1552#endif
1553		}
1554
1555		/* block until needed, or one second, whichever is shorter */
1556		CALLB_CPR_SAFE_BEGIN(&cpr);
1557		(void) cv_timedwait(&arc_reclaim_thr_cv,
1558		    &arc_reclaim_thr_lock, hz);
1559		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
1560	}
1561
1562	arc_thread_exit = 0;
1563	cv_broadcast(&arc_reclaim_thr_cv);
1564	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
1565	thread_exit();
1566}
1567
1568/*
1569 * Adapt arc info given the number of bytes we are trying to add and
1570 * the state that we are comming from.  This function is only called
1571 * when we are adding new content to the cache.
1572 */
1573static void
1574arc_adapt(int bytes, arc_state_t *state)
1575{
1576	int mult;
1577
1578	ASSERT(bytes > 0);
1579	/*
1580	 * Adapt the target size of the MRU list:
1581	 *	- if we just hit in the MRU ghost list, then increase
1582	 *	  the target size of the MRU list.
1583	 *	- if we just hit in the MFU ghost list, then increase
1584	 *	  the target size of the MFU list by decreasing the
1585	 *	  target size of the MRU list.
1586	 */
1587	if (state == arc_mru_ghost) {
1588		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
1589		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
1590
1591		arc_p = MIN(arc_c, arc_p + bytes * mult);
1592	} else if (state == arc_mfu_ghost) {
1593		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
1594		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
1595
1596		arc_p = MAX(0, (int64_t)arc_p - bytes * mult);
1597	}
1598	ASSERT((int64_t)arc_p >= 0);
1599
1600	if (arc_reclaim_needed()) {
1601		cv_signal(&arc_reclaim_thr_cv);
1602		return;
1603	}
1604
1605	if (arc_no_grow)
1606		return;
1607
1608	if (arc_c >= arc_c_max)
1609		return;
1610
1611	/*
1612	 * If we're within (2 * maxblocksize) bytes of the target
1613	 * cache size, increment the target cache size
1614	 */
1615	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
1616		atomic_add_64(&arc_c, (int64_t)bytes);
1617		if (arc_c > arc_c_max)
1618			arc_c = arc_c_max;
1619		else if (state == arc_anon)
1620			atomic_add_64(&arc_p, (int64_t)bytes);
1621		if (arc_p > arc_c)
1622			arc_p = arc_c;
1623	}
1624	ASSERT((int64_t)arc_p >= 0);
1625}
1626
1627/*
1628 * Check if the cache has reached its limits and eviction is required
1629 * prior to insert.
1630 */
1631static int
1632arc_evict_needed()
1633{
1634	if (arc_reclaim_needed())
1635		return (1);
1636
1637	return (arc_size > arc_c);
1638}
1639
1640/*
1641 * The buffer, supplied as the first argument, needs a data block.
1642 * So, if we are at cache max, determine which cache should be victimized.
1643 * We have the following cases:
1644 *
1645 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
1646 * In this situation if we're out of space, but the resident size of the MFU is
1647 * under the limit, victimize the MFU cache to satisfy this insertion request.
1648 *
1649 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
1650 * Here, we've used up all of the available space for the MRU, so we need to
1651 * evict from our own cache instead.  Evict from the set of resident MRU
1652 * entries.
1653 *
1654 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
1655 * c minus p represents the MFU space in the cache, since p is the size of the
1656 * cache that is dedicated to the MRU.  In this situation there's still space on
1657 * the MFU side, so the MRU side needs to be victimized.
1658 *
1659 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
1660 * MFU's resident set is consuming more space than it has been allotted.  In
1661 * this situation, we must victimize our own cache, the MFU, for this insertion.
1662 */
1663static void
1664arc_get_data_buf(arc_buf_t *buf)
1665{
1666	arc_state_t		*state = buf->b_hdr->b_state;
1667	uint64_t		size = buf->b_hdr->b_size;
1668	arc_buf_contents_t	type = buf->b_hdr->b_type;
1669
1670	arc_adapt(size, state);
1671
1672	/*
1673	 * We have not yet reached cache maximum size,
1674	 * just allocate a new buffer.
1675	 */
1676	if (!arc_evict_needed()) {
1677		if (type == ARC_BUFC_METADATA) {
1678			buf->b_data = zio_buf_alloc(size);
1679		} else {
1680			ASSERT(type == ARC_BUFC_DATA);
1681			buf->b_data = zio_data_buf_alloc(size);
1682		}
1683		atomic_add_64(&arc_size, size);
1684		goto out;
1685	}
1686
1687	/*
1688	 * If we are prefetching from the mfu ghost list, this buffer
1689	 * will end up on the mru list; so steal space from there.
1690	 */
1691	if (state == arc_mfu_ghost)
1692		state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
1693	else if (state == arc_mru_ghost)
1694		state = arc_mru;
1695
1696	if (state == arc_mru || state == arc_anon) {
1697		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
1698		state = (arc_p > mru_used) ? arc_mfu : arc_mru;
1699	} else {
1700		/* MFU cases */
1701		uint64_t mfu_space = arc_c - arc_p;
1702		state =  (mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
1703	}
1704	if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) {
1705		if (type == ARC_BUFC_METADATA) {
1706			buf->b_data = zio_buf_alloc(size);
1707		} else {
1708			ASSERT(type == ARC_BUFC_DATA);
1709			buf->b_data = zio_data_buf_alloc(size);
1710		}
1711		atomic_add_64(&arc_size, size);
1712		ARCSTAT_BUMP(arcstat_recycle_miss);
1713	}
1714	ASSERT(buf->b_data != NULL);
1715out:
1716	/*
1717	 * Update the state size.  Note that ghost states have a
1718	 * "ghost size" and so don't need to be updated.
1719	 */
1720	if (!GHOST_STATE(buf->b_hdr->b_state)) {
1721		arc_buf_hdr_t *hdr = buf->b_hdr;
1722
1723		atomic_add_64(&hdr->b_state->arcs_size, size);
1724		if (list_link_active(&hdr->b_arc_node)) {
1725			ASSERT(refcount_is_zero(&hdr->b_refcnt));
1726			atomic_add_64(&hdr->b_state->arcs_lsize, size);
1727		}
1728		/*
1729		 * If we are growing the cache, and we are adding anonymous
1730		 * data, and we have outgrown arc_p, update arc_p
1731		 */
1732		if (arc_size < arc_c && hdr->b_state == arc_anon &&
1733		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
1734			arc_p = MIN(arc_c, arc_p + size);
1735	}
1736}
1737
1738/*
1739 * This routine is called whenever a buffer is accessed.
1740 * NOTE: the hash lock is dropped in this function.
1741 */
1742static void
1743arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
1744{
1745	ASSERT(MUTEX_HELD(hash_lock));
1746
1747	if (buf->b_state == arc_anon) {
1748		/*
1749		 * This buffer is not in the cache, and does not
1750		 * appear in our "ghost" list.  Add the new buffer
1751		 * to the MRU state.
1752		 */
1753
1754		ASSERT(buf->b_arc_access == 0);
1755		buf->b_arc_access = lbolt;
1756		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
1757		arc_change_state(arc_mru, buf, hash_lock);
1758
1759	} else if (buf->b_state == arc_mru) {
1760		/*
1761		 * If this buffer is here because of a prefetch, then either:
1762		 * - clear the flag if this is a "referencing" read
1763		 *   (any subsequent access will bump this into the MFU state).
1764		 * or
1765		 * - move the buffer to the head of the list if this is
1766		 *   another prefetch (to make it less likely to be evicted).
1767		 */
1768		if ((buf->b_flags & ARC_PREFETCH) != 0) {
1769			if (refcount_count(&buf->b_refcnt) == 0) {
1770				ASSERT(list_link_active(&buf->b_arc_node));
1771				mutex_enter(&arc_mru->arcs_mtx);
1772				list_remove(&arc_mru->arcs_list, buf);
1773				list_insert_head(&arc_mru->arcs_list, buf);
1774				mutex_exit(&arc_mru->arcs_mtx);
1775			} else {
1776				buf->b_flags &= ~ARC_PREFETCH;
1777				ARCSTAT_BUMP(arcstat_mru_hits);
1778			}
1779			buf->b_arc_access = lbolt;
1780			return;
1781		}
1782
1783		/*
1784		 * This buffer has been "accessed" only once so far,
1785		 * but it is still in the cache. Move it to the MFU
1786		 * state.
1787		 */
1788		if (lbolt > buf->b_arc_access + ARC_MINTIME) {
1789			/*
1790			 * More than 125ms have passed since we
1791			 * instantiated this buffer.  Move it to the
1792			 * most frequently used state.
1793			 */
1794			buf->b_arc_access = lbolt;
1795			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
1796			arc_change_state(arc_mfu, buf, hash_lock);
1797		}
1798		ARCSTAT_BUMP(arcstat_mru_hits);
1799	} else if (buf->b_state == arc_mru_ghost) {
1800		arc_state_t	*new_state;
1801		/*
1802		 * This buffer has been "accessed" recently, but
1803		 * was evicted from the cache.  Move it to the
1804		 * MFU state.
1805		 */
1806
1807		if (buf->b_flags & ARC_PREFETCH) {
1808			new_state = arc_mru;
1809			if (refcount_count(&buf->b_refcnt) > 0)
1810				buf->b_flags &= ~ARC_PREFETCH;
1811			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
1812		} else {
1813			new_state = arc_mfu;
1814			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
1815		}
1816
1817		buf->b_arc_access = lbolt;
1818		arc_change_state(new_state, buf, hash_lock);
1819
1820		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
1821	} else if (buf->b_state == arc_mfu) {
1822		/*
1823		 * This buffer has been accessed more than once and is
1824		 * still in the cache.  Keep it in the MFU state.
1825		 *
1826		 * NOTE: an add_reference() that occurred when we did
1827		 * the arc_read() will have kicked this off the list.
1828		 * If it was a prefetch, we will explicitly move it to
1829		 * the head of the list now.
1830		 */
1831		if ((buf->b_flags & ARC_PREFETCH) != 0) {
1832			ASSERT(refcount_count(&buf->b_refcnt) == 0);
1833			ASSERT(list_link_active(&buf->b_arc_node));
1834			mutex_enter(&arc_mfu->arcs_mtx);
1835			list_remove(&arc_mfu->arcs_list, buf);
1836			list_insert_head(&arc_mfu->arcs_list, buf);
1837			mutex_exit(&arc_mfu->arcs_mtx);
1838		}
1839		ARCSTAT_BUMP(arcstat_mfu_hits);
1840		buf->b_arc_access = lbolt;
1841	} else if (buf->b_state == arc_mfu_ghost) {
1842		arc_state_t	*new_state = arc_mfu;
1843		/*
1844		 * This buffer has been accessed more than once but has
1845		 * been evicted from the cache.  Move it back to the
1846		 * MFU state.
1847		 */
1848
1849		if (buf->b_flags & ARC_PREFETCH) {
1850			/*
1851			 * This is a prefetch access...
1852			 * move this block back to the MRU state.
1853			 */
1854			ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
1855			new_state = arc_mru;
1856		}
1857
1858		buf->b_arc_access = lbolt;
1859		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
1860		arc_change_state(new_state, buf, hash_lock);
1861
1862		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
1863	} else {
1864		ASSERT(!"invalid arc state");
1865	}
1866}
1867
1868/* a generic arc_done_func_t which you can use */
1869/* ARGSUSED */
1870void
1871arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
1872{
1873	bcopy(buf->b_data, arg, buf->b_hdr->b_size);
1874	VERIFY(arc_buf_remove_ref(buf, arg) == 1);
1875}
1876
1877/* a generic arc_done_func_t which you can use */
1878void
1879arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
1880{
1881	arc_buf_t **bufp = arg;
1882	if (zio && zio->io_error) {
1883		VERIFY(arc_buf_remove_ref(buf, arg) == 1);
1884		*bufp = NULL;
1885	} else {
1886		*bufp = buf;
1887	}
1888}
1889
1890static void
1891arc_read_done(zio_t *zio)
1892{
1893	arc_buf_hdr_t	*hdr, *found;
1894	arc_buf_t	*buf;
1895	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
1896	kmutex_t	*hash_lock;
1897	arc_callback_t	*callback_list, *acb;
1898	int		freeable = FALSE;
1899
1900	buf = zio->io_private;
1901	hdr = buf->b_hdr;
1902
1903	/*
1904	 * The hdr was inserted into hash-table and removed from lists
1905	 * prior to starting I/O.  We should find this header, since
1906	 * it's in the hash table, and it should be legit since it's
1907	 * not possible to evict it during the I/O.  The only possible
1908	 * reason for it not to be found is if we were freed during the
1909	 * read.
1910	 */
1911	found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
1912	    &hash_lock);
1913
1914	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
1915	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))));
1916
1917	/* byteswap if necessary */
1918	callback_list = hdr->b_acb;
1919	ASSERT(callback_list != NULL);
1920	if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
1921		callback_list->acb_byteswap(buf->b_data, hdr->b_size);
1922
1923	arc_cksum_compute(buf);
1924
1925	/* create copies of the data buffer for the callers */
1926	abuf = buf;
1927	for (acb = callback_list; acb; acb = acb->acb_next) {
1928		if (acb->acb_done) {
1929			if (abuf == NULL)
1930				abuf = arc_buf_clone(buf);
1931			acb->acb_buf = abuf;
1932			abuf = NULL;
1933		}
1934	}
1935	hdr->b_acb = NULL;
1936	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
1937	ASSERT(!HDR_BUF_AVAILABLE(hdr));
1938	if (abuf == buf)
1939		hdr->b_flags |= ARC_BUF_AVAILABLE;
1940
1941	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
1942
1943	if (zio->io_error != 0) {
1944		hdr->b_flags |= ARC_IO_ERROR;
1945		if (hdr->b_state != arc_anon)
1946			arc_change_state(arc_anon, hdr, hash_lock);
1947		if (HDR_IN_HASH_TABLE(hdr))
1948			buf_hash_remove(hdr);
1949		freeable = refcount_is_zero(&hdr->b_refcnt);
1950		/* convert checksum errors into IO errors */
1951		if (zio->io_error == ECKSUM)
1952			zio->io_error = EIO;
1953	}
1954
1955	/*
1956	 * Broadcast before we drop the hash_lock to avoid the possibility
1957	 * that the hdr (and hence the cv) might be freed before we get to
1958	 * the cv_broadcast().
1959	 */
1960	cv_broadcast(&hdr->b_cv);
1961
1962	if (hash_lock) {
1963		/*
1964		 * Only call arc_access on anonymous buffers.  This is because
1965		 * if we've issued an I/O for an evicted buffer, we've already
1966		 * called arc_access (to prevent any simultaneous readers from
1967		 * getting confused).
1968		 */
1969		if (zio->io_error == 0 && hdr->b_state == arc_anon)
1970			arc_access(hdr, hash_lock);
1971		mutex_exit(hash_lock);
1972	} else {
1973		/*
1974		 * This block was freed while we waited for the read to
1975		 * complete.  It has been removed from the hash table and
1976		 * moved to the anonymous state (so that it won't show up
1977		 * in the cache).
1978		 */
1979		ASSERT3P(hdr->b_state, ==, arc_anon);
1980		freeable = refcount_is_zero(&hdr->b_refcnt);
1981	}
1982
1983	/* execute each callback and free its structure */
1984	while ((acb = callback_list) != NULL) {
1985		if (acb->acb_done)
1986			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
1987
1988		if (acb->acb_zio_dummy != NULL) {
1989			acb->acb_zio_dummy->io_error = zio->io_error;
1990			zio_nowait(acb->acb_zio_dummy);
1991		}
1992
1993		callback_list = acb->acb_next;
1994		kmem_free(acb, sizeof (arc_callback_t));
1995	}
1996
1997	if (freeable)
1998		arc_hdr_destroy(hdr);
1999}
2000
2001/*
2002 * "Read" the block block at the specified DVA (in bp) via the
2003 * cache.  If the block is found in the cache, invoke the provided
2004 * callback immediately and return.  Note that the `zio' parameter
2005 * in the callback will be NULL in this case, since no IO was
2006 * required.  If the block is not in the cache pass the read request
2007 * on to the spa with a substitute callback function, so that the
2008 * requested block will be added to the cache.
2009 *
2010 * If a read request arrives for a block that has a read in-progress,
2011 * either wait for the in-progress read to complete (and return the
2012 * results); or, if this is a read with a "done" func, add a record
2013 * to the read to invoke the "done" func when the read completes,
2014 * and return; or just return.
2015 *
2016 * arc_read_done() will invoke all the requested "done" functions
2017 * for readers of this block.
2018 */
2019int
2020arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
2021    arc_done_func_t *done, void *private, int priority, int flags,
2022    uint32_t *arc_flags, zbookmark_t *zb)
2023{
2024	arc_buf_hdr_t *hdr;
2025	arc_buf_t *buf;
2026	kmutex_t *hash_lock;
2027	zio_t	*rzio;
2028
2029top:
2030	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
2031	if (hdr && hdr->b_datacnt > 0) {
2032
2033		*arc_flags |= ARC_CACHED;
2034
2035		if (HDR_IO_IN_PROGRESS(hdr)) {
2036
2037			if (*arc_flags & ARC_WAIT) {
2038				cv_wait(&hdr->b_cv, hash_lock);
2039				mutex_exit(hash_lock);
2040				goto top;
2041			}
2042			ASSERT(*arc_flags & ARC_NOWAIT);
2043
2044			if (done) {
2045				arc_callback_t	*acb = NULL;
2046
2047				acb = kmem_zalloc(sizeof (arc_callback_t),
2048				    KM_SLEEP);
2049				acb->acb_done = done;
2050				acb->acb_private = private;
2051				acb->acb_byteswap = swap;
2052				if (pio != NULL)
2053					acb->acb_zio_dummy = zio_null(pio,
2054					    spa, NULL, NULL, flags);
2055
2056				ASSERT(acb->acb_done != NULL);
2057				acb->acb_next = hdr->b_acb;
2058				hdr->b_acb = acb;
2059				add_reference(hdr, hash_lock, private);
2060				mutex_exit(hash_lock);
2061				return (0);
2062			}
2063			mutex_exit(hash_lock);
2064			return (0);
2065		}
2066
2067		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2068
2069		if (done) {
2070			add_reference(hdr, hash_lock, private);
2071			/*
2072			 * If this block is already in use, create a new
2073			 * copy of the data so that we will be guaranteed
2074			 * that arc_release() will always succeed.
2075			 */
2076			buf = hdr->b_buf;
2077			ASSERT(buf);
2078			ASSERT(buf->b_data);
2079			if (HDR_BUF_AVAILABLE(hdr)) {
2080				ASSERT(buf->b_efunc == NULL);
2081				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2082			} else {
2083				buf = arc_buf_clone(buf);
2084			}
2085		} else if (*arc_flags & ARC_PREFETCH &&
2086		    refcount_count(&hdr->b_refcnt) == 0) {
2087			hdr->b_flags |= ARC_PREFETCH;
2088		}
2089		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2090		arc_access(hdr, hash_lock);
2091		mutex_exit(hash_lock);
2092		ARCSTAT_BUMP(arcstat_hits);
2093		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2094		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2095		    data, metadata, hits);
2096
2097		if (done)
2098			done(NULL, buf, private);
2099	} else {
2100		uint64_t size = BP_GET_LSIZE(bp);
2101		arc_callback_t	*acb;
2102
2103		if (hdr == NULL) {
2104			/* this block is not in the cache */
2105			arc_buf_hdr_t	*exists;
2106			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
2107			buf = arc_buf_alloc(spa, size, private, type);
2108			hdr = buf->b_hdr;
2109			hdr->b_dva = *BP_IDENTITY(bp);
2110			hdr->b_birth = bp->blk_birth;
2111			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
2112			exists = buf_hash_insert(hdr, &hash_lock);
2113			if (exists) {
2114				/* somebody beat us to the hash insert */
2115				mutex_exit(hash_lock);
2116				bzero(&hdr->b_dva, sizeof (dva_t));
2117				hdr->b_birth = 0;
2118				hdr->b_cksum0 = 0;
2119				(void) arc_buf_remove_ref(buf, private);
2120				goto top; /* restart the IO request */
2121			}
2122			/* if this is a prefetch, we don't have a reference */
2123			if (*arc_flags & ARC_PREFETCH) {
2124				(void) remove_reference(hdr, hash_lock,
2125				    private);
2126				hdr->b_flags |= ARC_PREFETCH;
2127			}
2128			if (BP_GET_LEVEL(bp) > 0)
2129				hdr->b_flags |= ARC_INDIRECT;
2130		} else {
2131			/* this block is in the ghost cache */
2132			ASSERT(GHOST_STATE(hdr->b_state));
2133			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2134			ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
2135			ASSERT(hdr->b_buf == NULL);
2136
2137			/* if this is a prefetch, we don't have a reference */
2138			if (*arc_flags & ARC_PREFETCH)
2139				hdr->b_flags |= ARC_PREFETCH;
2140			else
2141				add_reference(hdr, hash_lock, private);
2142			buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
2143			buf->b_hdr = hdr;
2144			buf->b_data = NULL;
2145			buf->b_efunc = NULL;
2146			buf->b_private = NULL;
2147			buf->b_next = NULL;
2148			hdr->b_buf = buf;
2149			arc_get_data_buf(buf);
2150			ASSERT(hdr->b_datacnt == 0);
2151			hdr->b_datacnt = 1;
2152
2153		}
2154
2155		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
2156		acb->acb_done = done;
2157		acb->acb_private = private;
2158		acb->acb_byteswap = swap;
2159
2160		ASSERT(hdr->b_acb == NULL);
2161		hdr->b_acb = acb;
2162		hdr->b_flags |= ARC_IO_IN_PROGRESS;
2163
2164		/*
2165		 * If the buffer has been evicted, migrate it to a present state
2166		 * before issuing the I/O.  Once we drop the hash-table lock,
2167		 * the header will be marked as I/O in progress and have an
2168		 * attached buffer.  At this point, anybody who finds this
2169		 * buffer ought to notice that it's legit but has a pending I/O.
2170		 */
2171
2172		if (GHOST_STATE(hdr->b_state))
2173			arc_access(hdr, hash_lock);
2174		mutex_exit(hash_lock);
2175
2176		ASSERT3U(hdr->b_size, ==, size);
2177		DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
2178		    zbookmark_t *, zb);
2179		ARCSTAT_BUMP(arcstat_misses);
2180		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
2181		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
2182		    data, metadata, misses);
2183
2184		rzio = zio_read(pio, spa, bp, buf->b_data, size,
2185		    arc_read_done, buf, priority, flags, zb);
2186
2187		if (*arc_flags & ARC_WAIT)
2188			return (zio_wait(rzio));
2189
2190		ASSERT(*arc_flags & ARC_NOWAIT);
2191		zio_nowait(rzio);
2192	}
2193	return (0);
2194}
2195
2196/*
2197 * arc_read() variant to support pool traversal.  If the block is already
2198 * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
2199 * The idea is that we don't want pool traversal filling up memory, but
2200 * if the ARC already has the data anyway, we shouldn't pay for the I/O.
2201 */
2202int
2203arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
2204{
2205	arc_buf_hdr_t *hdr;
2206	kmutex_t *hash_mtx;
2207	int rc = 0;
2208
2209	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
2210
2211	if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
2212		arc_buf_t *buf = hdr->b_buf;
2213
2214		ASSERT(buf);
2215		while (buf->b_data == NULL) {
2216			buf = buf->b_next;
2217			ASSERT(buf);
2218		}
2219		bcopy(buf->b_data, data, hdr->b_size);
2220	} else {
2221		rc = ENOENT;
2222	}
2223
2224	if (hash_mtx)
2225		mutex_exit(hash_mtx);
2226
2227	return (rc);
2228}
2229
2230void
2231arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
2232{
2233	ASSERT(buf->b_hdr != NULL);
2234	ASSERT(buf->b_hdr->b_state != arc_anon);
2235	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
2236	buf->b_efunc = func;
2237	buf->b_private = private;
2238}
2239
2240/*
2241 * This is used by the DMU to let the ARC know that a buffer is
2242 * being evicted, so the ARC should clean up.  If this arc buf
2243 * is not yet in the evicted state, it will be put there.
2244 */
2245int
2246arc_buf_evict(arc_buf_t *buf)
2247{
2248	arc_buf_hdr_t *hdr;
2249	kmutex_t *hash_lock;
2250	arc_buf_t **bufp;
2251
2252	mutex_enter(&arc_eviction_mtx);
2253	hdr = buf->b_hdr;
2254	if (hdr == NULL) {
2255		/*
2256		 * We are in arc_do_user_evicts().
2257		 */
2258		ASSERT(buf->b_data == NULL);
2259		mutex_exit(&arc_eviction_mtx);
2260		return (0);
2261	}
2262	hash_lock = HDR_LOCK(hdr);
2263	mutex_exit(&arc_eviction_mtx);
2264
2265	mutex_enter(hash_lock);
2266
2267	if (buf->b_data == NULL) {
2268		/*
2269		 * We are on the eviction list.
2270		 */
2271		mutex_exit(hash_lock);
2272		mutex_enter(&arc_eviction_mtx);
2273		if (buf->b_hdr == NULL) {
2274			/*
2275			 * We are already in arc_do_user_evicts().
2276			 */
2277			mutex_exit(&arc_eviction_mtx);
2278			return (0);
2279		} else {
2280			arc_buf_t copy = *buf; /* structure assignment */
2281			/*
2282			 * Process this buffer now
2283			 * but let arc_do_user_evicts() do the reaping.
2284			 */
2285			buf->b_efunc = NULL;
2286			mutex_exit(&arc_eviction_mtx);
2287			VERIFY(copy.b_efunc(&copy) == 0);
2288			return (1);
2289		}
2290	}
2291
2292	ASSERT(buf->b_hdr == hdr);
2293	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
2294	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2295
2296	/*
2297	 * Pull this buffer off of the hdr
2298	 */
2299	bufp = &hdr->b_buf;
2300	while (*bufp != buf)
2301		bufp = &(*bufp)->b_next;
2302	*bufp = buf->b_next;
2303
2304	ASSERT(buf->b_data != NULL);
2305	arc_buf_destroy(buf, FALSE, FALSE);
2306
2307	if (hdr->b_datacnt == 0) {
2308		arc_state_t *old_state = hdr->b_state;
2309		arc_state_t *evicted_state;
2310
2311		ASSERT(refcount_is_zero(&hdr->b_refcnt));
2312
2313		evicted_state =
2314		    (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2315
2316		mutex_enter(&old_state->arcs_mtx);
2317		mutex_enter(&evicted_state->arcs_mtx);
2318
2319		arc_change_state(evicted_state, hdr, hash_lock);
2320		ASSERT(HDR_IN_HASH_TABLE(hdr));
2321		hdr->b_flags = ARC_IN_HASH_TABLE;
2322
2323		mutex_exit(&evicted_state->arcs_mtx);
2324		mutex_exit(&old_state->arcs_mtx);
2325	}
2326	mutex_exit(hash_lock);
2327
2328	VERIFY(buf->b_efunc(buf) == 0);
2329	buf->b_efunc = NULL;
2330	buf->b_private = NULL;
2331	buf->b_hdr = NULL;
2332	kmem_cache_free(buf_cache, buf);
2333	return (1);
2334}
2335
2336/*
2337 * Release this buffer from the cache.  This must be done
2338 * after a read and prior to modifying the buffer contents.
2339 * If the buffer has more than one reference, we must make
2340 * make a new hdr for the buffer.
2341 */
2342void
2343arc_release(arc_buf_t *buf, void *tag)
2344{
2345	arc_buf_hdr_t *hdr = buf->b_hdr;
2346	kmutex_t *hash_lock = HDR_LOCK(hdr);
2347
2348	/* this buffer is not on any list */
2349	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
2350
2351	if (hdr->b_state == arc_anon) {
2352		/* this buffer is already released */
2353		ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
2354		ASSERT(BUF_EMPTY(hdr));
2355		ASSERT(buf->b_efunc == NULL);
2356		arc_buf_thaw(buf);
2357		return;
2358	}
2359
2360	mutex_enter(hash_lock);
2361
2362	/*
2363	 * Do we have more than one buf?
2364	 */
2365	if (hdr->b_buf != buf || buf->b_next != NULL) {
2366		arc_buf_hdr_t *nhdr;
2367		arc_buf_t **bufp;
2368		uint64_t blksz = hdr->b_size;
2369		spa_t *spa = hdr->b_spa;
2370		arc_buf_contents_t type = hdr->b_type;
2371
2372		ASSERT(hdr->b_datacnt > 1);
2373		/*
2374		 * Pull the data off of this buf and attach it to
2375		 * a new anonymous buf.
2376		 */
2377		(void) remove_reference(hdr, hash_lock, tag);
2378		bufp = &hdr->b_buf;
2379		while (*bufp != buf)
2380			bufp = &(*bufp)->b_next;
2381		*bufp = (*bufp)->b_next;
2382		buf->b_next = NULL;
2383
2384		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
2385		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
2386		if (refcount_is_zero(&hdr->b_refcnt)) {
2387			ASSERT3U(hdr->b_state->arcs_lsize, >=, hdr->b_size);
2388			atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size);
2389		}
2390		hdr->b_datacnt -= 1;
2391		arc_cksum_verify(buf);
2392
2393		mutex_exit(hash_lock);
2394
2395		nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
2396		nhdr->b_size = blksz;
2397		nhdr->b_spa = spa;
2398		nhdr->b_type = type;
2399		nhdr->b_buf = buf;
2400		nhdr->b_state = arc_anon;
2401		nhdr->b_arc_access = 0;
2402		nhdr->b_flags = 0;
2403		nhdr->b_datacnt = 1;
2404		nhdr->b_freeze_cksum = NULL;
2405		(void) refcount_add(&nhdr->b_refcnt, tag);
2406		buf->b_hdr = nhdr;
2407		atomic_add_64(&arc_anon->arcs_size, blksz);
2408
2409		hdr = nhdr;
2410	} else {
2411		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
2412		ASSERT(!list_link_active(&hdr->b_arc_node));
2413		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2414		arc_change_state(arc_anon, hdr, hash_lock);
2415		hdr->b_arc_access = 0;
2416		mutex_exit(hash_lock);
2417		bzero(&hdr->b_dva, sizeof (dva_t));
2418		hdr->b_birth = 0;
2419		hdr->b_cksum0 = 0;
2420		arc_buf_thaw(buf);
2421	}
2422	buf->b_efunc = NULL;
2423	buf->b_private = NULL;
2424}
2425
2426int
2427arc_released(arc_buf_t *buf)
2428{
2429	return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
2430}
2431
2432int
2433arc_has_callback(arc_buf_t *buf)
2434{
2435	return (buf->b_efunc != NULL);
2436}
2437
2438#ifdef ZFS_DEBUG
2439int
2440arc_referenced(arc_buf_t *buf)
2441{
2442	return (refcount_count(&buf->b_hdr->b_refcnt));
2443}
2444#endif
2445
2446static void
2447arc_write_ready(zio_t *zio)
2448{
2449	arc_write_callback_t *callback = zio->io_private;
2450	arc_buf_t *buf = callback->awcb_buf;
2451
2452	if (callback->awcb_ready) {
2453		ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
2454		callback->awcb_ready(zio, buf, callback->awcb_private);
2455	}
2456	arc_cksum_compute(buf);
2457}
2458
2459static void
2460arc_write_done(zio_t *zio)
2461{
2462	arc_write_callback_t *callback = zio->io_private;
2463	arc_buf_t *buf = callback->awcb_buf;
2464	arc_buf_hdr_t *hdr = buf->b_hdr;
2465
2466	hdr->b_acb = NULL;
2467
2468	/* this buffer is on no lists and is not in the hash table */
2469	ASSERT3P(hdr->b_state, ==, arc_anon);
2470
2471	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
2472	hdr->b_birth = zio->io_bp->blk_birth;
2473	hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
2474	/*
2475	 * If the block to be written was all-zero, we may have
2476	 * compressed it away.  In this case no write was performed
2477	 * so there will be no dva/birth-date/checksum.  The buffer
2478	 * must therefor remain anonymous (and uncached).
2479	 */
2480	if (!BUF_EMPTY(hdr)) {
2481		arc_buf_hdr_t *exists;
2482		kmutex_t *hash_lock;
2483
2484		arc_cksum_verify(buf);
2485
2486		exists = buf_hash_insert(hdr, &hash_lock);
2487		if (exists) {
2488			/*
2489			 * This can only happen if we overwrite for
2490			 * sync-to-convergence, because we remove
2491			 * buffers from the hash table when we arc_free().
2492			 */
2493			ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
2494			    BP_IDENTITY(zio->io_bp)));
2495			ASSERT3U(zio->io_bp_orig.blk_birth, ==,
2496			    zio->io_bp->blk_birth);
2497
2498			ASSERT(refcount_is_zero(&exists->b_refcnt));
2499			arc_change_state(arc_anon, exists, hash_lock);
2500			mutex_exit(hash_lock);
2501			arc_hdr_destroy(exists);
2502			exists = buf_hash_insert(hdr, &hash_lock);
2503			ASSERT3P(exists, ==, NULL);
2504		}
2505		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2506		arc_access(hdr, hash_lock);
2507		mutex_exit(hash_lock);
2508	} else if (callback->awcb_done == NULL) {
2509		int destroy_hdr;
2510		/*
2511		 * This is an anonymous buffer with no user callback,
2512		 * destroy it if there are no active references.
2513		 */
2514		mutex_enter(&arc_eviction_mtx);
2515		destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
2516		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2517		mutex_exit(&arc_eviction_mtx);
2518		if (destroy_hdr)
2519			arc_hdr_destroy(hdr);
2520	} else {
2521		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2522	}
2523
2524	if (callback->awcb_done) {
2525		ASSERT(!refcount_is_zero(&hdr->b_refcnt));
2526		callback->awcb_done(zio, buf, callback->awcb_private);
2527	}
2528
2529	kmem_free(callback, sizeof (arc_write_callback_t));
2530}
2531
2532zio_t *
2533arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
2534    uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
2535    arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
2536    int flags, zbookmark_t *zb)
2537{
2538	arc_buf_hdr_t *hdr = buf->b_hdr;
2539	arc_write_callback_t *callback;
2540	zio_t	*zio;
2541
2542	/* this is a private buffer - no locking required */
2543	ASSERT3P(hdr->b_state, ==, arc_anon);
2544	ASSERT(BUF_EMPTY(hdr));
2545	ASSERT(!HDR_IO_ERROR(hdr));
2546	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
2547	ASSERT(hdr->b_acb == 0);
2548	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
2549	callback->awcb_ready = ready;
2550	callback->awcb_done = done;
2551	callback->awcb_private = private;
2552	callback->awcb_buf = buf;
2553	hdr->b_flags |= ARC_IO_IN_PROGRESS;
2554	zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
2555	    buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback,
2556	    priority, flags, zb);
2557
2558	return (zio);
2559}
2560
2561int
2562arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
2563    zio_done_func_t *done, void *private, uint32_t arc_flags)
2564{
2565	arc_buf_hdr_t *ab;
2566	kmutex_t *hash_lock;
2567	zio_t	*zio;
2568
2569	/*
2570	 * If this buffer is in the cache, release it, so it
2571	 * can be re-used.
2572	 */
2573	ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
2574	if (ab != NULL) {
2575		/*
2576		 * The checksum of blocks to free is not always
2577		 * preserved (eg. on the deadlist).  However, if it is
2578		 * nonzero, it should match what we have in the cache.
2579		 */
2580		ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
2581		    ab->b_cksum0 == bp->blk_cksum.zc_word[0]);
2582		if (ab->b_state != arc_anon)
2583			arc_change_state(arc_anon, ab, hash_lock);
2584		if (HDR_IO_IN_PROGRESS(ab)) {
2585			/*
2586			 * This should only happen when we prefetch.
2587			 */
2588			ASSERT(ab->b_flags & ARC_PREFETCH);
2589			ASSERT3U(ab->b_datacnt, ==, 1);
2590			ab->b_flags |= ARC_FREED_IN_READ;
2591			if (HDR_IN_HASH_TABLE(ab))
2592				buf_hash_remove(ab);
2593			ab->b_arc_access = 0;
2594			bzero(&ab->b_dva, sizeof (dva_t));
2595			ab->b_birth = 0;
2596			ab->b_cksum0 = 0;
2597			ab->b_buf->b_efunc = NULL;
2598			ab->b_buf->b_private = NULL;
2599			mutex_exit(hash_lock);
2600		} else if (refcount_is_zero(&ab->b_refcnt)) {
2601			mutex_exit(hash_lock);
2602			arc_hdr_destroy(ab);
2603			ARCSTAT_BUMP(arcstat_deleted);
2604		} else {
2605			/*
2606			 * We still have an active reference on this
2607			 * buffer.  This can happen, e.g., from
2608			 * dbuf_unoverride().
2609			 */
2610			ASSERT(!HDR_IN_HASH_TABLE(ab));
2611			ab->b_arc_access = 0;
2612			bzero(&ab->b_dva, sizeof (dva_t));
2613			ab->b_birth = 0;
2614			ab->b_cksum0 = 0;
2615			ab->b_buf->b_efunc = NULL;
2616			ab->b_buf->b_private = NULL;
2617			mutex_exit(hash_lock);
2618		}
2619	}
2620
2621	zio = zio_free(pio, spa, txg, bp, done, private);
2622
2623	if (arc_flags & ARC_WAIT)
2624		return (zio_wait(zio));
2625
2626	ASSERT(arc_flags & ARC_NOWAIT);
2627	zio_nowait(zio);
2628
2629	return (0);
2630}
2631
2632void
2633arc_tempreserve_clear(uint64_t tempreserve)
2634{
2635	atomic_add_64(&arc_tempreserve, -tempreserve);
2636	ASSERT((int64_t)arc_tempreserve >= 0);
2637}
2638
2639int
2640arc_tempreserve_space(uint64_t tempreserve)
2641{
2642#ifdef ZFS_DEBUG
2643	/*
2644	 * Once in a while, fail for no reason.  Everything should cope.
2645	 */
2646	if (spa_get_random(10000) == 0) {
2647		dprintf("forcing random failure\n");
2648		return (ERESTART);
2649	}
2650#endif
2651	if (tempreserve > arc_c/4 && !arc_no_grow)
2652		arc_c = MIN(arc_c_max, tempreserve * 4);
2653	if (tempreserve > arc_c)
2654		return (ENOMEM);
2655
2656	/*
2657	 * Throttle writes when the amount of dirty data in the cache
2658	 * gets too large.  We try to keep the cache less than half full
2659	 * of dirty blocks so that our sync times don't grow too large.
2660	 * Note: if two requests come in concurrently, we might let them
2661	 * both succeed, when one of them should fail.  Not a huge deal.
2662	 *
2663	 * XXX The limit should be adjusted dynamically to keep the time
2664	 * to sync a dataset fixed (around 1-5 seconds?).
2665	 */
2666
2667	if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
2668	    arc_tempreserve + arc_anon->arcs_size > arc_c / 4) {
2669		dprintf("failing, arc_tempreserve=%lluK anon=%lluK "
2670		    "tempreserve=%lluK arc_c=%lluK\n",
2671		    arc_tempreserve>>10, arc_anon->arcs_lsize>>10,
2672		    tempreserve>>10, arc_c>>10);
2673		return (ERESTART);
2674	}
2675	atomic_add_64(&arc_tempreserve, tempreserve);
2676	return (0);
2677}
2678
2679#ifdef _KERNEL
2680static eventhandler_tag zfs_event_lowmem = NULL;
2681
2682static void
2683zfs_lowmem(void *arg __unused, int howto __unused)
2684{
2685
2686	zfs_needfree = 1;
2687	cv_signal(&arc_reclaim_thr_cv);
2688	while (zfs_needfree)
2689		tsleep(&zfs_needfree, 0, "zfs:lowmem", hz / 5);
2690}
2691#endif
2692
2693void
2694arc_init(void)
2695{
2696	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
2697	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
2698
2699	/* Convert seconds to clock ticks */
2700	arc_min_prefetch_lifespan = 1 * hz;
2701
2702	/* Start out with 1/8 of all memory */
2703	arc_c = physmem * PAGESIZE / 8;
2704#if 0
2705#ifdef _KERNEL
2706	/*
2707	 * On architectures where the physical memory can be larger
2708	 * than the addressable space (intel in 32-bit mode), we may
2709	 * need to limit the cache to 1/8 of VM size.
2710	 */
2711	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
2712#endif
2713#endif
2714	/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
2715	arc_c_min = MAX(arc_c / 4, 64<<20);
2716	/* set max to 3/4 of all memory, or all but 1GB, whichever is more */
2717	if (arc_c * 8 >= 1<<30)
2718		arc_c_max = (arc_c * 8) - (1<<30);
2719	else
2720		arc_c_max = arc_c_min;
2721	arc_c_max = MAX(arc_c * 6, arc_c_max);
2722#ifdef notyet
2723	/*
2724	 * Allow the tunables to override our calculations if they are
2725	 * reasonable (ie. over 64MB)
2726	 */
2727	if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE)
2728		arc_c_max = zfs_arc_max;
2729	if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max)
2730		arc_c_min = zfs_arc_min;
2731#endif
2732	arc_c = arc_c_max;
2733	arc_p = (arc_c >> 1);
2734
2735	/* if kmem_flags are set, lets try to use less memory */
2736	if (kmem_debugging())
2737		arc_c = arc_c / 2;
2738	if (arc_c < arc_c_min)
2739		arc_c = arc_c_min;
2740
2741	arc_anon = &ARC_anon;
2742	arc_mru = &ARC_mru;
2743	arc_mru_ghost = &ARC_mru_ghost;
2744	arc_mfu = &ARC_mfu;
2745	arc_mfu_ghost = &ARC_mfu_ghost;
2746	arc_size = 0;
2747
2748	mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
2749	mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
2750	mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
2751	mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
2752	mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
2753
2754	list_create(&arc_mru->arcs_list, sizeof (arc_buf_hdr_t),
2755	    offsetof(arc_buf_hdr_t, b_arc_node));
2756	list_create(&arc_mru_ghost->arcs_list, sizeof (arc_buf_hdr_t),
2757	    offsetof(arc_buf_hdr_t, b_arc_node));
2758	list_create(&arc_mfu->arcs_list, sizeof (arc_buf_hdr_t),
2759	    offsetof(arc_buf_hdr_t, b_arc_node));
2760	list_create(&arc_mfu_ghost->arcs_list, sizeof (arc_buf_hdr_t),
2761	    offsetof(arc_buf_hdr_t, b_arc_node));
2762
2763	buf_init();
2764
2765	arc_thread_exit = 0;
2766	arc_eviction_list = NULL;
2767	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
2768	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
2769
2770	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
2771	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
2772
2773	if (arc_ksp != NULL) {
2774		arc_ksp->ks_data = &arc_stats;
2775		kstat_install(arc_ksp);
2776	}
2777
2778	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
2779	    TS_RUN, minclsyspri);
2780
2781#ifdef _KERNEL
2782	zfs_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, zfs_lowmem, NULL,
2783	    EVENTHANDLER_PRI_FIRST);
2784#endif
2785
2786	arc_dead = FALSE;
2787}
2788
2789void
2790arc_fini(void)
2791{
2792	mutex_enter(&arc_reclaim_thr_lock);
2793	arc_thread_exit = 1;
2794	cv_signal(&arc_reclaim_thr_cv);
2795	while (arc_thread_exit != 0)
2796		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
2797	mutex_exit(&arc_reclaim_thr_lock);
2798
2799	arc_flush();
2800
2801	arc_dead = TRUE;
2802
2803	if (arc_ksp != NULL) {
2804		kstat_delete(arc_ksp);
2805		arc_ksp = NULL;
2806	}
2807
2808	mutex_destroy(&arc_eviction_mtx);
2809	mutex_destroy(&arc_reclaim_thr_lock);
2810	cv_destroy(&arc_reclaim_thr_cv);
2811
2812	list_destroy(&arc_mru->arcs_list);
2813	list_destroy(&arc_mru_ghost->arcs_list);
2814	list_destroy(&arc_mfu->arcs_list);
2815	list_destroy(&arc_mfu_ghost->arcs_list);
2816
2817	mutex_destroy(&arc_anon->arcs_mtx);
2818	mutex_destroy(&arc_mru->arcs_mtx);
2819	mutex_destroy(&arc_mru_ghost->arcs_mtx);
2820	mutex_destroy(&arc_mfu->arcs_mtx);
2821	mutex_destroy(&arc_mfu_ghost->arcs_mtx);
2822
2823	buf_fini();
2824
2825#ifdef _KERNEL
2826	if (zfs_event_lowmem != NULL)
2827		EVENTHANDLER_DEREGISTER(vm_lowmem, zfs_event_lowmem);
2828#endif
2829}
2830