arc.c revision 242845
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24 * Copyright (c) 2011 by Delphix. All rights reserved.
25 */
26
27/*
28 * DVA-based Adjustable Replacement Cache
29 *
30 * While much of the theory of operation used here is
31 * based on the self-tuning, low overhead replacement cache
32 * presented by Megiddo and Modha at FAST 2003, there are some
33 * significant differences:
34 *
35 * 1. The Megiddo and Modha model assumes any page is evictable.
36 * Pages in its cache cannot be "locked" into memory.  This makes
37 * the eviction algorithm simple: evict the last page in the list.
38 * This also make the performance characteristics easy to reason
39 * about.  Our cache is not so simple.  At any given moment, some
40 * subset of the blocks in the cache are un-evictable because we
41 * have handed out a reference to them.  Blocks are only evictable
42 * when there are no external references active.  This makes
43 * eviction far more problematic:  we choose to evict the evictable
44 * blocks that are the "lowest" in the list.
45 *
46 * There are times when it is not possible to evict the requested
47 * space.  In these circumstances we are unable to adjust the cache
48 * size.  To prevent the cache growing unbounded at these times we
49 * implement a "cache throttle" that slows the flow of new data
50 * into the cache until we can make space available.
51 *
52 * 2. The Megiddo and Modha model assumes a fixed cache size.
53 * Pages are evicted when the cache is full and there is a cache
54 * miss.  Our model has a variable sized cache.  It grows with
55 * high use, but also tries to react to memory pressure from the
56 * operating system: decreasing its size when system memory is
57 * tight.
58 *
59 * 3. The Megiddo and Modha model assumes a fixed page size. All
60 * elements of the cache are therefor exactly the same size.  So
61 * when adjusting the cache size following a cache miss, its simply
62 * a matter of choosing a single page to evict.  In our model, we
63 * have variable sized cache blocks (rangeing from 512 bytes to
64 * 128K bytes).  We therefor choose a set of blocks to evict to make
65 * space for a cache miss that approximates as closely as possible
66 * the space used by the new block.
67 *
68 * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
69 * by N. Megiddo & D. Modha, FAST 2003
70 */
71
72/*
73 * The locking model:
74 *
75 * A new reference to a cache buffer can be obtained in two
76 * ways: 1) via a hash table lookup using the DVA as a key,
77 * or 2) via one of the ARC lists.  The arc_read() interface
78 * uses method 1, while the internal arc algorithms for
79 * adjusting the cache use method 2.  We therefor provide two
80 * types of locks: 1) the hash table lock array, and 2) the
81 * arc list locks.
82 *
83 * Buffers do not have their own mutexs, rather they rely on the
84 * hash table mutexs for the bulk of their protection (i.e. most
85 * fields in the arc_buf_hdr_t are protected by these mutexs).
86 *
87 * buf_hash_find() returns the appropriate mutex (held) when it
88 * locates the requested buffer in the hash table.  It returns
89 * NULL for the mutex if the buffer was not in the table.
90 *
91 * buf_hash_remove() expects the appropriate hash mutex to be
92 * already held before it is invoked.
93 *
94 * Each arc state also has a mutex which is used to protect the
95 * buffer list associated with the state.  When attempting to
96 * obtain a hash table lock while holding an arc list lock you
97 * must use: mutex_tryenter() to avoid deadlock.  Also note that
98 * the active state mutex must be held before the ghost state mutex.
99 *
100 * Arc buffers may have an associated eviction callback function.
101 * This function will be invoked prior to removing the buffer (e.g.
102 * in arc_do_user_evicts()).  Note however that the data associated
103 * with the buffer may be evicted prior to the callback.  The callback
104 * must be made with *no locks held* (to prevent deadlock).  Additionally,
105 * the users of callbacks must ensure that their private data is
106 * protected from simultaneous callbacks from arc_buf_evict()
107 * and arc_do_user_evicts().
108 *
109 * Note that the majority of the performance stats are manipulated
110 * with atomic operations.
111 *
112 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
113 *
114 *	- L2ARC buflist creation
115 *	- L2ARC buflist eviction
116 *	- L2ARC write completion, which walks L2ARC buflists
117 *	- ARC header destruction, as it removes from L2ARC buflists
118 *	- ARC header release, as it removes from L2ARC buflists
119 */
120
121#include <sys/spa.h>
122#include <sys/zio.h>
123#include <sys/zfs_context.h>
124#include <sys/arc.h>
125#include <sys/refcount.h>
126#include <sys/vdev.h>
127#include <sys/vdev_impl.h>
128#ifdef _KERNEL
129#include <sys/dnlc.h>
130#endif
131#include <sys/callb.h>
132#include <sys/kstat.h>
133#include <zfs_fletcher.h>
134#include <sys/sdt.h>
135
136#include <vm/vm_pageout.h>
137
138#ifdef illumos
139#ifndef _KERNEL
140/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
141boolean_t arc_watch = B_FALSE;
142int arc_procfd;
143#endif
144#endif /* illumos */
145
146static kmutex_t		arc_reclaim_thr_lock;
147static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
148static uint8_t		arc_thread_exit;
149
150extern int zfs_write_limit_shift;
151extern uint64_t zfs_write_limit_max;
152extern kmutex_t zfs_write_limit_lock;
153
154#define	ARC_REDUCE_DNLC_PERCENT	3
155uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
156
157typedef enum arc_reclaim_strategy {
158	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
159	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
160} arc_reclaim_strategy_t;
161
162/* number of seconds before growing cache again */
163static int		arc_grow_retry = 60;
164
165/* shift of arc_c for calculating both min and max arc_p */
166static int		arc_p_min_shift = 4;
167
168/* log2(fraction of arc to reclaim) */
169static int		arc_shrink_shift = 5;
170
171/*
172 * minimum lifespan of a prefetch block in clock ticks
173 * (initialized in arc_init())
174 */
175static int		arc_min_prefetch_lifespan;
176
177static int arc_dead;
178extern int zfs_prefetch_disable;
179
180/*
181 * The arc has filled available memory and has now warmed up.
182 */
183static boolean_t arc_warm;
184
185/*
186 * These tunables are for performance analysis.
187 */
188uint64_t zfs_arc_max;
189uint64_t zfs_arc_min;
190uint64_t zfs_arc_meta_limit = 0;
191int zfs_arc_grow_retry = 0;
192int zfs_arc_shrink_shift = 0;
193int zfs_arc_p_min_shift = 0;
194int zfs_disable_dup_eviction = 0;
195
196TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
197TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
198TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
199SYSCTL_DECL(_vfs_zfs);
200SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
201    "Maximum ARC size");
202SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
203    "Minimum ARC size");
204
205/*
206 * Note that buffers can be in one of 6 states:
207 *	ARC_anon	- anonymous (discussed below)
208 *	ARC_mru		- recently used, currently cached
209 *	ARC_mru_ghost	- recentely used, no longer in cache
210 *	ARC_mfu		- frequently used, currently cached
211 *	ARC_mfu_ghost	- frequently used, no longer in cache
212 *	ARC_l2c_only	- exists in L2ARC but not other states
213 * When there are no active references to the buffer, they are
214 * are linked onto a list in one of these arc states.  These are
215 * the only buffers that can be evicted or deleted.  Within each
216 * state there are multiple lists, one for meta-data and one for
217 * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
218 * etc.) is tracked separately so that it can be managed more
219 * explicitly: favored over data, limited explicitly.
220 *
221 * Anonymous buffers are buffers that are not associated with
222 * a DVA.  These are buffers that hold dirty block copies
223 * before they are written to stable storage.  By definition,
224 * they are "ref'd" and are considered part of arc_mru
225 * that cannot be freed.  Generally, they will aquire a DVA
226 * as they are written and migrate onto the arc_mru list.
227 *
228 * The ARC_l2c_only state is for buffers that are in the second
229 * level ARC but no longer in any of the ARC_m* lists.  The second
230 * level ARC itself may also contain buffers that are in any of
231 * the ARC_m* states - meaning that a buffer can exist in two
232 * places.  The reason for the ARC_l2c_only state is to keep the
233 * buffer header in the hash table, so that reads that hit the
234 * second level ARC benefit from these fast lookups.
235 */
236
237#define	ARCS_LOCK_PAD		CACHE_LINE_SIZE
238struct arcs_lock {
239	kmutex_t	arcs_lock;
240#ifdef _KERNEL
241	unsigned char	pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
242#endif
243};
244
245/*
246 * must be power of two for mask use to work
247 *
248 */
249#define ARC_BUFC_NUMDATALISTS		16
250#define ARC_BUFC_NUMMETADATALISTS	16
251#define ARC_BUFC_NUMLISTS	(ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
252
253typedef struct arc_state {
254	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
255	uint64_t arcs_size;	/* total amount of data in this state */
256	list_t	arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
257	struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
258} arc_state_t;
259
260#define ARCS_LOCK(s, i)	(&((s)->arcs_locks[(i)].arcs_lock))
261
262/* The 6 states: */
263static arc_state_t ARC_anon;
264static arc_state_t ARC_mru;
265static arc_state_t ARC_mru_ghost;
266static arc_state_t ARC_mfu;
267static arc_state_t ARC_mfu_ghost;
268static arc_state_t ARC_l2c_only;
269
270typedef struct arc_stats {
271	kstat_named_t arcstat_hits;
272	kstat_named_t arcstat_misses;
273	kstat_named_t arcstat_demand_data_hits;
274	kstat_named_t arcstat_demand_data_misses;
275	kstat_named_t arcstat_demand_metadata_hits;
276	kstat_named_t arcstat_demand_metadata_misses;
277	kstat_named_t arcstat_prefetch_data_hits;
278	kstat_named_t arcstat_prefetch_data_misses;
279	kstat_named_t arcstat_prefetch_metadata_hits;
280	kstat_named_t arcstat_prefetch_metadata_misses;
281	kstat_named_t arcstat_mru_hits;
282	kstat_named_t arcstat_mru_ghost_hits;
283	kstat_named_t arcstat_mfu_hits;
284	kstat_named_t arcstat_mfu_ghost_hits;
285	kstat_named_t arcstat_allocated;
286	kstat_named_t arcstat_deleted;
287	kstat_named_t arcstat_stolen;
288	kstat_named_t arcstat_recycle_miss;
289	kstat_named_t arcstat_mutex_miss;
290	kstat_named_t arcstat_evict_skip;
291	kstat_named_t arcstat_evict_l2_cached;
292	kstat_named_t arcstat_evict_l2_eligible;
293	kstat_named_t arcstat_evict_l2_ineligible;
294	kstat_named_t arcstat_hash_elements;
295	kstat_named_t arcstat_hash_elements_max;
296	kstat_named_t arcstat_hash_collisions;
297	kstat_named_t arcstat_hash_chains;
298	kstat_named_t arcstat_hash_chain_max;
299	kstat_named_t arcstat_p;
300	kstat_named_t arcstat_c;
301	kstat_named_t arcstat_c_min;
302	kstat_named_t arcstat_c_max;
303	kstat_named_t arcstat_size;
304	kstat_named_t arcstat_hdr_size;
305	kstat_named_t arcstat_data_size;
306	kstat_named_t arcstat_other_size;
307	kstat_named_t arcstat_l2_hits;
308	kstat_named_t arcstat_l2_misses;
309	kstat_named_t arcstat_l2_feeds;
310	kstat_named_t arcstat_l2_rw_clash;
311	kstat_named_t arcstat_l2_read_bytes;
312	kstat_named_t arcstat_l2_write_bytes;
313	kstat_named_t arcstat_l2_writes_sent;
314	kstat_named_t arcstat_l2_writes_done;
315	kstat_named_t arcstat_l2_writes_error;
316	kstat_named_t arcstat_l2_writes_hdr_miss;
317	kstat_named_t arcstat_l2_evict_lock_retry;
318	kstat_named_t arcstat_l2_evict_reading;
319	kstat_named_t arcstat_l2_free_on_write;
320	kstat_named_t arcstat_l2_abort_lowmem;
321	kstat_named_t arcstat_l2_cksum_bad;
322	kstat_named_t arcstat_l2_io_error;
323	kstat_named_t arcstat_l2_size;
324	kstat_named_t arcstat_l2_hdr_size;
325	kstat_named_t arcstat_l2_write_trylock_fail;
326	kstat_named_t arcstat_l2_write_passed_headroom;
327	kstat_named_t arcstat_l2_write_spa_mismatch;
328	kstat_named_t arcstat_l2_write_in_l2;
329	kstat_named_t arcstat_l2_write_hdr_io_in_progress;
330	kstat_named_t arcstat_l2_write_not_cacheable;
331	kstat_named_t arcstat_l2_write_full;
332	kstat_named_t arcstat_l2_write_buffer_iter;
333	kstat_named_t arcstat_l2_write_pios;
334	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
335	kstat_named_t arcstat_l2_write_buffer_list_iter;
336	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
337	kstat_named_t arcstat_memory_throttle_count;
338	kstat_named_t arcstat_duplicate_buffers;
339	kstat_named_t arcstat_duplicate_buffers_size;
340	kstat_named_t arcstat_duplicate_reads;
341} arc_stats_t;
342
343static arc_stats_t arc_stats = {
344	{ "hits",			KSTAT_DATA_UINT64 },
345	{ "misses",			KSTAT_DATA_UINT64 },
346	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
347	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
348	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
349	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
350	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
351	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
352	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
353	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
354	{ "mru_hits",			KSTAT_DATA_UINT64 },
355	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
356	{ "mfu_hits",			KSTAT_DATA_UINT64 },
357	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
358	{ "allocated",			KSTAT_DATA_UINT64 },
359	{ "deleted",			KSTAT_DATA_UINT64 },
360	{ "stolen",			KSTAT_DATA_UINT64 },
361	{ "recycle_miss",		KSTAT_DATA_UINT64 },
362	{ "mutex_miss",			KSTAT_DATA_UINT64 },
363	{ "evict_skip",			KSTAT_DATA_UINT64 },
364	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
365	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
366	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
367	{ "hash_elements",		KSTAT_DATA_UINT64 },
368	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
369	{ "hash_collisions",		KSTAT_DATA_UINT64 },
370	{ "hash_chains",		KSTAT_DATA_UINT64 },
371	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
372	{ "p",				KSTAT_DATA_UINT64 },
373	{ "c",				KSTAT_DATA_UINT64 },
374	{ "c_min",			KSTAT_DATA_UINT64 },
375	{ "c_max",			KSTAT_DATA_UINT64 },
376	{ "size",			KSTAT_DATA_UINT64 },
377	{ "hdr_size",			KSTAT_DATA_UINT64 },
378	{ "data_size",			KSTAT_DATA_UINT64 },
379	{ "other_size",			KSTAT_DATA_UINT64 },
380	{ "l2_hits",			KSTAT_DATA_UINT64 },
381	{ "l2_misses",			KSTAT_DATA_UINT64 },
382	{ "l2_feeds",			KSTAT_DATA_UINT64 },
383	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
384	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
385	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
386	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
387	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
388	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
389	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
390	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
391	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
392	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
393	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
394	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
395	{ "l2_io_error",		KSTAT_DATA_UINT64 },
396	{ "l2_size",			KSTAT_DATA_UINT64 },
397	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
398	{ "l2_write_trylock_fail",	KSTAT_DATA_UINT64 },
399	{ "l2_write_passed_headroom",	KSTAT_DATA_UINT64 },
400	{ "l2_write_spa_mismatch",	KSTAT_DATA_UINT64 },
401	{ "l2_write_in_l2",		KSTAT_DATA_UINT64 },
402	{ "l2_write_io_in_progress",	KSTAT_DATA_UINT64 },
403	{ "l2_write_not_cacheable",	KSTAT_DATA_UINT64 },
404	{ "l2_write_full",		KSTAT_DATA_UINT64 },
405	{ "l2_write_buffer_iter",	KSTAT_DATA_UINT64 },
406	{ "l2_write_pios",		KSTAT_DATA_UINT64 },
407	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
408	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
409	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
410	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
411	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
412	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
413	{ "duplicate_reads",		KSTAT_DATA_UINT64 }
414};
415
416#define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
417
418#define	ARCSTAT_INCR(stat, val) \
419	atomic_add_64(&arc_stats.stat.value.ui64, (val));
420
421#define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
422#define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
423
424#define	ARCSTAT_MAX(stat, val) {					\
425	uint64_t m;							\
426	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
427	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
428		continue;						\
429}
430
431#define	ARCSTAT_MAXSTAT(stat) \
432	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
433
434/*
435 * We define a macro to allow ARC hits/misses to be easily broken down by
436 * two separate conditions, giving a total of four different subtypes for
437 * each of hits and misses (so eight statistics total).
438 */
439#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
440	if (cond1) {							\
441		if (cond2) {						\
442			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
443		} else {						\
444			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
445		}							\
446	} else {							\
447		if (cond2) {						\
448			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
449		} else {						\
450			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
451		}							\
452	}
453
454kstat_t			*arc_ksp;
455static arc_state_t	*arc_anon;
456static arc_state_t	*arc_mru;
457static arc_state_t	*arc_mru_ghost;
458static arc_state_t	*arc_mfu;
459static arc_state_t	*arc_mfu_ghost;
460static arc_state_t	*arc_l2c_only;
461
462/*
463 * There are several ARC variables that are critical to export as kstats --
464 * but we don't want to have to grovel around in the kstat whenever we wish to
465 * manipulate them.  For these variables, we therefore define them to be in
466 * terms of the statistic variable.  This assures that we are not introducing
467 * the possibility of inconsistency by having shadow copies of the variables,
468 * while still allowing the code to be readable.
469 */
470#define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
471#define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
472#define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
473#define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
474#define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
475
476static int		arc_no_grow;	/* Don't try to grow cache size */
477static uint64_t		arc_tempreserve;
478static uint64_t		arc_loaned_bytes;
479static uint64_t		arc_meta_used;
480static uint64_t		arc_meta_limit;
481static uint64_t		arc_meta_max = 0;
482SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RD, &arc_meta_used, 0,
483    "ARC metadata used");
484SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RW, &arc_meta_limit, 0,
485    "ARC metadata limit");
486
487typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
488
489typedef struct arc_callback arc_callback_t;
490
491struct arc_callback {
492	void			*acb_private;
493	arc_done_func_t		*acb_done;
494	arc_buf_t		*acb_buf;
495	zio_t			*acb_zio_dummy;
496	arc_callback_t		*acb_next;
497};
498
499typedef struct arc_write_callback arc_write_callback_t;
500
501struct arc_write_callback {
502	void		*awcb_private;
503	arc_done_func_t	*awcb_ready;
504	arc_done_func_t	*awcb_done;
505	arc_buf_t	*awcb_buf;
506};
507
508struct arc_buf_hdr {
509	/* protected by hash lock */
510	dva_t			b_dva;
511	uint64_t		b_birth;
512	uint64_t		b_cksum0;
513
514	kmutex_t		b_freeze_lock;
515	zio_cksum_t		*b_freeze_cksum;
516	void			*b_thawed;
517
518	arc_buf_hdr_t		*b_hash_next;
519	arc_buf_t		*b_buf;
520	uint32_t		b_flags;
521	uint32_t		b_datacnt;
522
523	arc_callback_t		*b_acb;
524	kcondvar_t		b_cv;
525
526	/* immutable */
527	arc_buf_contents_t	b_type;
528	uint64_t		b_size;
529	uint64_t		b_spa;
530
531	/* protected by arc state mutex */
532	arc_state_t		*b_state;
533	list_node_t		b_arc_node;
534
535	/* updated atomically */
536	clock_t			b_arc_access;
537
538	/* self protecting */
539	refcount_t		b_refcnt;
540
541	l2arc_buf_hdr_t		*b_l2hdr;
542	list_node_t		b_l2node;
543};
544
545static arc_buf_t *arc_eviction_list;
546static kmutex_t arc_eviction_mtx;
547static arc_buf_hdr_t arc_eviction_hdr;
548static void arc_get_data_buf(arc_buf_t *buf);
549static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
550static int arc_evict_needed(arc_buf_contents_t type);
551static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
552#ifdef illumos
553static void arc_buf_watch(arc_buf_t *buf);
554#endif /* illumos */
555
556static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
557
558#define	GHOST_STATE(state)	\
559	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
560	(state) == arc_l2c_only)
561
562/*
563 * Private ARC flags.  These flags are private ARC only flags that will show up
564 * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
565 * be passed in as arc_flags in things like arc_read.  However, these flags
566 * should never be passed and should only be set by ARC code.  When adding new
567 * public flags, make sure not to smash the private ones.
568 */
569
570#define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
571#define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
572#define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
573#define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
574#define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
575#define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
576#define	ARC_FREE_IN_PROGRESS	(1 << 15)	/* hdr about to be freed */
577#define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
578#define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
579#define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
580
581#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
582#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
583#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
584#define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_PREFETCH)
585#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
586#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
587#define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
588#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_L2CACHE)
589#define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS &&	\
590				    (hdr)->b_l2hdr != NULL)
591#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
592#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
593#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
594
595/*
596 * Other sizes
597 */
598
599#define	HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
600#define	L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
601
602/*
603 * Hash table routines
604 */
605
606#define	HT_LOCK_PAD	CACHE_LINE_SIZE
607
608struct ht_lock {
609	kmutex_t	ht_lock;
610#ifdef _KERNEL
611	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
612#endif
613};
614
615#define	BUF_LOCKS 256
616typedef struct buf_hash_table {
617	uint64_t ht_mask;
618	arc_buf_hdr_t **ht_table;
619	struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
620} buf_hash_table_t;
621
622static buf_hash_table_t buf_hash_table;
623
624#define	BUF_HASH_INDEX(spa, dva, birth) \
625	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
626#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
627#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
628#define	HDR_LOCK(hdr) \
629	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
630
631uint64_t zfs_crc64_table[256];
632
633/*
634 * Level 2 ARC
635 */
636
637#define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
638#define	L2ARC_HEADROOM		2		/* num of writes */
639#define	L2ARC_FEED_SECS		1		/* caching interval secs */
640#define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
641
642#define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
643#define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
644
645/*
646 * L2ARC Performance Tunables
647 */
648uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
649uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
650uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
651uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
652uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
653boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
654boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
655boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
656
657SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
658    &l2arc_write_max, 0, "max write size");
659SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
660    &l2arc_write_boost, 0, "extra write during warmup");
661SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
662    &l2arc_headroom, 0, "number of dev writes");
663SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
664    &l2arc_feed_secs, 0, "interval seconds");
665SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
666    &l2arc_feed_min_ms, 0, "min interval milliseconds");
667
668SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
669    &l2arc_noprefetch, 0, "don't cache prefetch bufs");
670SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
671    &l2arc_feed_again, 0, "turbo warmup");
672SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
673    &l2arc_norw, 0, "no reads during writes");
674
675SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
676    &ARC_anon.arcs_size, 0, "size of anonymous state");
677SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
678    &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
679SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
680    &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
681
682SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
683    &ARC_mru.arcs_size, 0, "size of mru state");
684SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
685    &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
686SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
687    &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
688
689SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
690    &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
691SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
692    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
693    "size of metadata in mru ghost state");
694SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
695    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
696    "size of data in mru ghost state");
697
698SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
699    &ARC_mfu.arcs_size, 0, "size of mfu state");
700SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
701    &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
702SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
703    &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
704
705SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
706    &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
707SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
708    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
709    "size of metadata in mfu ghost state");
710SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
711    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
712    "size of data in mfu ghost state");
713
714SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
715    &ARC_l2c_only.arcs_size, 0, "size of mru state");
716
717/*
718 * L2ARC Internals
719 */
720typedef struct l2arc_dev {
721	vdev_t			*l2ad_vdev;	/* vdev */
722	spa_t			*l2ad_spa;	/* spa */
723	uint64_t		l2ad_hand;	/* next write location */
724	uint64_t		l2ad_write;	/* desired write size, bytes */
725	uint64_t		l2ad_boost;	/* warmup write boost, bytes */
726	uint64_t		l2ad_start;	/* first addr on device */
727	uint64_t		l2ad_end;	/* last addr on device */
728	uint64_t		l2ad_evict;	/* last addr eviction reached */
729	boolean_t		l2ad_first;	/* first sweep through */
730	boolean_t		l2ad_writing;	/* currently writing */
731	list_t			*l2ad_buflist;	/* buffer list */
732	list_node_t		l2ad_node;	/* device list node */
733} l2arc_dev_t;
734
735static list_t L2ARC_dev_list;			/* device list */
736static list_t *l2arc_dev_list;			/* device list pointer */
737static kmutex_t l2arc_dev_mtx;			/* device list mutex */
738static l2arc_dev_t *l2arc_dev_last;		/* last device used */
739static kmutex_t l2arc_buflist_mtx;		/* mutex for all buflists */
740static list_t L2ARC_free_on_write;		/* free after write buf list */
741static list_t *l2arc_free_on_write;		/* free after write list ptr */
742static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
743static uint64_t l2arc_ndev;			/* number of devices */
744
745typedef struct l2arc_read_callback {
746	arc_buf_t	*l2rcb_buf;		/* read buffer */
747	spa_t		*l2rcb_spa;		/* spa */
748	blkptr_t	l2rcb_bp;		/* original blkptr */
749	zbookmark_t	l2rcb_zb;		/* original bookmark */
750	int		l2rcb_flags;		/* original flags */
751} l2arc_read_callback_t;
752
753typedef struct l2arc_write_callback {
754	l2arc_dev_t	*l2wcb_dev;		/* device info */
755	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
756} l2arc_write_callback_t;
757
758struct l2arc_buf_hdr {
759	/* protected by arc_buf_hdr  mutex */
760	l2arc_dev_t	*b_dev;			/* L2ARC device */
761	uint64_t	b_daddr;		/* disk address, offset byte */
762};
763
764typedef struct l2arc_data_free {
765	/* protected by l2arc_free_on_write_mtx */
766	void		*l2df_data;
767	size_t		l2df_size;
768	void		(*l2df_func)(void *, size_t);
769	list_node_t	l2df_list_node;
770} l2arc_data_free_t;
771
772static kmutex_t l2arc_feed_thr_lock;
773static kcondvar_t l2arc_feed_thr_cv;
774static uint8_t l2arc_thread_exit;
775
776static void l2arc_read_done(zio_t *zio);
777static void l2arc_hdr_stat_add(void);
778static void l2arc_hdr_stat_remove(void);
779
780static uint64_t
781buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
782{
783	uint8_t *vdva = (uint8_t *)dva;
784	uint64_t crc = -1ULL;
785	int i;
786
787	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
788
789	for (i = 0; i < sizeof (dva_t); i++)
790		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
791
792	crc ^= (spa>>8) ^ birth;
793
794	return (crc);
795}
796
797#define	BUF_EMPTY(buf)						\
798	((buf)->b_dva.dva_word[0] == 0 &&			\
799	(buf)->b_dva.dva_word[1] == 0 &&			\
800	(buf)->b_birth == 0)
801
802#define	BUF_EQUAL(spa, dva, birth, buf)				\
803	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
804	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
805	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
806
807static void
808buf_discard_identity(arc_buf_hdr_t *hdr)
809{
810	hdr->b_dva.dva_word[0] = 0;
811	hdr->b_dva.dva_word[1] = 0;
812	hdr->b_birth = 0;
813	hdr->b_cksum0 = 0;
814}
815
816static arc_buf_hdr_t *
817buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
818{
819	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
820	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
821	arc_buf_hdr_t *buf;
822
823	mutex_enter(hash_lock);
824	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
825	    buf = buf->b_hash_next) {
826		if (BUF_EQUAL(spa, dva, birth, buf)) {
827			*lockp = hash_lock;
828			return (buf);
829		}
830	}
831	mutex_exit(hash_lock);
832	*lockp = NULL;
833	return (NULL);
834}
835
836/*
837 * Insert an entry into the hash table.  If there is already an element
838 * equal to elem in the hash table, then the already existing element
839 * will be returned and the new element will not be inserted.
840 * Otherwise returns NULL.
841 */
842static arc_buf_hdr_t *
843buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
844{
845	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
846	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
847	arc_buf_hdr_t *fbuf;
848	uint32_t i;
849
850	ASSERT(!HDR_IN_HASH_TABLE(buf));
851	*lockp = hash_lock;
852	mutex_enter(hash_lock);
853	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
854	    fbuf = fbuf->b_hash_next, i++) {
855		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
856			return (fbuf);
857	}
858
859	buf->b_hash_next = buf_hash_table.ht_table[idx];
860	buf_hash_table.ht_table[idx] = buf;
861	buf->b_flags |= ARC_IN_HASH_TABLE;
862
863	/* collect some hash table performance data */
864	if (i > 0) {
865		ARCSTAT_BUMP(arcstat_hash_collisions);
866		if (i == 1)
867			ARCSTAT_BUMP(arcstat_hash_chains);
868
869		ARCSTAT_MAX(arcstat_hash_chain_max, i);
870	}
871
872	ARCSTAT_BUMP(arcstat_hash_elements);
873	ARCSTAT_MAXSTAT(arcstat_hash_elements);
874
875	return (NULL);
876}
877
878static void
879buf_hash_remove(arc_buf_hdr_t *buf)
880{
881	arc_buf_hdr_t *fbuf, **bufp;
882	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
883
884	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
885	ASSERT(HDR_IN_HASH_TABLE(buf));
886
887	bufp = &buf_hash_table.ht_table[idx];
888	while ((fbuf = *bufp) != buf) {
889		ASSERT(fbuf != NULL);
890		bufp = &fbuf->b_hash_next;
891	}
892	*bufp = buf->b_hash_next;
893	buf->b_hash_next = NULL;
894	buf->b_flags &= ~ARC_IN_HASH_TABLE;
895
896	/* collect some hash table performance data */
897	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
898
899	if (buf_hash_table.ht_table[idx] &&
900	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
901		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
902}
903
904/*
905 * Global data structures and functions for the buf kmem cache.
906 */
907static kmem_cache_t *hdr_cache;
908static kmem_cache_t *buf_cache;
909
910static void
911buf_fini(void)
912{
913	int i;
914
915	kmem_free(buf_hash_table.ht_table,
916	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
917	for (i = 0; i < BUF_LOCKS; i++)
918		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
919	kmem_cache_destroy(hdr_cache);
920	kmem_cache_destroy(buf_cache);
921}
922
923/*
924 * Constructor callback - called when the cache is empty
925 * and a new buf is requested.
926 */
927/* ARGSUSED */
928static int
929hdr_cons(void *vbuf, void *unused, int kmflag)
930{
931	arc_buf_hdr_t *buf = vbuf;
932
933	bzero(buf, sizeof (arc_buf_hdr_t));
934	refcount_create(&buf->b_refcnt);
935	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
936	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
937	arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
938
939	return (0);
940}
941
942/* ARGSUSED */
943static int
944buf_cons(void *vbuf, void *unused, int kmflag)
945{
946	arc_buf_t *buf = vbuf;
947
948	bzero(buf, sizeof (arc_buf_t));
949	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
950	rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL);
951	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
952
953	return (0);
954}
955
956/*
957 * Destructor callback - called when a cached buf is
958 * no longer required.
959 */
960/* ARGSUSED */
961static void
962hdr_dest(void *vbuf, void *unused)
963{
964	arc_buf_hdr_t *buf = vbuf;
965
966	ASSERT(BUF_EMPTY(buf));
967	refcount_destroy(&buf->b_refcnt);
968	cv_destroy(&buf->b_cv);
969	mutex_destroy(&buf->b_freeze_lock);
970	arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
971}
972
973/* ARGSUSED */
974static void
975buf_dest(void *vbuf, void *unused)
976{
977	arc_buf_t *buf = vbuf;
978
979	mutex_destroy(&buf->b_evict_lock);
980	rw_destroy(&buf->b_data_lock);
981	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
982}
983
984/*
985 * Reclaim callback -- invoked when memory is low.
986 */
987/* ARGSUSED */
988static void
989hdr_recl(void *unused)
990{
991	dprintf("hdr_recl called\n");
992	/*
993	 * umem calls the reclaim func when we destroy the buf cache,
994	 * which is after we do arc_fini().
995	 */
996	if (!arc_dead)
997		cv_signal(&arc_reclaim_thr_cv);
998}
999
1000static void
1001buf_init(void)
1002{
1003	uint64_t *ct;
1004	uint64_t hsize = 1ULL << 12;
1005	int i, j;
1006
1007	/*
1008	 * The hash table is big enough to fill all of physical memory
1009	 * with an average 64K block size.  The table will take up
1010	 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
1011	 */
1012	while (hsize * 65536 < (uint64_t)physmem * PAGESIZE)
1013		hsize <<= 1;
1014retry:
1015	buf_hash_table.ht_mask = hsize - 1;
1016	buf_hash_table.ht_table =
1017	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1018	if (buf_hash_table.ht_table == NULL) {
1019		ASSERT(hsize > (1ULL << 8));
1020		hsize >>= 1;
1021		goto retry;
1022	}
1023
1024	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
1025	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
1026	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1027	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1028
1029	for (i = 0; i < 256; i++)
1030		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1031			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1032
1033	for (i = 0; i < BUF_LOCKS; i++) {
1034		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1035		    NULL, MUTEX_DEFAULT, NULL);
1036	}
1037}
1038
1039#define	ARC_MINTIME	(hz>>4) /* 62 ms */
1040
1041static void
1042arc_cksum_verify(arc_buf_t *buf)
1043{
1044	zio_cksum_t zc;
1045
1046	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1047		return;
1048
1049	mutex_enter(&buf->b_hdr->b_freeze_lock);
1050	if (buf->b_hdr->b_freeze_cksum == NULL ||
1051	    (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
1052		mutex_exit(&buf->b_hdr->b_freeze_lock);
1053		return;
1054	}
1055	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1056	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1057		panic("buffer modified while frozen!");
1058	mutex_exit(&buf->b_hdr->b_freeze_lock);
1059}
1060
1061static int
1062arc_cksum_equal(arc_buf_t *buf)
1063{
1064	zio_cksum_t zc;
1065	int equal;
1066
1067	mutex_enter(&buf->b_hdr->b_freeze_lock);
1068	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1069	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1070	mutex_exit(&buf->b_hdr->b_freeze_lock);
1071
1072	return (equal);
1073}
1074
1075static void
1076arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1077{
1078	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1079		return;
1080
1081	mutex_enter(&buf->b_hdr->b_freeze_lock);
1082	if (buf->b_hdr->b_freeze_cksum != NULL) {
1083		mutex_exit(&buf->b_hdr->b_freeze_lock);
1084		return;
1085	}
1086	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1087	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1088	    buf->b_hdr->b_freeze_cksum);
1089	mutex_exit(&buf->b_hdr->b_freeze_lock);
1090#ifdef illumos
1091	arc_buf_watch(buf);
1092#endif /* illumos */
1093}
1094
1095#ifdef illumos
1096#ifndef _KERNEL
1097typedef struct procctl {
1098	long cmd;
1099	prwatch_t prwatch;
1100} procctl_t;
1101#endif
1102
1103/* ARGSUSED */
1104static void
1105arc_buf_unwatch(arc_buf_t *buf)
1106{
1107#ifndef _KERNEL
1108	if (arc_watch) {
1109		int result;
1110		procctl_t ctl;
1111		ctl.cmd = PCWATCH;
1112		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1113		ctl.prwatch.pr_size = 0;
1114		ctl.prwatch.pr_wflags = 0;
1115		result = write(arc_procfd, &ctl, sizeof (ctl));
1116		ASSERT3U(result, ==, sizeof (ctl));
1117	}
1118#endif
1119}
1120
1121/* ARGSUSED */
1122static void
1123arc_buf_watch(arc_buf_t *buf)
1124{
1125#ifndef _KERNEL
1126	if (arc_watch) {
1127		int result;
1128		procctl_t ctl;
1129		ctl.cmd = PCWATCH;
1130		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1131		ctl.prwatch.pr_size = buf->b_hdr->b_size;
1132		ctl.prwatch.pr_wflags = WA_WRITE;
1133		result = write(arc_procfd, &ctl, sizeof (ctl));
1134		ASSERT3U(result, ==, sizeof (ctl));
1135	}
1136#endif
1137}
1138#endif /* illumos */
1139
1140void
1141arc_buf_thaw(arc_buf_t *buf)
1142{
1143	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1144		if (buf->b_hdr->b_state != arc_anon)
1145			panic("modifying non-anon buffer!");
1146		if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1147			panic("modifying buffer while i/o in progress!");
1148		arc_cksum_verify(buf);
1149	}
1150
1151	mutex_enter(&buf->b_hdr->b_freeze_lock);
1152	if (buf->b_hdr->b_freeze_cksum != NULL) {
1153		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1154		buf->b_hdr->b_freeze_cksum = NULL;
1155	}
1156
1157	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1158		if (buf->b_hdr->b_thawed)
1159			kmem_free(buf->b_hdr->b_thawed, 1);
1160		buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1161	}
1162
1163	mutex_exit(&buf->b_hdr->b_freeze_lock);
1164
1165#ifdef illumos
1166	arc_buf_unwatch(buf);
1167#endif /* illumos */
1168}
1169
1170void
1171arc_buf_freeze(arc_buf_t *buf)
1172{
1173	kmutex_t *hash_lock;
1174
1175	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1176		return;
1177
1178	hash_lock = HDR_LOCK(buf->b_hdr);
1179	mutex_enter(hash_lock);
1180
1181	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1182	    buf->b_hdr->b_state == arc_anon);
1183	arc_cksum_compute(buf, B_FALSE);
1184	mutex_exit(hash_lock);
1185
1186}
1187
1188static void
1189get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock)
1190{
1191	uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth);
1192
1193	if (ab->b_type == ARC_BUFC_METADATA)
1194		buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
1195	else {
1196		buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
1197		buf_hashid += ARC_BUFC_NUMMETADATALISTS;
1198	}
1199
1200	*list = &state->arcs_lists[buf_hashid];
1201	*lock = ARCS_LOCK(state, buf_hashid);
1202}
1203
1204
1205static void
1206add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1207{
1208	ASSERT(MUTEX_HELD(hash_lock));
1209
1210	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1211	    (ab->b_state != arc_anon)) {
1212		uint64_t delta = ab->b_size * ab->b_datacnt;
1213		uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1214		list_t *list;
1215		kmutex_t *lock;
1216
1217		get_buf_info(ab, ab->b_state, &list, &lock);
1218		ASSERT(!MUTEX_HELD(lock));
1219		mutex_enter(lock);
1220		ASSERT(list_link_active(&ab->b_arc_node));
1221		list_remove(list, ab);
1222		if (GHOST_STATE(ab->b_state)) {
1223			ASSERT0(ab->b_datacnt);
1224			ASSERT3P(ab->b_buf, ==, NULL);
1225			delta = ab->b_size;
1226		}
1227		ASSERT(delta > 0);
1228		ASSERT3U(*size, >=, delta);
1229		atomic_add_64(size, -delta);
1230		mutex_exit(lock);
1231		/* remove the prefetch flag if we get a reference */
1232		if (ab->b_flags & ARC_PREFETCH)
1233			ab->b_flags &= ~ARC_PREFETCH;
1234	}
1235}
1236
1237static int
1238remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1239{
1240	int cnt;
1241	arc_state_t *state = ab->b_state;
1242
1243	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1244	ASSERT(!GHOST_STATE(state));
1245
1246	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1247	    (state != arc_anon)) {
1248		uint64_t *size = &state->arcs_lsize[ab->b_type];
1249		list_t *list;
1250		kmutex_t *lock;
1251
1252		get_buf_info(ab, state, &list, &lock);
1253		ASSERT(!MUTEX_HELD(lock));
1254		mutex_enter(lock);
1255		ASSERT(!list_link_active(&ab->b_arc_node));
1256		list_insert_head(list, ab);
1257		ASSERT(ab->b_datacnt > 0);
1258		atomic_add_64(size, ab->b_size * ab->b_datacnt);
1259		mutex_exit(lock);
1260	}
1261	return (cnt);
1262}
1263
1264/*
1265 * Move the supplied buffer to the indicated state.  The mutex
1266 * for the buffer must be held by the caller.
1267 */
1268static void
1269arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1270{
1271	arc_state_t *old_state = ab->b_state;
1272	int64_t refcnt = refcount_count(&ab->b_refcnt);
1273	uint64_t from_delta, to_delta;
1274	list_t *list;
1275	kmutex_t *lock;
1276
1277	ASSERT(MUTEX_HELD(hash_lock));
1278	ASSERT(new_state != old_state);
1279	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1280	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1281	ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1282
1283	from_delta = to_delta = ab->b_datacnt * ab->b_size;
1284
1285	/*
1286	 * If this buffer is evictable, transfer it from the
1287	 * old state list to the new state list.
1288	 */
1289	if (refcnt == 0) {
1290		if (old_state != arc_anon) {
1291			int use_mutex;
1292			uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1293
1294			get_buf_info(ab, old_state, &list, &lock);
1295			use_mutex = !MUTEX_HELD(lock);
1296			if (use_mutex)
1297				mutex_enter(lock);
1298
1299			ASSERT(list_link_active(&ab->b_arc_node));
1300			list_remove(list, ab);
1301
1302			/*
1303			 * If prefetching out of the ghost cache,
1304			 * we will have a non-zero datacnt.
1305			 */
1306			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1307				/* ghost elements have a ghost size */
1308				ASSERT(ab->b_buf == NULL);
1309				from_delta = ab->b_size;
1310			}
1311			ASSERT3U(*size, >=, from_delta);
1312			atomic_add_64(size, -from_delta);
1313
1314			if (use_mutex)
1315				mutex_exit(lock);
1316		}
1317		if (new_state != arc_anon) {
1318			int use_mutex;
1319			uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1320
1321			get_buf_info(ab, new_state, &list, &lock);
1322			use_mutex = !MUTEX_HELD(lock);
1323			if (use_mutex)
1324				mutex_enter(lock);
1325
1326			list_insert_head(list, ab);
1327
1328			/* ghost elements have a ghost size */
1329			if (GHOST_STATE(new_state)) {
1330				ASSERT(ab->b_datacnt == 0);
1331				ASSERT(ab->b_buf == NULL);
1332				to_delta = ab->b_size;
1333			}
1334			atomic_add_64(size, to_delta);
1335
1336			if (use_mutex)
1337				mutex_exit(lock);
1338		}
1339	}
1340
1341	ASSERT(!BUF_EMPTY(ab));
1342	if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1343		buf_hash_remove(ab);
1344
1345	/* adjust state sizes */
1346	if (to_delta)
1347		atomic_add_64(&new_state->arcs_size, to_delta);
1348	if (from_delta) {
1349		ASSERT3U(old_state->arcs_size, >=, from_delta);
1350		atomic_add_64(&old_state->arcs_size, -from_delta);
1351	}
1352	ab->b_state = new_state;
1353
1354	/* adjust l2arc hdr stats */
1355	if (new_state == arc_l2c_only)
1356		l2arc_hdr_stat_add();
1357	else if (old_state == arc_l2c_only)
1358		l2arc_hdr_stat_remove();
1359}
1360
1361void
1362arc_space_consume(uint64_t space, arc_space_type_t type)
1363{
1364	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1365
1366	switch (type) {
1367	case ARC_SPACE_DATA:
1368		ARCSTAT_INCR(arcstat_data_size, space);
1369		break;
1370	case ARC_SPACE_OTHER:
1371		ARCSTAT_INCR(arcstat_other_size, space);
1372		break;
1373	case ARC_SPACE_HDRS:
1374		ARCSTAT_INCR(arcstat_hdr_size, space);
1375		break;
1376	case ARC_SPACE_L2HDRS:
1377		ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1378		break;
1379	}
1380
1381	atomic_add_64(&arc_meta_used, space);
1382	atomic_add_64(&arc_size, space);
1383}
1384
1385void
1386arc_space_return(uint64_t space, arc_space_type_t type)
1387{
1388	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1389
1390	switch (type) {
1391	case ARC_SPACE_DATA:
1392		ARCSTAT_INCR(arcstat_data_size, -space);
1393		break;
1394	case ARC_SPACE_OTHER:
1395		ARCSTAT_INCR(arcstat_other_size, -space);
1396		break;
1397	case ARC_SPACE_HDRS:
1398		ARCSTAT_INCR(arcstat_hdr_size, -space);
1399		break;
1400	case ARC_SPACE_L2HDRS:
1401		ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1402		break;
1403	}
1404
1405	ASSERT(arc_meta_used >= space);
1406	if (arc_meta_max < arc_meta_used)
1407		arc_meta_max = arc_meta_used;
1408	atomic_add_64(&arc_meta_used, -space);
1409	ASSERT(arc_size >= space);
1410	atomic_add_64(&arc_size, -space);
1411}
1412
1413void *
1414arc_data_buf_alloc(uint64_t size)
1415{
1416	if (arc_evict_needed(ARC_BUFC_DATA))
1417		cv_signal(&arc_reclaim_thr_cv);
1418	atomic_add_64(&arc_size, size);
1419	return (zio_data_buf_alloc(size));
1420}
1421
1422void
1423arc_data_buf_free(void *buf, uint64_t size)
1424{
1425	zio_data_buf_free(buf, size);
1426	ASSERT(arc_size >= size);
1427	atomic_add_64(&arc_size, -size);
1428}
1429
1430arc_buf_t *
1431arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1432{
1433	arc_buf_hdr_t *hdr;
1434	arc_buf_t *buf;
1435
1436	ASSERT3U(size, >, 0);
1437	hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1438	ASSERT(BUF_EMPTY(hdr));
1439	hdr->b_size = size;
1440	hdr->b_type = type;
1441	hdr->b_spa = spa_load_guid(spa);
1442	hdr->b_state = arc_anon;
1443	hdr->b_arc_access = 0;
1444	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1445	buf->b_hdr = hdr;
1446	buf->b_data = NULL;
1447	buf->b_efunc = NULL;
1448	buf->b_private = NULL;
1449	buf->b_next = NULL;
1450	hdr->b_buf = buf;
1451	arc_get_data_buf(buf);
1452	hdr->b_datacnt = 1;
1453	hdr->b_flags = 0;
1454	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1455	(void) refcount_add(&hdr->b_refcnt, tag);
1456
1457	return (buf);
1458}
1459
1460static char *arc_onloan_tag = "onloan";
1461
1462/*
1463 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1464 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1465 * buffers must be returned to the arc before they can be used by the DMU or
1466 * freed.
1467 */
1468arc_buf_t *
1469arc_loan_buf(spa_t *spa, int size)
1470{
1471	arc_buf_t *buf;
1472
1473	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1474
1475	atomic_add_64(&arc_loaned_bytes, size);
1476	return (buf);
1477}
1478
1479/*
1480 * Return a loaned arc buffer to the arc.
1481 */
1482void
1483arc_return_buf(arc_buf_t *buf, void *tag)
1484{
1485	arc_buf_hdr_t *hdr = buf->b_hdr;
1486
1487	ASSERT(buf->b_data != NULL);
1488	(void) refcount_add(&hdr->b_refcnt, tag);
1489	(void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1490
1491	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1492}
1493
1494/* Detach an arc_buf from a dbuf (tag) */
1495void
1496arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1497{
1498	arc_buf_hdr_t *hdr;
1499
1500	ASSERT(buf->b_data != NULL);
1501	hdr = buf->b_hdr;
1502	(void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1503	(void) refcount_remove(&hdr->b_refcnt, tag);
1504	buf->b_efunc = NULL;
1505	buf->b_private = NULL;
1506
1507	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1508}
1509
1510static arc_buf_t *
1511arc_buf_clone(arc_buf_t *from)
1512{
1513	arc_buf_t *buf;
1514	arc_buf_hdr_t *hdr = from->b_hdr;
1515	uint64_t size = hdr->b_size;
1516
1517	ASSERT(hdr->b_state != arc_anon);
1518
1519	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1520	buf->b_hdr = hdr;
1521	buf->b_data = NULL;
1522	buf->b_efunc = NULL;
1523	buf->b_private = NULL;
1524	buf->b_next = hdr->b_buf;
1525	hdr->b_buf = buf;
1526	arc_get_data_buf(buf);
1527	bcopy(from->b_data, buf->b_data, size);
1528
1529	/*
1530	 * This buffer already exists in the arc so create a duplicate
1531	 * copy for the caller.  If the buffer is associated with user data
1532	 * then track the size and number of duplicates.  These stats will be
1533	 * updated as duplicate buffers are created and destroyed.
1534	 */
1535	if (hdr->b_type == ARC_BUFC_DATA) {
1536		ARCSTAT_BUMP(arcstat_duplicate_buffers);
1537		ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1538	}
1539	hdr->b_datacnt += 1;
1540	return (buf);
1541}
1542
1543void
1544arc_buf_add_ref(arc_buf_t *buf, void* tag)
1545{
1546	arc_buf_hdr_t *hdr;
1547	kmutex_t *hash_lock;
1548
1549	/*
1550	 * Check to see if this buffer is evicted.  Callers
1551	 * must verify b_data != NULL to know if the add_ref
1552	 * was successful.
1553	 */
1554	mutex_enter(&buf->b_evict_lock);
1555	if (buf->b_data == NULL) {
1556		mutex_exit(&buf->b_evict_lock);
1557		return;
1558	}
1559	hash_lock = HDR_LOCK(buf->b_hdr);
1560	mutex_enter(hash_lock);
1561	hdr = buf->b_hdr;
1562	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1563	mutex_exit(&buf->b_evict_lock);
1564
1565	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1566	add_reference(hdr, hash_lock, tag);
1567	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1568	arc_access(hdr, hash_lock);
1569	mutex_exit(hash_lock);
1570	ARCSTAT_BUMP(arcstat_hits);
1571	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1572	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1573	    data, metadata, hits);
1574}
1575
1576/*
1577 * Free the arc data buffer.  If it is an l2arc write in progress,
1578 * the buffer is placed on l2arc_free_on_write to be freed later.
1579 */
1580static void
1581arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1582{
1583	arc_buf_hdr_t *hdr = buf->b_hdr;
1584
1585	if (HDR_L2_WRITING(hdr)) {
1586		l2arc_data_free_t *df;
1587		df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1588		df->l2df_data = buf->b_data;
1589		df->l2df_size = hdr->b_size;
1590		df->l2df_func = free_func;
1591		mutex_enter(&l2arc_free_on_write_mtx);
1592		list_insert_head(l2arc_free_on_write, df);
1593		mutex_exit(&l2arc_free_on_write_mtx);
1594		ARCSTAT_BUMP(arcstat_l2_free_on_write);
1595	} else {
1596		free_func(buf->b_data, hdr->b_size);
1597	}
1598}
1599
1600static void
1601arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1602{
1603	arc_buf_t **bufp;
1604
1605	/* free up data associated with the buf */
1606	if (buf->b_data) {
1607		arc_state_t *state = buf->b_hdr->b_state;
1608		uint64_t size = buf->b_hdr->b_size;
1609		arc_buf_contents_t type = buf->b_hdr->b_type;
1610
1611		arc_cksum_verify(buf);
1612#ifdef illumos
1613		arc_buf_unwatch(buf);
1614#endif /* illumos */
1615
1616		if (!recycle) {
1617			if (type == ARC_BUFC_METADATA) {
1618				arc_buf_data_free(buf, zio_buf_free);
1619				arc_space_return(size, ARC_SPACE_DATA);
1620			} else {
1621				ASSERT(type == ARC_BUFC_DATA);
1622				arc_buf_data_free(buf, zio_data_buf_free);
1623				ARCSTAT_INCR(arcstat_data_size, -size);
1624				atomic_add_64(&arc_size, -size);
1625			}
1626		}
1627		if (list_link_active(&buf->b_hdr->b_arc_node)) {
1628			uint64_t *cnt = &state->arcs_lsize[type];
1629
1630			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1631			ASSERT(state != arc_anon);
1632
1633			ASSERT3U(*cnt, >=, size);
1634			atomic_add_64(cnt, -size);
1635		}
1636		ASSERT3U(state->arcs_size, >=, size);
1637		atomic_add_64(&state->arcs_size, -size);
1638		buf->b_data = NULL;
1639
1640		/*
1641		 * If we're destroying a duplicate buffer make sure
1642		 * that the appropriate statistics are updated.
1643		 */
1644		if (buf->b_hdr->b_datacnt > 1 &&
1645		    buf->b_hdr->b_type == ARC_BUFC_DATA) {
1646			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1647			ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1648		}
1649		ASSERT(buf->b_hdr->b_datacnt > 0);
1650		buf->b_hdr->b_datacnt -= 1;
1651	}
1652
1653	/* only remove the buf if requested */
1654	if (!all)
1655		return;
1656
1657	/* remove the buf from the hdr list */
1658	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1659		continue;
1660	*bufp = buf->b_next;
1661	buf->b_next = NULL;
1662
1663	ASSERT(buf->b_efunc == NULL);
1664
1665	/* clean up the buf */
1666	buf->b_hdr = NULL;
1667	kmem_cache_free(buf_cache, buf);
1668}
1669
1670static void
1671arc_hdr_destroy(arc_buf_hdr_t *hdr)
1672{
1673	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1674	ASSERT3P(hdr->b_state, ==, arc_anon);
1675	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1676	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1677
1678	if (l2hdr != NULL) {
1679		boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1680		/*
1681		 * To prevent arc_free() and l2arc_evict() from
1682		 * attempting to free the same buffer at the same time,
1683		 * a FREE_IN_PROGRESS flag is given to arc_free() to
1684		 * give it priority.  l2arc_evict() can't destroy this
1685		 * header while we are waiting on l2arc_buflist_mtx.
1686		 *
1687		 * The hdr may be removed from l2ad_buflist before we
1688		 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1689		 */
1690		if (!buflist_held) {
1691			mutex_enter(&l2arc_buflist_mtx);
1692			l2hdr = hdr->b_l2hdr;
1693		}
1694
1695		if (l2hdr != NULL) {
1696			list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1697			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1698			kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1699			if (hdr->b_state == arc_l2c_only)
1700				l2arc_hdr_stat_remove();
1701			hdr->b_l2hdr = NULL;
1702		}
1703
1704		if (!buflist_held)
1705			mutex_exit(&l2arc_buflist_mtx);
1706	}
1707
1708	if (!BUF_EMPTY(hdr)) {
1709		ASSERT(!HDR_IN_HASH_TABLE(hdr));
1710		buf_discard_identity(hdr);
1711	}
1712	while (hdr->b_buf) {
1713		arc_buf_t *buf = hdr->b_buf;
1714
1715		if (buf->b_efunc) {
1716			mutex_enter(&arc_eviction_mtx);
1717			mutex_enter(&buf->b_evict_lock);
1718			ASSERT(buf->b_hdr != NULL);
1719			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1720			hdr->b_buf = buf->b_next;
1721			buf->b_hdr = &arc_eviction_hdr;
1722			buf->b_next = arc_eviction_list;
1723			arc_eviction_list = buf;
1724			mutex_exit(&buf->b_evict_lock);
1725			mutex_exit(&arc_eviction_mtx);
1726		} else {
1727			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1728		}
1729	}
1730	if (hdr->b_freeze_cksum != NULL) {
1731		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1732		hdr->b_freeze_cksum = NULL;
1733	}
1734	if (hdr->b_thawed) {
1735		kmem_free(hdr->b_thawed, 1);
1736		hdr->b_thawed = NULL;
1737	}
1738
1739	ASSERT(!list_link_active(&hdr->b_arc_node));
1740	ASSERT3P(hdr->b_hash_next, ==, NULL);
1741	ASSERT3P(hdr->b_acb, ==, NULL);
1742	kmem_cache_free(hdr_cache, hdr);
1743}
1744
1745void
1746arc_buf_free(arc_buf_t *buf, void *tag)
1747{
1748	arc_buf_hdr_t *hdr = buf->b_hdr;
1749	int hashed = hdr->b_state != arc_anon;
1750
1751	ASSERT(buf->b_efunc == NULL);
1752	ASSERT(buf->b_data != NULL);
1753
1754	if (hashed) {
1755		kmutex_t *hash_lock = HDR_LOCK(hdr);
1756
1757		mutex_enter(hash_lock);
1758		hdr = buf->b_hdr;
1759		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1760
1761		(void) remove_reference(hdr, hash_lock, tag);
1762		if (hdr->b_datacnt > 1) {
1763			arc_buf_destroy(buf, FALSE, TRUE);
1764		} else {
1765			ASSERT(buf == hdr->b_buf);
1766			ASSERT(buf->b_efunc == NULL);
1767			hdr->b_flags |= ARC_BUF_AVAILABLE;
1768		}
1769		mutex_exit(hash_lock);
1770	} else if (HDR_IO_IN_PROGRESS(hdr)) {
1771		int destroy_hdr;
1772		/*
1773		 * We are in the middle of an async write.  Don't destroy
1774		 * this buffer unless the write completes before we finish
1775		 * decrementing the reference count.
1776		 */
1777		mutex_enter(&arc_eviction_mtx);
1778		(void) remove_reference(hdr, NULL, tag);
1779		ASSERT(refcount_is_zero(&hdr->b_refcnt));
1780		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1781		mutex_exit(&arc_eviction_mtx);
1782		if (destroy_hdr)
1783			arc_hdr_destroy(hdr);
1784	} else {
1785		if (remove_reference(hdr, NULL, tag) > 0)
1786			arc_buf_destroy(buf, FALSE, TRUE);
1787		else
1788			arc_hdr_destroy(hdr);
1789	}
1790}
1791
1792int
1793arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1794{
1795	arc_buf_hdr_t *hdr = buf->b_hdr;
1796	kmutex_t *hash_lock = HDR_LOCK(hdr);
1797	int no_callback = (buf->b_efunc == NULL);
1798
1799	if (hdr->b_state == arc_anon) {
1800		ASSERT(hdr->b_datacnt == 1);
1801		arc_buf_free(buf, tag);
1802		return (no_callback);
1803	}
1804
1805	mutex_enter(hash_lock);
1806	hdr = buf->b_hdr;
1807	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1808	ASSERT(hdr->b_state != arc_anon);
1809	ASSERT(buf->b_data != NULL);
1810
1811	(void) remove_reference(hdr, hash_lock, tag);
1812	if (hdr->b_datacnt > 1) {
1813		if (no_callback)
1814			arc_buf_destroy(buf, FALSE, TRUE);
1815	} else if (no_callback) {
1816		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1817		ASSERT(buf->b_efunc == NULL);
1818		hdr->b_flags |= ARC_BUF_AVAILABLE;
1819	}
1820	ASSERT(no_callback || hdr->b_datacnt > 1 ||
1821	    refcount_is_zero(&hdr->b_refcnt));
1822	mutex_exit(hash_lock);
1823	return (no_callback);
1824}
1825
1826int
1827arc_buf_size(arc_buf_t *buf)
1828{
1829	return (buf->b_hdr->b_size);
1830}
1831
1832/*
1833 * Called from the DMU to determine if the current buffer should be
1834 * evicted. In order to ensure proper locking, the eviction must be initiated
1835 * from the DMU. Return true if the buffer is associated with user data and
1836 * duplicate buffers still exist.
1837 */
1838boolean_t
1839arc_buf_eviction_needed(arc_buf_t *buf)
1840{
1841	arc_buf_hdr_t *hdr;
1842	boolean_t evict_needed = B_FALSE;
1843
1844	if (zfs_disable_dup_eviction)
1845		return (B_FALSE);
1846
1847	mutex_enter(&buf->b_evict_lock);
1848	hdr = buf->b_hdr;
1849	if (hdr == NULL) {
1850		/*
1851		 * We are in arc_do_user_evicts(); let that function
1852		 * perform the eviction.
1853		 */
1854		ASSERT(buf->b_data == NULL);
1855		mutex_exit(&buf->b_evict_lock);
1856		return (B_FALSE);
1857	} else if (buf->b_data == NULL) {
1858		/*
1859		 * We have already been added to the arc eviction list;
1860		 * recommend eviction.
1861		 */
1862		ASSERT3P(hdr, ==, &arc_eviction_hdr);
1863		mutex_exit(&buf->b_evict_lock);
1864		return (B_TRUE);
1865	}
1866
1867	if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1868		evict_needed = B_TRUE;
1869
1870	mutex_exit(&buf->b_evict_lock);
1871	return (evict_needed);
1872}
1873
1874/*
1875 * Evict buffers from list until we've removed the specified number of
1876 * bytes.  Move the removed buffers to the appropriate evict state.
1877 * If the recycle flag is set, then attempt to "recycle" a buffer:
1878 * - look for a buffer to evict that is `bytes' long.
1879 * - return the data block from this buffer rather than freeing it.
1880 * This flag is used by callers that are trying to make space for a
1881 * new buffer in a full arc cache.
1882 *
1883 * This function makes a "best effort".  It skips over any buffers
1884 * it can't get a hash_lock on, and so may not catch all candidates.
1885 * It may also return without evicting as much space as requested.
1886 */
1887static void *
1888arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1889    arc_buf_contents_t type)
1890{
1891	arc_state_t *evicted_state;
1892	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1893	int64_t bytes_remaining;
1894	arc_buf_hdr_t *ab, *ab_prev = NULL;
1895	list_t *evicted_list, *list, *evicted_list_start, *list_start;
1896	kmutex_t *lock, *evicted_lock;
1897	kmutex_t *hash_lock;
1898	boolean_t have_lock;
1899	void *stolen = NULL;
1900	static int evict_metadata_offset, evict_data_offset;
1901	int i, idx, offset, list_count, count;
1902
1903	ASSERT(state == arc_mru || state == arc_mfu);
1904
1905	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1906
1907	if (type == ARC_BUFC_METADATA) {
1908		offset = 0;
1909		list_count = ARC_BUFC_NUMMETADATALISTS;
1910		list_start = &state->arcs_lists[0];
1911		evicted_list_start = &evicted_state->arcs_lists[0];
1912		idx = evict_metadata_offset;
1913	} else {
1914		offset = ARC_BUFC_NUMMETADATALISTS;
1915		list_start = &state->arcs_lists[offset];
1916		evicted_list_start = &evicted_state->arcs_lists[offset];
1917		list_count = ARC_BUFC_NUMDATALISTS;
1918		idx = evict_data_offset;
1919	}
1920	bytes_remaining = evicted_state->arcs_lsize[type];
1921	count = 0;
1922
1923evict_start:
1924	list = &list_start[idx];
1925	evicted_list = &evicted_list_start[idx];
1926	lock = ARCS_LOCK(state, (offset + idx));
1927	evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
1928
1929	mutex_enter(lock);
1930	mutex_enter(evicted_lock);
1931
1932	for (ab = list_tail(list); ab; ab = ab_prev) {
1933		ab_prev = list_prev(list, ab);
1934		bytes_remaining -= (ab->b_size * ab->b_datacnt);
1935		/* prefetch buffers have a minimum lifespan */
1936		if (HDR_IO_IN_PROGRESS(ab) ||
1937		    (spa && ab->b_spa != spa) ||
1938		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1939		    ddi_get_lbolt() - ab->b_arc_access <
1940		    arc_min_prefetch_lifespan)) {
1941			skipped++;
1942			continue;
1943		}
1944		/* "lookahead" for better eviction candidate */
1945		if (recycle && ab->b_size != bytes &&
1946		    ab_prev && ab_prev->b_size == bytes)
1947			continue;
1948		hash_lock = HDR_LOCK(ab);
1949		have_lock = MUTEX_HELD(hash_lock);
1950		if (have_lock || mutex_tryenter(hash_lock)) {
1951			ASSERT0(refcount_count(&ab->b_refcnt));
1952			ASSERT(ab->b_datacnt > 0);
1953			while (ab->b_buf) {
1954				arc_buf_t *buf = ab->b_buf;
1955				if (!mutex_tryenter(&buf->b_evict_lock)) {
1956					missed += 1;
1957					break;
1958				}
1959				if (buf->b_data) {
1960					bytes_evicted += ab->b_size;
1961					if (recycle && ab->b_type == type &&
1962					    ab->b_size == bytes &&
1963					    !HDR_L2_WRITING(ab)) {
1964						stolen = buf->b_data;
1965						recycle = FALSE;
1966					}
1967				}
1968				if (buf->b_efunc) {
1969					mutex_enter(&arc_eviction_mtx);
1970					arc_buf_destroy(buf,
1971					    buf->b_data == stolen, FALSE);
1972					ab->b_buf = buf->b_next;
1973					buf->b_hdr = &arc_eviction_hdr;
1974					buf->b_next = arc_eviction_list;
1975					arc_eviction_list = buf;
1976					mutex_exit(&arc_eviction_mtx);
1977					mutex_exit(&buf->b_evict_lock);
1978				} else {
1979					mutex_exit(&buf->b_evict_lock);
1980					arc_buf_destroy(buf,
1981					    buf->b_data == stolen, TRUE);
1982				}
1983			}
1984
1985			if (ab->b_l2hdr) {
1986				ARCSTAT_INCR(arcstat_evict_l2_cached,
1987				    ab->b_size);
1988			} else {
1989				if (l2arc_write_eligible(ab->b_spa, ab)) {
1990					ARCSTAT_INCR(arcstat_evict_l2_eligible,
1991					    ab->b_size);
1992				} else {
1993					ARCSTAT_INCR(
1994					    arcstat_evict_l2_ineligible,
1995					    ab->b_size);
1996				}
1997			}
1998
1999			if (ab->b_datacnt == 0) {
2000				arc_change_state(evicted_state, ab, hash_lock);
2001				ASSERT(HDR_IN_HASH_TABLE(ab));
2002				ab->b_flags |= ARC_IN_HASH_TABLE;
2003				ab->b_flags &= ~ARC_BUF_AVAILABLE;
2004				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
2005			}
2006			if (!have_lock)
2007				mutex_exit(hash_lock);
2008			if (bytes >= 0 && bytes_evicted >= bytes)
2009				break;
2010			if (bytes_remaining > 0) {
2011				mutex_exit(evicted_lock);
2012				mutex_exit(lock);
2013				idx  = ((idx + 1) & (list_count - 1));
2014				count++;
2015				goto evict_start;
2016			}
2017		} else {
2018			missed += 1;
2019		}
2020	}
2021
2022	mutex_exit(evicted_lock);
2023	mutex_exit(lock);
2024
2025	idx  = ((idx + 1) & (list_count - 1));
2026	count++;
2027
2028	if (bytes_evicted < bytes) {
2029		if (count < list_count)
2030			goto evict_start;
2031		else
2032			dprintf("only evicted %lld bytes from %x",
2033			    (longlong_t)bytes_evicted, state);
2034	}
2035	if (type == ARC_BUFC_METADATA)
2036		evict_metadata_offset = idx;
2037	else
2038		evict_data_offset = idx;
2039
2040	if (skipped)
2041		ARCSTAT_INCR(arcstat_evict_skip, skipped);
2042
2043	if (missed)
2044		ARCSTAT_INCR(arcstat_mutex_miss, missed);
2045
2046	/*
2047	 * We have just evicted some date into the ghost state, make
2048	 * sure we also adjust the ghost state size if necessary.
2049	 */
2050	if (arc_no_grow &&
2051	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
2052		int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
2053		    arc_mru_ghost->arcs_size - arc_c;
2054
2055		if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
2056			int64_t todelete =
2057			    MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
2058			arc_evict_ghost(arc_mru_ghost, 0, todelete);
2059		} else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
2060			int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
2061			    arc_mru_ghost->arcs_size +
2062			    arc_mfu_ghost->arcs_size - arc_c);
2063			arc_evict_ghost(arc_mfu_ghost, 0, todelete);
2064		}
2065	}
2066	if (stolen)
2067		ARCSTAT_BUMP(arcstat_stolen);
2068
2069	return (stolen);
2070}
2071
2072/*
2073 * Remove buffers from list until we've removed the specified number of
2074 * bytes.  Destroy the buffers that are removed.
2075 */
2076static void
2077arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
2078{
2079	arc_buf_hdr_t *ab, *ab_prev;
2080	arc_buf_hdr_t marker = { 0 };
2081	list_t *list, *list_start;
2082	kmutex_t *hash_lock, *lock;
2083	uint64_t bytes_deleted = 0;
2084	uint64_t bufs_skipped = 0;
2085	static int evict_offset;
2086	int list_count, idx = evict_offset;
2087	int offset, count = 0;
2088
2089	ASSERT(GHOST_STATE(state));
2090
2091	/*
2092	 * data lists come after metadata lists
2093	 */
2094	list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
2095	list_count = ARC_BUFC_NUMDATALISTS;
2096	offset = ARC_BUFC_NUMMETADATALISTS;
2097
2098evict_start:
2099	list = &list_start[idx];
2100	lock = ARCS_LOCK(state, idx + offset);
2101
2102	mutex_enter(lock);
2103	for (ab = list_tail(list); ab; ab = ab_prev) {
2104		ab_prev = list_prev(list, ab);
2105		if (spa && ab->b_spa != spa)
2106			continue;
2107
2108		/* ignore markers */
2109		if (ab->b_spa == 0)
2110			continue;
2111
2112		hash_lock = HDR_LOCK(ab);
2113		/* caller may be trying to modify this buffer, skip it */
2114		if (MUTEX_HELD(hash_lock))
2115			continue;
2116		if (mutex_tryenter(hash_lock)) {
2117			ASSERT(!HDR_IO_IN_PROGRESS(ab));
2118			ASSERT(ab->b_buf == NULL);
2119			ARCSTAT_BUMP(arcstat_deleted);
2120			bytes_deleted += ab->b_size;
2121
2122			if (ab->b_l2hdr != NULL) {
2123				/*
2124				 * This buffer is cached on the 2nd Level ARC;
2125				 * don't destroy the header.
2126				 */
2127				arc_change_state(arc_l2c_only, ab, hash_lock);
2128				mutex_exit(hash_lock);
2129			} else {
2130				arc_change_state(arc_anon, ab, hash_lock);
2131				mutex_exit(hash_lock);
2132				arc_hdr_destroy(ab);
2133			}
2134
2135			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2136			if (bytes >= 0 && bytes_deleted >= bytes)
2137				break;
2138		} else if (bytes < 0) {
2139			/*
2140			 * Insert a list marker and then wait for the
2141			 * hash lock to become available. Once its
2142			 * available, restart from where we left off.
2143			 */
2144			list_insert_after(list, ab, &marker);
2145			mutex_exit(lock);
2146			mutex_enter(hash_lock);
2147			mutex_exit(hash_lock);
2148			mutex_enter(lock);
2149			ab_prev = list_prev(list, &marker);
2150			list_remove(list, &marker);
2151		} else
2152			bufs_skipped += 1;
2153	}
2154	mutex_exit(lock);
2155	idx  = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
2156	count++;
2157
2158	if (count < list_count)
2159		goto evict_start;
2160
2161	evict_offset = idx;
2162	if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
2163	    (bytes < 0 || bytes_deleted < bytes)) {
2164		list_start = &state->arcs_lists[0];
2165		list_count = ARC_BUFC_NUMMETADATALISTS;
2166		offset = count = 0;
2167		goto evict_start;
2168	}
2169
2170	if (bufs_skipped) {
2171		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2172		ASSERT(bytes >= 0);
2173	}
2174
2175	if (bytes_deleted < bytes)
2176		dprintf("only deleted %lld bytes from %p",
2177		    (longlong_t)bytes_deleted, state);
2178}
2179
2180static void
2181arc_adjust(void)
2182{
2183	int64_t adjustment, delta;
2184
2185	/*
2186	 * Adjust MRU size
2187	 */
2188
2189	adjustment = MIN((int64_t)(arc_size - arc_c),
2190	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2191	    arc_p));
2192
2193	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2194		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2195		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
2196		adjustment -= delta;
2197	}
2198
2199	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2200		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2201		(void) arc_evict(arc_mru, 0, delta, FALSE,
2202		    ARC_BUFC_METADATA);
2203	}
2204
2205	/*
2206	 * Adjust MFU size
2207	 */
2208
2209	adjustment = arc_size - arc_c;
2210
2211	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2212		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2213		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
2214		adjustment -= delta;
2215	}
2216
2217	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2218		int64_t delta = MIN(adjustment,
2219		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2220		(void) arc_evict(arc_mfu, 0, delta, FALSE,
2221		    ARC_BUFC_METADATA);
2222	}
2223
2224	/*
2225	 * Adjust ghost lists
2226	 */
2227
2228	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2229
2230	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2231		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2232		arc_evict_ghost(arc_mru_ghost, 0, delta);
2233	}
2234
2235	adjustment =
2236	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2237
2238	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2239		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2240		arc_evict_ghost(arc_mfu_ghost, 0, delta);
2241	}
2242}
2243
2244static void
2245arc_do_user_evicts(void)
2246{
2247	static arc_buf_t *tmp_arc_eviction_list;
2248
2249	/*
2250	 * Move list over to avoid LOR
2251	 */
2252restart:
2253	mutex_enter(&arc_eviction_mtx);
2254	tmp_arc_eviction_list = arc_eviction_list;
2255	arc_eviction_list = NULL;
2256	mutex_exit(&arc_eviction_mtx);
2257
2258	while (tmp_arc_eviction_list != NULL) {
2259		arc_buf_t *buf = tmp_arc_eviction_list;
2260		tmp_arc_eviction_list = buf->b_next;
2261		mutex_enter(&buf->b_evict_lock);
2262		buf->b_hdr = NULL;
2263		mutex_exit(&buf->b_evict_lock);
2264
2265		if (buf->b_efunc != NULL)
2266			VERIFY(buf->b_efunc(buf) == 0);
2267
2268		buf->b_efunc = NULL;
2269		buf->b_private = NULL;
2270		kmem_cache_free(buf_cache, buf);
2271	}
2272
2273	if (arc_eviction_list != NULL)
2274		goto restart;
2275}
2276
2277/*
2278 * Flush all *evictable* data from the cache for the given spa.
2279 * NOTE: this will not touch "active" (i.e. referenced) data.
2280 */
2281void
2282arc_flush(spa_t *spa)
2283{
2284	uint64_t guid = 0;
2285
2286	if (spa)
2287		guid = spa_load_guid(spa);
2288
2289	while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
2290		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2291		if (spa)
2292			break;
2293	}
2294	while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
2295		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2296		if (spa)
2297			break;
2298	}
2299	while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
2300		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2301		if (spa)
2302			break;
2303	}
2304	while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
2305		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2306		if (spa)
2307			break;
2308	}
2309
2310	arc_evict_ghost(arc_mru_ghost, guid, -1);
2311	arc_evict_ghost(arc_mfu_ghost, guid, -1);
2312
2313	mutex_enter(&arc_reclaim_thr_lock);
2314	arc_do_user_evicts();
2315	mutex_exit(&arc_reclaim_thr_lock);
2316	ASSERT(spa || arc_eviction_list == NULL);
2317}
2318
2319void
2320arc_shrink(void)
2321{
2322	if (arc_c > arc_c_min) {
2323		uint64_t to_free;
2324
2325#ifdef _KERNEL
2326		to_free = arc_c >> arc_shrink_shift;
2327#else
2328		to_free = arc_c >> arc_shrink_shift;
2329#endif
2330		if (arc_c > arc_c_min + to_free)
2331			atomic_add_64(&arc_c, -to_free);
2332		else
2333			arc_c = arc_c_min;
2334
2335		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2336		if (arc_c > arc_size)
2337			arc_c = MAX(arc_size, arc_c_min);
2338		if (arc_p > arc_c)
2339			arc_p = (arc_c >> 1);
2340		ASSERT(arc_c >= arc_c_min);
2341		ASSERT((int64_t)arc_p >= 0);
2342	}
2343
2344	if (arc_size > arc_c)
2345		arc_adjust();
2346}
2347
2348static int needfree = 0;
2349
2350static int
2351arc_reclaim_needed(void)
2352{
2353
2354#ifdef _KERNEL
2355
2356	if (needfree)
2357		return (1);
2358
2359	/*
2360	 * Cooperate with pagedaemon when it's time for it to scan
2361	 * and reclaim some pages.
2362	 */
2363	if (vm_paging_needed())
2364		return (1);
2365
2366#ifdef sun
2367	/*
2368	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
2369	 */
2370	extra = desfree;
2371
2372	/*
2373	 * check that we're out of range of the pageout scanner.  It starts to
2374	 * schedule paging if freemem is less than lotsfree and needfree.
2375	 * lotsfree is the high-water mark for pageout, and needfree is the
2376	 * number of needed free pages.  We add extra pages here to make sure
2377	 * the scanner doesn't start up while we're freeing memory.
2378	 */
2379	if (freemem < lotsfree + needfree + extra)
2380		return (1);
2381
2382	/*
2383	 * check to make sure that swapfs has enough space so that anon
2384	 * reservations can still succeed. anon_resvmem() checks that the
2385	 * availrmem is greater than swapfs_minfree, and the number of reserved
2386	 * swap pages.  We also add a bit of extra here just to prevent
2387	 * circumstances from getting really dire.
2388	 */
2389	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2390		return (1);
2391
2392#if defined(__i386)
2393	/*
2394	 * If we're on an i386 platform, it's possible that we'll exhaust the
2395	 * kernel heap space before we ever run out of available physical
2396	 * memory.  Most checks of the size of the heap_area compare against
2397	 * tune.t_minarmem, which is the minimum available real memory that we
2398	 * can have in the system.  However, this is generally fixed at 25 pages
2399	 * which is so low that it's useless.  In this comparison, we seek to
2400	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
2401	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2402	 * free)
2403	 */
2404	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
2405	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
2406		return (1);
2407#endif
2408#else	/* !sun */
2409	if (kmem_used() > (kmem_size() * 3) / 4)
2410		return (1);
2411#endif	/* sun */
2412
2413#else
2414	if (spa_get_random(100) == 0)
2415		return (1);
2416#endif
2417	return (0);
2418}
2419
2420extern kmem_cache_t	*zio_buf_cache[];
2421extern kmem_cache_t	*zio_data_buf_cache[];
2422
2423static void
2424arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2425{
2426	size_t			i;
2427	kmem_cache_t		*prev_cache = NULL;
2428	kmem_cache_t		*prev_data_cache = NULL;
2429
2430#ifdef _KERNEL
2431	if (arc_meta_used >= arc_meta_limit) {
2432		/*
2433		 * We are exceeding our meta-data cache limit.
2434		 * Purge some DNLC entries to release holds on meta-data.
2435		 */
2436		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2437	}
2438#if defined(__i386)
2439	/*
2440	 * Reclaim unused memory from all kmem caches.
2441	 */
2442	kmem_reap();
2443#endif
2444#endif
2445
2446	/*
2447	 * An aggressive reclamation will shrink the cache size as well as
2448	 * reap free buffers from the arc kmem caches.
2449	 */
2450	if (strat == ARC_RECLAIM_AGGR)
2451		arc_shrink();
2452
2453	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2454		if (zio_buf_cache[i] != prev_cache) {
2455			prev_cache = zio_buf_cache[i];
2456			kmem_cache_reap_now(zio_buf_cache[i]);
2457		}
2458		if (zio_data_buf_cache[i] != prev_data_cache) {
2459			prev_data_cache = zio_data_buf_cache[i];
2460			kmem_cache_reap_now(zio_data_buf_cache[i]);
2461		}
2462	}
2463	kmem_cache_reap_now(buf_cache);
2464	kmem_cache_reap_now(hdr_cache);
2465}
2466
2467static void
2468arc_reclaim_thread(void *dummy __unused)
2469{
2470	clock_t			growtime = 0;
2471	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
2472	callb_cpr_t		cpr;
2473
2474	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2475
2476	mutex_enter(&arc_reclaim_thr_lock);
2477	while (arc_thread_exit == 0) {
2478		if (arc_reclaim_needed()) {
2479
2480			if (arc_no_grow) {
2481				if (last_reclaim == ARC_RECLAIM_CONS) {
2482					last_reclaim = ARC_RECLAIM_AGGR;
2483				} else {
2484					last_reclaim = ARC_RECLAIM_CONS;
2485				}
2486			} else {
2487				arc_no_grow = TRUE;
2488				last_reclaim = ARC_RECLAIM_AGGR;
2489				membar_producer();
2490			}
2491
2492			/* reset the growth delay for every reclaim */
2493			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2494
2495			if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
2496				/*
2497				 * If needfree is TRUE our vm_lowmem hook
2498				 * was called and in that case we must free some
2499				 * memory, so switch to aggressive mode.
2500				 */
2501				arc_no_grow = TRUE;
2502				last_reclaim = ARC_RECLAIM_AGGR;
2503			}
2504			arc_kmem_reap_now(last_reclaim);
2505			arc_warm = B_TRUE;
2506
2507		} else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2508			arc_no_grow = FALSE;
2509		}
2510
2511		arc_adjust();
2512
2513		if (arc_eviction_list != NULL)
2514			arc_do_user_evicts();
2515
2516#ifdef _KERNEL
2517		if (needfree) {
2518			needfree = 0;
2519			wakeup(&needfree);
2520		}
2521#endif
2522
2523		/* block until needed, or one second, whichever is shorter */
2524		CALLB_CPR_SAFE_BEGIN(&cpr);
2525		(void) cv_timedwait(&arc_reclaim_thr_cv,
2526		    &arc_reclaim_thr_lock, hz);
2527		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2528	}
2529
2530	arc_thread_exit = 0;
2531	cv_broadcast(&arc_reclaim_thr_cv);
2532	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
2533	thread_exit();
2534}
2535
2536/*
2537 * Adapt arc info given the number of bytes we are trying to add and
2538 * the state that we are comming from.  This function is only called
2539 * when we are adding new content to the cache.
2540 */
2541static void
2542arc_adapt(int bytes, arc_state_t *state)
2543{
2544	int mult;
2545	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2546
2547	if (state == arc_l2c_only)
2548		return;
2549
2550	ASSERT(bytes > 0);
2551	/*
2552	 * Adapt the target size of the MRU list:
2553	 *	- if we just hit in the MRU ghost list, then increase
2554	 *	  the target size of the MRU list.
2555	 *	- if we just hit in the MFU ghost list, then increase
2556	 *	  the target size of the MFU list by decreasing the
2557	 *	  target size of the MRU list.
2558	 */
2559	if (state == arc_mru_ghost) {
2560		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2561		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2562		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2563
2564		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2565	} else if (state == arc_mfu_ghost) {
2566		uint64_t delta;
2567
2568		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2569		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2570		mult = MIN(mult, 10);
2571
2572		delta = MIN(bytes * mult, arc_p);
2573		arc_p = MAX(arc_p_min, arc_p - delta);
2574	}
2575	ASSERT((int64_t)arc_p >= 0);
2576
2577	if (arc_reclaim_needed()) {
2578		cv_signal(&arc_reclaim_thr_cv);
2579		return;
2580	}
2581
2582	if (arc_no_grow)
2583		return;
2584
2585	if (arc_c >= arc_c_max)
2586		return;
2587
2588	/*
2589	 * If we're within (2 * maxblocksize) bytes of the target
2590	 * cache size, increment the target cache size
2591	 */
2592	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2593		atomic_add_64(&arc_c, (int64_t)bytes);
2594		if (arc_c > arc_c_max)
2595			arc_c = arc_c_max;
2596		else if (state == arc_anon)
2597			atomic_add_64(&arc_p, (int64_t)bytes);
2598		if (arc_p > arc_c)
2599			arc_p = arc_c;
2600	}
2601	ASSERT((int64_t)arc_p >= 0);
2602}
2603
2604/*
2605 * Check if the cache has reached its limits and eviction is required
2606 * prior to insert.
2607 */
2608static int
2609arc_evict_needed(arc_buf_contents_t type)
2610{
2611	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2612		return (1);
2613
2614#ifdef sun
2615#ifdef _KERNEL
2616	/*
2617	 * If zio data pages are being allocated out of a separate heap segment,
2618	 * then enforce that the size of available vmem for this area remains
2619	 * above about 1/32nd free.
2620	 */
2621	if (type == ARC_BUFC_DATA && zio_arena != NULL &&
2622	    vmem_size(zio_arena, VMEM_FREE) <
2623	    (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
2624		return (1);
2625#endif
2626#endif	/* sun */
2627
2628	if (arc_reclaim_needed())
2629		return (1);
2630
2631	return (arc_size > arc_c);
2632}
2633
2634/*
2635 * The buffer, supplied as the first argument, needs a data block.
2636 * So, if we are at cache max, determine which cache should be victimized.
2637 * We have the following cases:
2638 *
2639 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2640 * In this situation if we're out of space, but the resident size of the MFU is
2641 * under the limit, victimize the MFU cache to satisfy this insertion request.
2642 *
2643 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2644 * Here, we've used up all of the available space for the MRU, so we need to
2645 * evict from our own cache instead.  Evict from the set of resident MRU
2646 * entries.
2647 *
2648 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2649 * c minus p represents the MFU space in the cache, since p is the size of the
2650 * cache that is dedicated to the MRU.  In this situation there's still space on
2651 * the MFU side, so the MRU side needs to be victimized.
2652 *
2653 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2654 * MFU's resident set is consuming more space than it has been allotted.  In
2655 * this situation, we must victimize our own cache, the MFU, for this insertion.
2656 */
2657static void
2658arc_get_data_buf(arc_buf_t *buf)
2659{
2660	arc_state_t		*state = buf->b_hdr->b_state;
2661	uint64_t		size = buf->b_hdr->b_size;
2662	arc_buf_contents_t	type = buf->b_hdr->b_type;
2663
2664	arc_adapt(size, state);
2665
2666	/*
2667	 * We have not yet reached cache maximum size,
2668	 * just allocate a new buffer.
2669	 */
2670	if (!arc_evict_needed(type)) {
2671		if (type == ARC_BUFC_METADATA) {
2672			buf->b_data = zio_buf_alloc(size);
2673			arc_space_consume(size, ARC_SPACE_DATA);
2674		} else {
2675			ASSERT(type == ARC_BUFC_DATA);
2676			buf->b_data = zio_data_buf_alloc(size);
2677			ARCSTAT_INCR(arcstat_data_size, size);
2678			atomic_add_64(&arc_size, size);
2679		}
2680		goto out;
2681	}
2682
2683	/*
2684	 * If we are prefetching from the mfu ghost list, this buffer
2685	 * will end up on the mru list; so steal space from there.
2686	 */
2687	if (state == arc_mfu_ghost)
2688		state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2689	else if (state == arc_mru_ghost)
2690		state = arc_mru;
2691
2692	if (state == arc_mru || state == arc_anon) {
2693		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2694		state = (arc_mfu->arcs_lsize[type] >= size &&
2695		    arc_p > mru_used) ? arc_mfu : arc_mru;
2696	} else {
2697		/* MFU cases */
2698		uint64_t mfu_space = arc_c - arc_p;
2699		state =  (arc_mru->arcs_lsize[type] >= size &&
2700		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2701	}
2702	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
2703		if (type == ARC_BUFC_METADATA) {
2704			buf->b_data = zio_buf_alloc(size);
2705			arc_space_consume(size, ARC_SPACE_DATA);
2706		} else {
2707			ASSERT(type == ARC_BUFC_DATA);
2708			buf->b_data = zio_data_buf_alloc(size);
2709			ARCSTAT_INCR(arcstat_data_size, size);
2710			atomic_add_64(&arc_size, size);
2711		}
2712		ARCSTAT_BUMP(arcstat_recycle_miss);
2713	}
2714	ASSERT(buf->b_data != NULL);
2715out:
2716	/*
2717	 * Update the state size.  Note that ghost states have a
2718	 * "ghost size" and so don't need to be updated.
2719	 */
2720	if (!GHOST_STATE(buf->b_hdr->b_state)) {
2721		arc_buf_hdr_t *hdr = buf->b_hdr;
2722
2723		atomic_add_64(&hdr->b_state->arcs_size, size);
2724		if (list_link_active(&hdr->b_arc_node)) {
2725			ASSERT(refcount_is_zero(&hdr->b_refcnt));
2726			atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2727		}
2728		/*
2729		 * If we are growing the cache, and we are adding anonymous
2730		 * data, and we have outgrown arc_p, update arc_p
2731		 */
2732		if (arc_size < arc_c && hdr->b_state == arc_anon &&
2733		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2734			arc_p = MIN(arc_c, arc_p + size);
2735	}
2736	ARCSTAT_BUMP(arcstat_allocated);
2737}
2738
2739/*
2740 * This routine is called whenever a buffer is accessed.
2741 * NOTE: the hash lock is dropped in this function.
2742 */
2743static void
2744arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2745{
2746	clock_t now;
2747
2748	ASSERT(MUTEX_HELD(hash_lock));
2749
2750	if (buf->b_state == arc_anon) {
2751		/*
2752		 * This buffer is not in the cache, and does not
2753		 * appear in our "ghost" list.  Add the new buffer
2754		 * to the MRU state.
2755		 */
2756
2757		ASSERT(buf->b_arc_access == 0);
2758		buf->b_arc_access = ddi_get_lbolt();
2759		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2760		arc_change_state(arc_mru, buf, hash_lock);
2761
2762	} else if (buf->b_state == arc_mru) {
2763		now = ddi_get_lbolt();
2764
2765		/*
2766		 * If this buffer is here because of a prefetch, then either:
2767		 * - clear the flag if this is a "referencing" read
2768		 *   (any subsequent access will bump this into the MFU state).
2769		 * or
2770		 * - move the buffer to the head of the list if this is
2771		 *   another prefetch (to make it less likely to be evicted).
2772		 */
2773		if ((buf->b_flags & ARC_PREFETCH) != 0) {
2774			if (refcount_count(&buf->b_refcnt) == 0) {
2775				ASSERT(list_link_active(&buf->b_arc_node));
2776			} else {
2777				buf->b_flags &= ~ARC_PREFETCH;
2778				ARCSTAT_BUMP(arcstat_mru_hits);
2779			}
2780			buf->b_arc_access = now;
2781			return;
2782		}
2783
2784		/*
2785		 * This buffer has been "accessed" only once so far,
2786		 * but it is still in the cache. Move it to the MFU
2787		 * state.
2788		 */
2789		if (now > buf->b_arc_access + ARC_MINTIME) {
2790			/*
2791			 * More than 125ms have passed since we
2792			 * instantiated this buffer.  Move it to the
2793			 * most frequently used state.
2794			 */
2795			buf->b_arc_access = now;
2796			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2797			arc_change_state(arc_mfu, buf, hash_lock);
2798		}
2799		ARCSTAT_BUMP(arcstat_mru_hits);
2800	} else if (buf->b_state == arc_mru_ghost) {
2801		arc_state_t	*new_state;
2802		/*
2803		 * This buffer has been "accessed" recently, but
2804		 * was evicted from the cache.  Move it to the
2805		 * MFU state.
2806		 */
2807
2808		if (buf->b_flags & ARC_PREFETCH) {
2809			new_state = arc_mru;
2810			if (refcount_count(&buf->b_refcnt) > 0)
2811				buf->b_flags &= ~ARC_PREFETCH;
2812			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2813		} else {
2814			new_state = arc_mfu;
2815			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2816		}
2817
2818		buf->b_arc_access = ddi_get_lbolt();
2819		arc_change_state(new_state, buf, hash_lock);
2820
2821		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2822	} else if (buf->b_state == arc_mfu) {
2823		/*
2824		 * This buffer has been accessed more than once and is
2825		 * still in the cache.  Keep it in the MFU state.
2826		 *
2827		 * NOTE: an add_reference() that occurred when we did
2828		 * the arc_read() will have kicked this off the list.
2829		 * If it was a prefetch, we will explicitly move it to
2830		 * the head of the list now.
2831		 */
2832		if ((buf->b_flags & ARC_PREFETCH) != 0) {
2833			ASSERT(refcount_count(&buf->b_refcnt) == 0);
2834			ASSERT(list_link_active(&buf->b_arc_node));
2835		}
2836		ARCSTAT_BUMP(arcstat_mfu_hits);
2837		buf->b_arc_access = ddi_get_lbolt();
2838	} else if (buf->b_state == arc_mfu_ghost) {
2839		arc_state_t	*new_state = arc_mfu;
2840		/*
2841		 * This buffer has been accessed more than once but has
2842		 * been evicted from the cache.  Move it back to the
2843		 * MFU state.
2844		 */
2845
2846		if (buf->b_flags & ARC_PREFETCH) {
2847			/*
2848			 * This is a prefetch access...
2849			 * move this block back to the MRU state.
2850			 */
2851			ASSERT0(refcount_count(&buf->b_refcnt));
2852			new_state = arc_mru;
2853		}
2854
2855		buf->b_arc_access = ddi_get_lbolt();
2856		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2857		arc_change_state(new_state, buf, hash_lock);
2858
2859		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2860	} else if (buf->b_state == arc_l2c_only) {
2861		/*
2862		 * This buffer is on the 2nd Level ARC.
2863		 */
2864
2865		buf->b_arc_access = ddi_get_lbolt();
2866		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2867		arc_change_state(arc_mfu, buf, hash_lock);
2868	} else {
2869		ASSERT(!"invalid arc state");
2870	}
2871}
2872
2873/* a generic arc_done_func_t which you can use */
2874/* ARGSUSED */
2875void
2876arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2877{
2878	if (zio == NULL || zio->io_error == 0)
2879		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2880	VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2881}
2882
2883/* a generic arc_done_func_t */
2884void
2885arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2886{
2887	arc_buf_t **bufp = arg;
2888	if (zio && zio->io_error) {
2889		VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2890		*bufp = NULL;
2891	} else {
2892		*bufp = buf;
2893		ASSERT(buf->b_data);
2894	}
2895}
2896
2897static void
2898arc_read_done(zio_t *zio)
2899{
2900	arc_buf_hdr_t	*hdr, *found;
2901	arc_buf_t	*buf;
2902	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
2903	kmutex_t	*hash_lock;
2904	arc_callback_t	*callback_list, *acb;
2905	int		freeable = FALSE;
2906
2907	buf = zio->io_private;
2908	hdr = buf->b_hdr;
2909
2910	/*
2911	 * The hdr was inserted into hash-table and removed from lists
2912	 * prior to starting I/O.  We should find this header, since
2913	 * it's in the hash table, and it should be legit since it's
2914	 * not possible to evict it during the I/O.  The only possible
2915	 * reason for it not to be found is if we were freed during the
2916	 * read.
2917	 */
2918	found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
2919	    &hash_lock);
2920
2921	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2922	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2923	    (found == hdr && HDR_L2_READING(hdr)));
2924
2925	hdr->b_flags &= ~ARC_L2_EVICTED;
2926	if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2927		hdr->b_flags &= ~ARC_L2CACHE;
2928
2929	/* byteswap if necessary */
2930	callback_list = hdr->b_acb;
2931	ASSERT(callback_list != NULL);
2932	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2933		dmu_object_byteswap_t bswap =
2934		    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
2935		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2936		    byteswap_uint64_array :
2937		    dmu_ot_byteswap[bswap].ob_func;
2938		func(buf->b_data, hdr->b_size);
2939	}
2940
2941	arc_cksum_compute(buf, B_FALSE);
2942#ifdef illumos
2943	arc_buf_watch(buf);
2944#endif /* illumos */
2945
2946	if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2947		/*
2948		 * Only call arc_access on anonymous buffers.  This is because
2949		 * if we've issued an I/O for an evicted buffer, we've already
2950		 * called arc_access (to prevent any simultaneous readers from
2951		 * getting confused).
2952		 */
2953		arc_access(hdr, hash_lock);
2954	}
2955
2956	/* create copies of the data buffer for the callers */
2957	abuf = buf;
2958	for (acb = callback_list; acb; acb = acb->acb_next) {
2959		if (acb->acb_done) {
2960			if (abuf == NULL) {
2961				ARCSTAT_BUMP(arcstat_duplicate_reads);
2962				abuf = arc_buf_clone(buf);
2963			}
2964			acb->acb_buf = abuf;
2965			abuf = NULL;
2966		}
2967	}
2968	hdr->b_acb = NULL;
2969	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2970	ASSERT(!HDR_BUF_AVAILABLE(hdr));
2971	if (abuf == buf) {
2972		ASSERT(buf->b_efunc == NULL);
2973		ASSERT(hdr->b_datacnt == 1);
2974		hdr->b_flags |= ARC_BUF_AVAILABLE;
2975	}
2976
2977	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2978
2979	if (zio->io_error != 0) {
2980		hdr->b_flags |= ARC_IO_ERROR;
2981		if (hdr->b_state != arc_anon)
2982			arc_change_state(arc_anon, hdr, hash_lock);
2983		if (HDR_IN_HASH_TABLE(hdr))
2984			buf_hash_remove(hdr);
2985		freeable = refcount_is_zero(&hdr->b_refcnt);
2986	}
2987
2988	/*
2989	 * Broadcast before we drop the hash_lock to avoid the possibility
2990	 * that the hdr (and hence the cv) might be freed before we get to
2991	 * the cv_broadcast().
2992	 */
2993	cv_broadcast(&hdr->b_cv);
2994
2995	if (hash_lock) {
2996		mutex_exit(hash_lock);
2997	} else {
2998		/*
2999		 * This block was freed while we waited for the read to
3000		 * complete.  It has been removed from the hash table and
3001		 * moved to the anonymous state (so that it won't show up
3002		 * in the cache).
3003		 */
3004		ASSERT3P(hdr->b_state, ==, arc_anon);
3005		freeable = refcount_is_zero(&hdr->b_refcnt);
3006	}
3007
3008	/* execute each callback and free its structure */
3009	while ((acb = callback_list) != NULL) {
3010		if (acb->acb_done)
3011			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3012
3013		if (acb->acb_zio_dummy != NULL) {
3014			acb->acb_zio_dummy->io_error = zio->io_error;
3015			zio_nowait(acb->acb_zio_dummy);
3016		}
3017
3018		callback_list = acb->acb_next;
3019		kmem_free(acb, sizeof (arc_callback_t));
3020	}
3021
3022	if (freeable)
3023		arc_hdr_destroy(hdr);
3024}
3025
3026/*
3027 * "Read" the block block at the specified DVA (in bp) via the
3028 * cache.  If the block is found in the cache, invoke the provided
3029 * callback immediately and return.  Note that the `zio' parameter
3030 * in the callback will be NULL in this case, since no IO was
3031 * required.  If the block is not in the cache pass the read request
3032 * on to the spa with a substitute callback function, so that the
3033 * requested block will be added to the cache.
3034 *
3035 * If a read request arrives for a block that has a read in-progress,
3036 * either wait for the in-progress read to complete (and return the
3037 * results); or, if this is a read with a "done" func, add a record
3038 * to the read to invoke the "done" func when the read completes,
3039 * and return; or just return.
3040 *
3041 * arc_read_done() will invoke all the requested "done" functions
3042 * for readers of this block.
3043 *
3044 * Normal callers should use arc_read and pass the arc buffer and offset
3045 * for the bp.  But if you know you don't need locking, you can use
3046 * arc_read_nolock.
3047 */
3048int
3049arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
3050    arc_done_func_t *done, void *private, int priority, int zio_flags,
3051    uint32_t *arc_flags, const zbookmark_t *zb)
3052{
3053	int err;
3054
3055	if (pbuf == NULL) {
3056		/*
3057		 * XXX This happens from traverse callback funcs, for
3058		 * the objset_phys_t block.
3059		 */
3060		return (arc_read_nolock(pio, spa, bp, done, private, priority,
3061		    zio_flags, arc_flags, zb));
3062	}
3063
3064	ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
3065	ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
3066	rw_enter(&pbuf->b_data_lock, RW_READER);
3067
3068	err = arc_read_nolock(pio, spa, bp, done, private, priority,
3069	    zio_flags, arc_flags, zb);
3070	rw_exit(&pbuf->b_data_lock);
3071
3072	return (err);
3073}
3074
3075int
3076arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
3077    arc_done_func_t *done, void *private, int priority, int zio_flags,
3078    uint32_t *arc_flags, const zbookmark_t *zb)
3079{
3080	arc_buf_hdr_t *hdr;
3081	arc_buf_t *buf;
3082	kmutex_t *hash_lock;
3083	zio_t *rzio;
3084	uint64_t guid = spa_load_guid(spa);
3085
3086top:
3087	hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3088	    &hash_lock);
3089	if (hdr && hdr->b_datacnt > 0) {
3090
3091		*arc_flags |= ARC_CACHED;
3092
3093		if (HDR_IO_IN_PROGRESS(hdr)) {
3094
3095			if (*arc_flags & ARC_WAIT) {
3096				cv_wait(&hdr->b_cv, hash_lock);
3097				mutex_exit(hash_lock);
3098				goto top;
3099			}
3100			ASSERT(*arc_flags & ARC_NOWAIT);
3101
3102			if (done) {
3103				arc_callback_t	*acb = NULL;
3104
3105				acb = kmem_zalloc(sizeof (arc_callback_t),
3106				    KM_SLEEP);
3107				acb->acb_done = done;
3108				acb->acb_private = private;
3109				if (pio != NULL)
3110					acb->acb_zio_dummy = zio_null(pio,
3111					    spa, NULL, NULL, NULL, zio_flags);
3112
3113				ASSERT(acb->acb_done != NULL);
3114				acb->acb_next = hdr->b_acb;
3115				hdr->b_acb = acb;
3116				add_reference(hdr, hash_lock, private);
3117				mutex_exit(hash_lock);
3118				return (0);
3119			}
3120			mutex_exit(hash_lock);
3121			return (0);
3122		}
3123
3124		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3125
3126		if (done) {
3127			add_reference(hdr, hash_lock, private);
3128			/*
3129			 * If this block is already in use, create a new
3130			 * copy of the data so that we will be guaranteed
3131			 * that arc_release() will always succeed.
3132			 */
3133			buf = hdr->b_buf;
3134			ASSERT(buf);
3135			ASSERT(buf->b_data);
3136			if (HDR_BUF_AVAILABLE(hdr)) {
3137				ASSERT(buf->b_efunc == NULL);
3138				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3139			} else {
3140				buf = arc_buf_clone(buf);
3141			}
3142
3143		} else if (*arc_flags & ARC_PREFETCH &&
3144		    refcount_count(&hdr->b_refcnt) == 0) {
3145			hdr->b_flags |= ARC_PREFETCH;
3146		}
3147		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3148		arc_access(hdr, hash_lock);
3149		if (*arc_flags & ARC_L2CACHE)
3150			hdr->b_flags |= ARC_L2CACHE;
3151		mutex_exit(hash_lock);
3152		ARCSTAT_BUMP(arcstat_hits);
3153		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3154		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3155		    data, metadata, hits);
3156
3157		if (done)
3158			done(NULL, buf, private);
3159	} else {
3160		uint64_t size = BP_GET_LSIZE(bp);
3161		arc_callback_t	*acb;
3162		vdev_t *vd = NULL;
3163		uint64_t addr;
3164		boolean_t devw = B_FALSE;
3165
3166		if (hdr == NULL) {
3167			/* this block is not in the cache */
3168			arc_buf_hdr_t	*exists;
3169			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3170			buf = arc_buf_alloc(spa, size, private, type);
3171			hdr = buf->b_hdr;
3172			hdr->b_dva = *BP_IDENTITY(bp);
3173			hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3174			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3175			exists = buf_hash_insert(hdr, &hash_lock);
3176			if (exists) {
3177				/* somebody beat us to the hash insert */
3178				mutex_exit(hash_lock);
3179				buf_discard_identity(hdr);
3180				(void) arc_buf_remove_ref(buf, private);
3181				goto top; /* restart the IO request */
3182			}
3183			/* if this is a prefetch, we don't have a reference */
3184			if (*arc_flags & ARC_PREFETCH) {
3185				(void) remove_reference(hdr, hash_lock,
3186				    private);
3187				hdr->b_flags |= ARC_PREFETCH;
3188			}
3189			if (*arc_flags & ARC_L2CACHE)
3190				hdr->b_flags |= ARC_L2CACHE;
3191			if (BP_GET_LEVEL(bp) > 0)
3192				hdr->b_flags |= ARC_INDIRECT;
3193		} else {
3194			/* this block is in the ghost cache */
3195			ASSERT(GHOST_STATE(hdr->b_state));
3196			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3197			ASSERT0(refcount_count(&hdr->b_refcnt));
3198			ASSERT(hdr->b_buf == NULL);
3199
3200			/* if this is a prefetch, we don't have a reference */
3201			if (*arc_flags & ARC_PREFETCH)
3202				hdr->b_flags |= ARC_PREFETCH;
3203			else
3204				add_reference(hdr, hash_lock, private);
3205			if (*arc_flags & ARC_L2CACHE)
3206				hdr->b_flags |= ARC_L2CACHE;
3207			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3208			buf->b_hdr = hdr;
3209			buf->b_data = NULL;
3210			buf->b_efunc = NULL;
3211			buf->b_private = NULL;
3212			buf->b_next = NULL;
3213			hdr->b_buf = buf;
3214			ASSERT(hdr->b_datacnt == 0);
3215			hdr->b_datacnt = 1;
3216			arc_get_data_buf(buf);
3217			arc_access(hdr, hash_lock);
3218		}
3219
3220		ASSERT(!GHOST_STATE(hdr->b_state));
3221
3222		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3223		acb->acb_done = done;
3224		acb->acb_private = private;
3225
3226		ASSERT(hdr->b_acb == NULL);
3227		hdr->b_acb = acb;
3228		hdr->b_flags |= ARC_IO_IN_PROGRESS;
3229
3230		if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
3231		    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3232			devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3233			addr = hdr->b_l2hdr->b_daddr;
3234			/*
3235			 * Lock out device removal.
3236			 */
3237			if (vdev_is_dead(vd) ||
3238			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3239				vd = NULL;
3240		}
3241
3242		mutex_exit(hash_lock);
3243
3244		ASSERT3U(hdr->b_size, ==, size);
3245		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3246		    uint64_t, size, zbookmark_t *, zb);
3247		ARCSTAT_BUMP(arcstat_misses);
3248		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3249		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3250		    data, metadata, misses);
3251#ifdef _KERNEL
3252		curthread->td_ru.ru_inblock++;
3253#endif
3254
3255		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3256			/*
3257			 * Read from the L2ARC if the following are true:
3258			 * 1. The L2ARC vdev was previously cached.
3259			 * 2. This buffer still has L2ARC metadata.
3260			 * 3. This buffer isn't currently writing to the L2ARC.
3261			 * 4. The L2ARC entry wasn't evicted, which may
3262			 *    also have invalidated the vdev.
3263			 * 5. This isn't prefetch and l2arc_noprefetch is set.
3264			 */
3265			if (hdr->b_l2hdr != NULL &&
3266			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3267			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3268				l2arc_read_callback_t *cb;
3269
3270				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3271				ARCSTAT_BUMP(arcstat_l2_hits);
3272
3273				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3274				    KM_SLEEP);
3275				cb->l2rcb_buf = buf;
3276				cb->l2rcb_spa = spa;
3277				cb->l2rcb_bp = *bp;
3278				cb->l2rcb_zb = *zb;
3279				cb->l2rcb_flags = zio_flags;
3280
3281				/*
3282				 * l2arc read.  The SCL_L2ARC lock will be
3283				 * released by l2arc_read_done().
3284				 */
3285				rzio = zio_read_phys(pio, vd, addr, size,
3286				    buf->b_data, ZIO_CHECKSUM_OFF,
3287				    l2arc_read_done, cb, priority, zio_flags |
3288				    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
3289				    ZIO_FLAG_DONT_PROPAGATE |
3290				    ZIO_FLAG_DONT_RETRY, B_FALSE);
3291				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3292				    zio_t *, rzio);
3293				ARCSTAT_INCR(arcstat_l2_read_bytes, size);
3294
3295				if (*arc_flags & ARC_NOWAIT) {
3296					zio_nowait(rzio);
3297					return (0);
3298				}
3299
3300				ASSERT(*arc_flags & ARC_WAIT);
3301				if (zio_wait(rzio) == 0)
3302					return (0);
3303
3304				/* l2arc read error; goto zio_read() */
3305			} else {
3306				DTRACE_PROBE1(l2arc__miss,
3307				    arc_buf_hdr_t *, hdr);
3308				ARCSTAT_BUMP(arcstat_l2_misses);
3309				if (HDR_L2_WRITING(hdr))
3310					ARCSTAT_BUMP(arcstat_l2_rw_clash);
3311				spa_config_exit(spa, SCL_L2ARC, vd);
3312			}
3313		} else {
3314			if (vd != NULL)
3315				spa_config_exit(spa, SCL_L2ARC, vd);
3316			if (l2arc_ndev != 0) {
3317				DTRACE_PROBE1(l2arc__miss,
3318				    arc_buf_hdr_t *, hdr);
3319				ARCSTAT_BUMP(arcstat_l2_misses);
3320			}
3321		}
3322
3323		rzio = zio_read(pio, spa, bp, buf->b_data, size,
3324		    arc_read_done, buf, priority, zio_flags, zb);
3325
3326		if (*arc_flags & ARC_WAIT)
3327			return (zio_wait(rzio));
3328
3329		ASSERT(*arc_flags & ARC_NOWAIT);
3330		zio_nowait(rzio);
3331	}
3332	return (0);
3333}
3334
3335void
3336arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3337{
3338	ASSERT(buf->b_hdr != NULL);
3339	ASSERT(buf->b_hdr->b_state != arc_anon);
3340	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3341	ASSERT(buf->b_efunc == NULL);
3342	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3343
3344	buf->b_efunc = func;
3345	buf->b_private = private;
3346}
3347
3348/*
3349 * This is used by the DMU to let the ARC know that a buffer is
3350 * being evicted, so the ARC should clean up.  If this arc buf
3351 * is not yet in the evicted state, it will be put there.
3352 */
3353int
3354arc_buf_evict(arc_buf_t *buf)
3355{
3356	arc_buf_hdr_t *hdr;
3357	kmutex_t *hash_lock;
3358	arc_buf_t **bufp;
3359	list_t *list, *evicted_list;
3360	kmutex_t *lock, *evicted_lock;
3361
3362	mutex_enter(&buf->b_evict_lock);
3363	hdr = buf->b_hdr;
3364	if (hdr == NULL) {
3365		/*
3366		 * We are in arc_do_user_evicts().
3367		 */
3368		ASSERT(buf->b_data == NULL);
3369		mutex_exit(&buf->b_evict_lock);
3370		return (0);
3371	} else if (buf->b_data == NULL) {
3372		arc_buf_t copy = *buf; /* structure assignment */
3373		/*
3374		 * We are on the eviction list; process this buffer now
3375		 * but let arc_do_user_evicts() do the reaping.
3376		 */
3377		buf->b_efunc = NULL;
3378		mutex_exit(&buf->b_evict_lock);
3379		VERIFY(copy.b_efunc(&copy) == 0);
3380		return (1);
3381	}
3382	hash_lock = HDR_LOCK(hdr);
3383	mutex_enter(hash_lock);
3384	hdr = buf->b_hdr;
3385	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3386
3387	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3388	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3389
3390	/*
3391	 * Pull this buffer off of the hdr
3392	 */
3393	bufp = &hdr->b_buf;
3394	while (*bufp != buf)
3395		bufp = &(*bufp)->b_next;
3396	*bufp = buf->b_next;
3397
3398	ASSERT(buf->b_data != NULL);
3399	arc_buf_destroy(buf, FALSE, FALSE);
3400
3401	if (hdr->b_datacnt == 0) {
3402		arc_state_t *old_state = hdr->b_state;
3403		arc_state_t *evicted_state;
3404
3405		ASSERT(hdr->b_buf == NULL);
3406		ASSERT(refcount_is_zero(&hdr->b_refcnt));
3407
3408		evicted_state =
3409		    (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3410
3411		get_buf_info(hdr, old_state, &list, &lock);
3412		get_buf_info(hdr, evicted_state, &evicted_list, &evicted_lock);
3413		mutex_enter(lock);
3414		mutex_enter(evicted_lock);
3415
3416		arc_change_state(evicted_state, hdr, hash_lock);
3417		ASSERT(HDR_IN_HASH_TABLE(hdr));
3418		hdr->b_flags |= ARC_IN_HASH_TABLE;
3419		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3420
3421		mutex_exit(evicted_lock);
3422		mutex_exit(lock);
3423	}
3424	mutex_exit(hash_lock);
3425	mutex_exit(&buf->b_evict_lock);
3426
3427	VERIFY(buf->b_efunc(buf) == 0);
3428	buf->b_efunc = NULL;
3429	buf->b_private = NULL;
3430	buf->b_hdr = NULL;
3431	buf->b_next = NULL;
3432	kmem_cache_free(buf_cache, buf);
3433	return (1);
3434}
3435
3436/*
3437 * Release this buffer from the cache.  This must be done
3438 * after a read and prior to modifying the buffer contents.
3439 * If the buffer has more than one reference, we must make
3440 * a new hdr for the buffer.
3441 */
3442void
3443arc_release(arc_buf_t *buf, void *tag)
3444{
3445	arc_buf_hdr_t *hdr;
3446	kmutex_t *hash_lock = NULL;
3447	l2arc_buf_hdr_t *l2hdr;
3448	uint64_t buf_size;
3449
3450	/*
3451	 * It would be nice to assert that if it's DMU metadata (level >
3452	 * 0 || it's the dnode file), then it must be syncing context.
3453	 * But we don't know that information at this level.
3454	 */
3455
3456	mutex_enter(&buf->b_evict_lock);
3457	hdr = buf->b_hdr;
3458
3459	/* this buffer is not on any list */
3460	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3461
3462	if (hdr->b_state == arc_anon) {
3463		/* this buffer is already released */
3464		ASSERT(buf->b_efunc == NULL);
3465	} else {
3466		hash_lock = HDR_LOCK(hdr);
3467		mutex_enter(hash_lock);
3468		hdr = buf->b_hdr;
3469		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3470	}
3471
3472	l2hdr = hdr->b_l2hdr;
3473	if (l2hdr) {
3474		mutex_enter(&l2arc_buflist_mtx);
3475		hdr->b_l2hdr = NULL;
3476		buf_size = hdr->b_size;
3477	}
3478
3479	/*
3480	 * Do we have more than one buf?
3481	 */
3482	if (hdr->b_datacnt > 1) {
3483		arc_buf_hdr_t *nhdr;
3484		arc_buf_t **bufp;
3485		uint64_t blksz = hdr->b_size;
3486		uint64_t spa = hdr->b_spa;
3487		arc_buf_contents_t type = hdr->b_type;
3488		uint32_t flags = hdr->b_flags;
3489
3490		ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3491		/*
3492		 * Pull the data off of this hdr and attach it to
3493		 * a new anonymous hdr.
3494		 */
3495		(void) remove_reference(hdr, hash_lock, tag);
3496		bufp = &hdr->b_buf;
3497		while (*bufp != buf)
3498			bufp = &(*bufp)->b_next;
3499		*bufp = buf->b_next;
3500		buf->b_next = NULL;
3501
3502		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3503		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3504		if (refcount_is_zero(&hdr->b_refcnt)) {
3505			uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3506			ASSERT3U(*size, >=, hdr->b_size);
3507			atomic_add_64(size, -hdr->b_size);
3508		}
3509
3510		/*
3511		 * We're releasing a duplicate user data buffer, update
3512		 * our statistics accordingly.
3513		 */
3514		if (hdr->b_type == ARC_BUFC_DATA) {
3515			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3516			ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3517			    -hdr->b_size);
3518		}
3519		hdr->b_datacnt -= 1;
3520		arc_cksum_verify(buf);
3521#ifdef illumos
3522		arc_buf_unwatch(buf);
3523#endif /* illumos */
3524
3525		mutex_exit(hash_lock);
3526
3527		nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3528		nhdr->b_size = blksz;
3529		nhdr->b_spa = spa;
3530		nhdr->b_type = type;
3531		nhdr->b_buf = buf;
3532		nhdr->b_state = arc_anon;
3533		nhdr->b_arc_access = 0;
3534		nhdr->b_flags = flags & ARC_L2_WRITING;
3535		nhdr->b_l2hdr = NULL;
3536		nhdr->b_datacnt = 1;
3537		nhdr->b_freeze_cksum = NULL;
3538		(void) refcount_add(&nhdr->b_refcnt, tag);
3539		buf->b_hdr = nhdr;
3540		mutex_exit(&buf->b_evict_lock);
3541		atomic_add_64(&arc_anon->arcs_size, blksz);
3542	} else {
3543		mutex_exit(&buf->b_evict_lock);
3544		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3545		ASSERT(!list_link_active(&hdr->b_arc_node));
3546		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3547		if (hdr->b_state != arc_anon)
3548			arc_change_state(arc_anon, hdr, hash_lock);
3549		hdr->b_arc_access = 0;
3550		if (hash_lock)
3551			mutex_exit(hash_lock);
3552
3553		buf_discard_identity(hdr);
3554		arc_buf_thaw(buf);
3555	}
3556	buf->b_efunc = NULL;
3557	buf->b_private = NULL;
3558
3559	if (l2hdr) {
3560		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3561		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3562		ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3563		mutex_exit(&l2arc_buflist_mtx);
3564	}
3565}
3566
3567/*
3568 * Release this buffer.  If it does not match the provided BP, fill it
3569 * with that block's contents.
3570 */
3571/* ARGSUSED */
3572int
3573arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
3574    zbookmark_t *zb)
3575{
3576	arc_release(buf, tag);
3577	return (0);
3578}
3579
3580int
3581arc_released(arc_buf_t *buf)
3582{
3583	int released;
3584
3585	mutex_enter(&buf->b_evict_lock);
3586	released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3587	mutex_exit(&buf->b_evict_lock);
3588	return (released);
3589}
3590
3591int
3592arc_has_callback(arc_buf_t *buf)
3593{
3594	int callback;
3595
3596	mutex_enter(&buf->b_evict_lock);
3597	callback = (buf->b_efunc != NULL);
3598	mutex_exit(&buf->b_evict_lock);
3599	return (callback);
3600}
3601
3602#ifdef ZFS_DEBUG
3603int
3604arc_referenced(arc_buf_t *buf)
3605{
3606	int referenced;
3607
3608	mutex_enter(&buf->b_evict_lock);
3609	referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3610	mutex_exit(&buf->b_evict_lock);
3611	return (referenced);
3612}
3613#endif
3614
3615static void
3616arc_write_ready(zio_t *zio)
3617{
3618	arc_write_callback_t *callback = zio->io_private;
3619	arc_buf_t *buf = callback->awcb_buf;
3620	arc_buf_hdr_t *hdr = buf->b_hdr;
3621
3622	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3623	callback->awcb_ready(zio, buf, callback->awcb_private);
3624
3625	/*
3626	 * If the IO is already in progress, then this is a re-write
3627	 * attempt, so we need to thaw and re-compute the cksum.
3628	 * It is the responsibility of the callback to handle the
3629	 * accounting for any re-write attempt.
3630	 */
3631	if (HDR_IO_IN_PROGRESS(hdr)) {
3632		mutex_enter(&hdr->b_freeze_lock);
3633		if (hdr->b_freeze_cksum != NULL) {
3634			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3635			hdr->b_freeze_cksum = NULL;
3636		}
3637		mutex_exit(&hdr->b_freeze_lock);
3638	}
3639	arc_cksum_compute(buf, B_FALSE);
3640	hdr->b_flags |= ARC_IO_IN_PROGRESS;
3641}
3642
3643static void
3644arc_write_done(zio_t *zio)
3645{
3646	arc_write_callback_t *callback = zio->io_private;
3647	arc_buf_t *buf = callback->awcb_buf;
3648	arc_buf_hdr_t *hdr = buf->b_hdr;
3649
3650	ASSERT(hdr->b_acb == NULL);
3651
3652	if (zio->io_error == 0) {
3653		hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3654		hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3655		hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3656	} else {
3657		ASSERT(BUF_EMPTY(hdr));
3658	}
3659
3660	/*
3661	 * If the block to be written was all-zero, we may have
3662	 * compressed it away.  In this case no write was performed
3663	 * so there will be no dva/birth/checksum.  The buffer must
3664	 * therefore remain anonymous (and uncached).
3665	 */
3666	if (!BUF_EMPTY(hdr)) {
3667		arc_buf_hdr_t *exists;
3668		kmutex_t *hash_lock;
3669
3670		ASSERT(zio->io_error == 0);
3671
3672		arc_cksum_verify(buf);
3673
3674		exists = buf_hash_insert(hdr, &hash_lock);
3675		if (exists) {
3676			/*
3677			 * This can only happen if we overwrite for
3678			 * sync-to-convergence, because we remove
3679			 * buffers from the hash table when we arc_free().
3680			 */
3681			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3682				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3683					panic("bad overwrite, hdr=%p exists=%p",
3684					    (void *)hdr, (void *)exists);
3685				ASSERT(refcount_is_zero(&exists->b_refcnt));
3686				arc_change_state(arc_anon, exists, hash_lock);
3687				mutex_exit(hash_lock);
3688				arc_hdr_destroy(exists);
3689				exists = buf_hash_insert(hdr, &hash_lock);
3690				ASSERT3P(exists, ==, NULL);
3691			} else {
3692				/* Dedup */
3693				ASSERT(hdr->b_datacnt == 1);
3694				ASSERT(hdr->b_state == arc_anon);
3695				ASSERT(BP_GET_DEDUP(zio->io_bp));
3696				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3697			}
3698		}
3699		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3700		/* if it's not anon, we are doing a scrub */
3701		if (!exists && hdr->b_state == arc_anon)
3702			arc_access(hdr, hash_lock);
3703		mutex_exit(hash_lock);
3704	} else {
3705		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3706	}
3707
3708	ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3709	callback->awcb_done(zio, buf, callback->awcb_private);
3710
3711	kmem_free(callback, sizeof (arc_write_callback_t));
3712}
3713
3714zio_t *
3715arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3716    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
3717    arc_done_func_t *ready, arc_done_func_t *done, void *private,
3718    int priority, int zio_flags, const zbookmark_t *zb)
3719{
3720	arc_buf_hdr_t *hdr = buf->b_hdr;
3721	arc_write_callback_t *callback;
3722	zio_t *zio;
3723
3724	ASSERT(ready != NULL);
3725	ASSERT(done != NULL);
3726	ASSERT(!HDR_IO_ERROR(hdr));
3727	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3728	ASSERT(hdr->b_acb == NULL);
3729	if (l2arc)
3730		hdr->b_flags |= ARC_L2CACHE;
3731	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3732	callback->awcb_ready = ready;
3733	callback->awcb_done = done;
3734	callback->awcb_private = private;
3735	callback->awcb_buf = buf;
3736
3737	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3738	    arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3739
3740	return (zio);
3741}
3742
3743static int
3744arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
3745{
3746#ifdef _KERNEL
3747	uint64_t available_memory =
3748	    ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count);
3749	static uint64_t page_load = 0;
3750	static uint64_t last_txg = 0;
3751
3752#ifdef sun
3753#if defined(__i386)
3754	available_memory =
3755	    MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3756#endif
3757#endif	/* sun */
3758	if (available_memory >= zfs_write_limit_max)
3759		return (0);
3760
3761	if (txg > last_txg) {
3762		last_txg = txg;
3763		page_load = 0;
3764	}
3765	/*
3766	 * If we are in pageout, we know that memory is already tight,
3767	 * the arc is already going to be evicting, so we just want to
3768	 * continue to let page writes occur as quickly as possible.
3769	 */
3770	if (curproc == pageproc) {
3771		if (page_load > available_memory / 4)
3772			return (ERESTART);
3773		/* Note: reserve is inflated, so we deflate */
3774		page_load += reserve / 8;
3775		return (0);
3776	} else if (page_load > 0 && arc_reclaim_needed()) {
3777		/* memory is low, delay before restarting */
3778		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3779		return (EAGAIN);
3780	}
3781	page_load = 0;
3782
3783	if (arc_size > arc_c_min) {
3784		uint64_t evictable_memory =
3785		    arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3786		    arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3787		    arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3788		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3789		available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3790	}
3791
3792	if (inflight_data > available_memory / 4) {
3793		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3794		return (ERESTART);
3795	}
3796#endif
3797	return (0);
3798}
3799
3800void
3801arc_tempreserve_clear(uint64_t reserve)
3802{
3803	atomic_add_64(&arc_tempreserve, -reserve);
3804	ASSERT((int64_t)arc_tempreserve >= 0);
3805}
3806
3807int
3808arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3809{
3810	int error;
3811	uint64_t anon_size;
3812
3813#ifdef ZFS_DEBUG
3814	/*
3815	 * Once in a while, fail for no reason.  Everything should cope.
3816	 */
3817	if (spa_get_random(10000) == 0) {
3818		dprintf("forcing random failure\n");
3819		return (ERESTART);
3820	}
3821#endif
3822	if (reserve > arc_c/4 && !arc_no_grow)
3823		arc_c = MIN(arc_c_max, reserve * 4);
3824	if (reserve > arc_c)
3825		return (ENOMEM);
3826
3827	/*
3828	 * Don't count loaned bufs as in flight dirty data to prevent long
3829	 * network delays from blocking transactions that are ready to be
3830	 * assigned to a txg.
3831	 */
3832	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3833
3834	/*
3835	 * Writes will, almost always, require additional memory allocations
3836	 * in order to compress/encrypt/etc the data.  We therefor need to
3837	 * make sure that there is sufficient available memory for this.
3838	 */
3839	if (error = arc_memory_throttle(reserve, anon_size, txg))
3840		return (error);
3841
3842	/*
3843	 * Throttle writes when the amount of dirty data in the cache
3844	 * gets too large.  We try to keep the cache less than half full
3845	 * of dirty blocks so that our sync times don't grow too large.
3846	 * Note: if two requests come in concurrently, we might let them
3847	 * both succeed, when one of them should fail.  Not a huge deal.
3848	 */
3849
3850	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3851	    anon_size > arc_c / 4) {
3852		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3853		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3854		    arc_tempreserve>>10,
3855		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3856		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3857		    reserve>>10, arc_c>>10);
3858		return (ERESTART);
3859	}
3860	atomic_add_64(&arc_tempreserve, reserve);
3861	return (0);
3862}
3863
3864static kmutex_t arc_lowmem_lock;
3865#ifdef _KERNEL
3866static eventhandler_tag arc_event_lowmem = NULL;
3867
3868static void
3869arc_lowmem(void *arg __unused, int howto __unused)
3870{
3871
3872	/* Serialize access via arc_lowmem_lock. */
3873	mutex_enter(&arc_lowmem_lock);
3874	mutex_enter(&arc_reclaim_thr_lock);
3875	needfree = 1;
3876	cv_signal(&arc_reclaim_thr_cv);
3877
3878	/*
3879	 * It is unsafe to block here in arbitrary threads, because we can come
3880	 * here from ARC itself and may hold ARC locks and thus risk a deadlock
3881	 * with ARC reclaim thread.
3882	 */
3883	if (curproc == pageproc) {
3884		while (needfree)
3885			msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
3886	}
3887	mutex_exit(&arc_reclaim_thr_lock);
3888	mutex_exit(&arc_lowmem_lock);
3889}
3890#endif
3891
3892void
3893arc_init(void)
3894{
3895	int i, prefetch_tunable_set = 0;
3896
3897	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3898	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3899	mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
3900
3901	/* Convert seconds to clock ticks */
3902	arc_min_prefetch_lifespan = 1 * hz;
3903
3904	/* Start out with 1/8 of all memory */
3905	arc_c = kmem_size() / 8;
3906
3907#ifdef sun
3908#ifdef _KERNEL
3909	/*
3910	 * On architectures where the physical memory can be larger
3911	 * than the addressable space (intel in 32-bit mode), we may
3912	 * need to limit the cache to 1/8 of VM size.
3913	 */
3914	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3915#endif
3916#endif	/* sun */
3917	/* set min cache to 1/32 of all memory, or 16MB, whichever is more */
3918	arc_c_min = MAX(arc_c / 4, 64<<18);
3919	/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
3920	if (arc_c * 8 >= 1<<30)
3921		arc_c_max = (arc_c * 8) - (1<<30);
3922	else
3923		arc_c_max = arc_c_min;
3924	arc_c_max = MAX(arc_c * 5, arc_c_max);
3925
3926#ifdef _KERNEL
3927	/*
3928	 * Allow the tunables to override our calculations if they are
3929	 * reasonable (ie. over 16MB)
3930	 */
3931	if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size())
3932		arc_c_max = zfs_arc_max;
3933	if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max)
3934		arc_c_min = zfs_arc_min;
3935#endif
3936
3937	arc_c = arc_c_max;
3938	arc_p = (arc_c >> 1);
3939
3940	/* limit meta-data to 1/4 of the arc capacity */
3941	arc_meta_limit = arc_c_max / 4;
3942
3943	/* Allow the tunable to override if it is reasonable */
3944	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3945		arc_meta_limit = zfs_arc_meta_limit;
3946
3947	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3948		arc_c_min = arc_meta_limit / 2;
3949
3950	if (zfs_arc_grow_retry > 0)
3951		arc_grow_retry = zfs_arc_grow_retry;
3952
3953	if (zfs_arc_shrink_shift > 0)
3954		arc_shrink_shift = zfs_arc_shrink_shift;
3955
3956	if (zfs_arc_p_min_shift > 0)
3957		arc_p_min_shift = zfs_arc_p_min_shift;
3958
3959	/* if kmem_flags are set, lets try to use less memory */
3960	if (kmem_debugging())
3961		arc_c = arc_c / 2;
3962	if (arc_c < arc_c_min)
3963		arc_c = arc_c_min;
3964
3965	zfs_arc_min = arc_c_min;
3966	zfs_arc_max = arc_c_max;
3967
3968	arc_anon = &ARC_anon;
3969	arc_mru = &ARC_mru;
3970	arc_mru_ghost = &ARC_mru_ghost;
3971	arc_mfu = &ARC_mfu;
3972	arc_mfu_ghost = &ARC_mfu_ghost;
3973	arc_l2c_only = &ARC_l2c_only;
3974	arc_size = 0;
3975
3976	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
3977		mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
3978		    NULL, MUTEX_DEFAULT, NULL);
3979		mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
3980		    NULL, MUTEX_DEFAULT, NULL);
3981		mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
3982		    NULL, MUTEX_DEFAULT, NULL);
3983		mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
3984		    NULL, MUTEX_DEFAULT, NULL);
3985		mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
3986		    NULL, MUTEX_DEFAULT, NULL);
3987		mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
3988		    NULL, MUTEX_DEFAULT, NULL);
3989
3990		list_create(&arc_mru->arcs_lists[i],
3991		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3992		list_create(&arc_mru_ghost->arcs_lists[i],
3993		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3994		list_create(&arc_mfu->arcs_lists[i],
3995		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3996		list_create(&arc_mfu_ghost->arcs_lists[i],
3997		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3998		list_create(&arc_mfu_ghost->arcs_lists[i],
3999		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4000		list_create(&arc_l2c_only->arcs_lists[i],
4001		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4002	}
4003
4004	buf_init();
4005
4006	arc_thread_exit = 0;
4007	arc_eviction_list = NULL;
4008	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
4009	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4010
4011	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4012	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4013
4014	if (arc_ksp != NULL) {
4015		arc_ksp->ks_data = &arc_stats;
4016		kstat_install(arc_ksp);
4017	}
4018
4019	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
4020	    TS_RUN, minclsyspri);
4021
4022#ifdef _KERNEL
4023	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
4024	    EVENTHANDLER_PRI_FIRST);
4025#endif
4026
4027	arc_dead = FALSE;
4028	arc_warm = B_FALSE;
4029
4030	if (zfs_write_limit_max == 0)
4031		zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
4032	else
4033		zfs_write_limit_shift = 0;
4034	mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
4035
4036#ifdef _KERNEL
4037	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
4038		prefetch_tunable_set = 1;
4039
4040#ifdef __i386__
4041	if (prefetch_tunable_set == 0) {
4042		printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
4043		    "-- to enable,\n");
4044		printf("            add \"vfs.zfs.prefetch_disable=0\" "
4045		    "to /boot/loader.conf.\n");
4046		zfs_prefetch_disable = 1;
4047	}
4048#else
4049	if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
4050	    prefetch_tunable_set == 0) {
4051		printf("ZFS NOTICE: Prefetch is disabled by default if less "
4052		    "than 4GB of RAM is present;\n"
4053		    "            to enable, add \"vfs.zfs.prefetch_disable=0\" "
4054		    "to /boot/loader.conf.\n");
4055		zfs_prefetch_disable = 1;
4056	}
4057#endif
4058	/* Warn about ZFS memory and address space requirements. */
4059	if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
4060		printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
4061		    "expect unstable behavior.\n");
4062	}
4063	if (kmem_size() < 512 * (1 << 20)) {
4064		printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
4065		    "expect unstable behavior.\n");
4066		printf("             Consider tuning vm.kmem_size and "
4067		    "vm.kmem_size_max\n");
4068		printf("             in /boot/loader.conf.\n");
4069	}
4070#endif
4071}
4072
4073void
4074arc_fini(void)
4075{
4076	int i;
4077
4078	mutex_enter(&arc_reclaim_thr_lock);
4079	arc_thread_exit = 1;
4080	cv_signal(&arc_reclaim_thr_cv);
4081	while (arc_thread_exit != 0)
4082		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
4083	mutex_exit(&arc_reclaim_thr_lock);
4084
4085	arc_flush(NULL);
4086
4087	arc_dead = TRUE;
4088
4089	if (arc_ksp != NULL) {
4090		kstat_delete(arc_ksp);
4091		arc_ksp = NULL;
4092	}
4093
4094	mutex_destroy(&arc_eviction_mtx);
4095	mutex_destroy(&arc_reclaim_thr_lock);
4096	cv_destroy(&arc_reclaim_thr_cv);
4097
4098	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4099		list_destroy(&arc_mru->arcs_lists[i]);
4100		list_destroy(&arc_mru_ghost->arcs_lists[i]);
4101		list_destroy(&arc_mfu->arcs_lists[i]);
4102		list_destroy(&arc_mfu_ghost->arcs_lists[i]);
4103		list_destroy(&arc_l2c_only->arcs_lists[i]);
4104
4105		mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
4106		mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
4107		mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
4108		mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
4109		mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
4110		mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
4111	}
4112
4113	mutex_destroy(&zfs_write_limit_lock);
4114
4115	buf_fini();
4116
4117	ASSERT(arc_loaned_bytes == 0);
4118
4119	mutex_destroy(&arc_lowmem_lock);
4120#ifdef _KERNEL
4121	if (arc_event_lowmem != NULL)
4122		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
4123#endif
4124}
4125
4126/*
4127 * Level 2 ARC
4128 *
4129 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
4130 * It uses dedicated storage devices to hold cached data, which are populated
4131 * using large infrequent writes.  The main role of this cache is to boost
4132 * the performance of random read workloads.  The intended L2ARC devices
4133 * include short-stroked disks, solid state disks, and other media with
4134 * substantially faster read latency than disk.
4135 *
4136 *                 +-----------------------+
4137 *                 |         ARC           |
4138 *                 +-----------------------+
4139 *                    |         ^     ^
4140 *                    |         |     |
4141 *      l2arc_feed_thread()    arc_read()
4142 *                    |         |     |
4143 *                    |  l2arc read   |
4144 *                    V         |     |
4145 *               +---------------+    |
4146 *               |     L2ARC     |    |
4147 *               +---------------+    |
4148 *                   |    ^           |
4149 *          l2arc_write() |           |
4150 *                   |    |           |
4151 *                   V    |           |
4152 *                 +-------+      +-------+
4153 *                 | vdev  |      | vdev  |
4154 *                 | cache |      | cache |
4155 *                 +-------+      +-------+
4156 *                 +=========+     .-----.
4157 *                 :  L2ARC  :    |-_____-|
4158 *                 : devices :    | Disks |
4159 *                 +=========+    `-_____-'
4160 *
4161 * Read requests are satisfied from the following sources, in order:
4162 *
4163 *	1) ARC
4164 *	2) vdev cache of L2ARC devices
4165 *	3) L2ARC devices
4166 *	4) vdev cache of disks
4167 *	5) disks
4168 *
4169 * Some L2ARC device types exhibit extremely slow write performance.
4170 * To accommodate for this there are some significant differences between
4171 * the L2ARC and traditional cache design:
4172 *
4173 * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
4174 * the ARC behave as usual, freeing buffers and placing headers on ghost
4175 * lists.  The ARC does not send buffers to the L2ARC during eviction as
4176 * this would add inflated write latencies for all ARC memory pressure.
4177 *
4178 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4179 * It does this by periodically scanning buffers from the eviction-end of
4180 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4181 * not already there.  It scans until a headroom of buffers is satisfied,
4182 * which itself is a buffer for ARC eviction.  The thread that does this is
4183 * l2arc_feed_thread(), illustrated below; example sizes are included to
4184 * provide a better sense of ratio than this diagram:
4185 *
4186 *	       head -->                        tail
4187 *	        +---------------------+----------+
4188 *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
4189 *	        +---------------------+----------+   |   o L2ARC eligible
4190 *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
4191 *	        +---------------------+----------+   |
4192 *	             15.9 Gbytes      ^ 32 Mbytes    |
4193 *	                           headroom          |
4194 *	                                      l2arc_feed_thread()
4195 *	                                             |
4196 *	                 l2arc write hand <--[oooo]--'
4197 *	                         |           8 Mbyte
4198 *	                         |          write max
4199 *	                         V
4200 *		  +==============================+
4201 *	L2ARC dev |####|#|###|###|    |####| ... |
4202 *	          +==============================+
4203 *	                     32 Gbytes
4204 *
4205 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4206 * evicted, then the L2ARC has cached a buffer much sooner than it probably
4207 * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
4208 * safe to say that this is an uncommon case, since buffers at the end of
4209 * the ARC lists have moved there due to inactivity.
4210 *
4211 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4212 * then the L2ARC simply misses copying some buffers.  This serves as a
4213 * pressure valve to prevent heavy read workloads from both stalling the ARC
4214 * with waits and clogging the L2ARC with writes.  This also helps prevent
4215 * the potential for the L2ARC to churn if it attempts to cache content too
4216 * quickly, such as during backups of the entire pool.
4217 *
4218 * 5. After system boot and before the ARC has filled main memory, there are
4219 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4220 * lists can remain mostly static.  Instead of searching from tail of these
4221 * lists as pictured, the l2arc_feed_thread() will search from the list heads
4222 * for eligible buffers, greatly increasing its chance of finding them.
4223 *
4224 * The L2ARC device write speed is also boosted during this time so that
4225 * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
4226 * there are no L2ARC reads, and no fear of degrading read performance
4227 * through increased writes.
4228 *
4229 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4230 * the vdev queue can aggregate them into larger and fewer writes.  Each
4231 * device is written to in a rotor fashion, sweeping writes through
4232 * available space then repeating.
4233 *
4234 * 7. The L2ARC does not store dirty content.  It never needs to flush
4235 * write buffers back to disk based storage.
4236 *
4237 * 8. If an ARC buffer is written (and dirtied) which also exists in the
4238 * L2ARC, the now stale L2ARC buffer is immediately dropped.
4239 *
4240 * The performance of the L2ARC can be tweaked by a number of tunables, which
4241 * may be necessary for different workloads:
4242 *
4243 *	l2arc_write_max		max write bytes per interval
4244 *	l2arc_write_boost	extra write bytes during device warmup
4245 *	l2arc_noprefetch	skip caching prefetched buffers
4246 *	l2arc_headroom		number of max device writes to precache
4247 *	l2arc_feed_secs		seconds between L2ARC writing
4248 *
4249 * Tunables may be removed or added as future performance improvements are
4250 * integrated, and also may become zpool properties.
4251 *
4252 * There are three key functions that control how the L2ARC warms up:
4253 *
4254 *	l2arc_write_eligible()	check if a buffer is eligible to cache
4255 *	l2arc_write_size()	calculate how much to write
4256 *	l2arc_write_interval()	calculate sleep delay between writes
4257 *
4258 * These three functions determine what to write, how much, and how quickly
4259 * to send writes.
4260 */
4261
4262static boolean_t
4263l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4264{
4265	/*
4266	 * A buffer is *not* eligible for the L2ARC if it:
4267	 * 1. belongs to a different spa.
4268	 * 2. is already cached on the L2ARC.
4269	 * 3. has an I/O in progress (it may be an incomplete read).
4270	 * 4. is flagged not eligible (zfs property).
4271	 */
4272	if (ab->b_spa != spa_guid) {
4273		ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
4274		return (B_FALSE);
4275	}
4276	if (ab->b_l2hdr != NULL) {
4277		ARCSTAT_BUMP(arcstat_l2_write_in_l2);
4278		return (B_FALSE);
4279	}
4280	if (HDR_IO_IN_PROGRESS(ab)) {
4281		ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
4282		return (B_FALSE);
4283	}
4284	if (!HDR_L2CACHE(ab)) {
4285		ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
4286		return (B_FALSE);
4287	}
4288
4289	return (B_TRUE);
4290}
4291
4292static uint64_t
4293l2arc_write_size(l2arc_dev_t *dev)
4294{
4295	uint64_t size;
4296
4297	size = dev->l2ad_write;
4298
4299	if (arc_warm == B_FALSE)
4300		size += dev->l2ad_boost;
4301
4302	return (size);
4303
4304}
4305
4306static clock_t
4307l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4308{
4309	clock_t interval, next, now;
4310
4311	/*
4312	 * If the ARC lists are busy, increase our write rate; if the
4313	 * lists are stale, idle back.  This is achieved by checking
4314	 * how much we previously wrote - if it was more than half of
4315	 * what we wanted, schedule the next write much sooner.
4316	 */
4317	if (l2arc_feed_again && wrote > (wanted / 2))
4318		interval = (hz * l2arc_feed_min_ms) / 1000;
4319	else
4320		interval = hz * l2arc_feed_secs;
4321
4322	now = ddi_get_lbolt();
4323	next = MAX(now, MIN(now + interval, began + interval));
4324
4325	return (next);
4326}
4327
4328static void
4329l2arc_hdr_stat_add(void)
4330{
4331	ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4332	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4333}
4334
4335static void
4336l2arc_hdr_stat_remove(void)
4337{
4338	ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4339	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4340}
4341
4342/*
4343 * Cycle through L2ARC devices.  This is how L2ARC load balances.
4344 * If a device is returned, this also returns holding the spa config lock.
4345 */
4346static l2arc_dev_t *
4347l2arc_dev_get_next(void)
4348{
4349	l2arc_dev_t *first, *next = NULL;
4350
4351	/*
4352	 * Lock out the removal of spas (spa_namespace_lock), then removal
4353	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4354	 * both locks will be dropped and a spa config lock held instead.
4355	 */
4356	mutex_enter(&spa_namespace_lock);
4357	mutex_enter(&l2arc_dev_mtx);
4358
4359	/* if there are no vdevs, there is nothing to do */
4360	if (l2arc_ndev == 0)
4361		goto out;
4362
4363	first = NULL;
4364	next = l2arc_dev_last;
4365	do {
4366		/* loop around the list looking for a non-faulted vdev */
4367		if (next == NULL) {
4368			next = list_head(l2arc_dev_list);
4369		} else {
4370			next = list_next(l2arc_dev_list, next);
4371			if (next == NULL)
4372				next = list_head(l2arc_dev_list);
4373		}
4374
4375		/* if we have come back to the start, bail out */
4376		if (first == NULL)
4377			first = next;
4378		else if (next == first)
4379			break;
4380
4381	} while (vdev_is_dead(next->l2ad_vdev));
4382
4383	/* if we were unable to find any usable vdevs, return NULL */
4384	if (vdev_is_dead(next->l2ad_vdev))
4385		next = NULL;
4386
4387	l2arc_dev_last = next;
4388
4389out:
4390	mutex_exit(&l2arc_dev_mtx);
4391
4392	/*
4393	 * Grab the config lock to prevent the 'next' device from being
4394	 * removed while we are writing to it.
4395	 */
4396	if (next != NULL)
4397		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4398	mutex_exit(&spa_namespace_lock);
4399
4400	return (next);
4401}
4402
4403/*
4404 * Free buffers that were tagged for destruction.
4405 */
4406static void
4407l2arc_do_free_on_write()
4408{
4409	list_t *buflist;
4410	l2arc_data_free_t *df, *df_prev;
4411
4412	mutex_enter(&l2arc_free_on_write_mtx);
4413	buflist = l2arc_free_on_write;
4414
4415	for (df = list_tail(buflist); df; df = df_prev) {
4416		df_prev = list_prev(buflist, df);
4417		ASSERT(df->l2df_data != NULL);
4418		ASSERT(df->l2df_func != NULL);
4419		df->l2df_func(df->l2df_data, df->l2df_size);
4420		list_remove(buflist, df);
4421		kmem_free(df, sizeof (l2arc_data_free_t));
4422	}
4423
4424	mutex_exit(&l2arc_free_on_write_mtx);
4425}
4426
4427/*
4428 * A write to a cache device has completed.  Update all headers to allow
4429 * reads from these buffers to begin.
4430 */
4431static void
4432l2arc_write_done(zio_t *zio)
4433{
4434	l2arc_write_callback_t *cb;
4435	l2arc_dev_t *dev;
4436	list_t *buflist;
4437	arc_buf_hdr_t *head, *ab, *ab_prev;
4438	l2arc_buf_hdr_t *abl2;
4439	kmutex_t *hash_lock;
4440
4441	cb = zio->io_private;
4442	ASSERT(cb != NULL);
4443	dev = cb->l2wcb_dev;
4444	ASSERT(dev != NULL);
4445	head = cb->l2wcb_head;
4446	ASSERT(head != NULL);
4447	buflist = dev->l2ad_buflist;
4448	ASSERT(buflist != NULL);
4449	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4450	    l2arc_write_callback_t *, cb);
4451
4452	if (zio->io_error != 0)
4453		ARCSTAT_BUMP(arcstat_l2_writes_error);
4454
4455	mutex_enter(&l2arc_buflist_mtx);
4456
4457	/*
4458	 * All writes completed, or an error was hit.
4459	 */
4460	for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4461		ab_prev = list_prev(buflist, ab);
4462
4463		hash_lock = HDR_LOCK(ab);
4464		if (!mutex_tryenter(hash_lock)) {
4465			/*
4466			 * This buffer misses out.  It may be in a stage
4467			 * of eviction.  Its ARC_L2_WRITING flag will be
4468			 * left set, denying reads to this buffer.
4469			 */
4470			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4471			continue;
4472		}
4473
4474		if (zio->io_error != 0) {
4475			/*
4476			 * Error - drop L2ARC entry.
4477			 */
4478			list_remove(buflist, ab);
4479			abl2 = ab->b_l2hdr;
4480			ab->b_l2hdr = NULL;
4481			kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4482			ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4483		}
4484
4485		/*
4486		 * Allow ARC to begin reads to this L2ARC entry.
4487		 */
4488		ab->b_flags &= ~ARC_L2_WRITING;
4489
4490		mutex_exit(hash_lock);
4491	}
4492
4493	atomic_inc_64(&l2arc_writes_done);
4494	list_remove(buflist, head);
4495	kmem_cache_free(hdr_cache, head);
4496	mutex_exit(&l2arc_buflist_mtx);
4497
4498	l2arc_do_free_on_write();
4499
4500	kmem_free(cb, sizeof (l2arc_write_callback_t));
4501}
4502
4503/*
4504 * A read to a cache device completed.  Validate buffer contents before
4505 * handing over to the regular ARC routines.
4506 */
4507static void
4508l2arc_read_done(zio_t *zio)
4509{
4510	l2arc_read_callback_t *cb;
4511	arc_buf_hdr_t *hdr;
4512	arc_buf_t *buf;
4513	kmutex_t *hash_lock;
4514	int equal;
4515
4516	ASSERT(zio->io_vd != NULL);
4517	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4518
4519	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4520
4521	cb = zio->io_private;
4522	ASSERT(cb != NULL);
4523	buf = cb->l2rcb_buf;
4524	ASSERT(buf != NULL);
4525
4526	hash_lock = HDR_LOCK(buf->b_hdr);
4527	mutex_enter(hash_lock);
4528	hdr = buf->b_hdr;
4529	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4530
4531	/*
4532	 * Check this survived the L2ARC journey.
4533	 */
4534	equal = arc_cksum_equal(buf);
4535	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4536		mutex_exit(hash_lock);
4537		zio->io_private = buf;
4538		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
4539		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
4540		arc_read_done(zio);
4541	} else {
4542		mutex_exit(hash_lock);
4543		/*
4544		 * Buffer didn't survive caching.  Increment stats and
4545		 * reissue to the original storage device.
4546		 */
4547		if (zio->io_error != 0) {
4548			ARCSTAT_BUMP(arcstat_l2_io_error);
4549		} else {
4550			zio->io_error = EIO;
4551		}
4552		if (!equal)
4553			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4554
4555		/*
4556		 * If there's no waiter, issue an async i/o to the primary
4557		 * storage now.  If there *is* a waiter, the caller must
4558		 * issue the i/o in a context where it's OK to block.
4559		 */
4560		if (zio->io_waiter == NULL) {
4561			zio_t *pio = zio_unique_parent(zio);
4562
4563			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4564
4565			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4566			    buf->b_data, zio->io_size, arc_read_done, buf,
4567			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4568		}
4569	}
4570
4571	kmem_free(cb, sizeof (l2arc_read_callback_t));
4572}
4573
4574/*
4575 * This is the list priority from which the L2ARC will search for pages to
4576 * cache.  This is used within loops (0..3) to cycle through lists in the
4577 * desired order.  This order can have a significant effect on cache
4578 * performance.
4579 *
4580 * Currently the metadata lists are hit first, MFU then MRU, followed by
4581 * the data lists.  This function returns a locked list, and also returns
4582 * the lock pointer.
4583 */
4584static list_t *
4585l2arc_list_locked(int list_num, kmutex_t **lock)
4586{
4587	list_t *list;
4588	int idx;
4589
4590	ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);
4591
4592	if (list_num < ARC_BUFC_NUMMETADATALISTS) {
4593		idx = list_num;
4594		list = &arc_mfu->arcs_lists[idx];
4595		*lock = ARCS_LOCK(arc_mfu, idx);
4596	} else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) {
4597		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4598		list = &arc_mru->arcs_lists[idx];
4599		*lock = ARCS_LOCK(arc_mru, idx);
4600	} else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 +
4601		ARC_BUFC_NUMDATALISTS)) {
4602		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4603		list = &arc_mfu->arcs_lists[idx];
4604		*lock = ARCS_LOCK(arc_mfu, idx);
4605	} else {
4606		idx = list_num - ARC_BUFC_NUMLISTS;
4607		list = &arc_mru->arcs_lists[idx];
4608		*lock = ARCS_LOCK(arc_mru, idx);
4609	}
4610
4611	ASSERT(!(MUTEX_HELD(*lock)));
4612	mutex_enter(*lock);
4613	return (list);
4614}
4615
4616/*
4617 * Evict buffers from the device write hand to the distance specified in
4618 * bytes.  This distance may span populated buffers, it may span nothing.
4619 * This is clearing a region on the L2ARC device ready for writing.
4620 * If the 'all' boolean is set, every buffer is evicted.
4621 */
4622static void
4623l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4624{
4625	list_t *buflist;
4626	l2arc_buf_hdr_t *abl2;
4627	arc_buf_hdr_t *ab, *ab_prev;
4628	kmutex_t *hash_lock;
4629	uint64_t taddr;
4630
4631	buflist = dev->l2ad_buflist;
4632
4633	if (buflist == NULL)
4634		return;
4635
4636	if (!all && dev->l2ad_first) {
4637		/*
4638		 * This is the first sweep through the device.  There is
4639		 * nothing to evict.
4640		 */
4641		return;
4642	}
4643
4644	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4645		/*
4646		 * When nearing the end of the device, evict to the end
4647		 * before the device write hand jumps to the start.
4648		 */
4649		taddr = dev->l2ad_end;
4650	} else {
4651		taddr = dev->l2ad_hand + distance;
4652	}
4653	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4654	    uint64_t, taddr, boolean_t, all);
4655
4656top:
4657	mutex_enter(&l2arc_buflist_mtx);
4658	for (ab = list_tail(buflist); ab; ab = ab_prev) {
4659		ab_prev = list_prev(buflist, ab);
4660
4661		hash_lock = HDR_LOCK(ab);
4662		if (!mutex_tryenter(hash_lock)) {
4663			/*
4664			 * Missed the hash lock.  Retry.
4665			 */
4666			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4667			mutex_exit(&l2arc_buflist_mtx);
4668			mutex_enter(hash_lock);
4669			mutex_exit(hash_lock);
4670			goto top;
4671		}
4672
4673		if (HDR_L2_WRITE_HEAD(ab)) {
4674			/*
4675			 * We hit a write head node.  Leave it for
4676			 * l2arc_write_done().
4677			 */
4678			list_remove(buflist, ab);
4679			mutex_exit(hash_lock);
4680			continue;
4681		}
4682
4683		if (!all && ab->b_l2hdr != NULL &&
4684		    (ab->b_l2hdr->b_daddr > taddr ||
4685		    ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4686			/*
4687			 * We've evicted to the target address,
4688			 * or the end of the device.
4689			 */
4690			mutex_exit(hash_lock);
4691			break;
4692		}
4693
4694		if (HDR_FREE_IN_PROGRESS(ab)) {
4695			/*
4696			 * Already on the path to destruction.
4697			 */
4698			mutex_exit(hash_lock);
4699			continue;
4700		}
4701
4702		if (ab->b_state == arc_l2c_only) {
4703			ASSERT(!HDR_L2_READING(ab));
4704			/*
4705			 * This doesn't exist in the ARC.  Destroy.
4706			 * arc_hdr_destroy() will call list_remove()
4707			 * and decrement arcstat_l2_size.
4708			 */
4709			arc_change_state(arc_anon, ab, hash_lock);
4710			arc_hdr_destroy(ab);
4711		} else {
4712			/*
4713			 * Invalidate issued or about to be issued
4714			 * reads, since we may be about to write
4715			 * over this location.
4716			 */
4717			if (HDR_L2_READING(ab)) {
4718				ARCSTAT_BUMP(arcstat_l2_evict_reading);
4719				ab->b_flags |= ARC_L2_EVICTED;
4720			}
4721
4722			/*
4723			 * Tell ARC this no longer exists in L2ARC.
4724			 */
4725			if (ab->b_l2hdr != NULL) {
4726				abl2 = ab->b_l2hdr;
4727				ab->b_l2hdr = NULL;
4728				kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4729				ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4730			}
4731			list_remove(buflist, ab);
4732
4733			/*
4734			 * This may have been leftover after a
4735			 * failed write.
4736			 */
4737			ab->b_flags &= ~ARC_L2_WRITING;
4738		}
4739		mutex_exit(hash_lock);
4740	}
4741	mutex_exit(&l2arc_buflist_mtx);
4742
4743	vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4744	dev->l2ad_evict = taddr;
4745}
4746
4747/*
4748 * Find and write ARC buffers to the L2ARC device.
4749 *
4750 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4751 * for reading until they have completed writing.
4752 */
4753static uint64_t
4754l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
4755{
4756	arc_buf_hdr_t *ab, *ab_prev, *head;
4757	l2arc_buf_hdr_t *hdrl2;
4758	list_t *list;
4759	uint64_t passed_sz, write_sz, buf_sz, headroom;
4760	void *buf_data;
4761	kmutex_t *hash_lock, *list_lock;
4762	boolean_t have_lock, full;
4763	l2arc_write_callback_t *cb;
4764	zio_t *pio, *wzio;
4765	uint64_t guid = spa_load_guid(spa);
4766	int try;
4767
4768	ASSERT(dev->l2ad_vdev != NULL);
4769
4770	pio = NULL;
4771	write_sz = 0;
4772	full = B_FALSE;
4773	head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4774	head->b_flags |= ARC_L2_WRITE_HEAD;
4775
4776	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
4777	/*
4778	 * Copy buffers for L2ARC writing.
4779	 */
4780	mutex_enter(&l2arc_buflist_mtx);
4781	for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
4782		list = l2arc_list_locked(try, &list_lock);
4783		passed_sz = 0;
4784		ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
4785
4786		/*
4787		 * L2ARC fast warmup.
4788		 *
4789		 * Until the ARC is warm and starts to evict, read from the
4790		 * head of the ARC lists rather than the tail.
4791		 */
4792		headroom = target_sz * l2arc_headroom;
4793		if (arc_warm == B_FALSE)
4794			ab = list_head(list);
4795		else
4796			ab = list_tail(list);
4797		if (ab == NULL)
4798			ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
4799
4800		for (; ab; ab = ab_prev) {
4801			if (arc_warm == B_FALSE)
4802				ab_prev = list_next(list, ab);
4803			else
4804				ab_prev = list_prev(list, ab);
4805			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size);
4806
4807			hash_lock = HDR_LOCK(ab);
4808			have_lock = MUTEX_HELD(hash_lock);
4809			if (!have_lock && !mutex_tryenter(hash_lock)) {
4810				ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
4811				/*
4812				 * Skip this buffer rather than waiting.
4813				 */
4814				continue;
4815			}
4816
4817			passed_sz += ab->b_size;
4818			if (passed_sz > headroom) {
4819				/*
4820				 * Searched too far.
4821				 */
4822				mutex_exit(hash_lock);
4823				ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
4824				break;
4825			}
4826
4827			if (!l2arc_write_eligible(guid, ab)) {
4828				mutex_exit(hash_lock);
4829				continue;
4830			}
4831
4832			if ((write_sz + ab->b_size) > target_sz) {
4833				full = B_TRUE;
4834				mutex_exit(hash_lock);
4835				ARCSTAT_BUMP(arcstat_l2_write_full);
4836				break;
4837			}
4838
4839			if (pio == NULL) {
4840				/*
4841				 * Insert a dummy header on the buflist so
4842				 * l2arc_write_done() can find where the
4843				 * write buffers begin without searching.
4844				 */
4845				list_insert_head(dev->l2ad_buflist, head);
4846
4847				cb = kmem_alloc(
4848				    sizeof (l2arc_write_callback_t), KM_SLEEP);
4849				cb->l2wcb_dev = dev;
4850				cb->l2wcb_head = head;
4851				pio = zio_root(spa, l2arc_write_done, cb,
4852				    ZIO_FLAG_CANFAIL);
4853				ARCSTAT_BUMP(arcstat_l2_write_pios);
4854			}
4855
4856			/*
4857			 * Create and add a new L2ARC header.
4858			 */
4859			hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4860			hdrl2->b_dev = dev;
4861			hdrl2->b_daddr = dev->l2ad_hand;
4862
4863			ab->b_flags |= ARC_L2_WRITING;
4864			ab->b_l2hdr = hdrl2;
4865			list_insert_head(dev->l2ad_buflist, ab);
4866			buf_data = ab->b_buf->b_data;
4867			buf_sz = ab->b_size;
4868
4869			/*
4870			 * Compute and store the buffer cksum before
4871			 * writing.  On debug the cksum is verified first.
4872			 */
4873			arc_cksum_verify(ab->b_buf);
4874			arc_cksum_compute(ab->b_buf, B_TRUE);
4875
4876			mutex_exit(hash_lock);
4877
4878			wzio = zio_write_phys(pio, dev->l2ad_vdev,
4879			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4880			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4881			    ZIO_FLAG_CANFAIL, B_FALSE);
4882
4883			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4884			    zio_t *, wzio);
4885			(void) zio_nowait(wzio);
4886
4887			/*
4888			 * Keep the clock hand suitably device-aligned.
4889			 */
4890			buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4891
4892			write_sz += buf_sz;
4893			dev->l2ad_hand += buf_sz;
4894		}
4895
4896		mutex_exit(list_lock);
4897
4898		if (full == B_TRUE)
4899			break;
4900	}
4901	mutex_exit(&l2arc_buflist_mtx);
4902
4903	if (pio == NULL) {
4904		ASSERT0(write_sz);
4905		kmem_cache_free(hdr_cache, head);
4906		return (0);
4907	}
4908
4909	ASSERT3U(write_sz, <=, target_sz);
4910	ARCSTAT_BUMP(arcstat_l2_writes_sent);
4911	ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
4912	ARCSTAT_INCR(arcstat_l2_size, write_sz);
4913	vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
4914
4915	/*
4916	 * Bump device hand to the device start if it is approaching the end.
4917	 * l2arc_evict() will already have evicted ahead for this case.
4918	 */
4919	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4920		vdev_space_update(dev->l2ad_vdev,
4921		    dev->l2ad_end - dev->l2ad_hand, 0, 0);
4922		dev->l2ad_hand = dev->l2ad_start;
4923		dev->l2ad_evict = dev->l2ad_start;
4924		dev->l2ad_first = B_FALSE;
4925	}
4926
4927	dev->l2ad_writing = B_TRUE;
4928	(void) zio_wait(pio);
4929	dev->l2ad_writing = B_FALSE;
4930
4931	return (write_sz);
4932}
4933
4934/*
4935 * This thread feeds the L2ARC at regular intervals.  This is the beating
4936 * heart of the L2ARC.
4937 */
4938static void
4939l2arc_feed_thread(void *dummy __unused)
4940{
4941	callb_cpr_t cpr;
4942	l2arc_dev_t *dev;
4943	spa_t *spa;
4944	uint64_t size, wrote;
4945	clock_t begin, next = ddi_get_lbolt();
4946
4947	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4948
4949	mutex_enter(&l2arc_feed_thr_lock);
4950
4951	while (l2arc_thread_exit == 0) {
4952		CALLB_CPR_SAFE_BEGIN(&cpr);
4953		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
4954		    next - ddi_get_lbolt());
4955		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
4956		next = ddi_get_lbolt() + hz;
4957
4958		/*
4959		 * Quick check for L2ARC devices.
4960		 */
4961		mutex_enter(&l2arc_dev_mtx);
4962		if (l2arc_ndev == 0) {
4963			mutex_exit(&l2arc_dev_mtx);
4964			continue;
4965		}
4966		mutex_exit(&l2arc_dev_mtx);
4967		begin = ddi_get_lbolt();
4968
4969		/*
4970		 * This selects the next l2arc device to write to, and in
4971		 * doing so the next spa to feed from: dev->l2ad_spa.   This
4972		 * will return NULL if there are now no l2arc devices or if
4973		 * they are all faulted.
4974		 *
4975		 * If a device is returned, its spa's config lock is also
4976		 * held to prevent device removal.  l2arc_dev_get_next()
4977		 * will grab and release l2arc_dev_mtx.
4978		 */
4979		if ((dev = l2arc_dev_get_next()) == NULL)
4980			continue;
4981
4982		spa = dev->l2ad_spa;
4983		ASSERT(spa != NULL);
4984
4985		/*
4986		 * If the pool is read-only then force the feed thread to
4987		 * sleep a little longer.
4988		 */
4989		if (!spa_writeable(spa)) {
4990			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
4991			spa_config_exit(spa, SCL_L2ARC, dev);
4992			continue;
4993		}
4994
4995		/*
4996		 * Avoid contributing to memory pressure.
4997		 */
4998		if (arc_reclaim_needed()) {
4999			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5000			spa_config_exit(spa, SCL_L2ARC, dev);
5001			continue;
5002		}
5003
5004		ARCSTAT_BUMP(arcstat_l2_feeds);
5005
5006		size = l2arc_write_size(dev);
5007
5008		/*
5009		 * Evict L2ARC buffers that will be overwritten.
5010		 */
5011		l2arc_evict(dev, size, B_FALSE);
5012
5013		/*
5014		 * Write ARC buffers.
5015		 */
5016		wrote = l2arc_write_buffers(spa, dev, size);
5017
5018		/*
5019		 * Calculate interval between writes.
5020		 */
5021		next = l2arc_write_interval(begin, size, wrote);
5022		spa_config_exit(spa, SCL_L2ARC, dev);
5023	}
5024
5025	l2arc_thread_exit = 0;
5026	cv_broadcast(&l2arc_feed_thr_cv);
5027	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
5028	thread_exit();
5029}
5030
5031boolean_t
5032l2arc_vdev_present(vdev_t *vd)
5033{
5034	l2arc_dev_t *dev;
5035
5036	mutex_enter(&l2arc_dev_mtx);
5037	for (dev = list_head(l2arc_dev_list); dev != NULL;
5038	    dev = list_next(l2arc_dev_list, dev)) {
5039		if (dev->l2ad_vdev == vd)
5040			break;
5041	}
5042	mutex_exit(&l2arc_dev_mtx);
5043
5044	return (dev != NULL);
5045}
5046
5047/*
5048 * Add a vdev for use by the L2ARC.  By this point the spa has already
5049 * validated the vdev and opened it.
5050 */
5051void
5052l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5053{
5054	l2arc_dev_t *adddev;
5055
5056	ASSERT(!l2arc_vdev_present(vd));
5057
5058	/*
5059	 * Create a new l2arc device entry.
5060	 */
5061	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5062	adddev->l2ad_spa = spa;
5063	adddev->l2ad_vdev = vd;
5064	adddev->l2ad_write = l2arc_write_max;
5065	adddev->l2ad_boost = l2arc_write_boost;
5066	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5067	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5068	adddev->l2ad_hand = adddev->l2ad_start;
5069	adddev->l2ad_evict = adddev->l2ad_start;
5070	adddev->l2ad_first = B_TRUE;
5071	adddev->l2ad_writing = B_FALSE;
5072	ASSERT3U(adddev->l2ad_write, >, 0);
5073
5074	/*
5075	 * This is a list of all ARC buffers that are still valid on the
5076	 * device.
5077	 */
5078	adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5079	list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5080	    offsetof(arc_buf_hdr_t, b_l2node));
5081
5082	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5083
5084	/*
5085	 * Add device to global list
5086	 */
5087	mutex_enter(&l2arc_dev_mtx);
5088	list_insert_head(l2arc_dev_list, adddev);
5089	atomic_inc_64(&l2arc_ndev);
5090	mutex_exit(&l2arc_dev_mtx);
5091}
5092
5093/*
5094 * Remove a vdev from the L2ARC.
5095 */
5096void
5097l2arc_remove_vdev(vdev_t *vd)
5098{
5099	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5100
5101	/*
5102	 * Find the device by vdev
5103	 */
5104	mutex_enter(&l2arc_dev_mtx);
5105	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5106		nextdev = list_next(l2arc_dev_list, dev);
5107		if (vd == dev->l2ad_vdev) {
5108			remdev = dev;
5109			break;
5110		}
5111	}
5112	ASSERT(remdev != NULL);
5113
5114	/*
5115	 * Remove device from global list
5116	 */
5117	list_remove(l2arc_dev_list, remdev);
5118	l2arc_dev_last = NULL;		/* may have been invalidated */
5119	atomic_dec_64(&l2arc_ndev);
5120	mutex_exit(&l2arc_dev_mtx);
5121
5122	/*
5123	 * Clear all buflists and ARC references.  L2ARC device flush.
5124	 */
5125	l2arc_evict(remdev, 0, B_TRUE);
5126	list_destroy(remdev->l2ad_buflist);
5127	kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5128	kmem_free(remdev, sizeof (l2arc_dev_t));
5129}
5130
5131void
5132l2arc_init(void)
5133{
5134	l2arc_thread_exit = 0;
5135	l2arc_ndev = 0;
5136	l2arc_writes_sent = 0;
5137	l2arc_writes_done = 0;
5138
5139	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5140	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5141	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5142	mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5143	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5144
5145	l2arc_dev_list = &L2ARC_dev_list;
5146	l2arc_free_on_write = &L2ARC_free_on_write;
5147	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5148	    offsetof(l2arc_dev_t, l2ad_node));
5149	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5150	    offsetof(l2arc_data_free_t, l2df_list_node));
5151}
5152
5153void
5154l2arc_fini(void)
5155{
5156	/*
5157	 * This is called from dmu_fini(), which is called from spa_fini();
5158	 * Because of this, we can assume that all l2arc devices have
5159	 * already been removed when the pools themselves were removed.
5160	 */
5161
5162	l2arc_do_free_on_write();
5163
5164	mutex_destroy(&l2arc_feed_thr_lock);
5165	cv_destroy(&l2arc_feed_thr_cv);
5166	mutex_destroy(&l2arc_dev_mtx);
5167	mutex_destroy(&l2arc_buflist_mtx);
5168	mutex_destroy(&l2arc_free_on_write_mtx);
5169
5170	list_destroy(l2arc_dev_list);
5171	list_destroy(l2arc_free_on_write);
5172}
5173
5174void
5175l2arc_start(void)
5176{
5177	if (!(spa_mode_global & FWRITE))
5178		return;
5179
5180	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5181	    TS_RUN, minclsyspri);
5182}
5183
5184void
5185l2arc_stop(void)
5186{
5187	if (!(spa_mode_global & FWRITE))
5188		return;
5189
5190	mutex_enter(&l2arc_feed_thr_lock);
5191	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
5192	l2arc_thread_exit = 1;
5193	while (l2arc_thread_exit != 0)
5194		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5195	mutex_exit(&l2arc_feed_thr_lock);
5196}
5197