arc.c revision 228103
1244971Sjkim/*
2244971Sjkim * CDDL HEADER START
3244971Sjkim *
4244971Sjkim * The contents of this file are subject to the terms of the
5244971Sjkim * Common Development and Distribution License (the "License").
6244971Sjkim * You may not use this file except in compliance with the License.
7244971Sjkim *
8245582Sjkim * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9244971Sjkim * or http://www.opensolaris.org/os/licensing.
10244971Sjkim * See the License for the specific language governing permissions
11244971Sjkim * and limitations under the License.
12244971Sjkim *
13244971Sjkim * When distributing Covered Code, include this CDDL HEADER in each
14244971Sjkim * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15244971Sjkim * If applicable, add the following below this CDDL HEADER, with the
16244971Sjkim * fields enclosed by brackets "[]" replaced with your own identifying
17244971Sjkim * information: Portions Copyright [yyyy] [name of copyright owner]
18244971Sjkim *
19244971Sjkim * CDDL HEADER END
20244971Sjkim */
21244971Sjkim/*
22244971Sjkim * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23244971Sjkim * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24244971Sjkim * Copyright (c) 2011 by Delphix. All rights reserved.
25244971Sjkim */
26244971Sjkim
27244971Sjkim/*
28244971Sjkim * DVA-based Adjustable Replacement Cache
29244971Sjkim *
30244971Sjkim * While much of the theory of operation used here is
31244971Sjkim * based on the self-tuning, low overhead replacement cache
32244971Sjkim * presented by Megiddo and Modha at FAST 2003, there are some
33244971Sjkim * significant differences:
34244971Sjkim *
35244971Sjkim * 1. The Megiddo and Modha model assumes any page is evictable.
36244971Sjkim * Pages in its cache cannot be "locked" into memory.  This makes
37244971Sjkim * the eviction algorithm simple: evict the last page in the list.
38244971Sjkim * This also make the performance characteristics easy to reason
39244971Sjkim * about.  Our cache is not so simple.  At any given moment, some
40244971Sjkim * subset of the blocks in the cache are un-evictable because we
41244971Sjkim * have handed out a reference to them.  Blocks are only evictable
42244971Sjkim * when there are no external references active.  This makes
43244971Sjkim * eviction far more problematic:  we choose to evict the evictable
44245582Sjkim * blocks that are the "lowest" in the list.
45245582Sjkim *
46245582Sjkim * There are times when it is not possible to evict the requested
47245582Sjkim * space.  In these circumstances we are unable to adjust the cache
48244971Sjkim * size.  To prevent the cache growing unbounded at these times we
49244971Sjkim * implement a "cache throttle" that slows the flow of new data
50244971Sjkim * into the cache until we can make space available.
51244971Sjkim *
52244971Sjkim * 2. The Megiddo and Modha model assumes a fixed cache size.
53244971Sjkim * Pages are evicted when the cache is full and there is a cache
54244971Sjkim * miss.  Our model has a variable sized cache.  It grows with
55244971Sjkim * high use, but also tries to react to memory pressure from the
56244971Sjkim * operating system: decreasing its size when system memory is
57244971Sjkim * tight.
58249663Sjkim *
59244971Sjkim * 3. The Megiddo and Modha model assumes a fixed page size. All
60244971Sjkim * elements of the cache are therefor exactly the same size.  So
61244971Sjkim * when adjusting the cache size following a cache miss, its simply
62244971Sjkim * a matter of choosing a single page to evict.  In our model, we
63244971Sjkim * have variable sized cache blocks (rangeing from 512 bytes to
64244971Sjkim * 128K bytes).  We therefor choose a set of blocks to evict to make
65249663Sjkim * space for a cache miss that approximates as closely as possible
66244971Sjkim * the space used by the new block.
67244971Sjkim *
68244971Sjkim * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
69244971Sjkim * by N. Megiddo & D. Modha, FAST 2003
70244971Sjkim */
71244971Sjkim
72244971Sjkim/*
73244971Sjkim * The locking model:
74244971Sjkim *
75244971Sjkim * A new reference to a cache buffer can be obtained in two
76244971Sjkim * ways: 1) via a hash table lookup using the DVA as a key,
77244971Sjkim * or 2) via one of the ARC lists.  The arc_read() interface
78249663Sjkim * uses method 1, while the internal arc algorithms for
79244971Sjkim * adjusting the cache use method 2.  We therefor provide two
80244971Sjkim * types of locks: 1) the hash table lock array, and 2) the
81244971Sjkim * arc list locks.
82244971Sjkim *
83244971Sjkim * Buffers do not have their own mutexs, rather they rely on the
84244971Sjkim * hash table mutexs for the bulk of their protection (i.e. most
85244971Sjkim * fields in the arc_buf_hdr_t are protected by these mutexs).
86244971Sjkim *
87244971Sjkim * buf_hash_find() returns the appropriate mutex (held) when it
88244971Sjkim * locates the requested buffer in the hash table.  It returns
89244971Sjkim * NULL for the mutex if the buffer was not in the table.
90244971Sjkim *
91249663Sjkim * buf_hash_remove() expects the appropriate hash mutex to be
92244971Sjkim * already held before it is invoked.
93244971Sjkim *
94244971Sjkim * Each arc state also has a mutex which is used to protect the
95244971Sjkim * buffer list associated with the state.  When attempting to
96244971Sjkim * obtain a hash table lock while holding an arc list lock you
97244971Sjkim * must use: mutex_tryenter() to avoid deadlock.  Also note that
98244971Sjkim * the active state mutex must be held before the ghost state mutex.
99244971Sjkim *
100244971Sjkim * Arc buffers may have an associated eviction callback function.
101244971Sjkim * This function will be invoked prior to removing the buffer (e.g.
102244971Sjkim * in arc_do_user_evicts()).  Note however that the data associated
103244971Sjkim * with the buffer may be evicted prior to the callback.  The callback
104244971Sjkim * must be made with *no locks held* (to prevent deadlock).  Additionally,
105244971Sjkim * the users of callbacks must ensure that their private data is
106244971Sjkim * protected from simultaneous callbacks from arc_buf_evict()
107244971Sjkim * and arc_do_user_evicts().
108249663Sjkim *
109244971Sjkim * Note that the majority of the performance stats are manipulated
110244971Sjkim * with atomic operations.
111244971Sjkim *
112249663Sjkim * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
113249663Sjkim *
114244971Sjkim *	- L2ARC buflist creation
115244971Sjkim *	- L2ARC buflist eviction
116244971Sjkim *	- L2ARC write completion, which walks L2ARC buflists
117244971Sjkim *	- ARC header destruction, as it removes from L2ARC buflists
118244971Sjkim *	- ARC header release, as it removes from L2ARC buflists
119249663Sjkim */
120244971Sjkim
121244971Sjkim#include <sys/spa.h>
122244971Sjkim#include <sys/zio.h>
123244971Sjkim#include <sys/zfs_context.h>
124244971Sjkim#include <sys/arc.h>
125244971Sjkim#include <sys/refcount.h>
126246849Sjkim#include <sys/vdev.h>
127246849Sjkim#include <sys/vdev_impl.h>
128246849Sjkim#ifdef _KERNEL
129246849Sjkim#include <sys/dnlc.h>
130244971Sjkim#endif
131244971Sjkim#include <sys/callb.h>
132246849Sjkim#include <sys/kstat.h>
133246849Sjkim#include <zfs_fletcher.h>
134246849Sjkim#include <sys/sdt.h>
135246849Sjkim
136246849Sjkim#include <vm/vm_pageout.h>
137249663Sjkim
138244971Sjkimstatic kmutex_t		arc_reclaim_thr_lock;
139244971Sjkimstatic kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
140244971Sjkimstatic uint8_t		arc_thread_exit;
141244971Sjkim
142244971Sjkimextern int zfs_write_limit_shift;
143244971Sjkimextern uint64_t zfs_write_limit_max;
144244971Sjkimextern kmutex_t zfs_write_limit_lock;
145244971Sjkim
146244971Sjkim#define	ARC_REDUCE_DNLC_PERCENT	3
147244971Sjkimuint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
148244971Sjkim
149244971Sjkimtypedef enum arc_reclaim_strategy {
150244971Sjkim	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
151244971Sjkim	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
152244971Sjkim} arc_reclaim_strategy_t;
153244971Sjkim
154244971Sjkim/* number of seconds before growing cache again */
155244971Sjkimstatic int		arc_grow_retry = 60;
156244971Sjkim
157244971Sjkim/* shift of arc_c for calculating both min and max arc_p */
158244971Sjkimstatic int		arc_p_min_shift = 4;
159244971Sjkim
160244971Sjkim/* log2(fraction of arc to reclaim) */
161244971Sjkimstatic int		arc_shrink_shift = 5;
162244971Sjkim
163244971Sjkim/*
164244971Sjkim * minimum lifespan of a prefetch block in clock ticks
165244971Sjkim * (initialized in arc_init())
166244971Sjkim */
167244971Sjkimstatic int		arc_min_prefetch_lifespan;
168249663Sjkim
169244971Sjkimstatic int arc_dead;
170244971Sjkimextern int zfs_prefetch_disable;
171244971Sjkim
172244971Sjkim/*
173249663Sjkim * The arc has filled available memory and has now warmed up.
174244971Sjkim */
175244971Sjkimstatic boolean_t arc_warm;
176244971Sjkim
177244971Sjkim/*
178244971Sjkim * These tunables are for performance analysis.
179244971Sjkim */
180244971Sjkimuint64_t zfs_arc_max;
181244971Sjkimuint64_t zfs_arc_min;
182244971Sjkimuint64_t zfs_arc_meta_limit = 0;
183244971Sjkimint zfs_arc_grow_retry = 0;
184244971Sjkimint zfs_arc_shrink_shift = 0;
185249663Sjkimint zfs_arc_p_min_shift = 0;
186244971Sjkim
187244971SjkimTUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
188244971SjkimTUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
189244971SjkimTUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
190244971SjkimSYSCTL_DECL(_vfs_zfs);
191244971SjkimSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
192244971Sjkim    "Maximum ARC size");
193244971SjkimSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
194244971Sjkim    "Minimum ARC size");
195244971Sjkim
196244971Sjkim/*
197244971Sjkim * Note that buffers can be in one of 6 states:
198244971Sjkim *	ARC_anon	- anonymous (discussed below)
199244971Sjkim *	ARC_mru		- recently used, currently cached
200244971Sjkim *	ARC_mru_ghost	- recentely used, no longer in cache
201244971Sjkim *	ARC_mfu		- frequently used, currently cached
202244971Sjkim *	ARC_mfu_ghost	- frequently used, no longer in cache
203244971Sjkim *	ARC_l2c_only	- exists in L2ARC but not other states
204244971Sjkim * When there are no active references to the buffer, they are
205244971Sjkim * are linked onto a list in one of these arc states.  These are
206244971Sjkim * the only buffers that can be evicted or deleted.  Within each
207244971Sjkim * state there are multiple lists, one for meta-data and one for
208244971Sjkim * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
209244971Sjkim * etc.) is tracked separately so that it can be managed more
210244971Sjkim * explicitly: favored over data, limited explicitly.
211244971Sjkim *
212244971Sjkim * Anonymous buffers are buffers that are not associated with
213244971Sjkim * a DVA.  These are buffers that hold dirty block copies
214244971Sjkim * before they are written to stable storage.  By definition,
215244971Sjkim * they are "ref'd" and are considered part of arc_mru
216244971Sjkim * that cannot be freed.  Generally, they will aquire a DVA
217249663Sjkim * as they are written and migrate onto the arc_mru list.
218244971Sjkim *
219244971Sjkim * The ARC_l2c_only state is for buffers that are in the second
220244971Sjkim * level ARC but no longer in any of the ARC_m* lists.  The second
221244971Sjkim * level ARC itself may also contain buffers that are in any of
222244971Sjkim * the ARC_m* states - meaning that a buffer can exist in two
223244971Sjkim * places.  The reason for the ARC_l2c_only state is to keep the
224244971Sjkim * buffer header in the hash table, so that reads that hit the
225244971Sjkim * second level ARC benefit from these fast lookups.
226244971Sjkim */
227244971Sjkim
228249663Sjkim#define	ARCS_LOCK_PAD		CACHE_LINE_SIZE
229244971Sjkimstruct arcs_lock {
230244971Sjkim	kmutex_t	arcs_lock;
231244971Sjkim#ifdef _KERNEL
232244971Sjkim	unsigned char	pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
233244971Sjkim#endif
234244971Sjkim};
235244971Sjkim
236244971Sjkim/*
237244971Sjkim * must be power of two for mask use to work
238244971Sjkim *
239244971Sjkim */
240244971Sjkim#define ARC_BUFC_NUMDATALISTS		16
241244971Sjkim#define ARC_BUFC_NUMMETADATALISTS	16
242244971Sjkim#define ARC_BUFC_NUMLISTS	(ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
243249663Sjkim
244244971Sjkimtypedef struct arc_state {
245244971Sjkim	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
246244971Sjkim	uint64_t arcs_size;	/* total amount of data in this state */
247244971Sjkim	list_t	arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
248244971Sjkim	struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
249244971Sjkim} arc_state_t;
250244971Sjkim
251244971Sjkim#define ARCS_LOCK(s, i)	(&((s)->arcs_locks[(i)].arcs_lock))
252244971Sjkim
253244971Sjkim/* The 6 states: */
254244971Sjkimstatic arc_state_t ARC_anon;
255249663Sjkimstatic arc_state_t ARC_mru;
256244971Sjkimstatic arc_state_t ARC_mru_ghost;
257244971Sjkimstatic arc_state_t ARC_mfu;
258244971Sjkimstatic arc_state_t ARC_mfu_ghost;
259244971Sjkimstatic arc_state_t ARC_l2c_only;
260244971Sjkim
261244971Sjkimtypedef struct arc_stats {
262249663Sjkim	kstat_named_t arcstat_hits;
263244971Sjkim	kstat_named_t arcstat_misses;
264244971Sjkim	kstat_named_t arcstat_demand_data_hits;
265244971Sjkim	kstat_named_t arcstat_demand_data_misses;
266244971Sjkim	kstat_named_t arcstat_demand_metadata_hits;
267244971Sjkim	kstat_named_t arcstat_demand_metadata_misses;
268244971Sjkim	kstat_named_t arcstat_prefetch_data_hits;
269244971Sjkim	kstat_named_t arcstat_prefetch_data_misses;
270244971Sjkim	kstat_named_t arcstat_prefetch_metadata_hits;
271244971Sjkim	kstat_named_t arcstat_prefetch_metadata_misses;
272244971Sjkim	kstat_named_t arcstat_mru_hits;
273244971Sjkim	kstat_named_t arcstat_mru_ghost_hits;
274244971Sjkim	kstat_named_t arcstat_mfu_hits;
275244971Sjkim	kstat_named_t arcstat_mfu_ghost_hits;
276244971Sjkim	kstat_named_t arcstat_allocated;
277244971Sjkim	kstat_named_t arcstat_deleted;
278244971Sjkim	kstat_named_t arcstat_stolen;
279244971Sjkim	kstat_named_t arcstat_recycle_miss;
280244971Sjkim	kstat_named_t arcstat_mutex_miss;
281244971Sjkim	kstat_named_t arcstat_evict_skip;
282244971Sjkim	kstat_named_t arcstat_evict_l2_cached;
283244971Sjkim	kstat_named_t arcstat_evict_l2_eligible;
284249663Sjkim	kstat_named_t arcstat_evict_l2_ineligible;
285244971Sjkim	kstat_named_t arcstat_hash_elements;
286244971Sjkim	kstat_named_t arcstat_hash_elements_max;
287244971Sjkim	kstat_named_t arcstat_hash_collisions;
288244971Sjkim	kstat_named_t arcstat_hash_chains;
289244971Sjkim	kstat_named_t arcstat_hash_chain_max;
290244971Sjkim	kstat_named_t arcstat_p;
291244971Sjkim	kstat_named_t arcstat_c;
292244971Sjkim	kstat_named_t arcstat_c_min;
293244971Sjkim	kstat_named_t arcstat_c_max;
294244971Sjkim	kstat_named_t arcstat_size;
295244971Sjkim	kstat_named_t arcstat_hdr_size;
296244971Sjkim	kstat_named_t arcstat_data_size;
297244971Sjkim	kstat_named_t arcstat_other_size;
298244971Sjkim	kstat_named_t arcstat_l2_hits;
299244971Sjkim	kstat_named_t arcstat_l2_misses;
300244971Sjkim	kstat_named_t arcstat_l2_feeds;
301244971Sjkim	kstat_named_t arcstat_l2_rw_clash;
302244971Sjkim	kstat_named_t arcstat_l2_read_bytes;
303244971Sjkim	kstat_named_t arcstat_l2_write_bytes;
304244971Sjkim	kstat_named_t arcstat_l2_writes_sent;
305244971Sjkim	kstat_named_t arcstat_l2_writes_done;
306249663Sjkim	kstat_named_t arcstat_l2_writes_error;
307244971Sjkim	kstat_named_t arcstat_l2_writes_hdr_miss;
308244971Sjkim	kstat_named_t arcstat_l2_evict_lock_retry;
309244971Sjkim	kstat_named_t arcstat_l2_evict_reading;
310244971Sjkim	kstat_named_t arcstat_l2_free_on_write;
311244971Sjkim	kstat_named_t arcstat_l2_abort_lowmem;
312244971Sjkim	kstat_named_t arcstat_l2_cksum_bad;
313244971Sjkim	kstat_named_t arcstat_l2_io_error;
314244971Sjkim	kstat_named_t arcstat_l2_size;
315244971Sjkim	kstat_named_t arcstat_l2_hdr_size;
316244971Sjkim	kstat_named_t arcstat_memory_throttle_count;
317244971Sjkim	kstat_named_t arcstat_l2_write_trylock_fail;
318244971Sjkim	kstat_named_t arcstat_l2_write_passed_headroom;
319244971Sjkim	kstat_named_t arcstat_l2_write_spa_mismatch;
320244971Sjkim	kstat_named_t arcstat_l2_write_in_l2;
321249663Sjkim	kstat_named_t arcstat_l2_write_hdr_io_in_progress;
322244971Sjkim	kstat_named_t arcstat_l2_write_not_cacheable;
323244971Sjkim	kstat_named_t arcstat_l2_write_full;
324244971Sjkim	kstat_named_t arcstat_l2_write_buffer_iter;
325244971Sjkim	kstat_named_t arcstat_l2_write_pios;
326244971Sjkim	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
327244971Sjkim	kstat_named_t arcstat_l2_write_buffer_list_iter;
328249663Sjkim	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
329244971Sjkim} arc_stats_t;
330244971Sjkim
331244971Sjkimstatic arc_stats_t arc_stats = {
332244971Sjkim	{ "hits",			KSTAT_DATA_UINT64 },
333244971Sjkim	{ "misses",			KSTAT_DATA_UINT64 },
334244971Sjkim	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
335244971Sjkim	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
336244971Sjkim	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
337244971Sjkim	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
338244971Sjkim	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
339244971Sjkim	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
340244971Sjkim	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
341244971Sjkim	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
342249663Sjkim	{ "mru_hits",			KSTAT_DATA_UINT64 },
343244971Sjkim	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
344244971Sjkim	{ "mfu_hits",			KSTAT_DATA_UINT64 },
345244971Sjkim	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
346244971Sjkim	{ "allocated",			KSTAT_DATA_UINT64 },
347244971Sjkim	{ "deleted",			KSTAT_DATA_UINT64 },
348244971Sjkim	{ "stolen",			KSTAT_DATA_UINT64 },
349244971Sjkim	{ "recycle_miss",		KSTAT_DATA_UINT64 },
350244971Sjkim	{ "mutex_miss",			KSTAT_DATA_UINT64 },
351244971Sjkim	{ "evict_skip",			KSTAT_DATA_UINT64 },
352244971Sjkim	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
353244971Sjkim	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
354249663Sjkim	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
355244971Sjkim	{ "hash_elements",		KSTAT_DATA_UINT64 },
356244971Sjkim	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
357244971Sjkim	{ "hash_collisions",		KSTAT_DATA_UINT64 },
358244971Sjkim	{ "hash_chains",		KSTAT_DATA_UINT64 },
359244971Sjkim	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
360244971Sjkim	{ "p",				KSTAT_DATA_UINT64 },
361244971Sjkim	{ "c",				KSTAT_DATA_UINT64 },
362244971Sjkim	{ "c_min",			KSTAT_DATA_UINT64 },
363244971Sjkim	{ "c_max",			KSTAT_DATA_UINT64 },
364244971Sjkim	{ "size",			KSTAT_DATA_UINT64 },
365244971Sjkim	{ "hdr_size",			KSTAT_DATA_UINT64 },
366244971Sjkim	{ "data_size",			KSTAT_DATA_UINT64 },
367244971Sjkim	{ "other_size",			KSTAT_DATA_UINT64 },
368249663Sjkim	{ "l2_hits",			KSTAT_DATA_UINT64 },
369244971Sjkim	{ "l2_misses",			KSTAT_DATA_UINT64 },
370244971Sjkim	{ "l2_feeds",			KSTAT_DATA_UINT64 },
371244971Sjkim	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
372244971Sjkim	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
373244971Sjkim	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
374244971Sjkim	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
375244971Sjkim	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
376244971Sjkim	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
377244971Sjkim	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
378244971Sjkim	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
379244971Sjkim	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
380244971Sjkim	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
381244971Sjkim	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
382244971Sjkim	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
383244971Sjkim	{ "l2_io_error",		KSTAT_DATA_UINT64 },
384244971Sjkim	{ "l2_size",			KSTAT_DATA_UINT64 },
385244971Sjkim	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
386244971Sjkim	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
387244971Sjkim	{ "l2_write_trylock_fail",	KSTAT_DATA_UINT64 },
388244971Sjkim	{ "l2_write_passed_headroom",	KSTAT_DATA_UINT64 },
389244971Sjkim	{ "l2_write_spa_mismatch",	KSTAT_DATA_UINT64 },
390244971Sjkim	{ "l2_write_in_l2",		KSTAT_DATA_UINT64 },
391244971Sjkim	{ "l2_write_io_in_progress",	KSTAT_DATA_UINT64 },
392249663Sjkim	{ "l2_write_not_cacheable",	KSTAT_DATA_UINT64 },
393244971Sjkim	{ "l2_write_full",		KSTAT_DATA_UINT64 },
394244971Sjkim	{ "l2_write_buffer_iter",	KSTAT_DATA_UINT64 },
395244971Sjkim	{ "l2_write_pios",		KSTAT_DATA_UINT64 },
396249663Sjkim	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
397244971Sjkim	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
398244971Sjkim	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }
399244971Sjkim};
400244971Sjkim
401244971Sjkim#define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
402244971Sjkim
403244971Sjkim#define	ARCSTAT_INCR(stat, val) \
404244971Sjkim	atomic_add_64(&arc_stats.stat.value.ui64, (val));
405249663Sjkim
406244971Sjkim#define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
407244971Sjkim#define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
408244971Sjkim
409244971Sjkim#define	ARCSTAT_MAX(stat, val) {					\
410244971Sjkim	uint64_t m;							\
411244971Sjkim	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
412244971Sjkim	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
413244971Sjkim		continue;						\
414244971Sjkim}
415244971Sjkim
416244971Sjkim#define	ARCSTAT_MAXSTAT(stat) \
417244971Sjkim	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
418244971Sjkim
419244971Sjkim/*
420249663Sjkim * We define a macro to allow ARC hits/misses to be easily broken down by
421244971Sjkim * two separate conditions, giving a total of four different subtypes for
422244971Sjkim * each of hits and misses (so eight statistics total).
423244971Sjkim */
424244971Sjkim#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
425244971Sjkim	if (cond1) {							\
426244971Sjkim		if (cond2) {						\
427244971Sjkim			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
428244971Sjkim		} else {						\
429244971Sjkim			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
430244971Sjkim		}							\
431244971Sjkim	} else {							\
432244971Sjkim		if (cond2) {						\
433244971Sjkim			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
434244971Sjkim		} else {						\
435244971Sjkim			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
436244971Sjkim		}							\
437244971Sjkim	}
438244971Sjkim
439244971Sjkimkstat_t			*arc_ksp;
440244971Sjkimstatic arc_state_t	*arc_anon;
441244971Sjkimstatic arc_state_t	*arc_mru;
442249663Sjkimstatic arc_state_t	*arc_mru_ghost;
443244971Sjkimstatic arc_state_t	*arc_mfu;
444244971Sjkimstatic arc_state_t	*arc_mfu_ghost;
445244971Sjkimstatic arc_state_t	*arc_l2c_only;
446244971Sjkim
447244971Sjkim/*
448244971Sjkim * There are several ARC variables that are critical to export as kstats --
449244971Sjkim * but we don't want to have to grovel around in the kstat whenever we wish to
450244971Sjkim * manipulate them.  For these variables, we therefore define them to be in
451244971Sjkim * terms of the statistic variable.  This assures that we are not introducing
452244971Sjkim * the possibility of inconsistency by having shadow copies of the variables,
453244971Sjkim * while still allowing the code to be readable.
454244971Sjkim */
455244971Sjkim#define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
456244971Sjkim#define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
457244971Sjkim#define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
458244971Sjkim#define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
459244971Sjkim#define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
460244971Sjkim
461244971Sjkimstatic int		arc_no_grow;	/* Don't try to grow cache size */
462244971Sjkimstatic uint64_t		arc_tempreserve;
463244971Sjkimstatic uint64_t		arc_loaned_bytes;
464244971Sjkimstatic uint64_t		arc_meta_used;
465244971Sjkimstatic uint64_t		arc_meta_limit;
466244971Sjkimstatic uint64_t		arc_meta_max = 0;
467249663SjkimSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RDTUN,
468244971Sjkim    &arc_meta_used, 0, "ARC metadata used");
469244971SjkimSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RDTUN,
470244971Sjkim    &arc_meta_limit, 0, "ARC metadata limit");
471244971Sjkim
472244971Sjkimtypedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
473244971Sjkim
474244971Sjkimtypedef struct arc_callback arc_callback_t;
475244971Sjkim
476244971Sjkimstruct arc_callback {
477244971Sjkim	void			*acb_private;
478244971Sjkim	arc_done_func_t		*acb_done;
479244971Sjkim	arc_buf_t		*acb_buf;
480244971Sjkim	zio_t			*acb_zio_dummy;
481244971Sjkim	arc_callback_t		*acb_next;
482244971Sjkim};
483244971Sjkim
484244971Sjkimtypedef struct arc_write_callback arc_write_callback_t;
485244971Sjkim
486244971Sjkimstruct arc_write_callback {
487244971Sjkim	void		*awcb_private;
488249663Sjkim	arc_done_func_t	*awcb_ready;
489244971Sjkim	arc_done_func_t	*awcb_done;
490244971Sjkim	arc_buf_t	*awcb_buf;
491244971Sjkim};
492244971Sjkim
493244971Sjkimstruct arc_buf_hdr {
494244971Sjkim	/* protected by hash lock */
495244971Sjkim	dva_t			b_dva;
496244971Sjkim	uint64_t		b_birth;
497244971Sjkim	uint64_t		b_cksum0;
498244971Sjkim
499244971Sjkim	kmutex_t		b_freeze_lock;
500244971Sjkim	zio_cksum_t		*b_freeze_cksum;
501244971Sjkim	void			*b_thawed;
502249663Sjkim
503244971Sjkim	arc_buf_hdr_t		*b_hash_next;
504244971Sjkim	arc_buf_t		*b_buf;
505244971Sjkim	uint32_t		b_flags;
506244971Sjkim	uint32_t		b_datacnt;
507244971Sjkim
508244971Sjkim	arc_callback_t		*b_acb;
509244971Sjkim	kcondvar_t		b_cv;
510244971Sjkim
511244971Sjkim	/* immutable */
512244971Sjkim	arc_buf_contents_t	b_type;
513244971Sjkim	uint64_t		b_size;
514244971Sjkim	uint64_t		b_spa;
515244971Sjkim
516244971Sjkim	/* protected by arc state mutex */
517244971Sjkim	arc_state_t		*b_state;
518244971Sjkim	list_node_t		b_arc_node;
519244971Sjkim
520244971Sjkim	/* updated atomically */
521244971Sjkim	clock_t			b_arc_access;
522244971Sjkim
523244971Sjkim	/* self protecting */
524244971Sjkim	refcount_t		b_refcnt;
525244971Sjkim
526244971Sjkim	l2arc_buf_hdr_t		*b_l2hdr;
527244971Sjkim	list_node_t		b_l2node;
528244971Sjkim};
529244971Sjkim
530244971Sjkimstatic arc_buf_t *arc_eviction_list;
531244971Sjkimstatic kmutex_t arc_eviction_mtx;
532244971Sjkimstatic arc_buf_hdr_t arc_eviction_hdr;
533244971Sjkimstatic void arc_get_data_buf(arc_buf_t *buf);
534244971Sjkimstatic void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
535244971Sjkimstatic int arc_evict_needed(arc_buf_contents_t type);
536244971Sjkimstatic void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
537249663Sjkim
538244971Sjkimstatic boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
539244971Sjkim
540244971Sjkim#define	GHOST_STATE(state)	\
541244971Sjkim	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
542244971Sjkim	(state) == arc_l2c_only)
543244971Sjkim
544244971Sjkim/*
545244971Sjkim * Private ARC flags.  These flags are private ARC only flags that will show up
546244971Sjkim * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
547244971Sjkim * be passed in as arc_flags in things like arc_read.  However, these flags
548244971Sjkim * should never be passed and should only be set by ARC code.  When adding new
549244971Sjkim * public flags, make sure not to smash the private ones.
550244971Sjkim */
551244971Sjkim
552244971Sjkim#define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
553244971Sjkim#define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
554244971Sjkim#define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
555244971Sjkim#define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
556244971Sjkim#define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
557244971Sjkim#define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
558244971Sjkim#define	ARC_FREE_IN_PROGRESS	(1 << 15)	/* hdr about to be freed */
559244971Sjkim#define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
560244971Sjkim#define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
561249663Sjkim#define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
562244971Sjkim
563244971Sjkim#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
564244971Sjkim#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
565244971Sjkim#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
566244971Sjkim#define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_PREFETCH)
567244971Sjkim#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
568244971Sjkim#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
569244971Sjkim#define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
570244971Sjkim#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_L2CACHE)
571244971Sjkim#define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS &&	\
572244971Sjkim				    (hdr)->b_l2hdr != NULL)
573249663Sjkim#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
574244971Sjkim#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
575244971Sjkim#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
576244971Sjkim
577244971Sjkim/*
578244971Sjkim * Other sizes
579244971Sjkim */
580244971Sjkim
581244971Sjkim#define	HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
582244971Sjkim#define	L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
583244971Sjkim
584244971Sjkim/*
585244971Sjkim * Hash table routines
586244971Sjkim */
587244971Sjkim
588244971Sjkim#define	HT_LOCK_PAD	CACHE_LINE_SIZE
589244971Sjkim
590249663Sjkimstruct ht_lock {
591244971Sjkim	kmutex_t	ht_lock;
592244971Sjkim#ifdef _KERNEL
593244971Sjkim	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
594244971Sjkim#endif
595244971Sjkim};
596244971Sjkim
597244971Sjkim#define	BUF_LOCKS 256
598244971Sjkimtypedef struct buf_hash_table {
599244971Sjkim	uint64_t ht_mask;
600244971Sjkim	arc_buf_hdr_t **ht_table;
601244971Sjkim	struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
602244971Sjkim} buf_hash_table_t;
603244971Sjkim
604244971Sjkimstatic buf_hash_table_t buf_hash_table;
605244971Sjkim
606244971Sjkim#define	BUF_HASH_INDEX(spa, dva, birth) \
607244971Sjkim	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
608244971Sjkim#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
609244971Sjkim#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
610249663Sjkim#define	HDR_LOCK(hdr) \
611244971Sjkim	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
612244971Sjkim
613244971Sjkimuint64_t zfs_crc64_table[256];
614244971Sjkim
615244971Sjkim/*
616244971Sjkim * Level 2 ARC
617244971Sjkim */
618244971Sjkim
619244971Sjkim#define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
620244971Sjkim#define	L2ARC_HEADROOM		2		/* num of writes */
621249663Sjkim#define	L2ARC_FEED_SECS		1		/* caching interval secs */
622244971Sjkim#define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
623244971Sjkim
624244971Sjkim#define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
625244971Sjkim#define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
626244971Sjkim
627244971Sjkim/*
628244971Sjkim * L2ARC Performance Tunables
629244971Sjkim */
630244971Sjkimuint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
631244971Sjkimuint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
632uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
633uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
634uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
635boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
636boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
637boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
638
639SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
640    &l2arc_write_max, 0, "max write size");
641SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
642    &l2arc_write_boost, 0, "extra write during warmup");
643SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
644    &l2arc_headroom, 0, "number of dev writes");
645SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
646    &l2arc_feed_secs, 0, "interval seconds");
647SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
648    &l2arc_feed_min_ms, 0, "min interval milliseconds");
649
650SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
651    &l2arc_noprefetch, 0, "don't cache prefetch bufs");
652SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
653    &l2arc_feed_again, 0, "turbo warmup");
654SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
655    &l2arc_norw, 0, "no reads during writes");
656
657SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
658    &ARC_anon.arcs_size, 0, "size of anonymous state");
659SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
660    &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
661SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
662    &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
663
664SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
665    &ARC_mru.arcs_size, 0, "size of mru state");
666SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
667    &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
668SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
669    &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
670
671SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
672    &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
673SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
674    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
675    "size of metadata in mru ghost state");
676SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
677    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
678    "size of data in mru ghost state");
679
680SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
681    &ARC_mfu.arcs_size, 0, "size of mfu state");
682SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
683    &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
684SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
685    &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
686
687SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
688    &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
689SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
690    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
691    "size of metadata in mfu ghost state");
692SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
693    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
694    "size of data in mfu ghost state");
695
696SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
697    &ARC_l2c_only.arcs_size, 0, "size of mru state");
698
699/*
700 * L2ARC Internals
701 */
702typedef struct l2arc_dev {
703	vdev_t			*l2ad_vdev;	/* vdev */
704	spa_t			*l2ad_spa;	/* spa */
705	uint64_t		l2ad_hand;	/* next write location */
706	uint64_t		l2ad_write;	/* desired write size, bytes */
707	uint64_t		l2ad_boost;	/* warmup write boost, bytes */
708	uint64_t		l2ad_start;	/* first addr on device */
709	uint64_t		l2ad_end;	/* last addr on device */
710	uint64_t		l2ad_evict;	/* last addr eviction reached */
711	boolean_t		l2ad_first;	/* first sweep through */
712	boolean_t		l2ad_writing;	/* currently writing */
713	list_t			*l2ad_buflist;	/* buffer list */
714	list_node_t		l2ad_node;	/* device list node */
715} l2arc_dev_t;
716
717static list_t L2ARC_dev_list;			/* device list */
718static list_t *l2arc_dev_list;			/* device list pointer */
719static kmutex_t l2arc_dev_mtx;			/* device list mutex */
720static l2arc_dev_t *l2arc_dev_last;		/* last device used */
721static kmutex_t l2arc_buflist_mtx;		/* mutex for all buflists */
722static list_t L2ARC_free_on_write;		/* free after write buf list */
723static list_t *l2arc_free_on_write;		/* free after write list ptr */
724static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
725static uint64_t l2arc_ndev;			/* number of devices */
726
727typedef struct l2arc_read_callback {
728	arc_buf_t	*l2rcb_buf;		/* read buffer */
729	spa_t		*l2rcb_spa;		/* spa */
730	blkptr_t	l2rcb_bp;		/* original blkptr */
731	zbookmark_t	l2rcb_zb;		/* original bookmark */
732	int		l2rcb_flags;		/* original flags */
733} l2arc_read_callback_t;
734
735typedef struct l2arc_write_callback {
736	l2arc_dev_t	*l2wcb_dev;		/* device info */
737	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
738} l2arc_write_callback_t;
739
740struct l2arc_buf_hdr {
741	/* protected by arc_buf_hdr  mutex */
742	l2arc_dev_t	*b_dev;			/* L2ARC device */
743	uint64_t	b_daddr;		/* disk address, offset byte */
744};
745
746typedef struct l2arc_data_free {
747	/* protected by l2arc_free_on_write_mtx */
748	void		*l2df_data;
749	size_t		l2df_size;
750	void		(*l2df_func)(void *, size_t);
751	list_node_t	l2df_list_node;
752} l2arc_data_free_t;
753
754static kmutex_t l2arc_feed_thr_lock;
755static kcondvar_t l2arc_feed_thr_cv;
756static uint8_t l2arc_thread_exit;
757
758static void l2arc_read_done(zio_t *zio);
759static void l2arc_hdr_stat_add(void);
760static void l2arc_hdr_stat_remove(void);
761
762static uint64_t
763buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
764{
765	uint8_t *vdva = (uint8_t *)dva;
766	uint64_t crc = -1ULL;
767	int i;
768
769	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
770
771	for (i = 0; i < sizeof (dva_t); i++)
772		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
773
774	crc ^= (spa>>8) ^ birth;
775
776	return (crc);
777}
778
779#define	BUF_EMPTY(buf)						\
780	((buf)->b_dva.dva_word[0] == 0 &&			\
781	(buf)->b_dva.dva_word[1] == 0 &&			\
782	(buf)->b_birth == 0)
783
784#define	BUF_EQUAL(spa, dva, birth, buf)				\
785	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
786	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
787	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
788
789static void
790buf_discard_identity(arc_buf_hdr_t *hdr)
791{
792	hdr->b_dva.dva_word[0] = 0;
793	hdr->b_dva.dva_word[1] = 0;
794	hdr->b_birth = 0;
795	hdr->b_cksum0 = 0;
796}
797
798static arc_buf_hdr_t *
799buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
800{
801	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
802	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
803	arc_buf_hdr_t *buf;
804
805	mutex_enter(hash_lock);
806	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
807	    buf = buf->b_hash_next) {
808		if (BUF_EQUAL(spa, dva, birth, buf)) {
809			*lockp = hash_lock;
810			return (buf);
811		}
812	}
813	mutex_exit(hash_lock);
814	*lockp = NULL;
815	return (NULL);
816}
817
818/*
819 * Insert an entry into the hash table.  If there is already an element
820 * equal to elem in the hash table, then the already existing element
821 * will be returned and the new element will not be inserted.
822 * Otherwise returns NULL.
823 */
824static arc_buf_hdr_t *
825buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
826{
827	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
828	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
829	arc_buf_hdr_t *fbuf;
830	uint32_t i;
831
832	ASSERT(!HDR_IN_HASH_TABLE(buf));
833	*lockp = hash_lock;
834	mutex_enter(hash_lock);
835	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
836	    fbuf = fbuf->b_hash_next, i++) {
837		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
838			return (fbuf);
839	}
840
841	buf->b_hash_next = buf_hash_table.ht_table[idx];
842	buf_hash_table.ht_table[idx] = buf;
843	buf->b_flags |= ARC_IN_HASH_TABLE;
844
845	/* collect some hash table performance data */
846	if (i > 0) {
847		ARCSTAT_BUMP(arcstat_hash_collisions);
848		if (i == 1)
849			ARCSTAT_BUMP(arcstat_hash_chains);
850
851		ARCSTAT_MAX(arcstat_hash_chain_max, i);
852	}
853
854	ARCSTAT_BUMP(arcstat_hash_elements);
855	ARCSTAT_MAXSTAT(arcstat_hash_elements);
856
857	return (NULL);
858}
859
860static void
861buf_hash_remove(arc_buf_hdr_t *buf)
862{
863	arc_buf_hdr_t *fbuf, **bufp;
864	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
865
866	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
867	ASSERT(HDR_IN_HASH_TABLE(buf));
868
869	bufp = &buf_hash_table.ht_table[idx];
870	while ((fbuf = *bufp) != buf) {
871		ASSERT(fbuf != NULL);
872		bufp = &fbuf->b_hash_next;
873	}
874	*bufp = buf->b_hash_next;
875	buf->b_hash_next = NULL;
876	buf->b_flags &= ~ARC_IN_HASH_TABLE;
877
878	/* collect some hash table performance data */
879	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
880
881	if (buf_hash_table.ht_table[idx] &&
882	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
883		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
884}
885
886/*
887 * Global data structures and functions for the buf kmem cache.
888 */
889static kmem_cache_t *hdr_cache;
890static kmem_cache_t *buf_cache;
891
892static void
893buf_fini(void)
894{
895	int i;
896
897	kmem_free(buf_hash_table.ht_table,
898	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
899	for (i = 0; i < BUF_LOCKS; i++)
900		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
901	kmem_cache_destroy(hdr_cache);
902	kmem_cache_destroy(buf_cache);
903}
904
905/*
906 * Constructor callback - called when the cache is empty
907 * and a new buf is requested.
908 */
909/* ARGSUSED */
910static int
911hdr_cons(void *vbuf, void *unused, int kmflag)
912{
913	arc_buf_hdr_t *buf = vbuf;
914
915	bzero(buf, sizeof (arc_buf_hdr_t));
916	refcount_create(&buf->b_refcnt);
917	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
918	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
919	arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
920
921	return (0);
922}
923
924/* ARGSUSED */
925static int
926buf_cons(void *vbuf, void *unused, int kmflag)
927{
928	arc_buf_t *buf = vbuf;
929
930	bzero(buf, sizeof (arc_buf_t));
931	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
932	rw_init(&buf->b_data_lock, NULL, RW_DEFAULT, NULL);
933	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
934
935	return (0);
936}
937
938/*
939 * Destructor callback - called when a cached buf is
940 * no longer required.
941 */
942/* ARGSUSED */
943static void
944hdr_dest(void *vbuf, void *unused)
945{
946	arc_buf_hdr_t *buf = vbuf;
947
948	ASSERT(BUF_EMPTY(buf));
949	refcount_destroy(&buf->b_refcnt);
950	cv_destroy(&buf->b_cv);
951	mutex_destroy(&buf->b_freeze_lock);
952	arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
953}
954
955/* ARGSUSED */
956static void
957buf_dest(void *vbuf, void *unused)
958{
959	arc_buf_t *buf = vbuf;
960
961	mutex_destroy(&buf->b_evict_lock);
962	rw_destroy(&buf->b_data_lock);
963	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
964}
965
966/*
967 * Reclaim callback -- invoked when memory is low.
968 */
969/* ARGSUSED */
970static void
971hdr_recl(void *unused)
972{
973	dprintf("hdr_recl called\n");
974	/*
975	 * umem calls the reclaim func when we destroy the buf cache,
976	 * which is after we do arc_fini().
977	 */
978	if (!arc_dead)
979		cv_signal(&arc_reclaim_thr_cv);
980}
981
982static void
983buf_init(void)
984{
985	uint64_t *ct;
986	uint64_t hsize = 1ULL << 12;
987	int i, j;
988
989	/*
990	 * The hash table is big enough to fill all of physical memory
991	 * with an average 64K block size.  The table will take up
992	 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
993	 */
994	while (hsize * 65536 < (uint64_t)physmem * PAGESIZE)
995		hsize <<= 1;
996retry:
997	buf_hash_table.ht_mask = hsize - 1;
998	buf_hash_table.ht_table =
999	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1000	if (buf_hash_table.ht_table == NULL) {
1001		ASSERT(hsize > (1ULL << 8));
1002		hsize >>= 1;
1003		goto retry;
1004	}
1005
1006	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
1007	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
1008	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1009	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1010
1011	for (i = 0; i < 256; i++)
1012		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1013			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1014
1015	for (i = 0; i < BUF_LOCKS; i++) {
1016		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1017		    NULL, MUTEX_DEFAULT, NULL);
1018	}
1019}
1020
1021#define	ARC_MINTIME	(hz>>4) /* 62 ms */
1022
1023static void
1024arc_cksum_verify(arc_buf_t *buf)
1025{
1026	zio_cksum_t zc;
1027
1028	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1029		return;
1030
1031	mutex_enter(&buf->b_hdr->b_freeze_lock);
1032	if (buf->b_hdr->b_freeze_cksum == NULL ||
1033	    (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
1034		mutex_exit(&buf->b_hdr->b_freeze_lock);
1035		return;
1036	}
1037	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1038	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1039		panic("buffer modified while frozen!");
1040	mutex_exit(&buf->b_hdr->b_freeze_lock);
1041}
1042
1043static int
1044arc_cksum_equal(arc_buf_t *buf)
1045{
1046	zio_cksum_t zc;
1047	int equal;
1048
1049	mutex_enter(&buf->b_hdr->b_freeze_lock);
1050	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1051	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1052	mutex_exit(&buf->b_hdr->b_freeze_lock);
1053
1054	return (equal);
1055}
1056
1057static void
1058arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1059{
1060	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1061		return;
1062
1063	mutex_enter(&buf->b_hdr->b_freeze_lock);
1064	if (buf->b_hdr->b_freeze_cksum != NULL) {
1065		mutex_exit(&buf->b_hdr->b_freeze_lock);
1066		return;
1067	}
1068	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1069	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1070	    buf->b_hdr->b_freeze_cksum);
1071	mutex_exit(&buf->b_hdr->b_freeze_lock);
1072}
1073
1074void
1075arc_buf_thaw(arc_buf_t *buf)
1076{
1077	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1078		if (buf->b_hdr->b_state != arc_anon)
1079			panic("modifying non-anon buffer!");
1080		if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1081			panic("modifying buffer while i/o in progress!");
1082		arc_cksum_verify(buf);
1083	}
1084
1085	mutex_enter(&buf->b_hdr->b_freeze_lock);
1086	if (buf->b_hdr->b_freeze_cksum != NULL) {
1087		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1088		buf->b_hdr->b_freeze_cksum = NULL;
1089	}
1090
1091	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1092		if (buf->b_hdr->b_thawed)
1093			kmem_free(buf->b_hdr->b_thawed, 1);
1094		buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1095	}
1096
1097	mutex_exit(&buf->b_hdr->b_freeze_lock);
1098}
1099
1100void
1101arc_buf_freeze(arc_buf_t *buf)
1102{
1103	kmutex_t *hash_lock;
1104
1105	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1106		return;
1107
1108	hash_lock = HDR_LOCK(buf->b_hdr);
1109	mutex_enter(hash_lock);
1110
1111	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1112	    buf->b_hdr->b_state == arc_anon);
1113	arc_cksum_compute(buf, B_FALSE);
1114	mutex_exit(hash_lock);
1115}
1116
1117static void
1118get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock)
1119{
1120	uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth);
1121
1122	if (ab->b_type == ARC_BUFC_METADATA)
1123		buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
1124	else {
1125		buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
1126		buf_hashid += ARC_BUFC_NUMMETADATALISTS;
1127	}
1128
1129	*list = &state->arcs_lists[buf_hashid];
1130	*lock = ARCS_LOCK(state, buf_hashid);
1131}
1132
1133
1134static void
1135add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1136{
1137	ASSERT(MUTEX_HELD(hash_lock));
1138
1139	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1140	    (ab->b_state != arc_anon)) {
1141		uint64_t delta = ab->b_size * ab->b_datacnt;
1142		uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1143		list_t *list;
1144		kmutex_t *lock;
1145
1146		get_buf_info(ab, ab->b_state, &list, &lock);
1147		ASSERT(!MUTEX_HELD(lock));
1148		mutex_enter(lock);
1149		ASSERT(list_link_active(&ab->b_arc_node));
1150		list_remove(list, ab);
1151		if (GHOST_STATE(ab->b_state)) {
1152			ASSERT3U(ab->b_datacnt, ==, 0);
1153			ASSERT3P(ab->b_buf, ==, NULL);
1154			delta = ab->b_size;
1155		}
1156		ASSERT(delta > 0);
1157		ASSERT3U(*size, >=, delta);
1158		atomic_add_64(size, -delta);
1159		mutex_exit(lock);
1160		/* remove the prefetch flag if we get a reference */
1161		if (ab->b_flags & ARC_PREFETCH)
1162			ab->b_flags &= ~ARC_PREFETCH;
1163	}
1164}
1165
1166static int
1167remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1168{
1169	int cnt;
1170	arc_state_t *state = ab->b_state;
1171
1172	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1173	ASSERT(!GHOST_STATE(state));
1174
1175	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1176	    (state != arc_anon)) {
1177		uint64_t *size = &state->arcs_lsize[ab->b_type];
1178		list_t *list;
1179		kmutex_t *lock;
1180
1181		get_buf_info(ab, state, &list, &lock);
1182		ASSERT(!MUTEX_HELD(lock));
1183		mutex_enter(lock);
1184		ASSERT(!list_link_active(&ab->b_arc_node));
1185		list_insert_head(list, ab);
1186		ASSERT(ab->b_datacnt > 0);
1187		atomic_add_64(size, ab->b_size * ab->b_datacnt);
1188		mutex_exit(lock);
1189	}
1190	return (cnt);
1191}
1192
1193/*
1194 * Move the supplied buffer to the indicated state.  The mutex
1195 * for the buffer must be held by the caller.
1196 */
1197static void
1198arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1199{
1200	arc_state_t *old_state = ab->b_state;
1201	int64_t refcnt = refcount_count(&ab->b_refcnt);
1202	uint64_t from_delta, to_delta;
1203	list_t *list;
1204	kmutex_t *lock;
1205
1206	ASSERT(MUTEX_HELD(hash_lock));
1207	ASSERT(new_state != old_state);
1208	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1209	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1210	ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1211
1212	from_delta = to_delta = ab->b_datacnt * ab->b_size;
1213
1214	/*
1215	 * If this buffer is evictable, transfer it from the
1216	 * old state list to the new state list.
1217	 */
1218	if (refcnt == 0) {
1219		if (old_state != arc_anon) {
1220			int use_mutex;
1221			uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1222
1223			get_buf_info(ab, old_state, &list, &lock);
1224			use_mutex = !MUTEX_HELD(lock);
1225			if (use_mutex)
1226				mutex_enter(lock);
1227
1228			ASSERT(list_link_active(&ab->b_arc_node));
1229			list_remove(list, ab);
1230
1231			/*
1232			 * If prefetching out of the ghost cache,
1233			 * we will have a non-zero datacnt.
1234			 */
1235			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1236				/* ghost elements have a ghost size */
1237				ASSERT(ab->b_buf == NULL);
1238				from_delta = ab->b_size;
1239			}
1240			ASSERT3U(*size, >=, from_delta);
1241			atomic_add_64(size, -from_delta);
1242
1243			if (use_mutex)
1244				mutex_exit(lock);
1245		}
1246		if (new_state != arc_anon) {
1247			int use_mutex;
1248			uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1249
1250			get_buf_info(ab, new_state, &list, &lock);
1251			use_mutex = !MUTEX_HELD(lock);
1252			if (use_mutex)
1253				mutex_enter(lock);
1254
1255			list_insert_head(list, ab);
1256
1257			/* ghost elements have a ghost size */
1258			if (GHOST_STATE(new_state)) {
1259				ASSERT(ab->b_datacnt == 0);
1260				ASSERT(ab->b_buf == NULL);
1261				to_delta = ab->b_size;
1262			}
1263			atomic_add_64(size, to_delta);
1264
1265			if (use_mutex)
1266				mutex_exit(lock);
1267		}
1268	}
1269
1270	ASSERT(!BUF_EMPTY(ab));
1271	if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1272		buf_hash_remove(ab);
1273
1274	/* adjust state sizes */
1275	if (to_delta)
1276		atomic_add_64(&new_state->arcs_size, to_delta);
1277	if (from_delta) {
1278		ASSERT3U(old_state->arcs_size, >=, from_delta);
1279		atomic_add_64(&old_state->arcs_size, -from_delta);
1280	}
1281	ab->b_state = new_state;
1282
1283	/* adjust l2arc hdr stats */
1284	if (new_state == arc_l2c_only)
1285		l2arc_hdr_stat_add();
1286	else if (old_state == arc_l2c_only)
1287		l2arc_hdr_stat_remove();
1288}
1289
1290void
1291arc_space_consume(uint64_t space, arc_space_type_t type)
1292{
1293	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1294
1295	switch (type) {
1296	case ARC_SPACE_DATA:
1297		ARCSTAT_INCR(arcstat_data_size, space);
1298		break;
1299	case ARC_SPACE_OTHER:
1300		ARCSTAT_INCR(arcstat_other_size, space);
1301		break;
1302	case ARC_SPACE_HDRS:
1303		ARCSTAT_INCR(arcstat_hdr_size, space);
1304		break;
1305	case ARC_SPACE_L2HDRS:
1306		ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1307		break;
1308	}
1309
1310	atomic_add_64(&arc_meta_used, space);
1311	atomic_add_64(&arc_size, space);
1312}
1313
1314void
1315arc_space_return(uint64_t space, arc_space_type_t type)
1316{
1317	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1318
1319	switch (type) {
1320	case ARC_SPACE_DATA:
1321		ARCSTAT_INCR(arcstat_data_size, -space);
1322		break;
1323	case ARC_SPACE_OTHER:
1324		ARCSTAT_INCR(arcstat_other_size, -space);
1325		break;
1326	case ARC_SPACE_HDRS:
1327		ARCSTAT_INCR(arcstat_hdr_size, -space);
1328		break;
1329	case ARC_SPACE_L2HDRS:
1330		ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1331		break;
1332	}
1333
1334	ASSERT(arc_meta_used >= space);
1335	if (arc_meta_max < arc_meta_used)
1336		arc_meta_max = arc_meta_used;
1337	atomic_add_64(&arc_meta_used, -space);
1338	ASSERT(arc_size >= space);
1339	atomic_add_64(&arc_size, -space);
1340}
1341
1342void *
1343arc_data_buf_alloc(uint64_t size)
1344{
1345	if (arc_evict_needed(ARC_BUFC_DATA))
1346		cv_signal(&arc_reclaim_thr_cv);
1347	atomic_add_64(&arc_size, size);
1348	return (zio_data_buf_alloc(size));
1349}
1350
1351void
1352arc_data_buf_free(void *buf, uint64_t size)
1353{
1354	zio_data_buf_free(buf, size);
1355	ASSERT(arc_size >= size);
1356	atomic_add_64(&arc_size, -size);
1357}
1358
1359arc_buf_t *
1360arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1361{
1362	arc_buf_hdr_t *hdr;
1363	arc_buf_t *buf;
1364
1365	ASSERT3U(size, >, 0);
1366	hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1367	ASSERT(BUF_EMPTY(hdr));
1368	hdr->b_size = size;
1369	hdr->b_type = type;
1370	hdr->b_spa = spa_load_guid(spa);
1371	hdr->b_state = arc_anon;
1372	hdr->b_arc_access = 0;
1373	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1374	buf->b_hdr = hdr;
1375	buf->b_data = NULL;
1376	buf->b_efunc = NULL;
1377	buf->b_private = NULL;
1378	buf->b_next = NULL;
1379	hdr->b_buf = buf;
1380	arc_get_data_buf(buf);
1381	hdr->b_datacnt = 1;
1382	hdr->b_flags = 0;
1383	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1384	(void) refcount_add(&hdr->b_refcnt, tag);
1385
1386	return (buf);
1387}
1388
1389static char *arc_onloan_tag = "onloan";
1390
1391/*
1392 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1393 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1394 * buffers must be returned to the arc before they can be used by the DMU or
1395 * freed.
1396 */
1397arc_buf_t *
1398arc_loan_buf(spa_t *spa, int size)
1399{
1400	arc_buf_t *buf;
1401
1402	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1403
1404	atomic_add_64(&arc_loaned_bytes, size);
1405	return (buf);
1406}
1407
1408/*
1409 * Return a loaned arc buffer to the arc.
1410 */
1411void
1412arc_return_buf(arc_buf_t *buf, void *tag)
1413{
1414	arc_buf_hdr_t *hdr = buf->b_hdr;
1415
1416	ASSERT(buf->b_data != NULL);
1417	(void) refcount_add(&hdr->b_refcnt, tag);
1418	(void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1419
1420	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1421}
1422
1423/* Detach an arc_buf from a dbuf (tag) */
1424void
1425arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1426{
1427	arc_buf_hdr_t *hdr;
1428
1429	ASSERT(buf->b_data != NULL);
1430	hdr = buf->b_hdr;
1431	(void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1432	(void) refcount_remove(&hdr->b_refcnt, tag);
1433	buf->b_efunc = NULL;
1434	buf->b_private = NULL;
1435
1436	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1437}
1438
1439static arc_buf_t *
1440arc_buf_clone(arc_buf_t *from)
1441{
1442	arc_buf_t *buf;
1443	arc_buf_hdr_t *hdr = from->b_hdr;
1444	uint64_t size = hdr->b_size;
1445
1446	ASSERT(hdr->b_state != arc_anon);
1447
1448	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1449	buf->b_hdr = hdr;
1450	buf->b_data = NULL;
1451	buf->b_efunc = NULL;
1452	buf->b_private = NULL;
1453	buf->b_next = hdr->b_buf;
1454	hdr->b_buf = buf;
1455	arc_get_data_buf(buf);
1456	bcopy(from->b_data, buf->b_data, size);
1457	hdr->b_datacnt += 1;
1458	return (buf);
1459}
1460
1461void
1462arc_buf_add_ref(arc_buf_t *buf, void* tag)
1463{
1464	arc_buf_hdr_t *hdr;
1465	kmutex_t *hash_lock;
1466
1467	/*
1468	 * Check to see if this buffer is evicted.  Callers
1469	 * must verify b_data != NULL to know if the add_ref
1470	 * was successful.
1471	 */
1472	mutex_enter(&buf->b_evict_lock);
1473	if (buf->b_data == NULL) {
1474		mutex_exit(&buf->b_evict_lock);
1475		return;
1476	}
1477	hash_lock = HDR_LOCK(buf->b_hdr);
1478	mutex_enter(hash_lock);
1479	hdr = buf->b_hdr;
1480	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1481	mutex_exit(&buf->b_evict_lock);
1482
1483	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1484	add_reference(hdr, hash_lock, tag);
1485	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1486	arc_access(hdr, hash_lock);
1487	mutex_exit(hash_lock);
1488	ARCSTAT_BUMP(arcstat_hits);
1489	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1490	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1491	    data, metadata, hits);
1492}
1493
1494/*
1495 * Free the arc data buffer.  If it is an l2arc write in progress,
1496 * the buffer is placed on l2arc_free_on_write to be freed later.
1497 */
1498static void
1499arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
1500    void *data, size_t size)
1501{
1502	if (HDR_L2_WRITING(hdr)) {
1503		l2arc_data_free_t *df;
1504		df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1505		df->l2df_data = data;
1506		df->l2df_size = size;
1507		df->l2df_func = free_func;
1508		mutex_enter(&l2arc_free_on_write_mtx);
1509		list_insert_head(l2arc_free_on_write, df);
1510		mutex_exit(&l2arc_free_on_write_mtx);
1511		ARCSTAT_BUMP(arcstat_l2_free_on_write);
1512	} else {
1513		free_func(data, size);
1514	}
1515}
1516
1517static void
1518arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1519{
1520	arc_buf_t **bufp;
1521
1522	/* free up data associated with the buf */
1523	if (buf->b_data) {
1524		arc_state_t *state = buf->b_hdr->b_state;
1525		uint64_t size = buf->b_hdr->b_size;
1526		arc_buf_contents_t type = buf->b_hdr->b_type;
1527
1528		arc_cksum_verify(buf);
1529
1530		if (!recycle) {
1531			if (type == ARC_BUFC_METADATA) {
1532				arc_buf_data_free(buf->b_hdr, zio_buf_free,
1533				    buf->b_data, size);
1534				arc_space_return(size, ARC_SPACE_DATA);
1535			} else {
1536				ASSERT(type == ARC_BUFC_DATA);
1537				arc_buf_data_free(buf->b_hdr,
1538				    zio_data_buf_free, buf->b_data, size);
1539				ARCSTAT_INCR(arcstat_data_size, -size);
1540				atomic_add_64(&arc_size, -size);
1541			}
1542		}
1543		if (list_link_active(&buf->b_hdr->b_arc_node)) {
1544			uint64_t *cnt = &state->arcs_lsize[type];
1545
1546			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1547			ASSERT(state != arc_anon);
1548
1549			ASSERT3U(*cnt, >=, size);
1550			atomic_add_64(cnt, -size);
1551		}
1552		ASSERT3U(state->arcs_size, >=, size);
1553		atomic_add_64(&state->arcs_size, -size);
1554		buf->b_data = NULL;
1555		ASSERT(buf->b_hdr->b_datacnt > 0);
1556		buf->b_hdr->b_datacnt -= 1;
1557	}
1558
1559	/* only remove the buf if requested */
1560	if (!all)
1561		return;
1562
1563	/* remove the buf from the hdr list */
1564	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1565		continue;
1566	*bufp = buf->b_next;
1567	buf->b_next = NULL;
1568
1569	ASSERT(buf->b_efunc == NULL);
1570
1571	/* clean up the buf */
1572	buf->b_hdr = NULL;
1573	kmem_cache_free(buf_cache, buf);
1574}
1575
1576static void
1577arc_hdr_destroy(arc_buf_hdr_t *hdr)
1578{
1579	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1580	ASSERT3P(hdr->b_state, ==, arc_anon);
1581	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1582	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1583
1584	if (l2hdr != NULL) {
1585		boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1586		/*
1587		 * To prevent arc_free() and l2arc_evict() from
1588		 * attempting to free the same buffer at the same time,
1589		 * a FREE_IN_PROGRESS flag is given to arc_free() to
1590		 * give it priority.  l2arc_evict() can't destroy this
1591		 * header while we are waiting on l2arc_buflist_mtx.
1592		 *
1593		 * The hdr may be removed from l2ad_buflist before we
1594		 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1595		 */
1596		if (!buflist_held) {
1597			mutex_enter(&l2arc_buflist_mtx);
1598			l2hdr = hdr->b_l2hdr;
1599		}
1600
1601		if (l2hdr != NULL) {
1602			list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1603			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1604			kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1605			if (hdr->b_state == arc_l2c_only)
1606				l2arc_hdr_stat_remove();
1607			hdr->b_l2hdr = NULL;
1608		}
1609
1610		if (!buflist_held)
1611			mutex_exit(&l2arc_buflist_mtx);
1612	}
1613
1614	if (!BUF_EMPTY(hdr)) {
1615		ASSERT(!HDR_IN_HASH_TABLE(hdr));
1616		buf_discard_identity(hdr);
1617	}
1618	while (hdr->b_buf) {
1619		arc_buf_t *buf = hdr->b_buf;
1620
1621		if (buf->b_efunc) {
1622			mutex_enter(&arc_eviction_mtx);
1623			mutex_enter(&buf->b_evict_lock);
1624			ASSERT(buf->b_hdr != NULL);
1625			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1626			hdr->b_buf = buf->b_next;
1627			buf->b_hdr = &arc_eviction_hdr;
1628			buf->b_next = arc_eviction_list;
1629			arc_eviction_list = buf;
1630			mutex_exit(&buf->b_evict_lock);
1631			mutex_exit(&arc_eviction_mtx);
1632		} else {
1633			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1634		}
1635	}
1636	if (hdr->b_freeze_cksum != NULL) {
1637		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1638		hdr->b_freeze_cksum = NULL;
1639	}
1640	if (hdr->b_thawed) {
1641		kmem_free(hdr->b_thawed, 1);
1642		hdr->b_thawed = NULL;
1643	}
1644
1645	ASSERT(!list_link_active(&hdr->b_arc_node));
1646	ASSERT3P(hdr->b_hash_next, ==, NULL);
1647	ASSERT3P(hdr->b_acb, ==, NULL);
1648	kmem_cache_free(hdr_cache, hdr);
1649}
1650
1651void
1652arc_buf_free(arc_buf_t *buf, void *tag)
1653{
1654	arc_buf_hdr_t *hdr = buf->b_hdr;
1655	int hashed = hdr->b_state != arc_anon;
1656
1657	ASSERT(buf->b_efunc == NULL);
1658	ASSERT(buf->b_data != NULL);
1659
1660	if (hashed) {
1661		kmutex_t *hash_lock = HDR_LOCK(hdr);
1662
1663		mutex_enter(hash_lock);
1664		hdr = buf->b_hdr;
1665		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1666
1667		(void) remove_reference(hdr, hash_lock, tag);
1668		if (hdr->b_datacnt > 1) {
1669			arc_buf_destroy(buf, FALSE, TRUE);
1670		} else {
1671			ASSERT(buf == hdr->b_buf);
1672			ASSERT(buf->b_efunc == NULL);
1673			hdr->b_flags |= ARC_BUF_AVAILABLE;
1674		}
1675		mutex_exit(hash_lock);
1676	} else if (HDR_IO_IN_PROGRESS(hdr)) {
1677		int destroy_hdr;
1678		/*
1679		 * We are in the middle of an async write.  Don't destroy
1680		 * this buffer unless the write completes before we finish
1681		 * decrementing the reference count.
1682		 */
1683		mutex_enter(&arc_eviction_mtx);
1684		(void) remove_reference(hdr, NULL, tag);
1685		ASSERT(refcount_is_zero(&hdr->b_refcnt));
1686		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1687		mutex_exit(&arc_eviction_mtx);
1688		if (destroy_hdr)
1689			arc_hdr_destroy(hdr);
1690	} else {
1691		if (remove_reference(hdr, NULL, tag) > 0)
1692			arc_buf_destroy(buf, FALSE, TRUE);
1693		else
1694			arc_hdr_destroy(hdr);
1695	}
1696}
1697
1698int
1699arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1700{
1701	arc_buf_hdr_t *hdr = buf->b_hdr;
1702	kmutex_t *hash_lock = HDR_LOCK(hdr);
1703	int no_callback = (buf->b_efunc == NULL);
1704
1705	if (hdr->b_state == arc_anon) {
1706		ASSERT(hdr->b_datacnt == 1);
1707		arc_buf_free(buf, tag);
1708		return (no_callback);
1709	}
1710
1711	mutex_enter(hash_lock);
1712	hdr = buf->b_hdr;
1713	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1714	ASSERT(hdr->b_state != arc_anon);
1715	ASSERT(buf->b_data != NULL);
1716
1717	(void) remove_reference(hdr, hash_lock, tag);
1718	if (hdr->b_datacnt > 1) {
1719		if (no_callback)
1720			arc_buf_destroy(buf, FALSE, TRUE);
1721	} else if (no_callback) {
1722		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1723		ASSERT(buf->b_efunc == NULL);
1724		hdr->b_flags |= ARC_BUF_AVAILABLE;
1725	}
1726	ASSERT(no_callback || hdr->b_datacnt > 1 ||
1727	    refcount_is_zero(&hdr->b_refcnt));
1728	mutex_exit(hash_lock);
1729	return (no_callback);
1730}
1731
1732int
1733arc_buf_size(arc_buf_t *buf)
1734{
1735	return (buf->b_hdr->b_size);
1736}
1737
1738/*
1739 * Evict buffers from list until we've removed the specified number of
1740 * bytes.  Move the removed buffers to the appropriate evict state.
1741 * If the recycle flag is set, then attempt to "recycle" a buffer:
1742 * - look for a buffer to evict that is `bytes' long.
1743 * - return the data block from this buffer rather than freeing it.
1744 * This flag is used by callers that are trying to make space for a
1745 * new buffer in a full arc cache.
1746 *
1747 * This function makes a "best effort".  It skips over any buffers
1748 * it can't get a hash_lock on, and so may not catch all candidates.
1749 * It may also return without evicting as much space as requested.
1750 */
1751static void *
1752arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1753    arc_buf_contents_t type)
1754{
1755	arc_state_t *evicted_state;
1756	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1757	int64_t bytes_remaining;
1758	arc_buf_hdr_t *ab, *ab_prev = NULL;
1759	list_t *evicted_list, *list, *evicted_list_start, *list_start;
1760	kmutex_t *lock, *evicted_lock;
1761	kmutex_t *hash_lock;
1762	boolean_t have_lock;
1763	void *stolen = NULL;
1764	static int evict_metadata_offset, evict_data_offset;
1765	int i, idx, offset, list_count, count;
1766
1767	ASSERT(state == arc_mru || state == arc_mfu);
1768
1769	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1770
1771	if (type == ARC_BUFC_METADATA) {
1772		offset = 0;
1773		list_count = ARC_BUFC_NUMMETADATALISTS;
1774		list_start = &state->arcs_lists[0];
1775		evicted_list_start = &evicted_state->arcs_lists[0];
1776		idx = evict_metadata_offset;
1777	} else {
1778		offset = ARC_BUFC_NUMMETADATALISTS;
1779		list_start = &state->arcs_lists[offset];
1780		evicted_list_start = &evicted_state->arcs_lists[offset];
1781		list_count = ARC_BUFC_NUMDATALISTS;
1782		idx = evict_data_offset;
1783	}
1784	bytes_remaining = evicted_state->arcs_lsize[type];
1785	count = 0;
1786
1787evict_start:
1788	list = &list_start[idx];
1789	evicted_list = &evicted_list_start[idx];
1790	lock = ARCS_LOCK(state, (offset + idx));
1791	evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
1792
1793	mutex_enter(lock);
1794	mutex_enter(evicted_lock);
1795
1796	for (ab = list_tail(list); ab; ab = ab_prev) {
1797		ab_prev = list_prev(list, ab);
1798		bytes_remaining -= (ab->b_size * ab->b_datacnt);
1799		/* prefetch buffers have a minimum lifespan */
1800		if (HDR_IO_IN_PROGRESS(ab) ||
1801		    (spa && ab->b_spa != spa) ||
1802		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1803		    ddi_get_lbolt() - ab->b_arc_access <
1804		    arc_min_prefetch_lifespan)) {
1805			skipped++;
1806			continue;
1807		}
1808		/* "lookahead" for better eviction candidate */
1809		if (recycle && ab->b_size != bytes &&
1810		    ab_prev && ab_prev->b_size == bytes)
1811			continue;
1812		hash_lock = HDR_LOCK(ab);
1813		have_lock = MUTEX_HELD(hash_lock);
1814		if (have_lock || mutex_tryenter(hash_lock)) {
1815			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
1816			ASSERT(ab->b_datacnt > 0);
1817			while (ab->b_buf) {
1818				arc_buf_t *buf = ab->b_buf;
1819				if (!mutex_tryenter(&buf->b_evict_lock)) {
1820					missed += 1;
1821					break;
1822				}
1823				if (buf->b_data) {
1824					bytes_evicted += ab->b_size;
1825					if (recycle && ab->b_type == type &&
1826					    ab->b_size == bytes &&
1827					    !HDR_L2_WRITING(ab)) {
1828						stolen = buf->b_data;
1829						recycle = FALSE;
1830					}
1831				}
1832				if (buf->b_efunc) {
1833					mutex_enter(&arc_eviction_mtx);
1834					arc_buf_destroy(buf,
1835					    buf->b_data == stolen, FALSE);
1836					ab->b_buf = buf->b_next;
1837					buf->b_hdr = &arc_eviction_hdr;
1838					buf->b_next = arc_eviction_list;
1839					arc_eviction_list = buf;
1840					mutex_exit(&arc_eviction_mtx);
1841					mutex_exit(&buf->b_evict_lock);
1842				} else {
1843					mutex_exit(&buf->b_evict_lock);
1844					arc_buf_destroy(buf,
1845					    buf->b_data == stolen, TRUE);
1846				}
1847			}
1848
1849			if (ab->b_l2hdr) {
1850				ARCSTAT_INCR(arcstat_evict_l2_cached,
1851				    ab->b_size);
1852			} else {
1853				if (l2arc_write_eligible(ab->b_spa, ab)) {
1854					ARCSTAT_INCR(arcstat_evict_l2_eligible,
1855					    ab->b_size);
1856				} else {
1857					ARCSTAT_INCR(
1858					    arcstat_evict_l2_ineligible,
1859					    ab->b_size);
1860				}
1861			}
1862
1863			if (ab->b_datacnt == 0) {
1864				arc_change_state(evicted_state, ab, hash_lock);
1865				ASSERT(HDR_IN_HASH_TABLE(ab));
1866				ab->b_flags |= ARC_IN_HASH_TABLE;
1867				ab->b_flags &= ~ARC_BUF_AVAILABLE;
1868				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
1869			}
1870			if (!have_lock)
1871				mutex_exit(hash_lock);
1872			if (bytes >= 0 && bytes_evicted >= bytes)
1873				break;
1874			if (bytes_remaining > 0) {
1875				mutex_exit(evicted_lock);
1876				mutex_exit(lock);
1877				idx  = ((idx + 1) & (list_count - 1));
1878				count++;
1879				goto evict_start;
1880			}
1881		} else {
1882			missed += 1;
1883		}
1884	}
1885
1886	mutex_exit(evicted_lock);
1887	mutex_exit(lock);
1888
1889	idx  = ((idx + 1) & (list_count - 1));
1890	count++;
1891
1892	if (bytes_evicted < bytes) {
1893		if (count < list_count)
1894			goto evict_start;
1895		else
1896			dprintf("only evicted %lld bytes from %x",
1897			    (longlong_t)bytes_evicted, state);
1898	}
1899	if (type == ARC_BUFC_METADATA)
1900		evict_metadata_offset = idx;
1901	else
1902		evict_data_offset = idx;
1903
1904	if (skipped)
1905		ARCSTAT_INCR(arcstat_evict_skip, skipped);
1906
1907	if (missed)
1908		ARCSTAT_INCR(arcstat_mutex_miss, missed);
1909
1910	/*
1911	 * We have just evicted some date into the ghost state, make
1912	 * sure we also adjust the ghost state size if necessary.
1913	 */
1914	if (arc_no_grow &&
1915	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
1916		int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
1917		    arc_mru_ghost->arcs_size - arc_c;
1918
1919		if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
1920			int64_t todelete =
1921			    MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
1922			arc_evict_ghost(arc_mru_ghost, 0, todelete);
1923		} else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
1924			int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
1925			    arc_mru_ghost->arcs_size +
1926			    arc_mfu_ghost->arcs_size - arc_c);
1927			arc_evict_ghost(arc_mfu_ghost, 0, todelete);
1928		}
1929	}
1930	if (stolen)
1931		ARCSTAT_BUMP(arcstat_stolen);
1932
1933	return (stolen);
1934}
1935
1936/*
1937 * Remove buffers from list until we've removed the specified number of
1938 * bytes.  Destroy the buffers that are removed.
1939 */
1940static void
1941arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
1942{
1943	arc_buf_hdr_t *ab, *ab_prev;
1944	arc_buf_hdr_t marker = { 0 };
1945	list_t *list, *list_start;
1946	kmutex_t *hash_lock, *lock;
1947	uint64_t bytes_deleted = 0;
1948	uint64_t bufs_skipped = 0;
1949	static int evict_offset;
1950	int list_count, idx = evict_offset;
1951	int offset, count = 0;
1952
1953	ASSERT(GHOST_STATE(state));
1954
1955	/*
1956	 * data lists come after metadata lists
1957	 */
1958	list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
1959	list_count = ARC_BUFC_NUMDATALISTS;
1960	offset = ARC_BUFC_NUMMETADATALISTS;
1961
1962evict_start:
1963	list = &list_start[idx];
1964	lock = ARCS_LOCK(state, idx + offset);
1965
1966	mutex_enter(lock);
1967	for (ab = list_tail(list); ab; ab = ab_prev) {
1968		ab_prev = list_prev(list, ab);
1969		if (spa && ab->b_spa != spa)
1970			continue;
1971
1972		/* ignore markers */
1973		if (ab->b_spa == 0)
1974			continue;
1975
1976		hash_lock = HDR_LOCK(ab);
1977		/* caller may be trying to modify this buffer, skip it */
1978		if (MUTEX_HELD(hash_lock))
1979			continue;
1980		if (mutex_tryenter(hash_lock)) {
1981			ASSERT(!HDR_IO_IN_PROGRESS(ab));
1982			ASSERT(ab->b_buf == NULL);
1983			ARCSTAT_BUMP(arcstat_deleted);
1984			bytes_deleted += ab->b_size;
1985
1986			if (ab->b_l2hdr != NULL) {
1987				/*
1988				 * This buffer is cached on the 2nd Level ARC;
1989				 * don't destroy the header.
1990				 */
1991				arc_change_state(arc_l2c_only, ab, hash_lock);
1992				mutex_exit(hash_lock);
1993			} else {
1994				arc_change_state(arc_anon, ab, hash_lock);
1995				mutex_exit(hash_lock);
1996				arc_hdr_destroy(ab);
1997			}
1998
1999			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2000			if (bytes >= 0 && bytes_deleted >= bytes)
2001				break;
2002		} else if (bytes < 0) {
2003			/*
2004			 * Insert a list marker and then wait for the
2005			 * hash lock to become available. Once its
2006			 * available, restart from where we left off.
2007			 */
2008			list_insert_after(list, ab, &marker);
2009			mutex_exit(lock);
2010			mutex_enter(hash_lock);
2011			mutex_exit(hash_lock);
2012			mutex_enter(lock);
2013			ab_prev = list_prev(list, &marker);
2014			list_remove(list, &marker);
2015		} else
2016			bufs_skipped += 1;
2017	}
2018	mutex_exit(lock);
2019	idx  = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
2020	count++;
2021
2022	if (count < list_count)
2023		goto evict_start;
2024
2025	evict_offset = idx;
2026	if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
2027	    (bytes < 0 || bytes_deleted < bytes)) {
2028		list_start = &state->arcs_lists[0];
2029		list_count = ARC_BUFC_NUMMETADATALISTS;
2030		offset = count = 0;
2031		goto evict_start;
2032	}
2033
2034	if (bufs_skipped) {
2035		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2036		ASSERT(bytes >= 0);
2037	}
2038
2039	if (bytes_deleted < bytes)
2040		dprintf("only deleted %lld bytes from %p",
2041		    (longlong_t)bytes_deleted, state);
2042}
2043
2044static void
2045arc_adjust(void)
2046{
2047	int64_t adjustment, delta;
2048
2049	/*
2050	 * Adjust MRU size
2051	 */
2052
2053	adjustment = MIN((int64_t)(arc_size - arc_c),
2054	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2055	    arc_p));
2056
2057	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2058		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2059		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
2060		adjustment -= delta;
2061	}
2062
2063	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2064		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2065		(void) arc_evict(arc_mru, 0, delta, FALSE,
2066		    ARC_BUFC_METADATA);
2067	}
2068
2069	/*
2070	 * Adjust MFU size
2071	 */
2072
2073	adjustment = arc_size - arc_c;
2074
2075	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2076		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2077		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
2078		adjustment -= delta;
2079	}
2080
2081	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2082		int64_t delta = MIN(adjustment,
2083		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2084		(void) arc_evict(arc_mfu, 0, delta, FALSE,
2085		    ARC_BUFC_METADATA);
2086	}
2087
2088	/*
2089	 * Adjust ghost lists
2090	 */
2091
2092	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2093
2094	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2095		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2096		arc_evict_ghost(arc_mru_ghost, 0, delta);
2097	}
2098
2099	adjustment =
2100	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2101
2102	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2103		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2104		arc_evict_ghost(arc_mfu_ghost, 0, delta);
2105	}
2106}
2107
2108static void
2109arc_do_user_evicts(void)
2110{
2111	static arc_buf_t *tmp_arc_eviction_list;
2112
2113	/*
2114	 * Move list over to avoid LOR
2115	 */
2116restart:
2117	mutex_enter(&arc_eviction_mtx);
2118	tmp_arc_eviction_list = arc_eviction_list;
2119	arc_eviction_list = NULL;
2120	mutex_exit(&arc_eviction_mtx);
2121
2122	while (tmp_arc_eviction_list != NULL) {
2123		arc_buf_t *buf = tmp_arc_eviction_list;
2124		tmp_arc_eviction_list = buf->b_next;
2125		mutex_enter(&buf->b_evict_lock);
2126		buf->b_hdr = NULL;
2127		mutex_exit(&buf->b_evict_lock);
2128
2129		if (buf->b_efunc != NULL)
2130			VERIFY(buf->b_efunc(buf) == 0);
2131
2132		buf->b_efunc = NULL;
2133		buf->b_private = NULL;
2134		kmem_cache_free(buf_cache, buf);
2135	}
2136
2137	if (arc_eviction_list != NULL)
2138		goto restart;
2139}
2140
2141/*
2142 * Flush all *evictable* data from the cache for the given spa.
2143 * NOTE: this will not touch "active" (i.e. referenced) data.
2144 */
2145void
2146arc_flush(spa_t *spa)
2147{
2148	uint64_t guid = 0;
2149
2150	if (spa)
2151		guid = spa_load_guid(spa);
2152
2153	while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
2154		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2155		if (spa)
2156			break;
2157	}
2158	while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
2159		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2160		if (spa)
2161			break;
2162	}
2163	while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
2164		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2165		if (spa)
2166			break;
2167	}
2168	while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
2169		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2170		if (spa)
2171			break;
2172	}
2173
2174	arc_evict_ghost(arc_mru_ghost, guid, -1);
2175	arc_evict_ghost(arc_mfu_ghost, guid, -1);
2176
2177	mutex_enter(&arc_reclaim_thr_lock);
2178	arc_do_user_evicts();
2179	mutex_exit(&arc_reclaim_thr_lock);
2180	ASSERT(spa || arc_eviction_list == NULL);
2181}
2182
2183void
2184arc_shrink(void)
2185{
2186	if (arc_c > arc_c_min) {
2187		uint64_t to_free;
2188
2189#ifdef _KERNEL
2190		to_free = arc_c >> arc_shrink_shift;
2191#else
2192		to_free = arc_c >> arc_shrink_shift;
2193#endif
2194		if (arc_c > arc_c_min + to_free)
2195			atomic_add_64(&arc_c, -to_free);
2196		else
2197			arc_c = arc_c_min;
2198
2199		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2200		if (arc_c > arc_size)
2201			arc_c = MAX(arc_size, arc_c_min);
2202		if (arc_p > arc_c)
2203			arc_p = (arc_c >> 1);
2204		ASSERT(arc_c >= arc_c_min);
2205		ASSERT((int64_t)arc_p >= 0);
2206	}
2207
2208	if (arc_size > arc_c)
2209		arc_adjust();
2210}
2211
2212static int needfree = 0;
2213
2214static int
2215arc_reclaim_needed(void)
2216{
2217
2218#ifdef _KERNEL
2219
2220	if (needfree)
2221		return (1);
2222
2223	/*
2224	 * Cooperate with pagedaemon when it's time for it to scan
2225	 * and reclaim some pages.
2226	 */
2227	if (vm_paging_needed())
2228		return (1);
2229
2230#ifdef sun
2231	/*
2232	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
2233	 */
2234	extra = desfree;
2235
2236	/*
2237	 * check that we're out of range of the pageout scanner.  It starts to
2238	 * schedule paging if freemem is less than lotsfree and needfree.
2239	 * lotsfree is the high-water mark for pageout, and needfree is the
2240	 * number of needed free pages.  We add extra pages here to make sure
2241	 * the scanner doesn't start up while we're freeing memory.
2242	 */
2243	if (freemem < lotsfree + needfree + extra)
2244		return (1);
2245
2246	/*
2247	 * check to make sure that swapfs has enough space so that anon
2248	 * reservations can still succeed. anon_resvmem() checks that the
2249	 * availrmem is greater than swapfs_minfree, and the number of reserved
2250	 * swap pages.  We also add a bit of extra here just to prevent
2251	 * circumstances from getting really dire.
2252	 */
2253	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2254		return (1);
2255
2256#if defined(__i386)
2257	/*
2258	 * If we're on an i386 platform, it's possible that we'll exhaust the
2259	 * kernel heap space before we ever run out of available physical
2260	 * memory.  Most checks of the size of the heap_area compare against
2261	 * tune.t_minarmem, which is the minimum available real memory that we
2262	 * can have in the system.  However, this is generally fixed at 25 pages
2263	 * which is so low that it's useless.  In this comparison, we seek to
2264	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
2265	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2266	 * free)
2267	 */
2268	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
2269	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
2270		return (1);
2271#endif
2272#else	/* !sun */
2273	if (kmem_used() > (kmem_size() * 3) / 4)
2274		return (1);
2275#endif	/* sun */
2276
2277#else
2278	if (spa_get_random(100) == 0)
2279		return (1);
2280#endif
2281	return (0);
2282}
2283
2284extern kmem_cache_t	*zio_buf_cache[];
2285extern kmem_cache_t	*zio_data_buf_cache[];
2286
2287static void
2288arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2289{
2290	size_t			i;
2291	kmem_cache_t		*prev_cache = NULL;
2292	kmem_cache_t		*prev_data_cache = NULL;
2293
2294#ifdef _KERNEL
2295	if (arc_meta_used >= arc_meta_limit) {
2296		/*
2297		 * We are exceeding our meta-data cache limit.
2298		 * Purge some DNLC entries to release holds on meta-data.
2299		 */
2300		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2301	}
2302#if defined(__i386)
2303	/*
2304	 * Reclaim unused memory from all kmem caches.
2305	 */
2306	kmem_reap();
2307#endif
2308#endif
2309
2310	/*
2311	 * An aggressive reclamation will shrink the cache size as well as
2312	 * reap free buffers from the arc kmem caches.
2313	 */
2314	if (strat == ARC_RECLAIM_AGGR)
2315		arc_shrink();
2316
2317	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2318		if (zio_buf_cache[i] != prev_cache) {
2319			prev_cache = zio_buf_cache[i];
2320			kmem_cache_reap_now(zio_buf_cache[i]);
2321		}
2322		if (zio_data_buf_cache[i] != prev_data_cache) {
2323			prev_data_cache = zio_data_buf_cache[i];
2324			kmem_cache_reap_now(zio_data_buf_cache[i]);
2325		}
2326	}
2327	kmem_cache_reap_now(buf_cache);
2328	kmem_cache_reap_now(hdr_cache);
2329}
2330
2331static void
2332arc_reclaim_thread(void *dummy __unused)
2333{
2334	clock_t			growtime = 0;
2335	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
2336	callb_cpr_t		cpr;
2337
2338	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2339
2340	mutex_enter(&arc_reclaim_thr_lock);
2341	while (arc_thread_exit == 0) {
2342		if (arc_reclaim_needed()) {
2343
2344			if (arc_no_grow) {
2345				if (last_reclaim == ARC_RECLAIM_CONS) {
2346					last_reclaim = ARC_RECLAIM_AGGR;
2347				} else {
2348					last_reclaim = ARC_RECLAIM_CONS;
2349				}
2350			} else {
2351				arc_no_grow = TRUE;
2352				last_reclaim = ARC_RECLAIM_AGGR;
2353				membar_producer();
2354			}
2355
2356			/* reset the growth delay for every reclaim */
2357			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2358
2359			if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
2360				/*
2361				 * If needfree is TRUE our vm_lowmem hook
2362				 * was called and in that case we must free some
2363				 * memory, so switch to aggressive mode.
2364				 */
2365				arc_no_grow = TRUE;
2366				last_reclaim = ARC_RECLAIM_AGGR;
2367			}
2368			arc_kmem_reap_now(last_reclaim);
2369			arc_warm = B_TRUE;
2370
2371		} else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2372			arc_no_grow = FALSE;
2373		}
2374
2375		arc_adjust();
2376
2377		if (arc_eviction_list != NULL)
2378			arc_do_user_evicts();
2379
2380#ifdef _KERNEL
2381		if (needfree) {
2382			needfree = 0;
2383			wakeup(&needfree);
2384		}
2385#endif
2386
2387		/* block until needed, or one second, whichever is shorter */
2388		CALLB_CPR_SAFE_BEGIN(&cpr);
2389		(void) cv_timedwait(&arc_reclaim_thr_cv,
2390		    &arc_reclaim_thr_lock, hz);
2391		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2392	}
2393
2394	arc_thread_exit = 0;
2395	cv_broadcast(&arc_reclaim_thr_cv);
2396	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
2397	thread_exit();
2398}
2399
2400/*
2401 * Adapt arc info given the number of bytes we are trying to add and
2402 * the state that we are comming from.  This function is only called
2403 * when we are adding new content to the cache.
2404 */
2405static void
2406arc_adapt(int bytes, arc_state_t *state)
2407{
2408	int mult;
2409	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2410
2411	if (state == arc_l2c_only)
2412		return;
2413
2414	ASSERT(bytes > 0);
2415	/*
2416	 * Adapt the target size of the MRU list:
2417	 *	- if we just hit in the MRU ghost list, then increase
2418	 *	  the target size of the MRU list.
2419	 *	- if we just hit in the MFU ghost list, then increase
2420	 *	  the target size of the MFU list by decreasing the
2421	 *	  target size of the MRU list.
2422	 */
2423	if (state == arc_mru_ghost) {
2424		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2425		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2426		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2427
2428		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2429	} else if (state == arc_mfu_ghost) {
2430		uint64_t delta;
2431
2432		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2433		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2434		mult = MIN(mult, 10);
2435
2436		delta = MIN(bytes * mult, arc_p);
2437		arc_p = MAX(arc_p_min, arc_p - delta);
2438	}
2439	ASSERT((int64_t)arc_p >= 0);
2440
2441	if (arc_reclaim_needed()) {
2442		cv_signal(&arc_reclaim_thr_cv);
2443		return;
2444	}
2445
2446	if (arc_no_grow)
2447		return;
2448
2449	if (arc_c >= arc_c_max)
2450		return;
2451
2452	/*
2453	 * If we're within (2 * maxblocksize) bytes of the target
2454	 * cache size, increment the target cache size
2455	 */
2456	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2457		atomic_add_64(&arc_c, (int64_t)bytes);
2458		if (arc_c > arc_c_max)
2459			arc_c = arc_c_max;
2460		else if (state == arc_anon)
2461			atomic_add_64(&arc_p, (int64_t)bytes);
2462		if (arc_p > arc_c)
2463			arc_p = arc_c;
2464	}
2465	ASSERT((int64_t)arc_p >= 0);
2466}
2467
2468/*
2469 * Check if the cache has reached its limits and eviction is required
2470 * prior to insert.
2471 */
2472static int
2473arc_evict_needed(arc_buf_contents_t type)
2474{
2475	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2476		return (1);
2477
2478#ifdef sun
2479#ifdef _KERNEL
2480	/*
2481	 * If zio data pages are being allocated out of a separate heap segment,
2482	 * then enforce that the size of available vmem for this area remains
2483	 * above about 1/32nd free.
2484	 */
2485	if (type == ARC_BUFC_DATA && zio_arena != NULL &&
2486	    vmem_size(zio_arena, VMEM_FREE) <
2487	    (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
2488		return (1);
2489#endif
2490#endif	/* sun */
2491
2492	if (arc_reclaim_needed())
2493		return (1);
2494
2495	return (arc_size > arc_c);
2496}
2497
2498/*
2499 * The buffer, supplied as the first argument, needs a data block.
2500 * So, if we are at cache max, determine which cache should be victimized.
2501 * We have the following cases:
2502 *
2503 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2504 * In this situation if we're out of space, but the resident size of the MFU is
2505 * under the limit, victimize the MFU cache to satisfy this insertion request.
2506 *
2507 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2508 * Here, we've used up all of the available space for the MRU, so we need to
2509 * evict from our own cache instead.  Evict from the set of resident MRU
2510 * entries.
2511 *
2512 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2513 * c minus p represents the MFU space in the cache, since p is the size of the
2514 * cache that is dedicated to the MRU.  In this situation there's still space on
2515 * the MFU side, so the MRU side needs to be victimized.
2516 *
2517 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2518 * MFU's resident set is consuming more space than it has been allotted.  In
2519 * this situation, we must victimize our own cache, the MFU, for this insertion.
2520 */
2521static void
2522arc_get_data_buf(arc_buf_t *buf)
2523{
2524	arc_state_t		*state = buf->b_hdr->b_state;
2525	uint64_t		size = buf->b_hdr->b_size;
2526	arc_buf_contents_t	type = buf->b_hdr->b_type;
2527
2528	arc_adapt(size, state);
2529
2530	/*
2531	 * We have not yet reached cache maximum size,
2532	 * just allocate a new buffer.
2533	 */
2534	if (!arc_evict_needed(type)) {
2535		if (type == ARC_BUFC_METADATA) {
2536			buf->b_data = zio_buf_alloc(size);
2537			arc_space_consume(size, ARC_SPACE_DATA);
2538		} else {
2539			ASSERT(type == ARC_BUFC_DATA);
2540			buf->b_data = zio_data_buf_alloc(size);
2541			ARCSTAT_INCR(arcstat_data_size, size);
2542			atomic_add_64(&arc_size, size);
2543		}
2544		goto out;
2545	}
2546
2547	/*
2548	 * If we are prefetching from the mfu ghost list, this buffer
2549	 * will end up on the mru list; so steal space from there.
2550	 */
2551	if (state == arc_mfu_ghost)
2552		state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2553	else if (state == arc_mru_ghost)
2554		state = arc_mru;
2555
2556	if (state == arc_mru || state == arc_anon) {
2557		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2558		state = (arc_mfu->arcs_lsize[type] >= size &&
2559		    arc_p > mru_used) ? arc_mfu : arc_mru;
2560	} else {
2561		/* MFU cases */
2562		uint64_t mfu_space = arc_c - arc_p;
2563		state =  (arc_mru->arcs_lsize[type] >= size &&
2564		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2565	}
2566	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
2567		if (type == ARC_BUFC_METADATA) {
2568			buf->b_data = zio_buf_alloc(size);
2569			arc_space_consume(size, ARC_SPACE_DATA);
2570		} else {
2571			ASSERT(type == ARC_BUFC_DATA);
2572			buf->b_data = zio_data_buf_alloc(size);
2573			ARCSTAT_INCR(arcstat_data_size, size);
2574			atomic_add_64(&arc_size, size);
2575		}
2576		ARCSTAT_BUMP(arcstat_recycle_miss);
2577	}
2578	ASSERT(buf->b_data != NULL);
2579out:
2580	/*
2581	 * Update the state size.  Note that ghost states have a
2582	 * "ghost size" and so don't need to be updated.
2583	 */
2584	if (!GHOST_STATE(buf->b_hdr->b_state)) {
2585		arc_buf_hdr_t *hdr = buf->b_hdr;
2586
2587		atomic_add_64(&hdr->b_state->arcs_size, size);
2588		if (list_link_active(&hdr->b_arc_node)) {
2589			ASSERT(refcount_is_zero(&hdr->b_refcnt));
2590			atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2591		}
2592		/*
2593		 * If we are growing the cache, and we are adding anonymous
2594		 * data, and we have outgrown arc_p, update arc_p
2595		 */
2596		if (arc_size < arc_c && hdr->b_state == arc_anon &&
2597		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2598			arc_p = MIN(arc_c, arc_p + size);
2599	}
2600	ARCSTAT_BUMP(arcstat_allocated);
2601}
2602
2603/*
2604 * This routine is called whenever a buffer is accessed.
2605 * NOTE: the hash lock is dropped in this function.
2606 */
2607static void
2608arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2609{
2610	clock_t now;
2611
2612	ASSERT(MUTEX_HELD(hash_lock));
2613
2614	if (buf->b_state == arc_anon) {
2615		/*
2616		 * This buffer is not in the cache, and does not
2617		 * appear in our "ghost" list.  Add the new buffer
2618		 * to the MRU state.
2619		 */
2620
2621		ASSERT(buf->b_arc_access == 0);
2622		buf->b_arc_access = ddi_get_lbolt();
2623		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2624		arc_change_state(arc_mru, buf, hash_lock);
2625
2626	} else if (buf->b_state == arc_mru) {
2627		now = ddi_get_lbolt();
2628
2629		/*
2630		 * If this buffer is here because of a prefetch, then either:
2631		 * - clear the flag if this is a "referencing" read
2632		 *   (any subsequent access will bump this into the MFU state).
2633		 * or
2634		 * - move the buffer to the head of the list if this is
2635		 *   another prefetch (to make it less likely to be evicted).
2636		 */
2637		if ((buf->b_flags & ARC_PREFETCH) != 0) {
2638			if (refcount_count(&buf->b_refcnt) == 0) {
2639				ASSERT(list_link_active(&buf->b_arc_node));
2640			} else {
2641				buf->b_flags &= ~ARC_PREFETCH;
2642				ARCSTAT_BUMP(arcstat_mru_hits);
2643			}
2644			buf->b_arc_access = now;
2645			return;
2646		}
2647
2648		/*
2649		 * This buffer has been "accessed" only once so far,
2650		 * but it is still in the cache. Move it to the MFU
2651		 * state.
2652		 */
2653		if (now > buf->b_arc_access + ARC_MINTIME) {
2654			/*
2655			 * More than 125ms have passed since we
2656			 * instantiated this buffer.  Move it to the
2657			 * most frequently used state.
2658			 */
2659			buf->b_arc_access = now;
2660			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2661			arc_change_state(arc_mfu, buf, hash_lock);
2662		}
2663		ARCSTAT_BUMP(arcstat_mru_hits);
2664	} else if (buf->b_state == arc_mru_ghost) {
2665		arc_state_t	*new_state;
2666		/*
2667		 * This buffer has been "accessed" recently, but
2668		 * was evicted from the cache.  Move it to the
2669		 * MFU state.
2670		 */
2671
2672		if (buf->b_flags & ARC_PREFETCH) {
2673			new_state = arc_mru;
2674			if (refcount_count(&buf->b_refcnt) > 0)
2675				buf->b_flags &= ~ARC_PREFETCH;
2676			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2677		} else {
2678			new_state = arc_mfu;
2679			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2680		}
2681
2682		buf->b_arc_access = ddi_get_lbolt();
2683		arc_change_state(new_state, buf, hash_lock);
2684
2685		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2686	} else if (buf->b_state == arc_mfu) {
2687		/*
2688		 * This buffer has been accessed more than once and is
2689		 * still in the cache.  Keep it in the MFU state.
2690		 *
2691		 * NOTE: an add_reference() that occurred when we did
2692		 * the arc_read() will have kicked this off the list.
2693		 * If it was a prefetch, we will explicitly move it to
2694		 * the head of the list now.
2695		 */
2696		if ((buf->b_flags & ARC_PREFETCH) != 0) {
2697			ASSERT(refcount_count(&buf->b_refcnt) == 0);
2698			ASSERT(list_link_active(&buf->b_arc_node));
2699		}
2700		ARCSTAT_BUMP(arcstat_mfu_hits);
2701		buf->b_arc_access = ddi_get_lbolt();
2702	} else if (buf->b_state == arc_mfu_ghost) {
2703		arc_state_t	*new_state = arc_mfu;
2704		/*
2705		 * This buffer has been accessed more than once but has
2706		 * been evicted from the cache.  Move it back to the
2707		 * MFU state.
2708		 */
2709
2710		if (buf->b_flags & ARC_PREFETCH) {
2711			/*
2712			 * This is a prefetch access...
2713			 * move this block back to the MRU state.
2714			 */
2715			ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0);
2716			new_state = arc_mru;
2717		}
2718
2719		buf->b_arc_access = ddi_get_lbolt();
2720		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2721		arc_change_state(new_state, buf, hash_lock);
2722
2723		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2724	} else if (buf->b_state == arc_l2c_only) {
2725		/*
2726		 * This buffer is on the 2nd Level ARC.
2727		 */
2728
2729		buf->b_arc_access = ddi_get_lbolt();
2730		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2731		arc_change_state(arc_mfu, buf, hash_lock);
2732	} else {
2733		ASSERT(!"invalid arc state");
2734	}
2735}
2736
2737/* a generic arc_done_func_t which you can use */
2738/* ARGSUSED */
2739void
2740arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2741{
2742	if (zio == NULL || zio->io_error == 0)
2743		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2744	VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2745}
2746
2747/* a generic arc_done_func_t */
2748void
2749arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2750{
2751	arc_buf_t **bufp = arg;
2752	if (zio && zio->io_error) {
2753		VERIFY(arc_buf_remove_ref(buf, arg) == 1);
2754		*bufp = NULL;
2755	} else {
2756		*bufp = buf;
2757		ASSERT(buf->b_data);
2758	}
2759}
2760
2761static void
2762arc_read_done(zio_t *zio)
2763{
2764	arc_buf_hdr_t	*hdr, *found;
2765	arc_buf_t	*buf;
2766	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
2767	kmutex_t	*hash_lock;
2768	arc_callback_t	*callback_list, *acb;
2769	int		freeable = FALSE;
2770
2771	buf = zio->io_private;
2772	hdr = buf->b_hdr;
2773
2774	/*
2775	 * The hdr was inserted into hash-table and removed from lists
2776	 * prior to starting I/O.  We should find this header, since
2777	 * it's in the hash table, and it should be legit since it's
2778	 * not possible to evict it during the I/O.  The only possible
2779	 * reason for it not to be found is if we were freed during the
2780	 * read.
2781	 */
2782	found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
2783	    &hash_lock);
2784
2785	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2786	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2787	    (found == hdr && HDR_L2_READING(hdr)));
2788
2789	hdr->b_flags &= ~ARC_L2_EVICTED;
2790	if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2791		hdr->b_flags &= ~ARC_L2CACHE;
2792
2793	/* byteswap if necessary */
2794	callback_list = hdr->b_acb;
2795	ASSERT(callback_list != NULL);
2796	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2797		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2798		    byteswap_uint64_array :
2799		    dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
2800		func(buf->b_data, hdr->b_size);
2801	}
2802
2803	arc_cksum_compute(buf, B_FALSE);
2804
2805	if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2806		/*
2807		 * Only call arc_access on anonymous buffers.  This is because
2808		 * if we've issued an I/O for an evicted buffer, we've already
2809		 * called arc_access (to prevent any simultaneous readers from
2810		 * getting confused).
2811		 */
2812		arc_access(hdr, hash_lock);
2813	}
2814
2815	/* create copies of the data buffer for the callers */
2816	abuf = buf;
2817	for (acb = callback_list; acb; acb = acb->acb_next) {
2818		if (acb->acb_done) {
2819			if (abuf == NULL)
2820				abuf = arc_buf_clone(buf);
2821			acb->acb_buf = abuf;
2822			abuf = NULL;
2823		}
2824	}
2825	hdr->b_acb = NULL;
2826	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
2827	ASSERT(!HDR_BUF_AVAILABLE(hdr));
2828	if (abuf == buf) {
2829		ASSERT(buf->b_efunc == NULL);
2830		ASSERT(hdr->b_datacnt == 1);
2831		hdr->b_flags |= ARC_BUF_AVAILABLE;
2832	}
2833
2834	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
2835
2836	if (zio->io_error != 0) {
2837		hdr->b_flags |= ARC_IO_ERROR;
2838		if (hdr->b_state != arc_anon)
2839			arc_change_state(arc_anon, hdr, hash_lock);
2840		if (HDR_IN_HASH_TABLE(hdr))
2841			buf_hash_remove(hdr);
2842		freeable = refcount_is_zero(&hdr->b_refcnt);
2843	}
2844
2845	/*
2846	 * Broadcast before we drop the hash_lock to avoid the possibility
2847	 * that the hdr (and hence the cv) might be freed before we get to
2848	 * the cv_broadcast().
2849	 */
2850	cv_broadcast(&hdr->b_cv);
2851
2852	if (hash_lock) {
2853		mutex_exit(hash_lock);
2854	} else {
2855		/*
2856		 * This block was freed while we waited for the read to
2857		 * complete.  It has been removed from the hash table and
2858		 * moved to the anonymous state (so that it won't show up
2859		 * in the cache).
2860		 */
2861		ASSERT3P(hdr->b_state, ==, arc_anon);
2862		freeable = refcount_is_zero(&hdr->b_refcnt);
2863	}
2864
2865	/* execute each callback and free its structure */
2866	while ((acb = callback_list) != NULL) {
2867		if (acb->acb_done)
2868			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
2869
2870		if (acb->acb_zio_dummy != NULL) {
2871			acb->acb_zio_dummy->io_error = zio->io_error;
2872			zio_nowait(acb->acb_zio_dummy);
2873		}
2874
2875		callback_list = acb->acb_next;
2876		kmem_free(acb, sizeof (arc_callback_t));
2877	}
2878
2879	if (freeable)
2880		arc_hdr_destroy(hdr);
2881}
2882
2883/*
2884 * "Read" the block block at the specified DVA (in bp) via the
2885 * cache.  If the block is found in the cache, invoke the provided
2886 * callback immediately and return.  Note that the `zio' parameter
2887 * in the callback will be NULL in this case, since no IO was
2888 * required.  If the block is not in the cache pass the read request
2889 * on to the spa with a substitute callback function, so that the
2890 * requested block will be added to the cache.
2891 *
2892 * If a read request arrives for a block that has a read in-progress,
2893 * either wait for the in-progress read to complete (and return the
2894 * results); or, if this is a read with a "done" func, add a record
2895 * to the read to invoke the "done" func when the read completes,
2896 * and return; or just return.
2897 *
2898 * arc_read_done() will invoke all the requested "done" functions
2899 * for readers of this block.
2900 *
2901 * Normal callers should use arc_read and pass the arc buffer and offset
2902 * for the bp.  But if you know you don't need locking, you can use
2903 * arc_read_nolock.
2904 */
2905int
2906arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
2907    arc_done_func_t *done, void *private, int priority, int zio_flags,
2908    uint32_t *arc_flags, const zbookmark_t *zb)
2909{
2910	int err;
2911
2912	if (pbuf == NULL) {
2913		/*
2914		 * XXX This happens from traverse callback funcs, for
2915		 * the objset_phys_t block.
2916		 */
2917		return (arc_read_nolock(pio, spa, bp, done, private, priority,
2918		    zio_flags, arc_flags, zb));
2919	}
2920
2921	ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
2922	ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
2923	rw_enter(&pbuf->b_data_lock, RW_READER);
2924
2925	err = arc_read_nolock(pio, spa, bp, done, private, priority,
2926	    zio_flags, arc_flags, zb);
2927	rw_exit(&pbuf->b_data_lock);
2928
2929	return (err);
2930}
2931
2932int
2933arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
2934    arc_done_func_t *done, void *private, int priority, int zio_flags,
2935    uint32_t *arc_flags, const zbookmark_t *zb)
2936{
2937	arc_buf_hdr_t *hdr;
2938	arc_buf_t *buf;
2939	kmutex_t *hash_lock;
2940	zio_t *rzio;
2941	uint64_t guid = spa_load_guid(spa);
2942
2943top:
2944	hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
2945	    &hash_lock);
2946	if (hdr && hdr->b_datacnt > 0) {
2947
2948		*arc_flags |= ARC_CACHED;
2949
2950		if (HDR_IO_IN_PROGRESS(hdr)) {
2951
2952			if (*arc_flags & ARC_WAIT) {
2953				cv_wait(&hdr->b_cv, hash_lock);
2954				mutex_exit(hash_lock);
2955				goto top;
2956			}
2957			ASSERT(*arc_flags & ARC_NOWAIT);
2958
2959			if (done) {
2960				arc_callback_t	*acb = NULL;
2961
2962				acb = kmem_zalloc(sizeof (arc_callback_t),
2963				    KM_SLEEP);
2964				acb->acb_done = done;
2965				acb->acb_private = private;
2966				if (pio != NULL)
2967					acb->acb_zio_dummy = zio_null(pio,
2968					    spa, NULL, NULL, NULL, zio_flags);
2969
2970				ASSERT(acb->acb_done != NULL);
2971				acb->acb_next = hdr->b_acb;
2972				hdr->b_acb = acb;
2973				add_reference(hdr, hash_lock, private);
2974				mutex_exit(hash_lock);
2975				return (0);
2976			}
2977			mutex_exit(hash_lock);
2978			return (0);
2979		}
2980
2981		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
2982
2983		if (done) {
2984			add_reference(hdr, hash_lock, private);
2985			/*
2986			 * If this block is already in use, create a new
2987			 * copy of the data so that we will be guaranteed
2988			 * that arc_release() will always succeed.
2989			 */
2990			buf = hdr->b_buf;
2991			ASSERT(buf);
2992			ASSERT(buf->b_data);
2993			if (HDR_BUF_AVAILABLE(hdr)) {
2994				ASSERT(buf->b_efunc == NULL);
2995				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
2996			} else {
2997				buf = arc_buf_clone(buf);
2998			}
2999
3000		} else if (*arc_flags & ARC_PREFETCH &&
3001		    refcount_count(&hdr->b_refcnt) == 0) {
3002			hdr->b_flags |= ARC_PREFETCH;
3003		}
3004		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3005		arc_access(hdr, hash_lock);
3006		if (*arc_flags & ARC_L2CACHE)
3007			hdr->b_flags |= ARC_L2CACHE;
3008		mutex_exit(hash_lock);
3009		ARCSTAT_BUMP(arcstat_hits);
3010		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3011		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3012		    data, metadata, hits);
3013
3014		if (done)
3015			done(NULL, buf, private);
3016	} else {
3017		uint64_t size = BP_GET_LSIZE(bp);
3018		arc_callback_t	*acb;
3019		vdev_t *vd = NULL;
3020		uint64_t addr;
3021		boolean_t devw = B_FALSE;
3022
3023		if (hdr == NULL) {
3024			/* this block is not in the cache */
3025			arc_buf_hdr_t	*exists;
3026			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3027			buf = arc_buf_alloc(spa, size, private, type);
3028			hdr = buf->b_hdr;
3029			hdr->b_dva = *BP_IDENTITY(bp);
3030			hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3031			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3032			exists = buf_hash_insert(hdr, &hash_lock);
3033			if (exists) {
3034				/* somebody beat us to the hash insert */
3035				mutex_exit(hash_lock);
3036				buf_discard_identity(hdr);
3037				(void) arc_buf_remove_ref(buf, private);
3038				goto top; /* restart the IO request */
3039			}
3040			/* if this is a prefetch, we don't have a reference */
3041			if (*arc_flags & ARC_PREFETCH) {
3042				(void) remove_reference(hdr, hash_lock,
3043				    private);
3044				hdr->b_flags |= ARC_PREFETCH;
3045			}
3046			if (*arc_flags & ARC_L2CACHE)
3047				hdr->b_flags |= ARC_L2CACHE;
3048			if (BP_GET_LEVEL(bp) > 0)
3049				hdr->b_flags |= ARC_INDIRECT;
3050		} else {
3051			/* this block is in the ghost cache */
3052			ASSERT(GHOST_STATE(hdr->b_state));
3053			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3054			ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0);
3055			ASSERT(hdr->b_buf == NULL);
3056
3057			/* if this is a prefetch, we don't have a reference */
3058			if (*arc_flags & ARC_PREFETCH)
3059				hdr->b_flags |= ARC_PREFETCH;
3060			else
3061				add_reference(hdr, hash_lock, private);
3062			if (*arc_flags & ARC_L2CACHE)
3063				hdr->b_flags |= ARC_L2CACHE;
3064			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3065			buf->b_hdr = hdr;
3066			buf->b_data = NULL;
3067			buf->b_efunc = NULL;
3068			buf->b_private = NULL;
3069			buf->b_next = NULL;
3070			hdr->b_buf = buf;
3071			ASSERT(hdr->b_datacnt == 0);
3072			hdr->b_datacnt = 1;
3073			arc_get_data_buf(buf);
3074			arc_access(hdr, hash_lock);
3075		}
3076
3077		ASSERT(!GHOST_STATE(hdr->b_state));
3078
3079		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3080		acb->acb_done = done;
3081		acb->acb_private = private;
3082
3083		ASSERT(hdr->b_acb == NULL);
3084		hdr->b_acb = acb;
3085		hdr->b_flags |= ARC_IO_IN_PROGRESS;
3086
3087		if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
3088		    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3089			devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3090			addr = hdr->b_l2hdr->b_daddr;
3091			/*
3092			 * Lock out device removal.
3093			 */
3094			if (vdev_is_dead(vd) ||
3095			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3096				vd = NULL;
3097		}
3098
3099		mutex_exit(hash_lock);
3100
3101		ASSERT3U(hdr->b_size, ==, size);
3102		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3103		    uint64_t, size, zbookmark_t *, zb);
3104		ARCSTAT_BUMP(arcstat_misses);
3105		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3106		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3107		    data, metadata, misses);
3108
3109		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3110			/*
3111			 * Read from the L2ARC if the following are true:
3112			 * 1. The L2ARC vdev was previously cached.
3113			 * 2. This buffer still has L2ARC metadata.
3114			 * 3. This buffer isn't currently writing to the L2ARC.
3115			 * 4. The L2ARC entry wasn't evicted, which may
3116			 *    also have invalidated the vdev.
3117			 * 5. This isn't prefetch and l2arc_noprefetch is set.
3118			 */
3119			if (hdr->b_l2hdr != NULL &&
3120			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3121			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3122				l2arc_read_callback_t *cb;
3123
3124				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3125				ARCSTAT_BUMP(arcstat_l2_hits);
3126
3127				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3128				    KM_SLEEP);
3129				cb->l2rcb_buf = buf;
3130				cb->l2rcb_spa = spa;
3131				cb->l2rcb_bp = *bp;
3132				cb->l2rcb_zb = *zb;
3133				cb->l2rcb_flags = zio_flags;
3134
3135				/*
3136				 * l2arc read.  The SCL_L2ARC lock will be
3137				 * released by l2arc_read_done().
3138				 */
3139				rzio = zio_read_phys(pio, vd, addr, size,
3140				    buf->b_data, ZIO_CHECKSUM_OFF,
3141				    l2arc_read_done, cb, priority, zio_flags |
3142				    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
3143				    ZIO_FLAG_DONT_PROPAGATE |
3144				    ZIO_FLAG_DONT_RETRY, B_FALSE);
3145				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3146				    zio_t *, rzio);
3147				ARCSTAT_INCR(arcstat_l2_read_bytes, size);
3148
3149				if (*arc_flags & ARC_NOWAIT) {
3150					zio_nowait(rzio);
3151					return (0);
3152				}
3153
3154				ASSERT(*arc_flags & ARC_WAIT);
3155				if (zio_wait(rzio) == 0)
3156					return (0);
3157
3158				/* l2arc read error; goto zio_read() */
3159			} else {
3160				DTRACE_PROBE1(l2arc__miss,
3161				    arc_buf_hdr_t *, hdr);
3162				ARCSTAT_BUMP(arcstat_l2_misses);
3163				if (HDR_L2_WRITING(hdr))
3164					ARCSTAT_BUMP(arcstat_l2_rw_clash);
3165				spa_config_exit(spa, SCL_L2ARC, vd);
3166			}
3167		} else {
3168			if (vd != NULL)
3169				spa_config_exit(spa, SCL_L2ARC, vd);
3170			if (l2arc_ndev != 0) {
3171				DTRACE_PROBE1(l2arc__miss,
3172				    arc_buf_hdr_t *, hdr);
3173				ARCSTAT_BUMP(arcstat_l2_misses);
3174			}
3175		}
3176
3177		rzio = zio_read(pio, spa, bp, buf->b_data, size,
3178		    arc_read_done, buf, priority, zio_flags, zb);
3179
3180		if (*arc_flags & ARC_WAIT)
3181			return (zio_wait(rzio));
3182
3183		ASSERT(*arc_flags & ARC_NOWAIT);
3184		zio_nowait(rzio);
3185	}
3186	return (0);
3187}
3188
3189void
3190arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3191{
3192	ASSERT(buf->b_hdr != NULL);
3193	ASSERT(buf->b_hdr->b_state != arc_anon);
3194	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3195	ASSERT(buf->b_efunc == NULL);
3196	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3197
3198	buf->b_efunc = func;
3199	buf->b_private = private;
3200}
3201
3202/*
3203 * This is used by the DMU to let the ARC know that a buffer is
3204 * being evicted, so the ARC should clean up.  If this arc buf
3205 * is not yet in the evicted state, it will be put there.
3206 */
3207int
3208arc_buf_evict(arc_buf_t *buf)
3209{
3210	arc_buf_hdr_t *hdr;
3211	kmutex_t *hash_lock;
3212	arc_buf_t **bufp;
3213	list_t *list, *evicted_list;
3214	kmutex_t *lock, *evicted_lock;
3215
3216	mutex_enter(&buf->b_evict_lock);
3217	hdr = buf->b_hdr;
3218	if (hdr == NULL) {
3219		/*
3220		 * We are in arc_do_user_evicts().
3221		 */
3222		ASSERT(buf->b_data == NULL);
3223		mutex_exit(&buf->b_evict_lock);
3224		return (0);
3225	} else if (buf->b_data == NULL) {
3226		arc_buf_t copy = *buf; /* structure assignment */
3227		/*
3228		 * We are on the eviction list; process this buffer now
3229		 * but let arc_do_user_evicts() do the reaping.
3230		 */
3231		buf->b_efunc = NULL;
3232		mutex_exit(&buf->b_evict_lock);
3233		VERIFY(copy.b_efunc(&copy) == 0);
3234		return (1);
3235	}
3236	hash_lock = HDR_LOCK(hdr);
3237	mutex_enter(hash_lock);
3238	hdr = buf->b_hdr;
3239	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3240
3241	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3242	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3243
3244	/*
3245	 * Pull this buffer off of the hdr
3246	 */
3247	bufp = &hdr->b_buf;
3248	while (*bufp != buf)
3249		bufp = &(*bufp)->b_next;
3250	*bufp = buf->b_next;
3251
3252	ASSERT(buf->b_data != NULL);
3253	arc_buf_destroy(buf, FALSE, FALSE);
3254
3255	if (hdr->b_datacnt == 0) {
3256		arc_state_t *old_state = hdr->b_state;
3257		arc_state_t *evicted_state;
3258
3259		ASSERT(hdr->b_buf == NULL);
3260		ASSERT(refcount_is_zero(&hdr->b_refcnt));
3261
3262		evicted_state =
3263		    (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3264
3265		get_buf_info(hdr, old_state, &list, &lock);
3266		get_buf_info(hdr, evicted_state, &evicted_list, &evicted_lock);
3267		mutex_enter(lock);
3268		mutex_enter(evicted_lock);
3269
3270		arc_change_state(evicted_state, hdr, hash_lock);
3271		ASSERT(HDR_IN_HASH_TABLE(hdr));
3272		hdr->b_flags |= ARC_IN_HASH_TABLE;
3273		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3274
3275		mutex_exit(evicted_lock);
3276		mutex_exit(lock);
3277	}
3278	mutex_exit(hash_lock);
3279	mutex_exit(&buf->b_evict_lock);
3280
3281	VERIFY(buf->b_efunc(buf) == 0);
3282	buf->b_efunc = NULL;
3283	buf->b_private = NULL;
3284	buf->b_hdr = NULL;
3285	buf->b_next = NULL;
3286	kmem_cache_free(buf_cache, buf);
3287	return (1);
3288}
3289
3290/*
3291 * Release this buffer from the cache.  This must be done
3292 * after a read and prior to modifying the buffer contents.
3293 * If the buffer has more than one reference, we must make
3294 * a new hdr for the buffer.
3295 */
3296void
3297arc_release(arc_buf_t *buf, void *tag)
3298{
3299	arc_buf_hdr_t *hdr;
3300	kmutex_t *hash_lock = NULL;
3301	l2arc_buf_hdr_t *l2hdr;
3302	uint64_t buf_size;
3303
3304	/*
3305	 * It would be nice to assert that if it's DMU metadata (level >
3306	 * 0 || it's the dnode file), then it must be syncing context.
3307	 * But we don't know that information at this level.
3308	 */
3309
3310	mutex_enter(&buf->b_evict_lock);
3311	hdr = buf->b_hdr;
3312
3313	/* this buffer is not on any list */
3314	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3315
3316	if (hdr->b_state == arc_anon) {
3317		/* this buffer is already released */
3318		ASSERT(buf->b_efunc == NULL);
3319	} else {
3320		hash_lock = HDR_LOCK(hdr);
3321		mutex_enter(hash_lock);
3322		hdr = buf->b_hdr;
3323		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3324	}
3325
3326	l2hdr = hdr->b_l2hdr;
3327	if (l2hdr) {
3328		mutex_enter(&l2arc_buflist_mtx);
3329		hdr->b_l2hdr = NULL;
3330		buf_size = hdr->b_size;
3331	}
3332
3333	/*
3334	 * Do we have more than one buf?
3335	 */
3336	if (hdr->b_datacnt > 1) {
3337		arc_buf_hdr_t *nhdr;
3338		arc_buf_t **bufp;
3339		uint64_t blksz = hdr->b_size;
3340		uint64_t spa = hdr->b_spa;
3341		arc_buf_contents_t type = hdr->b_type;
3342		uint32_t flags = hdr->b_flags;
3343
3344		ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3345		/*
3346		 * Pull the data off of this hdr and attach it to
3347		 * a new anonymous hdr.
3348		 */
3349		(void) remove_reference(hdr, hash_lock, tag);
3350		bufp = &hdr->b_buf;
3351		while (*bufp != buf)
3352			bufp = &(*bufp)->b_next;
3353		*bufp = buf->b_next;
3354		buf->b_next = NULL;
3355
3356		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3357		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3358		if (refcount_is_zero(&hdr->b_refcnt)) {
3359			uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3360			ASSERT3U(*size, >=, hdr->b_size);
3361			atomic_add_64(size, -hdr->b_size);
3362		}
3363		hdr->b_datacnt -= 1;
3364		arc_cksum_verify(buf);
3365
3366		mutex_exit(hash_lock);
3367
3368		nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3369		nhdr->b_size = blksz;
3370		nhdr->b_spa = spa;
3371		nhdr->b_type = type;
3372		nhdr->b_buf = buf;
3373		nhdr->b_state = arc_anon;
3374		nhdr->b_arc_access = 0;
3375		nhdr->b_flags = flags & ARC_L2_WRITING;
3376		nhdr->b_l2hdr = NULL;
3377		nhdr->b_datacnt = 1;
3378		nhdr->b_freeze_cksum = NULL;
3379		(void) refcount_add(&nhdr->b_refcnt, tag);
3380		buf->b_hdr = nhdr;
3381		mutex_exit(&buf->b_evict_lock);
3382		atomic_add_64(&arc_anon->arcs_size, blksz);
3383	} else {
3384		mutex_exit(&buf->b_evict_lock);
3385		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3386		ASSERT(!list_link_active(&hdr->b_arc_node));
3387		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3388		if (hdr->b_state != arc_anon)
3389			arc_change_state(arc_anon, hdr, hash_lock);
3390		hdr->b_arc_access = 0;
3391		if (hash_lock)
3392			mutex_exit(hash_lock);
3393
3394		buf_discard_identity(hdr);
3395		arc_buf_thaw(buf);
3396	}
3397	buf->b_efunc = NULL;
3398	buf->b_private = NULL;
3399
3400	if (l2hdr) {
3401		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3402		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3403		ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3404		mutex_exit(&l2arc_buflist_mtx);
3405	}
3406}
3407
3408/*
3409 * Release this buffer.  If it does not match the provided BP, fill it
3410 * with that block's contents.
3411 */
3412/* ARGSUSED */
3413int
3414arc_release_bp(arc_buf_t *buf, void *tag, blkptr_t *bp, spa_t *spa,
3415    zbookmark_t *zb)
3416{
3417	arc_release(buf, tag);
3418	return (0);
3419}
3420
3421int
3422arc_released(arc_buf_t *buf)
3423{
3424	int released;
3425
3426	mutex_enter(&buf->b_evict_lock);
3427	released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3428	mutex_exit(&buf->b_evict_lock);
3429	return (released);
3430}
3431
3432int
3433arc_has_callback(arc_buf_t *buf)
3434{
3435	int callback;
3436
3437	mutex_enter(&buf->b_evict_lock);
3438	callback = (buf->b_efunc != NULL);
3439	mutex_exit(&buf->b_evict_lock);
3440	return (callback);
3441}
3442
3443#ifdef ZFS_DEBUG
3444int
3445arc_referenced(arc_buf_t *buf)
3446{
3447	int referenced;
3448
3449	mutex_enter(&buf->b_evict_lock);
3450	referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3451	mutex_exit(&buf->b_evict_lock);
3452	return (referenced);
3453}
3454#endif
3455
3456static void
3457arc_write_ready(zio_t *zio)
3458{
3459	arc_write_callback_t *callback = zio->io_private;
3460	arc_buf_t *buf = callback->awcb_buf;
3461	arc_buf_hdr_t *hdr = buf->b_hdr;
3462
3463	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3464	callback->awcb_ready(zio, buf, callback->awcb_private);
3465
3466	/*
3467	 * If the IO is already in progress, then this is a re-write
3468	 * attempt, so we need to thaw and re-compute the cksum.
3469	 * It is the responsibility of the callback to handle the
3470	 * accounting for any re-write attempt.
3471	 */
3472	if (HDR_IO_IN_PROGRESS(hdr)) {
3473		mutex_enter(&hdr->b_freeze_lock);
3474		if (hdr->b_freeze_cksum != NULL) {
3475			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3476			hdr->b_freeze_cksum = NULL;
3477		}
3478		mutex_exit(&hdr->b_freeze_lock);
3479	}
3480	arc_cksum_compute(buf, B_FALSE);
3481	hdr->b_flags |= ARC_IO_IN_PROGRESS;
3482}
3483
3484static void
3485arc_write_done(zio_t *zio)
3486{
3487	arc_write_callback_t *callback = zio->io_private;
3488	arc_buf_t *buf = callback->awcb_buf;
3489	arc_buf_hdr_t *hdr = buf->b_hdr;
3490
3491	ASSERT(hdr->b_acb == NULL);
3492
3493	if (zio->io_error == 0) {
3494		hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3495		hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3496		hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3497	} else {
3498		ASSERT(BUF_EMPTY(hdr));
3499	}
3500
3501	/*
3502	 * If the block to be written was all-zero, we may have
3503	 * compressed it away.  In this case no write was performed
3504	 * so there will be no dva/birth/checksum.  The buffer must
3505	 * therefore remain anonymous (and uncached).
3506	 */
3507	if (!BUF_EMPTY(hdr)) {
3508		arc_buf_hdr_t *exists;
3509		kmutex_t *hash_lock;
3510
3511		ASSERT(zio->io_error == 0);
3512
3513		arc_cksum_verify(buf);
3514
3515		exists = buf_hash_insert(hdr, &hash_lock);
3516		if (exists) {
3517			/*
3518			 * This can only happen if we overwrite for
3519			 * sync-to-convergence, because we remove
3520			 * buffers from the hash table when we arc_free().
3521			 */
3522			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3523				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3524					panic("bad overwrite, hdr=%p exists=%p",
3525					    (void *)hdr, (void *)exists);
3526				ASSERT(refcount_is_zero(&exists->b_refcnt));
3527				arc_change_state(arc_anon, exists, hash_lock);
3528				mutex_exit(hash_lock);
3529				arc_hdr_destroy(exists);
3530				exists = buf_hash_insert(hdr, &hash_lock);
3531				ASSERT3P(exists, ==, NULL);
3532			} else {
3533				/* Dedup */
3534				ASSERT(hdr->b_datacnt == 1);
3535				ASSERT(hdr->b_state == arc_anon);
3536				ASSERT(BP_GET_DEDUP(zio->io_bp));
3537				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3538			}
3539		}
3540		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3541		/* if it's not anon, we are doing a scrub */
3542		if (!exists && hdr->b_state == arc_anon)
3543			arc_access(hdr, hash_lock);
3544		mutex_exit(hash_lock);
3545	} else {
3546		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3547	}
3548
3549	ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3550	callback->awcb_done(zio, buf, callback->awcb_private);
3551
3552	kmem_free(callback, sizeof (arc_write_callback_t));
3553}
3554
3555zio_t *
3556arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3557    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
3558    arc_done_func_t *ready, arc_done_func_t *done, void *private,
3559    int priority, int zio_flags, const zbookmark_t *zb)
3560{
3561	arc_buf_hdr_t *hdr = buf->b_hdr;
3562	arc_write_callback_t *callback;
3563	zio_t *zio;
3564
3565	ASSERT(ready != NULL);
3566	ASSERT(done != NULL);
3567	ASSERT(!HDR_IO_ERROR(hdr));
3568	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3569	ASSERT(hdr->b_acb == NULL);
3570	if (l2arc)
3571		hdr->b_flags |= ARC_L2CACHE;
3572	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3573	callback->awcb_ready = ready;
3574	callback->awcb_done = done;
3575	callback->awcb_private = private;
3576	callback->awcb_buf = buf;
3577
3578	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3579	    arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3580
3581	return (zio);
3582}
3583
3584static int
3585arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
3586{
3587#ifdef _KERNEL
3588	uint64_t available_memory =
3589	    ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count);
3590	static uint64_t page_load = 0;
3591	static uint64_t last_txg = 0;
3592
3593#ifdef sun
3594#if defined(__i386)
3595	available_memory =
3596	    MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3597#endif
3598#endif	/* sun */
3599	if (available_memory >= zfs_write_limit_max)
3600		return (0);
3601
3602	if (txg > last_txg) {
3603		last_txg = txg;
3604		page_load = 0;
3605	}
3606	/*
3607	 * If we are in pageout, we know that memory is already tight,
3608	 * the arc is already going to be evicting, so we just want to
3609	 * continue to let page writes occur as quickly as possible.
3610	 */
3611	if (curproc == pageproc) {
3612		if (page_load > available_memory / 4)
3613			return (ERESTART);
3614		/* Note: reserve is inflated, so we deflate */
3615		page_load += reserve / 8;
3616		return (0);
3617	} else if (page_load > 0 && arc_reclaim_needed()) {
3618		/* memory is low, delay before restarting */
3619		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3620		return (EAGAIN);
3621	}
3622	page_load = 0;
3623
3624	if (arc_size > arc_c_min) {
3625		uint64_t evictable_memory =
3626		    arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3627		    arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3628		    arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3629		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3630		available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3631	}
3632
3633	if (inflight_data > available_memory / 4) {
3634		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3635		return (ERESTART);
3636	}
3637#endif
3638	return (0);
3639}
3640
3641void
3642arc_tempreserve_clear(uint64_t reserve)
3643{
3644	atomic_add_64(&arc_tempreserve, -reserve);
3645	ASSERT((int64_t)arc_tempreserve >= 0);
3646}
3647
3648int
3649arc_tempreserve_space(uint64_t reserve, uint64_t txg)
3650{
3651	int error;
3652	uint64_t anon_size;
3653
3654#ifdef ZFS_DEBUG
3655	/*
3656	 * Once in a while, fail for no reason.  Everything should cope.
3657	 */
3658	if (spa_get_random(10000) == 0) {
3659		dprintf("forcing random failure\n");
3660		return (ERESTART);
3661	}
3662#endif
3663	if (reserve > arc_c/4 && !arc_no_grow)
3664		arc_c = MIN(arc_c_max, reserve * 4);
3665	if (reserve > arc_c)
3666		return (ENOMEM);
3667
3668	/*
3669	 * Don't count loaned bufs as in flight dirty data to prevent long
3670	 * network delays from blocking transactions that are ready to be
3671	 * assigned to a txg.
3672	 */
3673	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3674
3675	/*
3676	 * Writes will, almost always, require additional memory allocations
3677	 * in order to compress/encrypt/etc the data.  We therefor need to
3678	 * make sure that there is sufficient available memory for this.
3679	 */
3680	if (error = arc_memory_throttle(reserve, anon_size, txg))
3681		return (error);
3682
3683	/*
3684	 * Throttle writes when the amount of dirty data in the cache
3685	 * gets too large.  We try to keep the cache less than half full
3686	 * of dirty blocks so that our sync times don't grow too large.
3687	 * Note: if two requests come in concurrently, we might let them
3688	 * both succeed, when one of them should fail.  Not a huge deal.
3689	 */
3690
3691	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3692	    anon_size > arc_c / 4) {
3693		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3694		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3695		    arc_tempreserve>>10,
3696		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3697		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3698		    reserve>>10, arc_c>>10);
3699		return (ERESTART);
3700	}
3701	atomic_add_64(&arc_tempreserve, reserve);
3702	return (0);
3703}
3704
3705static kmutex_t arc_lowmem_lock;
3706#ifdef _KERNEL
3707static eventhandler_tag arc_event_lowmem = NULL;
3708
3709static void
3710arc_lowmem(void *arg __unused, int howto __unused)
3711{
3712
3713	/* Serialize access via arc_lowmem_lock. */
3714	mutex_enter(&arc_lowmem_lock);
3715	mutex_enter(&arc_reclaim_thr_lock);
3716	needfree = 1;
3717	cv_signal(&arc_reclaim_thr_cv);
3718	while (needfree)
3719		msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
3720	mutex_exit(&arc_reclaim_thr_lock);
3721	mutex_exit(&arc_lowmem_lock);
3722}
3723#endif
3724
3725void
3726arc_init(void)
3727{
3728	int i, prefetch_tunable_set = 0;
3729
3730	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3731	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3732	mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
3733
3734	/* Convert seconds to clock ticks */
3735	arc_min_prefetch_lifespan = 1 * hz;
3736
3737	/* Start out with 1/8 of all memory */
3738	arc_c = kmem_size() / 8;
3739
3740#ifdef sun
3741#ifdef _KERNEL
3742	/*
3743	 * On architectures where the physical memory can be larger
3744	 * than the addressable space (intel in 32-bit mode), we may
3745	 * need to limit the cache to 1/8 of VM size.
3746	 */
3747	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3748#endif
3749#endif	/* sun */
3750	/* set min cache to 1/32 of all memory, or 16MB, whichever is more */
3751	arc_c_min = MAX(arc_c / 4, 64<<18);
3752	/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
3753	if (arc_c * 8 >= 1<<30)
3754		arc_c_max = (arc_c * 8) - (1<<30);
3755	else
3756		arc_c_max = arc_c_min;
3757	arc_c_max = MAX(arc_c * 5, arc_c_max);
3758
3759#ifdef _KERNEL
3760	/*
3761	 * Allow the tunables to override our calculations if they are
3762	 * reasonable (ie. over 16MB)
3763	 */
3764	if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size())
3765		arc_c_max = zfs_arc_max;
3766	if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max)
3767		arc_c_min = zfs_arc_min;
3768#endif
3769
3770	arc_c = arc_c_max;
3771	arc_p = (arc_c >> 1);
3772
3773	/* limit meta-data to 1/4 of the arc capacity */
3774	arc_meta_limit = arc_c_max / 4;
3775
3776	/* Allow the tunable to override if it is reasonable */
3777	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
3778		arc_meta_limit = zfs_arc_meta_limit;
3779
3780	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
3781		arc_c_min = arc_meta_limit / 2;
3782
3783	if (zfs_arc_grow_retry > 0)
3784		arc_grow_retry = zfs_arc_grow_retry;
3785
3786	if (zfs_arc_shrink_shift > 0)
3787		arc_shrink_shift = zfs_arc_shrink_shift;
3788
3789	if (zfs_arc_p_min_shift > 0)
3790		arc_p_min_shift = zfs_arc_p_min_shift;
3791
3792	/* if kmem_flags are set, lets try to use less memory */
3793	if (kmem_debugging())
3794		arc_c = arc_c / 2;
3795	if (arc_c < arc_c_min)
3796		arc_c = arc_c_min;
3797
3798	zfs_arc_min = arc_c_min;
3799	zfs_arc_max = arc_c_max;
3800
3801	arc_anon = &ARC_anon;
3802	arc_mru = &ARC_mru;
3803	arc_mru_ghost = &ARC_mru_ghost;
3804	arc_mfu = &ARC_mfu;
3805	arc_mfu_ghost = &ARC_mfu_ghost;
3806	arc_l2c_only = &ARC_l2c_only;
3807	arc_size = 0;
3808
3809	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
3810		mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
3811		    NULL, MUTEX_DEFAULT, NULL);
3812		mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
3813		    NULL, MUTEX_DEFAULT, NULL);
3814		mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
3815		    NULL, MUTEX_DEFAULT, NULL);
3816		mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
3817		    NULL, MUTEX_DEFAULT, NULL);
3818		mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
3819		    NULL, MUTEX_DEFAULT, NULL);
3820		mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
3821		    NULL, MUTEX_DEFAULT, NULL);
3822
3823		list_create(&arc_mru->arcs_lists[i],
3824		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3825		list_create(&arc_mru_ghost->arcs_lists[i],
3826		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3827		list_create(&arc_mfu->arcs_lists[i],
3828		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3829		list_create(&arc_mfu_ghost->arcs_lists[i],
3830		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3831		list_create(&arc_mfu_ghost->arcs_lists[i],
3832		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3833		list_create(&arc_l2c_only->arcs_lists[i],
3834		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
3835	}
3836
3837	buf_init();
3838
3839	arc_thread_exit = 0;
3840	arc_eviction_list = NULL;
3841	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
3842	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
3843
3844	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
3845	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3846
3847	if (arc_ksp != NULL) {
3848		arc_ksp->ks_data = &arc_stats;
3849		kstat_install(arc_ksp);
3850	}
3851
3852	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
3853	    TS_RUN, minclsyspri);
3854
3855#ifdef _KERNEL
3856	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
3857	    EVENTHANDLER_PRI_FIRST);
3858#endif
3859
3860	arc_dead = FALSE;
3861	arc_warm = B_FALSE;
3862
3863	if (zfs_write_limit_max == 0)
3864		zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
3865	else
3866		zfs_write_limit_shift = 0;
3867	mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
3868
3869#ifdef _KERNEL
3870	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
3871		prefetch_tunable_set = 1;
3872
3873#ifdef __i386__
3874	if (prefetch_tunable_set == 0) {
3875		printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
3876		    "-- to enable,\n");
3877		printf("            add \"vfs.zfs.prefetch_disable=0\" "
3878		    "to /boot/loader.conf.\n");
3879		zfs_prefetch_disable = 1;
3880	}
3881#else
3882	if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
3883	    prefetch_tunable_set == 0) {
3884		printf("ZFS NOTICE: Prefetch is disabled by default if less "
3885		    "than 4GB of RAM is present;\n"
3886		    "            to enable, add \"vfs.zfs.prefetch_disable=0\" "
3887		    "to /boot/loader.conf.\n");
3888		zfs_prefetch_disable = 1;
3889	}
3890#endif
3891	/* Warn about ZFS memory and address space requirements. */
3892	if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
3893		printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
3894		    "expect unstable behavior.\n");
3895	}
3896	if (kmem_size() < 512 * (1 << 20)) {
3897		printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
3898		    "expect unstable behavior.\n");
3899		printf("             Consider tuning vm.kmem_size and "
3900		    "vm.kmem_size_max\n");
3901		printf("             in /boot/loader.conf.\n");
3902	}
3903#endif
3904}
3905
3906void
3907arc_fini(void)
3908{
3909	int i;
3910
3911	mutex_enter(&arc_reclaim_thr_lock);
3912	arc_thread_exit = 1;
3913	cv_signal(&arc_reclaim_thr_cv);
3914	while (arc_thread_exit != 0)
3915		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
3916	mutex_exit(&arc_reclaim_thr_lock);
3917
3918	arc_flush(NULL);
3919
3920	arc_dead = TRUE;
3921
3922	if (arc_ksp != NULL) {
3923		kstat_delete(arc_ksp);
3924		arc_ksp = NULL;
3925	}
3926
3927	mutex_destroy(&arc_eviction_mtx);
3928	mutex_destroy(&arc_reclaim_thr_lock);
3929	cv_destroy(&arc_reclaim_thr_cv);
3930
3931	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
3932		list_destroy(&arc_mru->arcs_lists[i]);
3933		list_destroy(&arc_mru_ghost->arcs_lists[i]);
3934		list_destroy(&arc_mfu->arcs_lists[i]);
3935		list_destroy(&arc_mfu_ghost->arcs_lists[i]);
3936		list_destroy(&arc_l2c_only->arcs_lists[i]);
3937
3938		mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
3939		mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
3940		mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
3941		mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
3942		mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
3943		mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
3944	}
3945
3946	mutex_destroy(&zfs_write_limit_lock);
3947
3948	buf_fini();
3949
3950	ASSERT(arc_loaned_bytes == 0);
3951
3952	mutex_destroy(&arc_lowmem_lock);
3953#ifdef _KERNEL
3954	if (arc_event_lowmem != NULL)
3955		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
3956#endif
3957}
3958
3959/*
3960 * Level 2 ARC
3961 *
3962 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
3963 * It uses dedicated storage devices to hold cached data, which are populated
3964 * using large infrequent writes.  The main role of this cache is to boost
3965 * the performance of random read workloads.  The intended L2ARC devices
3966 * include short-stroked disks, solid state disks, and other media with
3967 * substantially faster read latency than disk.
3968 *
3969 *                 +-----------------------+
3970 *                 |         ARC           |
3971 *                 +-----------------------+
3972 *                    |         ^     ^
3973 *                    |         |     |
3974 *      l2arc_feed_thread()    arc_read()
3975 *                    |         |     |
3976 *                    |  l2arc read   |
3977 *                    V         |     |
3978 *               +---------------+    |
3979 *               |     L2ARC     |    |
3980 *               +---------------+    |
3981 *                   |    ^           |
3982 *          l2arc_write() |           |
3983 *                   |    |           |
3984 *                   V    |           |
3985 *                 +-------+      +-------+
3986 *                 | vdev  |      | vdev  |
3987 *                 | cache |      | cache |
3988 *                 +-------+      +-------+
3989 *                 +=========+     .-----.
3990 *                 :  L2ARC  :    |-_____-|
3991 *                 : devices :    | Disks |
3992 *                 +=========+    `-_____-'
3993 *
3994 * Read requests are satisfied from the following sources, in order:
3995 *
3996 *	1) ARC
3997 *	2) vdev cache of L2ARC devices
3998 *	3) L2ARC devices
3999 *	4) vdev cache of disks
4000 *	5) disks
4001 *
4002 * Some L2ARC device types exhibit extremely slow write performance.
4003 * To accommodate for this there are some significant differences between
4004 * the L2ARC and traditional cache design:
4005 *
4006 * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
4007 * the ARC behave as usual, freeing buffers and placing headers on ghost
4008 * lists.  The ARC does not send buffers to the L2ARC during eviction as
4009 * this would add inflated write latencies for all ARC memory pressure.
4010 *
4011 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4012 * It does this by periodically scanning buffers from the eviction-end of
4013 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4014 * not already there.  It scans until a headroom of buffers is satisfied,
4015 * which itself is a buffer for ARC eviction.  The thread that does this is
4016 * l2arc_feed_thread(), illustrated below; example sizes are included to
4017 * provide a better sense of ratio than this diagram:
4018 *
4019 *	       head -->                        tail
4020 *	        +---------------------+----------+
4021 *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
4022 *	        +---------------------+----------+   |   o L2ARC eligible
4023 *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
4024 *	        +---------------------+----------+   |
4025 *	             15.9 Gbytes      ^ 32 Mbytes    |
4026 *	                           headroom          |
4027 *	                                      l2arc_feed_thread()
4028 *	                                             |
4029 *	                 l2arc write hand <--[oooo]--'
4030 *	                         |           8 Mbyte
4031 *	                         |          write max
4032 *	                         V
4033 *		  +==============================+
4034 *	L2ARC dev |####|#|###|###|    |####| ... |
4035 *	          +==============================+
4036 *	                     32 Gbytes
4037 *
4038 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4039 * evicted, then the L2ARC has cached a buffer much sooner than it probably
4040 * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
4041 * safe to say that this is an uncommon case, since buffers at the end of
4042 * the ARC lists have moved there due to inactivity.
4043 *
4044 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4045 * then the L2ARC simply misses copying some buffers.  This serves as a
4046 * pressure valve to prevent heavy read workloads from both stalling the ARC
4047 * with waits and clogging the L2ARC with writes.  This also helps prevent
4048 * the potential for the L2ARC to churn if it attempts to cache content too
4049 * quickly, such as during backups of the entire pool.
4050 *
4051 * 5. After system boot and before the ARC has filled main memory, there are
4052 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4053 * lists can remain mostly static.  Instead of searching from tail of these
4054 * lists as pictured, the l2arc_feed_thread() will search from the list heads
4055 * for eligible buffers, greatly increasing its chance of finding them.
4056 *
4057 * The L2ARC device write speed is also boosted during this time so that
4058 * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
4059 * there are no L2ARC reads, and no fear of degrading read performance
4060 * through increased writes.
4061 *
4062 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4063 * the vdev queue can aggregate them into larger and fewer writes.  Each
4064 * device is written to in a rotor fashion, sweeping writes through
4065 * available space then repeating.
4066 *
4067 * 7. The L2ARC does not store dirty content.  It never needs to flush
4068 * write buffers back to disk based storage.
4069 *
4070 * 8. If an ARC buffer is written (and dirtied) which also exists in the
4071 * L2ARC, the now stale L2ARC buffer is immediately dropped.
4072 *
4073 * The performance of the L2ARC can be tweaked by a number of tunables, which
4074 * may be necessary for different workloads:
4075 *
4076 *	l2arc_write_max		max write bytes per interval
4077 *	l2arc_write_boost	extra write bytes during device warmup
4078 *	l2arc_noprefetch	skip caching prefetched buffers
4079 *	l2arc_headroom		number of max device writes to precache
4080 *	l2arc_feed_secs		seconds between L2ARC writing
4081 *
4082 * Tunables may be removed or added as future performance improvements are
4083 * integrated, and also may become zpool properties.
4084 *
4085 * There are three key functions that control how the L2ARC warms up:
4086 *
4087 *	l2arc_write_eligible()	check if a buffer is eligible to cache
4088 *	l2arc_write_size()	calculate how much to write
4089 *	l2arc_write_interval()	calculate sleep delay between writes
4090 *
4091 * These three functions determine what to write, how much, and how quickly
4092 * to send writes.
4093 */
4094
4095static boolean_t
4096l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4097{
4098	/*
4099	 * A buffer is *not* eligible for the L2ARC if it:
4100	 * 1. belongs to a different spa.
4101	 * 2. is already cached on the L2ARC.
4102	 * 3. has an I/O in progress (it may be an incomplete read).
4103	 * 4. is flagged not eligible (zfs property).
4104	 */
4105	if (ab->b_spa != spa_guid) {
4106		ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
4107		return (B_FALSE);
4108	}
4109	if (ab->b_l2hdr != NULL) {
4110		ARCSTAT_BUMP(arcstat_l2_write_in_l2);
4111		return (B_FALSE);
4112	}
4113	if (HDR_IO_IN_PROGRESS(ab)) {
4114		ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
4115		return (B_FALSE);
4116	}
4117	if (!HDR_L2CACHE(ab)) {
4118		ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
4119		return (B_FALSE);
4120	}
4121
4122	return (B_TRUE);
4123}
4124
4125static uint64_t
4126l2arc_write_size(l2arc_dev_t *dev)
4127{
4128	uint64_t size;
4129
4130	size = dev->l2ad_write;
4131
4132	if (arc_warm == B_FALSE)
4133		size += dev->l2ad_boost;
4134
4135	return (size);
4136
4137}
4138
4139static clock_t
4140l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4141{
4142	clock_t interval, next, now;
4143
4144	/*
4145	 * If the ARC lists are busy, increase our write rate; if the
4146	 * lists are stale, idle back.  This is achieved by checking
4147	 * how much we previously wrote - if it was more than half of
4148	 * what we wanted, schedule the next write much sooner.
4149	 */
4150	if (l2arc_feed_again && wrote > (wanted / 2))
4151		interval = (hz * l2arc_feed_min_ms) / 1000;
4152	else
4153		interval = hz * l2arc_feed_secs;
4154
4155	now = ddi_get_lbolt();
4156	next = MAX(now, MIN(now + interval, began + interval));
4157
4158	return (next);
4159}
4160
4161static void
4162l2arc_hdr_stat_add(void)
4163{
4164	ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4165	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4166}
4167
4168static void
4169l2arc_hdr_stat_remove(void)
4170{
4171	ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4172	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4173}
4174
4175/*
4176 * Cycle through L2ARC devices.  This is how L2ARC load balances.
4177 * If a device is returned, this also returns holding the spa config lock.
4178 */
4179static l2arc_dev_t *
4180l2arc_dev_get_next(void)
4181{
4182	l2arc_dev_t *first, *next = NULL;
4183
4184	/*
4185	 * Lock out the removal of spas (spa_namespace_lock), then removal
4186	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4187	 * both locks will be dropped and a spa config lock held instead.
4188	 */
4189	mutex_enter(&spa_namespace_lock);
4190	mutex_enter(&l2arc_dev_mtx);
4191
4192	/* if there are no vdevs, there is nothing to do */
4193	if (l2arc_ndev == 0)
4194		goto out;
4195
4196	first = NULL;
4197	next = l2arc_dev_last;
4198	do {
4199		/* loop around the list looking for a non-faulted vdev */
4200		if (next == NULL) {
4201			next = list_head(l2arc_dev_list);
4202		} else {
4203			next = list_next(l2arc_dev_list, next);
4204			if (next == NULL)
4205				next = list_head(l2arc_dev_list);
4206		}
4207
4208		/* if we have come back to the start, bail out */
4209		if (first == NULL)
4210			first = next;
4211		else if (next == first)
4212			break;
4213
4214	} while (vdev_is_dead(next->l2ad_vdev));
4215
4216	/* if we were unable to find any usable vdevs, return NULL */
4217	if (vdev_is_dead(next->l2ad_vdev))
4218		next = NULL;
4219
4220	l2arc_dev_last = next;
4221
4222out:
4223	mutex_exit(&l2arc_dev_mtx);
4224
4225	/*
4226	 * Grab the config lock to prevent the 'next' device from being
4227	 * removed while we are writing to it.
4228	 */
4229	if (next != NULL)
4230		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4231	mutex_exit(&spa_namespace_lock);
4232
4233	return (next);
4234}
4235
4236/*
4237 * Free buffers that were tagged for destruction.
4238 */
4239static void
4240l2arc_do_free_on_write()
4241{
4242	list_t *buflist;
4243	l2arc_data_free_t *df, *df_prev;
4244
4245	mutex_enter(&l2arc_free_on_write_mtx);
4246	buflist = l2arc_free_on_write;
4247
4248	for (df = list_tail(buflist); df; df = df_prev) {
4249		df_prev = list_prev(buflist, df);
4250		ASSERT(df->l2df_data != NULL);
4251		ASSERT(df->l2df_func != NULL);
4252		df->l2df_func(df->l2df_data, df->l2df_size);
4253		list_remove(buflist, df);
4254		kmem_free(df, sizeof (l2arc_data_free_t));
4255	}
4256
4257	mutex_exit(&l2arc_free_on_write_mtx);
4258}
4259
4260/*
4261 * A write to a cache device has completed.  Update all headers to allow
4262 * reads from these buffers to begin.
4263 */
4264static void
4265l2arc_write_done(zio_t *zio)
4266{
4267	l2arc_write_callback_t *cb;
4268	l2arc_dev_t *dev;
4269	list_t *buflist;
4270	arc_buf_hdr_t *head, *ab, *ab_prev;
4271	l2arc_buf_hdr_t *abl2;
4272	kmutex_t *hash_lock;
4273
4274	cb = zio->io_private;
4275	ASSERT(cb != NULL);
4276	dev = cb->l2wcb_dev;
4277	ASSERT(dev != NULL);
4278	head = cb->l2wcb_head;
4279	ASSERT(head != NULL);
4280	buflist = dev->l2ad_buflist;
4281	ASSERT(buflist != NULL);
4282	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4283	    l2arc_write_callback_t *, cb);
4284
4285	if (zio->io_error != 0)
4286		ARCSTAT_BUMP(arcstat_l2_writes_error);
4287
4288	mutex_enter(&l2arc_buflist_mtx);
4289
4290	/*
4291	 * All writes completed, or an error was hit.
4292	 */
4293	for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4294		ab_prev = list_prev(buflist, ab);
4295
4296		hash_lock = HDR_LOCK(ab);
4297		if (!mutex_tryenter(hash_lock)) {
4298			/*
4299			 * This buffer misses out.  It may be in a stage
4300			 * of eviction.  Its ARC_L2_WRITING flag will be
4301			 * left set, denying reads to this buffer.
4302			 */
4303			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4304			continue;
4305		}
4306
4307		if (zio->io_error != 0) {
4308			/*
4309			 * Error - drop L2ARC entry.
4310			 */
4311			list_remove(buflist, ab);
4312			abl2 = ab->b_l2hdr;
4313			ab->b_l2hdr = NULL;
4314			kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4315			ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4316		}
4317
4318		/*
4319		 * Allow ARC to begin reads to this L2ARC entry.
4320		 */
4321		ab->b_flags &= ~ARC_L2_WRITING;
4322
4323		mutex_exit(hash_lock);
4324	}
4325
4326	atomic_inc_64(&l2arc_writes_done);
4327	list_remove(buflist, head);
4328	kmem_cache_free(hdr_cache, head);
4329	mutex_exit(&l2arc_buflist_mtx);
4330
4331	l2arc_do_free_on_write();
4332
4333	kmem_free(cb, sizeof (l2arc_write_callback_t));
4334}
4335
4336/*
4337 * A read to a cache device completed.  Validate buffer contents before
4338 * handing over to the regular ARC routines.
4339 */
4340static void
4341l2arc_read_done(zio_t *zio)
4342{
4343	l2arc_read_callback_t *cb;
4344	arc_buf_hdr_t *hdr;
4345	arc_buf_t *buf;
4346	kmutex_t *hash_lock;
4347	int equal;
4348
4349	ASSERT(zio->io_vd != NULL);
4350	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4351
4352	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4353
4354	cb = zio->io_private;
4355	ASSERT(cb != NULL);
4356	buf = cb->l2rcb_buf;
4357	ASSERT(buf != NULL);
4358
4359	hash_lock = HDR_LOCK(buf->b_hdr);
4360	mutex_enter(hash_lock);
4361	hdr = buf->b_hdr;
4362	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4363
4364	/*
4365	 * Check this survived the L2ARC journey.
4366	 */
4367	equal = arc_cksum_equal(buf);
4368	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4369		mutex_exit(hash_lock);
4370		zio->io_private = buf;
4371		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
4372		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
4373		arc_read_done(zio);
4374	} else {
4375		mutex_exit(hash_lock);
4376		/*
4377		 * Buffer didn't survive caching.  Increment stats and
4378		 * reissue to the original storage device.
4379		 */
4380		if (zio->io_error != 0) {
4381			ARCSTAT_BUMP(arcstat_l2_io_error);
4382		} else {
4383			zio->io_error = EIO;
4384		}
4385		if (!equal)
4386			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4387
4388		/*
4389		 * If there's no waiter, issue an async i/o to the primary
4390		 * storage now.  If there *is* a waiter, the caller must
4391		 * issue the i/o in a context where it's OK to block.
4392		 */
4393		if (zio->io_waiter == NULL) {
4394			zio_t *pio = zio_unique_parent(zio);
4395
4396			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4397
4398			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4399			    buf->b_data, zio->io_size, arc_read_done, buf,
4400			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4401		}
4402	}
4403
4404	kmem_free(cb, sizeof (l2arc_read_callback_t));
4405}
4406
4407/*
4408 * This is the list priority from which the L2ARC will search for pages to
4409 * cache.  This is used within loops (0..3) to cycle through lists in the
4410 * desired order.  This order can have a significant effect on cache
4411 * performance.
4412 *
4413 * Currently the metadata lists are hit first, MFU then MRU, followed by
4414 * the data lists.  This function returns a locked list, and also returns
4415 * the lock pointer.
4416 */
4417static list_t *
4418l2arc_list_locked(int list_num, kmutex_t **lock)
4419{
4420	list_t *list;
4421	int idx;
4422
4423	ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);
4424
4425	if (list_num < ARC_BUFC_NUMMETADATALISTS) {
4426		idx = list_num;
4427		list = &arc_mfu->arcs_lists[idx];
4428		*lock = ARCS_LOCK(arc_mfu, idx);
4429	} else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) {
4430		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4431		list = &arc_mru->arcs_lists[idx];
4432		*lock = ARCS_LOCK(arc_mru, idx);
4433	} else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 +
4434		ARC_BUFC_NUMDATALISTS)) {
4435		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4436		list = &arc_mfu->arcs_lists[idx];
4437		*lock = ARCS_LOCK(arc_mfu, idx);
4438	} else {
4439		idx = list_num - ARC_BUFC_NUMLISTS;
4440		list = &arc_mru->arcs_lists[idx];
4441		*lock = ARCS_LOCK(arc_mru, idx);
4442	}
4443
4444	ASSERT(!(MUTEX_HELD(*lock)));
4445	mutex_enter(*lock);
4446	return (list);
4447}
4448
4449/*
4450 * Evict buffers from the device write hand to the distance specified in
4451 * bytes.  This distance may span populated buffers, it may span nothing.
4452 * This is clearing a region on the L2ARC device ready for writing.
4453 * If the 'all' boolean is set, every buffer is evicted.
4454 */
4455static void
4456l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4457{
4458	list_t *buflist;
4459	l2arc_buf_hdr_t *abl2;
4460	arc_buf_hdr_t *ab, *ab_prev;
4461	kmutex_t *hash_lock;
4462	uint64_t taddr;
4463
4464	buflist = dev->l2ad_buflist;
4465
4466	if (buflist == NULL)
4467		return;
4468
4469	if (!all && dev->l2ad_first) {
4470		/*
4471		 * This is the first sweep through the device.  There is
4472		 * nothing to evict.
4473		 */
4474		return;
4475	}
4476
4477	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4478		/*
4479		 * When nearing the end of the device, evict to the end
4480		 * before the device write hand jumps to the start.
4481		 */
4482		taddr = dev->l2ad_end;
4483	} else {
4484		taddr = dev->l2ad_hand + distance;
4485	}
4486	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4487	    uint64_t, taddr, boolean_t, all);
4488
4489top:
4490	mutex_enter(&l2arc_buflist_mtx);
4491	for (ab = list_tail(buflist); ab; ab = ab_prev) {
4492		ab_prev = list_prev(buflist, ab);
4493
4494		hash_lock = HDR_LOCK(ab);
4495		if (!mutex_tryenter(hash_lock)) {
4496			/*
4497			 * Missed the hash lock.  Retry.
4498			 */
4499			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4500			mutex_exit(&l2arc_buflist_mtx);
4501			mutex_enter(hash_lock);
4502			mutex_exit(hash_lock);
4503			goto top;
4504		}
4505
4506		if (HDR_L2_WRITE_HEAD(ab)) {
4507			/*
4508			 * We hit a write head node.  Leave it for
4509			 * l2arc_write_done().
4510			 */
4511			list_remove(buflist, ab);
4512			mutex_exit(hash_lock);
4513			continue;
4514		}
4515
4516		if (!all && ab->b_l2hdr != NULL &&
4517		    (ab->b_l2hdr->b_daddr > taddr ||
4518		    ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4519			/*
4520			 * We've evicted to the target address,
4521			 * or the end of the device.
4522			 */
4523			mutex_exit(hash_lock);
4524			break;
4525		}
4526
4527		if (HDR_FREE_IN_PROGRESS(ab)) {
4528			/*
4529			 * Already on the path to destruction.
4530			 */
4531			mutex_exit(hash_lock);
4532			continue;
4533		}
4534
4535		if (ab->b_state == arc_l2c_only) {
4536			ASSERT(!HDR_L2_READING(ab));
4537			/*
4538			 * This doesn't exist in the ARC.  Destroy.
4539			 * arc_hdr_destroy() will call list_remove()
4540			 * and decrement arcstat_l2_size.
4541			 */
4542			arc_change_state(arc_anon, ab, hash_lock);
4543			arc_hdr_destroy(ab);
4544		} else {
4545			/*
4546			 * Invalidate issued or about to be issued
4547			 * reads, since we may be about to write
4548			 * over this location.
4549			 */
4550			if (HDR_L2_READING(ab)) {
4551				ARCSTAT_BUMP(arcstat_l2_evict_reading);
4552				ab->b_flags |= ARC_L2_EVICTED;
4553			}
4554
4555			/*
4556			 * Tell ARC this no longer exists in L2ARC.
4557			 */
4558			if (ab->b_l2hdr != NULL) {
4559				abl2 = ab->b_l2hdr;
4560				ab->b_l2hdr = NULL;
4561				kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4562				ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4563			}
4564			list_remove(buflist, ab);
4565
4566			/*
4567			 * This may have been leftover after a
4568			 * failed write.
4569			 */
4570			ab->b_flags &= ~ARC_L2_WRITING;
4571		}
4572		mutex_exit(hash_lock);
4573	}
4574	mutex_exit(&l2arc_buflist_mtx);
4575
4576	vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4577	dev->l2ad_evict = taddr;
4578}
4579
4580/*
4581 * Find and write ARC buffers to the L2ARC device.
4582 *
4583 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4584 * for reading until they have completed writing.
4585 */
4586static uint64_t
4587l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
4588{
4589	arc_buf_hdr_t *ab, *ab_prev, *head;
4590	l2arc_buf_hdr_t *hdrl2;
4591	list_t *list;
4592	uint64_t passed_sz, write_sz, buf_sz, headroom;
4593	void *buf_data;
4594	kmutex_t *hash_lock, *list_lock;
4595	boolean_t have_lock, full;
4596	l2arc_write_callback_t *cb;
4597	zio_t *pio, *wzio;
4598	uint64_t guid = spa_load_guid(spa);
4599	int try;
4600
4601	ASSERT(dev->l2ad_vdev != NULL);
4602
4603	pio = NULL;
4604	write_sz = 0;
4605	full = B_FALSE;
4606	head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4607	head->b_flags |= ARC_L2_WRITE_HEAD;
4608
4609	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
4610	/*
4611	 * Copy buffers for L2ARC writing.
4612	 */
4613	mutex_enter(&l2arc_buflist_mtx);
4614	for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
4615		list = l2arc_list_locked(try, &list_lock);
4616		passed_sz = 0;
4617		ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
4618
4619		/*
4620		 * L2ARC fast warmup.
4621		 *
4622		 * Until the ARC is warm and starts to evict, read from the
4623		 * head of the ARC lists rather than the tail.
4624		 */
4625		headroom = target_sz * l2arc_headroom;
4626		if (arc_warm == B_FALSE)
4627			ab = list_head(list);
4628		else
4629			ab = list_tail(list);
4630		if (ab == NULL)
4631			ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
4632
4633		for (; ab; ab = ab_prev) {
4634			if (arc_warm == B_FALSE)
4635				ab_prev = list_next(list, ab);
4636			else
4637				ab_prev = list_prev(list, ab);
4638			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size);
4639
4640			hash_lock = HDR_LOCK(ab);
4641			have_lock = MUTEX_HELD(hash_lock);
4642			if (!have_lock && !mutex_tryenter(hash_lock)) {
4643				ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
4644				/*
4645				 * Skip this buffer rather than waiting.
4646				 */
4647				continue;
4648			}
4649
4650			passed_sz += ab->b_size;
4651			if (passed_sz > headroom) {
4652				/*
4653				 * Searched too far.
4654				 */
4655				mutex_exit(hash_lock);
4656				ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
4657				break;
4658			}
4659
4660			if (!l2arc_write_eligible(guid, ab)) {
4661				mutex_exit(hash_lock);
4662				continue;
4663			}
4664
4665			if ((write_sz + ab->b_size) > target_sz) {
4666				full = B_TRUE;
4667				mutex_exit(hash_lock);
4668				ARCSTAT_BUMP(arcstat_l2_write_full);
4669				break;
4670			}
4671
4672			if (pio == NULL) {
4673				/*
4674				 * Insert a dummy header on the buflist so
4675				 * l2arc_write_done() can find where the
4676				 * write buffers begin without searching.
4677				 */
4678				list_insert_head(dev->l2ad_buflist, head);
4679
4680				cb = kmem_alloc(
4681				    sizeof (l2arc_write_callback_t), KM_SLEEP);
4682				cb->l2wcb_dev = dev;
4683				cb->l2wcb_head = head;
4684				pio = zio_root(spa, l2arc_write_done, cb,
4685				    ZIO_FLAG_CANFAIL);
4686				ARCSTAT_BUMP(arcstat_l2_write_pios);
4687			}
4688
4689			/*
4690			 * Create and add a new L2ARC header.
4691			 */
4692			hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4693			hdrl2->b_dev = dev;
4694			hdrl2->b_daddr = dev->l2ad_hand;
4695
4696			ab->b_flags |= ARC_L2_WRITING;
4697			ab->b_l2hdr = hdrl2;
4698			list_insert_head(dev->l2ad_buflist, ab);
4699			buf_data = ab->b_buf->b_data;
4700			buf_sz = ab->b_size;
4701
4702			/*
4703			 * Compute and store the buffer cksum before
4704			 * writing.  On debug the cksum is verified first.
4705			 */
4706			arc_cksum_verify(ab->b_buf);
4707			arc_cksum_compute(ab->b_buf, B_TRUE);
4708
4709			mutex_exit(hash_lock);
4710
4711			wzio = zio_write_phys(pio, dev->l2ad_vdev,
4712			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
4713			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
4714			    ZIO_FLAG_CANFAIL, B_FALSE);
4715
4716			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
4717			    zio_t *, wzio);
4718			(void) zio_nowait(wzio);
4719
4720			/*
4721			 * Keep the clock hand suitably device-aligned.
4722			 */
4723			buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
4724
4725			write_sz += buf_sz;
4726			dev->l2ad_hand += buf_sz;
4727		}
4728
4729		mutex_exit(list_lock);
4730
4731		if (full == B_TRUE)
4732			break;
4733	}
4734	mutex_exit(&l2arc_buflist_mtx);
4735
4736	if (pio == NULL) {
4737		ASSERT3U(write_sz, ==, 0);
4738		kmem_cache_free(hdr_cache, head);
4739		return (0);
4740	}
4741
4742	ASSERT3U(write_sz, <=, target_sz);
4743	ARCSTAT_BUMP(arcstat_l2_writes_sent);
4744	ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
4745	ARCSTAT_INCR(arcstat_l2_size, write_sz);
4746	vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
4747
4748	/*
4749	 * Bump device hand to the device start if it is approaching the end.
4750	 * l2arc_evict() will already have evicted ahead for this case.
4751	 */
4752	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
4753		vdev_space_update(dev->l2ad_vdev,
4754		    dev->l2ad_end - dev->l2ad_hand, 0, 0);
4755		dev->l2ad_hand = dev->l2ad_start;
4756		dev->l2ad_evict = dev->l2ad_start;
4757		dev->l2ad_first = B_FALSE;
4758	}
4759
4760	dev->l2ad_writing = B_TRUE;
4761	(void) zio_wait(pio);
4762	dev->l2ad_writing = B_FALSE;
4763
4764	return (write_sz);
4765}
4766
4767/*
4768 * This thread feeds the L2ARC at regular intervals.  This is the beating
4769 * heart of the L2ARC.
4770 */
4771static void
4772l2arc_feed_thread(void *dummy __unused)
4773{
4774	callb_cpr_t cpr;
4775	l2arc_dev_t *dev;
4776	spa_t *spa;
4777	uint64_t size, wrote;
4778	clock_t begin, next = ddi_get_lbolt();
4779
4780	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
4781
4782	mutex_enter(&l2arc_feed_thr_lock);
4783
4784	while (l2arc_thread_exit == 0) {
4785		CALLB_CPR_SAFE_BEGIN(&cpr);
4786		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
4787		    next - ddi_get_lbolt());
4788		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
4789		next = ddi_get_lbolt() + hz;
4790
4791		/*
4792		 * Quick check for L2ARC devices.
4793		 */
4794		mutex_enter(&l2arc_dev_mtx);
4795		if (l2arc_ndev == 0) {
4796			mutex_exit(&l2arc_dev_mtx);
4797			continue;
4798		}
4799		mutex_exit(&l2arc_dev_mtx);
4800		begin = ddi_get_lbolt();
4801
4802		/*
4803		 * This selects the next l2arc device to write to, and in
4804		 * doing so the next spa to feed from: dev->l2ad_spa.   This
4805		 * will return NULL if there are now no l2arc devices or if
4806		 * they are all faulted.
4807		 *
4808		 * If a device is returned, its spa's config lock is also
4809		 * held to prevent device removal.  l2arc_dev_get_next()
4810		 * will grab and release l2arc_dev_mtx.
4811		 */
4812		if ((dev = l2arc_dev_get_next()) == NULL)
4813			continue;
4814
4815		spa = dev->l2ad_spa;
4816		ASSERT(spa != NULL);
4817
4818		/*
4819		 * If the pool is read-only then force the feed thread to
4820		 * sleep a little longer.
4821		 */
4822		if (!spa_writeable(spa)) {
4823			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
4824			spa_config_exit(spa, SCL_L2ARC, dev);
4825			continue;
4826		}
4827
4828		/*
4829		 * Avoid contributing to memory pressure.
4830		 */
4831		if (arc_reclaim_needed()) {
4832			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
4833			spa_config_exit(spa, SCL_L2ARC, dev);
4834			continue;
4835		}
4836
4837		ARCSTAT_BUMP(arcstat_l2_feeds);
4838
4839		size = l2arc_write_size(dev);
4840
4841		/*
4842		 * Evict L2ARC buffers that will be overwritten.
4843		 */
4844		l2arc_evict(dev, size, B_FALSE);
4845
4846		/*
4847		 * Write ARC buffers.
4848		 */
4849		wrote = l2arc_write_buffers(spa, dev, size);
4850
4851		/*
4852		 * Calculate interval between writes.
4853		 */
4854		next = l2arc_write_interval(begin, size, wrote);
4855		spa_config_exit(spa, SCL_L2ARC, dev);
4856	}
4857
4858	l2arc_thread_exit = 0;
4859	cv_broadcast(&l2arc_feed_thr_cv);
4860	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
4861	thread_exit();
4862}
4863
4864boolean_t
4865l2arc_vdev_present(vdev_t *vd)
4866{
4867	l2arc_dev_t *dev;
4868
4869	mutex_enter(&l2arc_dev_mtx);
4870	for (dev = list_head(l2arc_dev_list); dev != NULL;
4871	    dev = list_next(l2arc_dev_list, dev)) {
4872		if (dev->l2ad_vdev == vd)
4873			break;
4874	}
4875	mutex_exit(&l2arc_dev_mtx);
4876
4877	return (dev != NULL);
4878}
4879
4880/*
4881 * Add a vdev for use by the L2ARC.  By this point the spa has already
4882 * validated the vdev and opened it.
4883 */
4884void
4885l2arc_add_vdev(spa_t *spa, vdev_t *vd)
4886{
4887	l2arc_dev_t *adddev;
4888
4889	ASSERT(!l2arc_vdev_present(vd));
4890
4891	/*
4892	 * Create a new l2arc device entry.
4893	 */
4894	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
4895	adddev->l2ad_spa = spa;
4896	adddev->l2ad_vdev = vd;
4897	adddev->l2ad_write = l2arc_write_max;
4898	adddev->l2ad_boost = l2arc_write_boost;
4899	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
4900	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
4901	adddev->l2ad_hand = adddev->l2ad_start;
4902	adddev->l2ad_evict = adddev->l2ad_start;
4903	adddev->l2ad_first = B_TRUE;
4904	adddev->l2ad_writing = B_FALSE;
4905	ASSERT3U(adddev->l2ad_write, >, 0);
4906
4907	/*
4908	 * This is a list of all ARC buffers that are still valid on the
4909	 * device.
4910	 */
4911	adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
4912	list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
4913	    offsetof(arc_buf_hdr_t, b_l2node));
4914
4915	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
4916
4917	/*
4918	 * Add device to global list
4919	 */
4920	mutex_enter(&l2arc_dev_mtx);
4921	list_insert_head(l2arc_dev_list, adddev);
4922	atomic_inc_64(&l2arc_ndev);
4923	mutex_exit(&l2arc_dev_mtx);
4924}
4925
4926/*
4927 * Remove a vdev from the L2ARC.
4928 */
4929void
4930l2arc_remove_vdev(vdev_t *vd)
4931{
4932	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
4933
4934	/*
4935	 * Find the device by vdev
4936	 */
4937	mutex_enter(&l2arc_dev_mtx);
4938	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
4939		nextdev = list_next(l2arc_dev_list, dev);
4940		if (vd == dev->l2ad_vdev) {
4941			remdev = dev;
4942			break;
4943		}
4944	}
4945	ASSERT(remdev != NULL);
4946
4947	/*
4948	 * Remove device from global list
4949	 */
4950	list_remove(l2arc_dev_list, remdev);
4951	l2arc_dev_last = NULL;		/* may have been invalidated */
4952	atomic_dec_64(&l2arc_ndev);
4953	mutex_exit(&l2arc_dev_mtx);
4954
4955	/*
4956	 * Clear all buflists and ARC references.  L2ARC device flush.
4957	 */
4958	l2arc_evict(remdev, 0, B_TRUE);
4959	list_destroy(remdev->l2ad_buflist);
4960	kmem_free(remdev->l2ad_buflist, sizeof (list_t));
4961	kmem_free(remdev, sizeof (l2arc_dev_t));
4962}
4963
4964void
4965l2arc_init(void)
4966{
4967	l2arc_thread_exit = 0;
4968	l2arc_ndev = 0;
4969	l2arc_writes_sent = 0;
4970	l2arc_writes_done = 0;
4971
4972	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4973	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
4974	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
4975	mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
4976	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
4977
4978	l2arc_dev_list = &L2ARC_dev_list;
4979	l2arc_free_on_write = &L2ARC_free_on_write;
4980	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
4981	    offsetof(l2arc_dev_t, l2ad_node));
4982	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
4983	    offsetof(l2arc_data_free_t, l2df_list_node));
4984}
4985
4986void
4987l2arc_fini(void)
4988{
4989	/*
4990	 * This is called from dmu_fini(), which is called from spa_fini();
4991	 * Because of this, we can assume that all l2arc devices have
4992	 * already been removed when the pools themselves were removed.
4993	 */
4994
4995	l2arc_do_free_on_write();
4996
4997	mutex_destroy(&l2arc_feed_thr_lock);
4998	cv_destroy(&l2arc_feed_thr_cv);
4999	mutex_destroy(&l2arc_dev_mtx);
5000	mutex_destroy(&l2arc_buflist_mtx);
5001	mutex_destroy(&l2arc_free_on_write_mtx);
5002
5003	list_destroy(l2arc_dev_list);
5004	list_destroy(l2arc_free_on_write);
5005}
5006
5007void
5008l2arc_start(void)
5009{
5010	if (!(spa_mode_global & FWRITE))
5011		return;
5012
5013	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5014	    TS_RUN, minclsyspri);
5015}
5016
5017void
5018l2arc_stop(void)
5019{
5020	if (!(spa_mode_global & FWRITE))
5021		return;
5022
5023	mutex_enter(&l2arc_feed_thr_lock);
5024	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
5025	l2arc_thread_exit = 1;
5026	while (l2arc_thread_exit != 0)
5027		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5028	mutex_exit(&l2arc_feed_thr_lock);
5029}
5030