arc.c revision 277826
198937Sdes/*
298937Sdes * CDDL HEADER START
398937Sdes *
498937Sdes * The contents of this file are subject to the terms of the
598937Sdes * Common Development and Distribution License (the "License").
698937Sdes * You may not use this file except in compliance with the License.
798937Sdes *
898937Sdes * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
998937Sdes * or http://www.opensolaris.org/os/licensing.
1098937Sdes * See the License for the specific language governing permissions
1198937Sdes * and limitations under the License.
1298937Sdes *
1398937Sdes * When distributing Covered Code, include this CDDL HEADER in each
1498937Sdes * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
1598937Sdes * If applicable, add the following below this CDDL HEADER, with the
1698937Sdes * fields enclosed by brackets "[]" replaced with your own identifying
1798937Sdes * information: Portions Copyright [yyyy] [name of copyright owner]
1898937Sdes *
1998937Sdes * CDDL HEADER END
2098937Sdes */
2198937Sdes/*
2298937Sdes * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
2398937Sdes * Copyright (c) 2012, Joyent, Inc. All rights reserved.
2498937Sdes * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
2598937Sdes * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
2698937Sdes * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
2798937Sdes */
2898937Sdes
29124208Sdes/*
3098937Sdes * DVA-based Adjustable Replacement Cache
3198937Sdes *
32 * While much of the theory of operation used here is
33 * based on the self-tuning, low overhead replacement cache
34 * presented by Megiddo and Modha at FAST 2003, there are some
35 * significant differences:
36 *
37 * 1. The Megiddo and Modha model assumes any page is evictable.
38 * Pages in its cache cannot be "locked" into memory.  This makes
39 * the eviction algorithm simple: evict the last page in the list.
40 * This also make the performance characteristics easy to reason
41 * about.  Our cache is not so simple.  At any given moment, some
42 * subset of the blocks in the cache are un-evictable because we
43 * have handed out a reference to them.  Blocks are only evictable
44 * when there are no external references active.  This makes
45 * eviction far more problematic:  we choose to evict the evictable
46 * blocks that are the "lowest" in the list.
47 *
48 * There are times when it is not possible to evict the requested
49 * space.  In these circumstances we are unable to adjust the cache
50 * size.  To prevent the cache growing unbounded at these times we
51 * implement a "cache throttle" that slows the flow of new data
52 * into the cache until we can make space available.
53 *
54 * 2. The Megiddo and Modha model assumes a fixed cache size.
55 * Pages are evicted when the cache is full and there is a cache
56 * miss.  Our model has a variable sized cache.  It grows with
57 * high use, but also tries to react to memory pressure from the
58 * operating system: decreasing its size when system memory is
59 * tight.
60 *
61 * 3. The Megiddo and Modha model assumes a fixed page size. All
62 * elements of the cache are therefore exactly the same size.  So
63 * when adjusting the cache size following a cache miss, its simply
64 * a matter of choosing a single page to evict.  In our model, we
65 * have variable sized cache blocks (rangeing from 512 bytes to
66 * 128K bytes).  We therefore choose a set of blocks to evict to make
67 * space for a cache miss that approximates as closely as possible
68 * the space used by the new block.
69 *
70 * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
71 * by N. Megiddo & D. Modha, FAST 2003
72 */
73
74/*
75 * The locking model:
76 *
77 * A new reference to a cache buffer can be obtained in two
78 * ways: 1) via a hash table lookup using the DVA as a key,
79 * or 2) via one of the ARC lists.  The arc_read() interface
80 * uses method 1, while the internal arc algorithms for
81 * adjusting the cache use method 2.  We therefore provide two
82 * types of locks: 1) the hash table lock array, and 2) the
83 * arc list locks.
84 *
85 * Buffers do not have their own mutexs, rather they rely on the
86 * hash table mutexs for the bulk of their protection (i.e. most
87 * fields in the arc_buf_hdr_t are protected by these mutexs).
88 *
89 * buf_hash_find() returns the appropriate mutex (held) when it
90 * locates the requested buffer in the hash table.  It returns
91 * NULL for the mutex if the buffer was not in the table.
92 *
93 * buf_hash_remove() expects the appropriate hash mutex to be
94 * already held before it is invoked.
95 *
96 * Each arc state also has a mutex which is used to protect the
97 * buffer list associated with the state.  When attempting to
98 * obtain a hash table lock while holding an arc list lock you
99 * must use: mutex_tryenter() to avoid deadlock.  Also note that
100 * the active state mutex must be held before the ghost state mutex.
101 *
102 * Arc buffers may have an associated eviction callback function.
103 * This function will be invoked prior to removing the buffer (e.g.
104 * in arc_do_user_evicts()).  Note however that the data associated
105 * with the buffer may be evicted prior to the callback.  The callback
106 * must be made with *no locks held* (to prevent deadlock).  Additionally,
107 * the users of callbacks must ensure that their private data is
108 * protected from simultaneous callbacks from arc_clear_callback()
109 * and arc_do_user_evicts().
110 *
111 * Note that the majority of the performance stats are manipulated
112 * with atomic operations.
113 *
114 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
115 *
116 *	- L2ARC buflist creation
117 *	- L2ARC buflist eviction
118 *	- L2ARC write completion, which walks L2ARC buflists
119 *	- ARC header destruction, as it removes from L2ARC buflists
120 *	- ARC header release, as it removes from L2ARC buflists
121 */
122
123#include <sys/spa.h>
124#include <sys/zio.h>
125#include <sys/zio_compress.h>
126#include <sys/zfs_context.h>
127#include <sys/arc.h>
128#include <sys/refcount.h>
129#include <sys/vdev.h>
130#include <sys/vdev_impl.h>
131#include <sys/dsl_pool.h>
132#ifdef _KERNEL
133#include <sys/dnlc.h>
134#endif
135#include <sys/callb.h>
136#include <sys/kstat.h>
137#include <sys/trim_map.h>
138#include <zfs_fletcher.h>
139#include <sys/sdt.h>
140
141#include <vm/vm_pageout.h>
142#include <machine/vmparam.h>
143
144#ifdef illumos
145#ifndef _KERNEL
146/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
147boolean_t arc_watch = B_FALSE;
148int arc_procfd;
149#endif
150#endif /* illumos */
151
152static kmutex_t		arc_reclaim_thr_lock;
153static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
154static uint8_t		arc_thread_exit;
155
156#define	ARC_REDUCE_DNLC_PERCENT	3
157uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
158
159typedef enum arc_reclaim_strategy {
160	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
161	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
162} arc_reclaim_strategy_t;
163
164/*
165 * The number of iterations through arc_evict_*() before we
166 * drop & reacquire the lock.
167 */
168int arc_evict_iterations = 100;
169
170/* number of seconds before growing cache again */
171static int		arc_grow_retry = 60;
172
173/* shift of arc_c for calculating both min and max arc_p */
174static int		arc_p_min_shift = 4;
175
176/* log2(fraction of arc to reclaim) */
177static int		arc_shrink_shift = 5;
178
179/*
180 * minimum lifespan of a prefetch block in clock ticks
181 * (initialized in arc_init())
182 */
183static int		arc_min_prefetch_lifespan;
184
185/*
186 * If this percent of memory is free, don't throttle.
187 */
188int arc_lotsfree_percent = 10;
189
190static int arc_dead;
191extern int zfs_prefetch_disable;
192
193/*
194 * The arc has filled available memory and has now warmed up.
195 */
196static boolean_t arc_warm;
197
198uint64_t zfs_arc_max;
199uint64_t zfs_arc_min;
200uint64_t zfs_arc_meta_limit = 0;
201uint64_t zfs_arc_meta_min = 0;
202int zfs_arc_grow_retry = 0;
203int zfs_arc_shrink_shift = 0;
204int zfs_arc_p_min_shift = 0;
205int zfs_disable_dup_eviction = 0;
206uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
207u_int zfs_arc_free_target = 0;
208
209static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
210static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS);
211
212#ifdef _KERNEL
213static void
214arc_free_target_init(void *unused __unused)
215{
216
217	zfs_arc_free_target = vm_pageout_wakeup_thresh;
218}
219SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
220    arc_free_target_init, NULL);
221
222TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
223TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min);
224TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift);
225SYSCTL_DECL(_vfs_zfs);
226SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
227    "Maximum ARC size");
228SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
229    "Minimum ARC size");
230SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
231    &zfs_arc_average_blocksize, 0,
232    "ARC average blocksize");
233SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
234    &arc_shrink_shift, 0,
235    "log2(fraction of arc to reclaim)");
236
237/*
238 * We don't have a tunable for arc_free_target due to the dependency on
239 * pagedaemon initialisation.
240 */
241SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
242    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
243    sysctl_vfs_zfs_arc_free_target, "IU",
244    "Desired number of free pages below which ARC triggers reclaim");
245
246static int
247sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
248{
249	u_int val;
250	int err;
251
252	val = zfs_arc_free_target;
253	err = sysctl_handle_int(oidp, &val, 0, req);
254	if (err != 0 || req->newptr == NULL)
255		return (err);
256
257	if (val < minfree)
258		return (EINVAL);
259	if (val > vm_cnt.v_page_count)
260		return (EINVAL);
261
262	zfs_arc_free_target = val;
263
264	return (0);
265}
266
267/*
268 * Must be declared here, before the definition of corresponding kstat
269 * macro which uses the same names will confuse the compiler.
270 */
271SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit,
272    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
273    sysctl_vfs_zfs_arc_meta_limit, "QU",
274    "ARC metadata limit");
275#endif
276
277/*
278 * Note that buffers can be in one of 6 states:
279 *	ARC_anon	- anonymous (discussed below)
280 *	ARC_mru		- recently used, currently cached
281 *	ARC_mru_ghost	- recentely used, no longer in cache
282 *	ARC_mfu		- frequently used, currently cached
283 *	ARC_mfu_ghost	- frequently used, no longer in cache
284 *	ARC_l2c_only	- exists in L2ARC but not other states
285 * When there are no active references to the buffer, they are
286 * are linked onto a list in one of these arc states.  These are
287 * the only buffers that can be evicted or deleted.  Within each
288 * state there are multiple lists, one for meta-data and one for
289 * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
290 * etc.) is tracked separately so that it can be managed more
291 * explicitly: favored over data, limited explicitly.
292 *
293 * Anonymous buffers are buffers that are not associated with
294 * a DVA.  These are buffers that hold dirty block copies
295 * before they are written to stable storage.  By definition,
296 * they are "ref'd" and are considered part of arc_mru
297 * that cannot be freed.  Generally, they will aquire a DVA
298 * as they are written and migrate onto the arc_mru list.
299 *
300 * The ARC_l2c_only state is for buffers that are in the second
301 * level ARC but no longer in any of the ARC_m* lists.  The second
302 * level ARC itself may also contain buffers that are in any of
303 * the ARC_m* states - meaning that a buffer can exist in two
304 * places.  The reason for the ARC_l2c_only state is to keep the
305 * buffer header in the hash table, so that reads that hit the
306 * second level ARC benefit from these fast lookups.
307 */
308
309#define	ARCS_LOCK_PAD		CACHE_LINE_SIZE
310struct arcs_lock {
311	kmutex_t	arcs_lock;
312#ifdef _KERNEL
313	unsigned char	pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
314#endif
315};
316
317/*
318 * must be power of two for mask use to work
319 *
320 */
321#define ARC_BUFC_NUMDATALISTS		16
322#define ARC_BUFC_NUMMETADATALISTS	16
323#define ARC_BUFC_NUMLISTS	(ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
324
325typedef struct arc_state {
326	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
327	uint64_t arcs_size;	/* total amount of data in this state */
328	list_t	arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
329	struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
330} arc_state_t;
331
332#define ARCS_LOCK(s, i)	(&((s)->arcs_locks[(i)].arcs_lock))
333
334/* The 6 states: */
335static arc_state_t ARC_anon;
336static arc_state_t ARC_mru;
337static arc_state_t ARC_mru_ghost;
338static arc_state_t ARC_mfu;
339static arc_state_t ARC_mfu_ghost;
340static arc_state_t ARC_l2c_only;
341
342typedef struct arc_stats {
343	kstat_named_t arcstat_hits;
344	kstat_named_t arcstat_misses;
345	kstat_named_t arcstat_demand_data_hits;
346	kstat_named_t arcstat_demand_data_misses;
347	kstat_named_t arcstat_demand_metadata_hits;
348	kstat_named_t arcstat_demand_metadata_misses;
349	kstat_named_t arcstat_prefetch_data_hits;
350	kstat_named_t arcstat_prefetch_data_misses;
351	kstat_named_t arcstat_prefetch_metadata_hits;
352	kstat_named_t arcstat_prefetch_metadata_misses;
353	kstat_named_t arcstat_mru_hits;
354	kstat_named_t arcstat_mru_ghost_hits;
355	kstat_named_t arcstat_mfu_hits;
356	kstat_named_t arcstat_mfu_ghost_hits;
357	kstat_named_t arcstat_allocated;
358	kstat_named_t arcstat_deleted;
359	kstat_named_t arcstat_stolen;
360	kstat_named_t arcstat_recycle_miss;
361	/*
362	 * Number of buffers that could not be evicted because the hash lock
363	 * was held by another thread.  The lock may not necessarily be held
364	 * by something using the same buffer, since hash locks are shared
365	 * by multiple buffers.
366	 */
367	kstat_named_t arcstat_mutex_miss;
368	/*
369	 * Number of buffers skipped because they have I/O in progress, are
370	 * indrect prefetch buffers that have not lived long enough, or are
371	 * not from the spa we're trying to evict from.
372	 */
373	kstat_named_t arcstat_evict_skip;
374	kstat_named_t arcstat_evict_l2_cached;
375	kstat_named_t arcstat_evict_l2_eligible;
376	kstat_named_t arcstat_evict_l2_ineligible;
377	kstat_named_t arcstat_hash_elements;
378	kstat_named_t arcstat_hash_elements_max;
379	kstat_named_t arcstat_hash_collisions;
380	kstat_named_t arcstat_hash_chains;
381	kstat_named_t arcstat_hash_chain_max;
382	kstat_named_t arcstat_p;
383	kstat_named_t arcstat_c;
384	kstat_named_t arcstat_c_min;
385	kstat_named_t arcstat_c_max;
386	kstat_named_t arcstat_size;
387	kstat_named_t arcstat_hdr_size;
388	kstat_named_t arcstat_data_size;
389	kstat_named_t arcstat_other_size;
390	kstat_named_t arcstat_l2_hits;
391	kstat_named_t arcstat_l2_misses;
392	kstat_named_t arcstat_l2_feeds;
393	kstat_named_t arcstat_l2_rw_clash;
394	kstat_named_t arcstat_l2_read_bytes;
395	kstat_named_t arcstat_l2_write_bytes;
396	kstat_named_t arcstat_l2_writes_sent;
397	kstat_named_t arcstat_l2_writes_done;
398	kstat_named_t arcstat_l2_writes_error;
399	kstat_named_t arcstat_l2_writes_hdr_miss;
400	kstat_named_t arcstat_l2_evict_lock_retry;
401	kstat_named_t arcstat_l2_evict_reading;
402	kstat_named_t arcstat_l2_free_on_write;
403	kstat_named_t arcstat_l2_cdata_free_on_write;
404	kstat_named_t arcstat_l2_abort_lowmem;
405	kstat_named_t arcstat_l2_cksum_bad;
406	kstat_named_t arcstat_l2_io_error;
407	kstat_named_t arcstat_l2_size;
408	kstat_named_t arcstat_l2_asize;
409	kstat_named_t arcstat_l2_hdr_size;
410	kstat_named_t arcstat_l2_compress_successes;
411	kstat_named_t arcstat_l2_compress_zeros;
412	kstat_named_t arcstat_l2_compress_failures;
413	kstat_named_t arcstat_l2_write_trylock_fail;
414	kstat_named_t arcstat_l2_write_passed_headroom;
415	kstat_named_t arcstat_l2_write_spa_mismatch;
416	kstat_named_t arcstat_l2_write_in_l2;
417	kstat_named_t arcstat_l2_write_hdr_io_in_progress;
418	kstat_named_t arcstat_l2_write_not_cacheable;
419	kstat_named_t arcstat_l2_write_full;
420	kstat_named_t arcstat_l2_write_buffer_iter;
421	kstat_named_t arcstat_l2_write_pios;
422	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
423	kstat_named_t arcstat_l2_write_buffer_list_iter;
424	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
425	kstat_named_t arcstat_memory_throttle_count;
426	kstat_named_t arcstat_duplicate_buffers;
427	kstat_named_t arcstat_duplicate_buffers_size;
428	kstat_named_t arcstat_duplicate_reads;
429	kstat_named_t arcstat_meta_used;
430	kstat_named_t arcstat_meta_limit;
431	kstat_named_t arcstat_meta_max;
432	kstat_named_t arcstat_meta_min;
433} arc_stats_t;
434
435static arc_stats_t arc_stats = {
436	{ "hits",			KSTAT_DATA_UINT64 },
437	{ "misses",			KSTAT_DATA_UINT64 },
438	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
439	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
440	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
441	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
442	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
443	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
444	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
445	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
446	{ "mru_hits",			KSTAT_DATA_UINT64 },
447	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
448	{ "mfu_hits",			KSTAT_DATA_UINT64 },
449	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
450	{ "allocated",			KSTAT_DATA_UINT64 },
451	{ "deleted",			KSTAT_DATA_UINT64 },
452	{ "stolen",			KSTAT_DATA_UINT64 },
453	{ "recycle_miss",		KSTAT_DATA_UINT64 },
454	{ "mutex_miss",			KSTAT_DATA_UINT64 },
455	{ "evict_skip",			KSTAT_DATA_UINT64 },
456	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
457	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
458	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
459	{ "hash_elements",		KSTAT_DATA_UINT64 },
460	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
461	{ "hash_collisions",		KSTAT_DATA_UINT64 },
462	{ "hash_chains",		KSTAT_DATA_UINT64 },
463	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
464	{ "p",				KSTAT_DATA_UINT64 },
465	{ "c",				KSTAT_DATA_UINT64 },
466	{ "c_min",			KSTAT_DATA_UINT64 },
467	{ "c_max",			KSTAT_DATA_UINT64 },
468	{ "size",			KSTAT_DATA_UINT64 },
469	{ "hdr_size",			KSTAT_DATA_UINT64 },
470	{ "data_size",			KSTAT_DATA_UINT64 },
471	{ "other_size",			KSTAT_DATA_UINT64 },
472	{ "l2_hits",			KSTAT_DATA_UINT64 },
473	{ "l2_misses",			KSTAT_DATA_UINT64 },
474	{ "l2_feeds",			KSTAT_DATA_UINT64 },
475	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
476	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
477	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
478	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
479	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
480	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
481	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
482	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
483	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
484	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
485	{ "l2_cdata_free_on_write",	KSTAT_DATA_UINT64 },
486	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
487	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
488	{ "l2_io_error",		KSTAT_DATA_UINT64 },
489	{ "l2_size",			KSTAT_DATA_UINT64 },
490	{ "l2_asize",			KSTAT_DATA_UINT64 },
491	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
492	{ "l2_compress_successes",	KSTAT_DATA_UINT64 },
493	{ "l2_compress_zeros",		KSTAT_DATA_UINT64 },
494	{ "l2_compress_failures",	KSTAT_DATA_UINT64 },
495	{ "l2_write_trylock_fail",	KSTAT_DATA_UINT64 },
496	{ "l2_write_passed_headroom",	KSTAT_DATA_UINT64 },
497	{ "l2_write_spa_mismatch",	KSTAT_DATA_UINT64 },
498	{ "l2_write_in_l2",		KSTAT_DATA_UINT64 },
499	{ "l2_write_io_in_progress",	KSTAT_DATA_UINT64 },
500	{ "l2_write_not_cacheable",	KSTAT_DATA_UINT64 },
501	{ "l2_write_full",		KSTAT_DATA_UINT64 },
502	{ "l2_write_buffer_iter",	KSTAT_DATA_UINT64 },
503	{ "l2_write_pios",		KSTAT_DATA_UINT64 },
504	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
505	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
506	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
507	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
508	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
509	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
510	{ "duplicate_reads",		KSTAT_DATA_UINT64 },
511	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
512	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
513	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
514	{ "arc_meta_min",		KSTAT_DATA_UINT64 }
515};
516
517#define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
518
519#define	ARCSTAT_INCR(stat, val) \
520	atomic_add_64(&arc_stats.stat.value.ui64, (val))
521
522#define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
523#define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
524
525#define	ARCSTAT_MAX(stat, val) {					\
526	uint64_t m;							\
527	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
528	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
529		continue;						\
530}
531
532#define	ARCSTAT_MAXSTAT(stat) \
533	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
534
535/*
536 * We define a macro to allow ARC hits/misses to be easily broken down by
537 * two separate conditions, giving a total of four different subtypes for
538 * each of hits and misses (so eight statistics total).
539 */
540#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
541	if (cond1) {							\
542		if (cond2) {						\
543			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
544		} else {						\
545			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
546		}							\
547	} else {							\
548		if (cond2) {						\
549			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
550		} else {						\
551			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
552		}							\
553	}
554
555kstat_t			*arc_ksp;
556static arc_state_t	*arc_anon;
557static arc_state_t	*arc_mru;
558static arc_state_t	*arc_mru_ghost;
559static arc_state_t	*arc_mfu;
560static arc_state_t	*arc_mfu_ghost;
561static arc_state_t	*arc_l2c_only;
562
563/*
564 * There are several ARC variables that are critical to export as kstats --
565 * but we don't want to have to grovel around in the kstat whenever we wish to
566 * manipulate them.  For these variables, we therefore define them to be in
567 * terms of the statistic variable.  This assures that we are not introducing
568 * the possibility of inconsistency by having shadow copies of the variables,
569 * while still allowing the code to be readable.
570 */
571#define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
572#define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
573#define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
574#define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
575#define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
576#define	arc_meta_limit	ARCSTAT(arcstat_meta_limit) /* max size for metadata */
577#define	arc_meta_min	ARCSTAT(arcstat_meta_min) /* min size for metadata */
578#define	arc_meta_used	ARCSTAT(arcstat_meta_used) /* size of metadata */
579#define	arc_meta_max	ARCSTAT(arcstat_meta_max) /* max size of metadata */
580
581#define	L2ARC_IS_VALID_COMPRESS(_c_) \
582	((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
583
584static int		arc_no_grow;	/* Don't try to grow cache size */
585static uint64_t		arc_tempreserve;
586static uint64_t		arc_loaned_bytes;
587
588typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
589
590typedef struct arc_callback arc_callback_t;
591
592struct arc_callback {
593	void			*acb_private;
594	arc_done_func_t		*acb_done;
595	arc_buf_t		*acb_buf;
596	zio_t			*acb_zio_dummy;
597	arc_callback_t		*acb_next;
598};
599
600typedef struct arc_write_callback arc_write_callback_t;
601
602struct arc_write_callback {
603	void		*awcb_private;
604	arc_done_func_t	*awcb_ready;
605	arc_done_func_t	*awcb_physdone;
606	arc_done_func_t	*awcb_done;
607	arc_buf_t	*awcb_buf;
608};
609
610struct arc_buf_hdr {
611	/* protected by hash lock */
612	dva_t			b_dva;
613	uint64_t		b_birth;
614	uint64_t		b_cksum0;
615
616	kmutex_t		b_freeze_lock;
617	zio_cksum_t		*b_freeze_cksum;
618	void			*b_thawed;
619
620	arc_buf_hdr_t		*b_hash_next;
621	arc_buf_t		*b_buf;
622	arc_flags_t		b_flags;
623	uint32_t		b_datacnt;
624
625	arc_callback_t		*b_acb;
626	kcondvar_t		b_cv;
627
628	/* immutable */
629	arc_buf_contents_t	b_type;
630	uint64_t		b_size;
631	uint64_t		b_spa;
632
633	/* protected by arc state mutex */
634	arc_state_t		*b_state;
635	list_node_t		b_arc_node;
636
637	/* updated atomically */
638	clock_t			b_arc_access;
639
640	/* self protecting */
641	refcount_t		b_refcnt;
642
643	l2arc_buf_hdr_t		*b_l2hdr;
644	list_node_t		b_l2node;
645};
646
647#ifdef _KERNEL
648static int
649sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS)
650{
651	uint64_t val;
652	int err;
653
654	val = arc_meta_limit;
655	err = sysctl_handle_64(oidp, &val, 0, req);
656	if (err != 0 || req->newptr == NULL)
657		return (err);
658
659        if (val <= 0 || val > arc_c_max)
660		return (EINVAL);
661
662	arc_meta_limit = val;
663	return (0);
664}
665#endif
666
667static arc_buf_t *arc_eviction_list;
668static kmutex_t arc_eviction_mtx;
669static arc_buf_hdr_t arc_eviction_hdr;
670
671#define	GHOST_STATE(state)	\
672	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
673	(state) == arc_l2c_only)
674
675#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
676#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
677#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
678#define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
679#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
680#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
681#define	HDR_FREE_IN_PROGRESS(hdr)	\
682	((hdr)->b_flags & ARC_FLAG_FREE_IN_PROGRESS)
683#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_FLAG_L2CACHE)
684#define	HDR_L2_READING(hdr)	\
685	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS &&	\
686	    (hdr)->b_l2hdr != NULL)
687#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITING)
688#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
689#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
690
691/*
692 * Other sizes
693 */
694
695#define	HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
696#define	L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
697
698/*
699 * Hash table routines
700 */
701
702#define	HT_LOCK_PAD	CACHE_LINE_SIZE
703
704struct ht_lock {
705	kmutex_t	ht_lock;
706#ifdef _KERNEL
707	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
708#endif
709};
710
711#define	BUF_LOCKS 256
712typedef struct buf_hash_table {
713	uint64_t ht_mask;
714	arc_buf_hdr_t **ht_table;
715	struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
716} buf_hash_table_t;
717
718static buf_hash_table_t buf_hash_table;
719
720#define	BUF_HASH_INDEX(spa, dva, birth) \
721	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
722#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
723#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
724#define	HDR_LOCK(hdr) \
725	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
726
727uint64_t zfs_crc64_table[256];
728
729/*
730 * Level 2 ARC
731 */
732
733#define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
734#define	L2ARC_HEADROOM		2			/* num of writes */
735/*
736 * If we discover during ARC scan any buffers to be compressed, we boost
737 * our headroom for the next scanning cycle by this percentage multiple.
738 */
739#define	L2ARC_HEADROOM_BOOST	200
740#define	L2ARC_FEED_SECS		1		/* caching interval secs */
741#define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
742
743#define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
744#define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
745
746/* L2ARC Performance Tunables */
747uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
748uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
749uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
750uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
751uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
752uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
753boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
754boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
755boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
756
757SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
758    &l2arc_write_max, 0, "max write size");
759SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
760    &l2arc_write_boost, 0, "extra write during warmup");
761SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
762    &l2arc_headroom, 0, "number of dev writes");
763SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
764    &l2arc_feed_secs, 0, "interval seconds");
765SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
766    &l2arc_feed_min_ms, 0, "min interval milliseconds");
767
768SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
769    &l2arc_noprefetch, 0, "don't cache prefetch bufs");
770SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
771    &l2arc_feed_again, 0, "turbo warmup");
772SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
773    &l2arc_norw, 0, "no reads during writes");
774
775SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
776    &ARC_anon.arcs_size, 0, "size of anonymous state");
777SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
778    &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
779SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
780    &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
781
782SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
783    &ARC_mru.arcs_size, 0, "size of mru state");
784SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
785    &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
786SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
787    &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
788
789SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
790    &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
791SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
792    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
793    "size of metadata in mru ghost state");
794SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
795    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
796    "size of data in mru ghost state");
797
798SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
799    &ARC_mfu.arcs_size, 0, "size of mfu state");
800SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
801    &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
802SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
803    &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
804
805SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
806    &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
807SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
808    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
809    "size of metadata in mfu ghost state");
810SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
811    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
812    "size of data in mfu ghost state");
813
814SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
815    &ARC_l2c_only.arcs_size, 0, "size of mru state");
816
817/*
818 * L2ARC Internals
819 */
820typedef struct l2arc_dev {
821	vdev_t			*l2ad_vdev;	/* vdev */
822	spa_t			*l2ad_spa;	/* spa */
823	uint64_t		l2ad_hand;	/* next write location */
824	uint64_t		l2ad_start;	/* first addr on device */
825	uint64_t		l2ad_end;	/* last addr on device */
826	uint64_t		l2ad_evict;	/* last addr eviction reached */
827	boolean_t		l2ad_first;	/* first sweep through */
828	boolean_t		l2ad_writing;	/* currently writing */
829	list_t			*l2ad_buflist;	/* buffer list */
830	list_node_t		l2ad_node;	/* device list node */
831} l2arc_dev_t;
832
833static list_t L2ARC_dev_list;			/* device list */
834static list_t *l2arc_dev_list;			/* device list pointer */
835static kmutex_t l2arc_dev_mtx;			/* device list mutex */
836static l2arc_dev_t *l2arc_dev_last;		/* last device used */
837static kmutex_t l2arc_buflist_mtx;		/* mutex for all buflists */
838static list_t L2ARC_free_on_write;		/* free after write buf list */
839static list_t *l2arc_free_on_write;		/* free after write list ptr */
840static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
841static uint64_t l2arc_ndev;			/* number of devices */
842
843typedef struct l2arc_read_callback {
844	arc_buf_t		*l2rcb_buf;		/* read buffer */
845	spa_t			*l2rcb_spa;		/* spa */
846	blkptr_t		l2rcb_bp;		/* original blkptr */
847	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
848	int			l2rcb_flags;		/* original flags */
849	enum zio_compress	l2rcb_compress;		/* applied compress */
850} l2arc_read_callback_t;
851
852typedef struct l2arc_write_callback {
853	l2arc_dev_t	*l2wcb_dev;		/* device info */
854	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
855} l2arc_write_callback_t;
856
857struct l2arc_buf_hdr {
858	/* protected by arc_buf_hdr  mutex */
859	l2arc_dev_t		*b_dev;		/* L2ARC device */
860	uint64_t		b_daddr;	/* disk address, offset byte */
861	/* compression applied to buffer data */
862	enum zio_compress	b_compress;
863	/* real alloc'd buffer size depending on b_compress applied */
864	int			b_asize;
865	/* temporary buffer holder for in-flight compressed data */
866	void			*b_tmp_cdata;
867};
868
869typedef struct l2arc_data_free {
870	/* protected by l2arc_free_on_write_mtx */
871	void		*l2df_data;
872	size_t		l2df_size;
873	void		(*l2df_func)(void *, size_t);
874	list_node_t	l2df_list_node;
875} l2arc_data_free_t;
876
877static kmutex_t l2arc_feed_thr_lock;
878static kcondvar_t l2arc_feed_thr_cv;
879static uint8_t l2arc_thread_exit;
880
881static void arc_get_data_buf(arc_buf_t *);
882static void arc_access(arc_buf_hdr_t *, kmutex_t *);
883static int arc_evict_needed(arc_buf_contents_t);
884static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t);
885static void arc_buf_watch(arc_buf_t *);
886
887static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
888static void l2arc_read_done(zio_t *);
889static void l2arc_hdr_stat_add(void);
890static void l2arc_hdr_stat_remove(void);
891
892static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *);
893static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
894static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
895
896static uint64_t
897buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
898{
899	uint8_t *vdva = (uint8_t *)dva;
900	uint64_t crc = -1ULL;
901	int i;
902
903	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
904
905	for (i = 0; i < sizeof (dva_t); i++)
906		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
907
908	crc ^= (spa>>8) ^ birth;
909
910	return (crc);
911}
912
913#define	BUF_EMPTY(buf)						\
914	((buf)->b_dva.dva_word[0] == 0 &&			\
915	(buf)->b_dva.dva_word[1] == 0 &&			\
916	(buf)->b_cksum0 == 0)
917
918#define	BUF_EQUAL(spa, dva, birth, buf)				\
919	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
920	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
921	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
922
923static void
924buf_discard_identity(arc_buf_hdr_t *hdr)
925{
926	hdr->b_dva.dva_word[0] = 0;
927	hdr->b_dva.dva_word[1] = 0;
928	hdr->b_birth = 0;
929	hdr->b_cksum0 = 0;
930}
931
932static arc_buf_hdr_t *
933buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
934{
935	const dva_t *dva = BP_IDENTITY(bp);
936	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
937	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
938	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
939	arc_buf_hdr_t *hdr;
940
941	mutex_enter(hash_lock);
942	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
943	    hdr = hdr->b_hash_next) {
944		if (BUF_EQUAL(spa, dva, birth, hdr)) {
945			*lockp = hash_lock;
946			return (hdr);
947		}
948	}
949	mutex_exit(hash_lock);
950	*lockp = NULL;
951	return (NULL);
952}
953
954/*
955 * Insert an entry into the hash table.  If there is already an element
956 * equal to elem in the hash table, then the already existing element
957 * will be returned and the new element will not be inserted.
958 * Otherwise returns NULL.
959 */
960static arc_buf_hdr_t *
961buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
962{
963	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
964	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
965	arc_buf_hdr_t *fhdr;
966	uint32_t i;
967
968	ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
969	ASSERT(hdr->b_birth != 0);
970	ASSERT(!HDR_IN_HASH_TABLE(hdr));
971	*lockp = hash_lock;
972	mutex_enter(hash_lock);
973	for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
974	    fhdr = fhdr->b_hash_next, i++) {
975		if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
976			return (fhdr);
977	}
978
979	hdr->b_hash_next = buf_hash_table.ht_table[idx];
980	buf_hash_table.ht_table[idx] = hdr;
981	hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
982
983	/* collect some hash table performance data */
984	if (i > 0) {
985		ARCSTAT_BUMP(arcstat_hash_collisions);
986		if (i == 1)
987			ARCSTAT_BUMP(arcstat_hash_chains);
988
989		ARCSTAT_MAX(arcstat_hash_chain_max, i);
990	}
991
992	ARCSTAT_BUMP(arcstat_hash_elements);
993	ARCSTAT_MAXSTAT(arcstat_hash_elements);
994
995	return (NULL);
996}
997
998static void
999buf_hash_remove(arc_buf_hdr_t *hdr)
1000{
1001	arc_buf_hdr_t *fhdr, **hdrp;
1002	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1003
1004	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
1005	ASSERT(HDR_IN_HASH_TABLE(hdr));
1006
1007	hdrp = &buf_hash_table.ht_table[idx];
1008	while ((fhdr = *hdrp) != hdr) {
1009		ASSERT(fhdr != NULL);
1010		hdrp = &fhdr->b_hash_next;
1011	}
1012	*hdrp = hdr->b_hash_next;
1013	hdr->b_hash_next = NULL;
1014	hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE;
1015
1016	/* collect some hash table performance data */
1017	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1018
1019	if (buf_hash_table.ht_table[idx] &&
1020	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1021		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1022}
1023
1024/*
1025 * Global data structures and functions for the buf kmem cache.
1026 */
1027static kmem_cache_t *hdr_cache;
1028static kmem_cache_t *buf_cache;
1029
1030static void
1031buf_fini(void)
1032{
1033	int i;
1034
1035	kmem_free(buf_hash_table.ht_table,
1036	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
1037	for (i = 0; i < BUF_LOCKS; i++)
1038		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
1039	kmem_cache_destroy(hdr_cache);
1040	kmem_cache_destroy(buf_cache);
1041}
1042
1043/*
1044 * Constructor callback - called when the cache is empty
1045 * and a new buf is requested.
1046 */
1047/* ARGSUSED */
1048static int
1049hdr_cons(void *vbuf, void *unused, int kmflag)
1050{
1051	arc_buf_hdr_t *hdr = vbuf;
1052
1053	bzero(hdr, sizeof (arc_buf_hdr_t));
1054	refcount_create(&hdr->b_refcnt);
1055	cv_init(&hdr->b_cv, NULL, CV_DEFAULT, NULL);
1056	mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1057	arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1058
1059	return (0);
1060}
1061
1062/* ARGSUSED */
1063static int
1064buf_cons(void *vbuf, void *unused, int kmflag)
1065{
1066	arc_buf_t *buf = vbuf;
1067
1068	bzero(buf, sizeof (arc_buf_t));
1069	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1070	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1071
1072	return (0);
1073}
1074
1075/*
1076 * Destructor callback - called when a cached buf is
1077 * no longer required.
1078 */
1079/* ARGSUSED */
1080static void
1081hdr_dest(void *vbuf, void *unused)
1082{
1083	arc_buf_hdr_t *hdr = vbuf;
1084
1085	ASSERT(BUF_EMPTY(hdr));
1086	refcount_destroy(&hdr->b_refcnt);
1087	cv_destroy(&hdr->b_cv);
1088	mutex_destroy(&hdr->b_freeze_lock);
1089	arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1090}
1091
1092/* ARGSUSED */
1093static void
1094buf_dest(void *vbuf, void *unused)
1095{
1096	arc_buf_t *buf = vbuf;
1097
1098	mutex_destroy(&buf->b_evict_lock);
1099	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1100}
1101
1102/*
1103 * Reclaim callback -- invoked when memory is low.
1104 */
1105/* ARGSUSED */
1106static void
1107hdr_recl(void *unused)
1108{
1109	dprintf("hdr_recl called\n");
1110	/*
1111	 * umem calls the reclaim func when we destroy the buf cache,
1112	 * which is after we do arc_fini().
1113	 */
1114	if (!arc_dead)
1115		cv_signal(&arc_reclaim_thr_cv);
1116}
1117
1118static void
1119buf_init(void)
1120{
1121	uint64_t *ct;
1122	uint64_t hsize = 1ULL << 12;
1123	int i, j;
1124
1125	/*
1126	 * The hash table is big enough to fill all of physical memory
1127	 * with an average block size of zfs_arc_average_blocksize (default 8K).
1128	 * By default, the table will take up
1129	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1130	 */
1131	while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
1132		hsize <<= 1;
1133retry:
1134	buf_hash_table.ht_mask = hsize - 1;
1135	buf_hash_table.ht_table =
1136	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1137	if (buf_hash_table.ht_table == NULL) {
1138		ASSERT(hsize > (1ULL << 8));
1139		hsize >>= 1;
1140		goto retry;
1141	}
1142
1143	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
1144	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
1145	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1146	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1147
1148	for (i = 0; i < 256; i++)
1149		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1150			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1151
1152	for (i = 0; i < BUF_LOCKS; i++) {
1153		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1154		    NULL, MUTEX_DEFAULT, NULL);
1155	}
1156}
1157
1158#define	ARC_MINTIME	(hz>>4) /* 62 ms */
1159
1160static void
1161arc_cksum_verify(arc_buf_t *buf)
1162{
1163	zio_cksum_t zc;
1164
1165	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1166		return;
1167
1168	mutex_enter(&buf->b_hdr->b_freeze_lock);
1169	if (buf->b_hdr->b_freeze_cksum == NULL ||
1170	    (buf->b_hdr->b_flags & ARC_FLAG_IO_ERROR)) {
1171		mutex_exit(&buf->b_hdr->b_freeze_lock);
1172		return;
1173	}
1174	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1175	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1176		panic("buffer modified while frozen!");
1177	mutex_exit(&buf->b_hdr->b_freeze_lock);
1178}
1179
1180static int
1181arc_cksum_equal(arc_buf_t *buf)
1182{
1183	zio_cksum_t zc;
1184	int equal;
1185
1186	mutex_enter(&buf->b_hdr->b_freeze_lock);
1187	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1188	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1189	mutex_exit(&buf->b_hdr->b_freeze_lock);
1190
1191	return (equal);
1192}
1193
1194static void
1195arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1196{
1197	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1198		return;
1199
1200	mutex_enter(&buf->b_hdr->b_freeze_lock);
1201	if (buf->b_hdr->b_freeze_cksum != NULL) {
1202		mutex_exit(&buf->b_hdr->b_freeze_lock);
1203		return;
1204	}
1205	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1206	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1207	    buf->b_hdr->b_freeze_cksum);
1208	mutex_exit(&buf->b_hdr->b_freeze_lock);
1209#ifdef illumos
1210	arc_buf_watch(buf);
1211#endif
1212}
1213
1214#ifdef illumos
1215#ifndef _KERNEL
1216typedef struct procctl {
1217	long cmd;
1218	prwatch_t prwatch;
1219} procctl_t;
1220#endif
1221
1222/* ARGSUSED */
1223static void
1224arc_buf_unwatch(arc_buf_t *buf)
1225{
1226#ifndef _KERNEL
1227	if (arc_watch) {
1228		int result;
1229		procctl_t ctl;
1230		ctl.cmd = PCWATCH;
1231		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1232		ctl.prwatch.pr_size = 0;
1233		ctl.prwatch.pr_wflags = 0;
1234		result = write(arc_procfd, &ctl, sizeof (ctl));
1235		ASSERT3U(result, ==, sizeof (ctl));
1236	}
1237#endif
1238}
1239
1240/* ARGSUSED */
1241static void
1242arc_buf_watch(arc_buf_t *buf)
1243{
1244#ifndef _KERNEL
1245	if (arc_watch) {
1246		int result;
1247		procctl_t ctl;
1248		ctl.cmd = PCWATCH;
1249		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1250		ctl.prwatch.pr_size = buf->b_hdr->b_size;
1251		ctl.prwatch.pr_wflags = WA_WRITE;
1252		result = write(arc_procfd, &ctl, sizeof (ctl));
1253		ASSERT3U(result, ==, sizeof (ctl));
1254	}
1255#endif
1256}
1257#endif /* illumos */
1258
1259void
1260arc_buf_thaw(arc_buf_t *buf)
1261{
1262	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1263		if (buf->b_hdr->b_state != arc_anon)
1264			panic("modifying non-anon buffer!");
1265		if (buf->b_hdr->b_flags & ARC_FLAG_IO_IN_PROGRESS)
1266			panic("modifying buffer while i/o in progress!");
1267		arc_cksum_verify(buf);
1268	}
1269
1270	mutex_enter(&buf->b_hdr->b_freeze_lock);
1271	if (buf->b_hdr->b_freeze_cksum != NULL) {
1272		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1273		buf->b_hdr->b_freeze_cksum = NULL;
1274	}
1275
1276	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1277		if (buf->b_hdr->b_thawed)
1278			kmem_free(buf->b_hdr->b_thawed, 1);
1279		buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1280	}
1281
1282	mutex_exit(&buf->b_hdr->b_freeze_lock);
1283
1284#ifdef illumos
1285	arc_buf_unwatch(buf);
1286#endif
1287}
1288
1289void
1290arc_buf_freeze(arc_buf_t *buf)
1291{
1292	kmutex_t *hash_lock;
1293
1294	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1295		return;
1296
1297	hash_lock = HDR_LOCK(buf->b_hdr);
1298	mutex_enter(hash_lock);
1299
1300	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1301	    buf->b_hdr->b_state == arc_anon);
1302	arc_cksum_compute(buf, B_FALSE);
1303	mutex_exit(hash_lock);
1304
1305}
1306
1307static void
1308get_buf_info(arc_buf_hdr_t *hdr, arc_state_t *state, list_t **list, kmutex_t **lock)
1309{
1310	uint64_t buf_hashid = buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1311
1312	if (hdr->b_type == ARC_BUFC_METADATA)
1313		buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
1314	else {
1315		buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
1316		buf_hashid += ARC_BUFC_NUMMETADATALISTS;
1317	}
1318
1319	*list = &state->arcs_lists[buf_hashid];
1320	*lock = ARCS_LOCK(state, buf_hashid);
1321}
1322
1323
1324static void
1325add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1326{
1327	ASSERT(MUTEX_HELD(hash_lock));
1328
1329	if ((refcount_add(&hdr->b_refcnt, tag) == 1) &&
1330	    (hdr->b_state != arc_anon)) {
1331		uint64_t delta = hdr->b_size * hdr->b_datacnt;
1332		uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
1333		list_t *list;
1334		kmutex_t *lock;
1335
1336		get_buf_info(hdr, hdr->b_state, &list, &lock);
1337		ASSERT(!MUTEX_HELD(lock));
1338		mutex_enter(lock);
1339		ASSERT(list_link_active(&hdr->b_arc_node));
1340		list_remove(list, hdr);
1341		if (GHOST_STATE(hdr->b_state)) {
1342			ASSERT0(hdr->b_datacnt);
1343			ASSERT3P(hdr->b_buf, ==, NULL);
1344			delta = hdr->b_size;
1345		}
1346		ASSERT(delta > 0);
1347		ASSERT3U(*size, >=, delta);
1348		atomic_add_64(size, -delta);
1349		mutex_exit(lock);
1350		/* remove the prefetch flag if we get a reference */
1351		if (hdr->b_flags & ARC_FLAG_PREFETCH)
1352			hdr->b_flags &= ~ARC_FLAG_PREFETCH;
1353	}
1354}
1355
1356static int
1357remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1358{
1359	int cnt;
1360	arc_state_t *state = hdr->b_state;
1361
1362	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1363	ASSERT(!GHOST_STATE(state));
1364
1365	if (((cnt = refcount_remove(&hdr->b_refcnt, tag)) == 0) &&
1366	    (state != arc_anon)) {
1367		uint64_t *size = &state->arcs_lsize[hdr->b_type];
1368		list_t *list;
1369		kmutex_t *lock;
1370
1371		get_buf_info(hdr, state, &list, &lock);
1372		ASSERT(!MUTEX_HELD(lock));
1373		mutex_enter(lock);
1374		ASSERT(!list_link_active(&hdr->b_arc_node));
1375		list_insert_head(list, hdr);
1376		ASSERT(hdr->b_datacnt > 0);
1377		atomic_add_64(size, hdr->b_size * hdr->b_datacnt);
1378		mutex_exit(lock);
1379	}
1380	return (cnt);
1381}
1382
1383/*
1384 * Move the supplied buffer to the indicated state.  The mutex
1385 * for the buffer must be held by the caller.
1386 */
1387static void
1388arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
1389    kmutex_t *hash_lock)
1390{
1391	arc_state_t *old_state = hdr->b_state;
1392	int64_t refcnt = refcount_count(&hdr->b_refcnt);
1393	uint64_t from_delta, to_delta;
1394	list_t *list;
1395	kmutex_t *lock;
1396
1397	ASSERT(MUTEX_HELD(hash_lock));
1398	ASSERT3P(new_state, !=, old_state);
1399	ASSERT(refcnt == 0 || hdr->b_datacnt > 0);
1400	ASSERT(hdr->b_datacnt == 0 || !GHOST_STATE(new_state));
1401	ASSERT(hdr->b_datacnt <= 1 || old_state != arc_anon);
1402
1403	from_delta = to_delta = hdr->b_datacnt * hdr->b_size;
1404
1405	/*
1406	 * If this buffer is evictable, transfer it from the
1407	 * old state list to the new state list.
1408	 */
1409	if (refcnt == 0) {
1410		if (old_state != arc_anon) {
1411			int use_mutex;
1412			uint64_t *size = &old_state->arcs_lsize[hdr->b_type];
1413
1414			get_buf_info(hdr, old_state, &list, &lock);
1415			use_mutex = !MUTEX_HELD(lock);
1416			if (use_mutex)
1417				mutex_enter(lock);
1418
1419			ASSERT(list_link_active(&hdr->b_arc_node));
1420			list_remove(list, hdr);
1421
1422			/*
1423			 * If prefetching out of the ghost cache,
1424			 * we will have a non-zero datacnt.
1425			 */
1426			if (GHOST_STATE(old_state) && hdr->b_datacnt == 0) {
1427				/* ghost elements have a ghost size */
1428				ASSERT(hdr->b_buf == NULL);
1429				from_delta = hdr->b_size;
1430			}
1431			ASSERT3U(*size, >=, from_delta);
1432			atomic_add_64(size, -from_delta);
1433
1434			if (use_mutex)
1435				mutex_exit(lock);
1436		}
1437		if (new_state != arc_anon) {
1438			int use_mutex;
1439			uint64_t *size = &new_state->arcs_lsize[hdr->b_type];
1440
1441			get_buf_info(hdr, new_state, &list, &lock);
1442			use_mutex = !MUTEX_HELD(lock);
1443			if (use_mutex)
1444				mutex_enter(lock);
1445
1446			list_insert_head(list, hdr);
1447
1448			/* ghost elements have a ghost size */
1449			if (GHOST_STATE(new_state)) {
1450				ASSERT(hdr->b_datacnt == 0);
1451				ASSERT(hdr->b_buf == NULL);
1452				to_delta = hdr->b_size;
1453			}
1454			atomic_add_64(size, to_delta);
1455
1456			if (use_mutex)
1457				mutex_exit(lock);
1458		}
1459	}
1460
1461	ASSERT(!BUF_EMPTY(hdr));
1462	if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
1463		buf_hash_remove(hdr);
1464
1465	/* adjust state sizes */
1466	if (to_delta)
1467		atomic_add_64(&new_state->arcs_size, to_delta);
1468	if (from_delta) {
1469		ASSERT3U(old_state->arcs_size, >=, from_delta);
1470		atomic_add_64(&old_state->arcs_size, -from_delta);
1471	}
1472	hdr->b_state = new_state;
1473
1474	/* adjust l2arc hdr stats */
1475	if (new_state == arc_l2c_only)
1476		l2arc_hdr_stat_add();
1477	else if (old_state == arc_l2c_only)
1478		l2arc_hdr_stat_remove();
1479}
1480
1481void
1482arc_space_consume(uint64_t space, arc_space_type_t type)
1483{
1484	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1485
1486	switch (type) {
1487	case ARC_SPACE_DATA:
1488		ARCSTAT_INCR(arcstat_data_size, space);
1489		break;
1490	case ARC_SPACE_OTHER:
1491		ARCSTAT_INCR(arcstat_other_size, space);
1492		break;
1493	case ARC_SPACE_HDRS:
1494		ARCSTAT_INCR(arcstat_hdr_size, space);
1495		break;
1496	case ARC_SPACE_L2HDRS:
1497		ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1498		break;
1499	}
1500
1501	ARCSTAT_INCR(arcstat_meta_used, space);
1502	atomic_add_64(&arc_size, space);
1503}
1504
1505void
1506arc_space_return(uint64_t space, arc_space_type_t type)
1507{
1508	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1509
1510	switch (type) {
1511	case ARC_SPACE_DATA:
1512		ARCSTAT_INCR(arcstat_data_size, -space);
1513		break;
1514	case ARC_SPACE_OTHER:
1515		ARCSTAT_INCR(arcstat_other_size, -space);
1516		break;
1517	case ARC_SPACE_HDRS:
1518		ARCSTAT_INCR(arcstat_hdr_size, -space);
1519		break;
1520	case ARC_SPACE_L2HDRS:
1521		ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1522		break;
1523	}
1524
1525	ASSERT(arc_meta_used >= space);
1526	if (arc_meta_max < arc_meta_used)
1527		arc_meta_max = arc_meta_used;
1528	ARCSTAT_INCR(arcstat_meta_used, -space);
1529	ASSERT(arc_size >= space);
1530	atomic_add_64(&arc_size, -space);
1531}
1532
1533arc_buf_t *
1534arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1535{
1536	arc_buf_hdr_t *hdr;
1537	arc_buf_t *buf;
1538
1539	ASSERT3U(size, >, 0);
1540	hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1541	ASSERT(BUF_EMPTY(hdr));
1542	hdr->b_size = size;
1543	hdr->b_type = type;
1544	hdr->b_spa = spa_load_guid(spa);
1545	hdr->b_state = arc_anon;
1546	hdr->b_arc_access = 0;
1547	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1548	buf->b_hdr = hdr;
1549	buf->b_data = NULL;
1550	buf->b_efunc = NULL;
1551	buf->b_private = NULL;
1552	buf->b_next = NULL;
1553	hdr->b_buf = buf;
1554	arc_get_data_buf(buf);
1555	hdr->b_datacnt = 1;
1556	hdr->b_flags = 0;
1557	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1558	(void) refcount_add(&hdr->b_refcnt, tag);
1559
1560	return (buf);
1561}
1562
1563static char *arc_onloan_tag = "onloan";
1564
1565/*
1566 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1567 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1568 * buffers must be returned to the arc before they can be used by the DMU or
1569 * freed.
1570 */
1571arc_buf_t *
1572arc_loan_buf(spa_t *spa, int size)
1573{
1574	arc_buf_t *buf;
1575
1576	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1577
1578	atomic_add_64(&arc_loaned_bytes, size);
1579	return (buf);
1580}
1581
1582/*
1583 * Return a loaned arc buffer to the arc.
1584 */
1585void
1586arc_return_buf(arc_buf_t *buf, void *tag)
1587{
1588	arc_buf_hdr_t *hdr = buf->b_hdr;
1589
1590	ASSERT(buf->b_data != NULL);
1591	(void) refcount_add(&hdr->b_refcnt, tag);
1592	(void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1593
1594	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1595}
1596
1597/* Detach an arc_buf from a dbuf (tag) */
1598void
1599arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1600{
1601	arc_buf_hdr_t *hdr;
1602
1603	ASSERT(buf->b_data != NULL);
1604	hdr = buf->b_hdr;
1605	(void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1606	(void) refcount_remove(&hdr->b_refcnt, tag);
1607	buf->b_efunc = NULL;
1608	buf->b_private = NULL;
1609
1610	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1611}
1612
1613static arc_buf_t *
1614arc_buf_clone(arc_buf_t *from)
1615{
1616	arc_buf_t *buf;
1617	arc_buf_hdr_t *hdr = from->b_hdr;
1618	uint64_t size = hdr->b_size;
1619
1620	ASSERT(hdr->b_state != arc_anon);
1621
1622	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1623	buf->b_hdr = hdr;
1624	buf->b_data = NULL;
1625	buf->b_efunc = NULL;
1626	buf->b_private = NULL;
1627	buf->b_next = hdr->b_buf;
1628	hdr->b_buf = buf;
1629	arc_get_data_buf(buf);
1630	bcopy(from->b_data, buf->b_data, size);
1631
1632	/*
1633	 * This buffer already exists in the arc so create a duplicate
1634	 * copy for the caller.  If the buffer is associated with user data
1635	 * then track the size and number of duplicates.  These stats will be
1636	 * updated as duplicate buffers are created and destroyed.
1637	 */
1638	if (hdr->b_type == ARC_BUFC_DATA) {
1639		ARCSTAT_BUMP(arcstat_duplicate_buffers);
1640		ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1641	}
1642	hdr->b_datacnt += 1;
1643	return (buf);
1644}
1645
1646void
1647arc_buf_add_ref(arc_buf_t *buf, void* tag)
1648{
1649	arc_buf_hdr_t *hdr;
1650	kmutex_t *hash_lock;
1651
1652	/*
1653	 * Check to see if this buffer is evicted.  Callers
1654	 * must verify b_data != NULL to know if the add_ref
1655	 * was successful.
1656	 */
1657	mutex_enter(&buf->b_evict_lock);
1658	if (buf->b_data == NULL) {
1659		mutex_exit(&buf->b_evict_lock);
1660		return;
1661	}
1662	hash_lock = HDR_LOCK(buf->b_hdr);
1663	mutex_enter(hash_lock);
1664	hdr = buf->b_hdr;
1665	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1666	mutex_exit(&buf->b_evict_lock);
1667
1668	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1669	add_reference(hdr, hash_lock, tag);
1670	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1671	arc_access(hdr, hash_lock);
1672	mutex_exit(hash_lock);
1673	ARCSTAT_BUMP(arcstat_hits);
1674	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH),
1675	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1676	    data, metadata, hits);
1677}
1678
1679static void
1680arc_buf_free_on_write(void *data, size_t size,
1681    void (*free_func)(void *, size_t))
1682{
1683	l2arc_data_free_t *df;
1684
1685	df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1686	df->l2df_data = data;
1687	df->l2df_size = size;
1688	df->l2df_func = free_func;
1689	mutex_enter(&l2arc_free_on_write_mtx);
1690	list_insert_head(l2arc_free_on_write, df);
1691	mutex_exit(&l2arc_free_on_write_mtx);
1692}
1693
1694/*
1695 * Free the arc data buffer.  If it is an l2arc write in progress,
1696 * the buffer is placed on l2arc_free_on_write to be freed later.
1697 */
1698static void
1699arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1700{
1701	arc_buf_hdr_t *hdr = buf->b_hdr;
1702
1703	if (HDR_L2_WRITING(hdr)) {
1704		arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func);
1705		ARCSTAT_BUMP(arcstat_l2_free_on_write);
1706	} else {
1707		free_func(buf->b_data, hdr->b_size);
1708	}
1709}
1710
1711/*
1712 * Free up buf->b_data and if 'remove' is set, then pull the
1713 * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
1714 */
1715static void
1716arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
1717{
1718	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1719
1720	ASSERT(MUTEX_HELD(&l2arc_buflist_mtx));
1721
1722	if (l2hdr->b_tmp_cdata == NULL)
1723		return;
1724
1725	ASSERT(HDR_L2_WRITING(hdr));
1726	arc_buf_free_on_write(l2hdr->b_tmp_cdata, hdr->b_size,
1727	    zio_data_buf_free);
1728	ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
1729	l2hdr->b_tmp_cdata = NULL;
1730}
1731
1732static void
1733arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
1734{
1735	arc_buf_t **bufp;
1736
1737	/* free up data associated with the buf */
1738	if (buf->b_data) {
1739		arc_state_t *state = buf->b_hdr->b_state;
1740		uint64_t size = buf->b_hdr->b_size;
1741		arc_buf_contents_t type = buf->b_hdr->b_type;
1742
1743		arc_cksum_verify(buf);
1744#ifdef illumos
1745		arc_buf_unwatch(buf);
1746#endif
1747
1748		if (!recycle) {
1749			if (type == ARC_BUFC_METADATA) {
1750				arc_buf_data_free(buf, zio_buf_free);
1751				arc_space_return(size, ARC_SPACE_DATA);
1752			} else {
1753				ASSERT(type == ARC_BUFC_DATA);
1754				arc_buf_data_free(buf, zio_data_buf_free);
1755				ARCSTAT_INCR(arcstat_data_size, -size);
1756				atomic_add_64(&arc_size, -size);
1757			}
1758		}
1759		if (list_link_active(&buf->b_hdr->b_arc_node)) {
1760			uint64_t *cnt = &state->arcs_lsize[type];
1761
1762			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1763			ASSERT(state != arc_anon);
1764
1765			ASSERT3U(*cnt, >=, size);
1766			atomic_add_64(cnt, -size);
1767		}
1768		ASSERT3U(state->arcs_size, >=, size);
1769		atomic_add_64(&state->arcs_size, -size);
1770		buf->b_data = NULL;
1771
1772		/*
1773		 * If we're destroying a duplicate buffer make sure
1774		 * that the appropriate statistics are updated.
1775		 */
1776		if (buf->b_hdr->b_datacnt > 1 &&
1777		    buf->b_hdr->b_type == ARC_BUFC_DATA) {
1778			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1779			ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1780		}
1781		ASSERT(buf->b_hdr->b_datacnt > 0);
1782		buf->b_hdr->b_datacnt -= 1;
1783	}
1784
1785	/* only remove the buf if requested */
1786	if (!remove)
1787		return;
1788
1789	/* remove the buf from the hdr list */
1790	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1791		continue;
1792	*bufp = buf->b_next;
1793	buf->b_next = NULL;
1794
1795	ASSERT(buf->b_efunc == NULL);
1796
1797	/* clean up the buf */
1798	buf->b_hdr = NULL;
1799	kmem_cache_free(buf_cache, buf);
1800}
1801
1802static void
1803arc_hdr_destroy(arc_buf_hdr_t *hdr)
1804{
1805	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1806	ASSERT3P(hdr->b_state, ==, arc_anon);
1807	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1808	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1809
1810	if (l2hdr != NULL) {
1811		boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1812		/*
1813		 * To prevent arc_free() and l2arc_evict() from
1814		 * attempting to free the same buffer at the same time,
1815		 * a FREE_IN_PROGRESS flag is given to arc_free() to
1816		 * give it priority.  l2arc_evict() can't destroy this
1817		 * header while we are waiting on l2arc_buflist_mtx.
1818		 *
1819		 * The hdr may be removed from l2ad_buflist before we
1820		 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1821		 */
1822		if (!buflist_held) {
1823			mutex_enter(&l2arc_buflist_mtx);
1824			l2hdr = hdr->b_l2hdr;
1825		}
1826
1827		if (l2hdr != NULL) {
1828			trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
1829			    hdr->b_size, 0);
1830			list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1831			arc_buf_l2_cdata_free(hdr);
1832			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1833			ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1834			vdev_space_update(l2hdr->b_dev->l2ad_vdev,
1835			    -l2hdr->b_asize, 0, 0);
1836			kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1837			if (hdr->b_state == arc_l2c_only)
1838				l2arc_hdr_stat_remove();
1839			hdr->b_l2hdr = NULL;
1840		}
1841
1842		if (!buflist_held)
1843			mutex_exit(&l2arc_buflist_mtx);
1844	}
1845
1846	if (!BUF_EMPTY(hdr)) {
1847		ASSERT(!HDR_IN_HASH_TABLE(hdr));
1848		buf_discard_identity(hdr);
1849	}
1850	while (hdr->b_buf) {
1851		arc_buf_t *buf = hdr->b_buf;
1852
1853		if (buf->b_efunc) {
1854			mutex_enter(&arc_eviction_mtx);
1855			mutex_enter(&buf->b_evict_lock);
1856			ASSERT(buf->b_hdr != NULL);
1857			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1858			hdr->b_buf = buf->b_next;
1859			buf->b_hdr = &arc_eviction_hdr;
1860			buf->b_next = arc_eviction_list;
1861			arc_eviction_list = buf;
1862			mutex_exit(&buf->b_evict_lock);
1863			mutex_exit(&arc_eviction_mtx);
1864		} else {
1865			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1866		}
1867	}
1868	if (hdr->b_freeze_cksum != NULL) {
1869		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1870		hdr->b_freeze_cksum = NULL;
1871	}
1872	if (hdr->b_thawed) {
1873		kmem_free(hdr->b_thawed, 1);
1874		hdr->b_thawed = NULL;
1875	}
1876
1877	ASSERT(!list_link_active(&hdr->b_arc_node));
1878	ASSERT3P(hdr->b_hash_next, ==, NULL);
1879	ASSERT3P(hdr->b_acb, ==, NULL);
1880	kmem_cache_free(hdr_cache, hdr);
1881}
1882
1883void
1884arc_buf_free(arc_buf_t *buf, void *tag)
1885{
1886	arc_buf_hdr_t *hdr = buf->b_hdr;
1887	int hashed = hdr->b_state != arc_anon;
1888
1889	ASSERT(buf->b_efunc == NULL);
1890	ASSERT(buf->b_data != NULL);
1891
1892	if (hashed) {
1893		kmutex_t *hash_lock = HDR_LOCK(hdr);
1894
1895		mutex_enter(hash_lock);
1896		hdr = buf->b_hdr;
1897		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1898
1899		(void) remove_reference(hdr, hash_lock, tag);
1900		if (hdr->b_datacnt > 1) {
1901			arc_buf_destroy(buf, FALSE, TRUE);
1902		} else {
1903			ASSERT(buf == hdr->b_buf);
1904			ASSERT(buf->b_efunc == NULL);
1905			hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
1906		}
1907		mutex_exit(hash_lock);
1908	} else if (HDR_IO_IN_PROGRESS(hdr)) {
1909		int destroy_hdr;
1910		/*
1911		 * We are in the middle of an async write.  Don't destroy
1912		 * this buffer unless the write completes before we finish
1913		 * decrementing the reference count.
1914		 */
1915		mutex_enter(&arc_eviction_mtx);
1916		(void) remove_reference(hdr, NULL, tag);
1917		ASSERT(refcount_is_zero(&hdr->b_refcnt));
1918		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1919		mutex_exit(&arc_eviction_mtx);
1920		if (destroy_hdr)
1921			arc_hdr_destroy(hdr);
1922	} else {
1923		if (remove_reference(hdr, NULL, tag) > 0)
1924			arc_buf_destroy(buf, FALSE, TRUE);
1925		else
1926			arc_hdr_destroy(hdr);
1927	}
1928}
1929
1930boolean_t
1931arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1932{
1933	arc_buf_hdr_t *hdr = buf->b_hdr;
1934	kmutex_t *hash_lock = HDR_LOCK(hdr);
1935	boolean_t no_callback = (buf->b_efunc == NULL);
1936
1937	if (hdr->b_state == arc_anon) {
1938		ASSERT(hdr->b_datacnt == 1);
1939		arc_buf_free(buf, tag);
1940		return (no_callback);
1941	}
1942
1943	mutex_enter(hash_lock);
1944	hdr = buf->b_hdr;
1945	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1946	ASSERT(hdr->b_state != arc_anon);
1947	ASSERT(buf->b_data != NULL);
1948
1949	(void) remove_reference(hdr, hash_lock, tag);
1950	if (hdr->b_datacnt > 1) {
1951		if (no_callback)
1952			arc_buf_destroy(buf, FALSE, TRUE);
1953	} else if (no_callback) {
1954		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1955		ASSERT(buf->b_efunc == NULL);
1956		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
1957	}
1958	ASSERT(no_callback || hdr->b_datacnt > 1 ||
1959	    refcount_is_zero(&hdr->b_refcnt));
1960	mutex_exit(hash_lock);
1961	return (no_callback);
1962}
1963
1964int
1965arc_buf_size(arc_buf_t *buf)
1966{
1967	return (buf->b_hdr->b_size);
1968}
1969
1970/*
1971 * Called from the DMU to determine if the current buffer should be
1972 * evicted. In order to ensure proper locking, the eviction must be initiated
1973 * from the DMU. Return true if the buffer is associated with user data and
1974 * duplicate buffers still exist.
1975 */
1976boolean_t
1977arc_buf_eviction_needed(arc_buf_t *buf)
1978{
1979	arc_buf_hdr_t *hdr;
1980	boolean_t evict_needed = B_FALSE;
1981
1982	if (zfs_disable_dup_eviction)
1983		return (B_FALSE);
1984
1985	mutex_enter(&buf->b_evict_lock);
1986	hdr = buf->b_hdr;
1987	if (hdr == NULL) {
1988		/*
1989		 * We are in arc_do_user_evicts(); let that function
1990		 * perform the eviction.
1991		 */
1992		ASSERT(buf->b_data == NULL);
1993		mutex_exit(&buf->b_evict_lock);
1994		return (B_FALSE);
1995	} else if (buf->b_data == NULL) {
1996		/*
1997		 * We have already been added to the arc eviction list;
1998		 * recommend eviction.
1999		 */
2000		ASSERT3P(hdr, ==, &arc_eviction_hdr);
2001		mutex_exit(&buf->b_evict_lock);
2002		return (B_TRUE);
2003	}
2004
2005	if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
2006		evict_needed = B_TRUE;
2007
2008	mutex_exit(&buf->b_evict_lock);
2009	return (evict_needed);
2010}
2011
2012/*
2013 * Evict buffers from list until we've removed the specified number of
2014 * bytes.  Move the removed buffers to the appropriate evict state.
2015 * If the recycle flag is set, then attempt to "recycle" a buffer:
2016 * - look for a buffer to evict that is `bytes' long.
2017 * - return the data block from this buffer rather than freeing it.
2018 * This flag is used by callers that are trying to make space for a
2019 * new buffer in a full arc cache.
2020 *
2021 * This function makes a "best effort".  It skips over any buffers
2022 * it can't get a hash_lock on, and so may not catch all candidates.
2023 * It may also return without evicting as much space as requested.
2024 */
2025static void *
2026arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
2027    arc_buf_contents_t type)
2028{
2029	arc_state_t *evicted_state;
2030	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
2031	int64_t bytes_remaining;
2032	arc_buf_hdr_t *hdr, *hdr_prev = NULL;
2033	list_t *evicted_list, *list, *evicted_list_start, *list_start;
2034	kmutex_t *lock, *evicted_lock;
2035	kmutex_t *hash_lock;
2036	boolean_t have_lock;
2037	void *stolen = NULL;
2038	arc_buf_hdr_t marker = { 0 };
2039	int count = 0;
2040	static int evict_metadata_offset, evict_data_offset;
2041	int i, idx, offset, list_count, lists;
2042
2043	ASSERT(state == arc_mru || state == arc_mfu);
2044
2045	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2046
2047	/*
2048	 * Decide which "type" (data vs metadata) to recycle from.
2049	 *
2050	 * If we are over the metadata limit, recycle from metadata.
2051	 * If we are under the metadata minimum, recycle from data.
2052	 * Otherwise, recycle from whichever type has the oldest (least
2053	 * recently accessed) header.  This is not yet implemented.
2054	 */
2055	if (recycle) {
2056		arc_buf_contents_t realtype;
2057		if (state->arcs_lsize[ARC_BUFC_DATA] == 0) {
2058			realtype = ARC_BUFC_METADATA;
2059		} else if (state->arcs_lsize[ARC_BUFC_METADATA] == 0) {
2060			realtype = ARC_BUFC_DATA;
2061		} else if (arc_meta_used >= arc_meta_limit) {
2062			realtype = ARC_BUFC_METADATA;
2063		} else if (arc_meta_used <= arc_meta_min) {
2064			realtype = ARC_BUFC_DATA;
2065		} else {
2066#ifdef illumos
2067			if (data_hdr->b_arc_access <
2068			    metadata_hdr->b_arc_access) {
2069				realtype = ARC_BUFC_DATA;
2070			} else {
2071				realtype = ARC_BUFC_METADATA;
2072			}
2073#else
2074			/* TODO */
2075			realtype = type;
2076#endif
2077		}
2078		if (realtype != type) {
2079			/*
2080			 * If we want to evict from a different list,
2081			 * we can not recycle, because DATA vs METADATA
2082			 * buffers are segregated into different kmem
2083			 * caches (and vmem arenas).
2084			 */
2085			type = realtype;
2086			recycle = B_FALSE;
2087		}
2088	}
2089
2090	if (type == ARC_BUFC_METADATA) {
2091		offset = 0;
2092		list_count = ARC_BUFC_NUMMETADATALISTS;
2093		list_start = &state->arcs_lists[0];
2094		evicted_list_start = &evicted_state->arcs_lists[0];
2095		idx = evict_metadata_offset;
2096	} else {
2097		offset = ARC_BUFC_NUMMETADATALISTS;
2098		list_start = &state->arcs_lists[offset];
2099		evicted_list_start = &evicted_state->arcs_lists[offset];
2100		list_count = ARC_BUFC_NUMDATALISTS;
2101		idx = evict_data_offset;
2102	}
2103	bytes_remaining = evicted_state->arcs_lsize[type];
2104	lists = 0;
2105
2106evict_start:
2107	list = &list_start[idx];
2108	evicted_list = &evicted_list_start[idx];
2109	lock = ARCS_LOCK(state, (offset + idx));
2110	evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
2111
2112	mutex_enter(lock);
2113	mutex_enter(evicted_lock);
2114
2115	for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
2116		hdr_prev = list_prev(list, hdr);
2117		bytes_remaining -= (hdr->b_size * hdr->b_datacnt);
2118		/* prefetch buffers have a minimum lifespan */
2119		if (HDR_IO_IN_PROGRESS(hdr) ||
2120		    (spa && hdr->b_spa != spa) ||
2121		    (hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT) &&
2122		    ddi_get_lbolt() - hdr->b_arc_access <
2123		    arc_min_prefetch_lifespan)) {
2124			skipped++;
2125			continue;
2126		}
2127		/* "lookahead" for better eviction candidate */
2128		if (recycle && hdr->b_size != bytes &&
2129		    hdr_prev && hdr_prev->b_size == bytes)
2130			continue;
2131
2132		/* ignore markers */
2133		if (hdr->b_spa == 0)
2134			continue;
2135
2136		/*
2137		 * It may take a long time to evict all the bufs requested.
2138		 * To avoid blocking all arc activity, periodically drop
2139		 * the arcs_mtx and give other threads a chance to run
2140		 * before reacquiring the lock.
2141		 *
2142		 * If we are looking for a buffer to recycle, we are in
2143		 * the hot code path, so don't sleep.
2144		 */
2145		if (!recycle && count++ > arc_evict_iterations) {
2146			list_insert_after(list, hdr, &marker);
2147			mutex_exit(evicted_lock);
2148			mutex_exit(lock);
2149			kpreempt(KPREEMPT_SYNC);
2150			mutex_enter(lock);
2151			mutex_enter(evicted_lock);
2152			hdr_prev = list_prev(list, &marker);
2153			list_remove(list, &marker);
2154			count = 0;
2155			continue;
2156		}
2157
2158		hash_lock = HDR_LOCK(hdr);
2159		have_lock = MUTEX_HELD(hash_lock);
2160		if (have_lock || mutex_tryenter(hash_lock)) {
2161			ASSERT0(refcount_count(&hdr->b_refcnt));
2162			ASSERT(hdr->b_datacnt > 0);
2163			while (hdr->b_buf) {
2164				arc_buf_t *buf = hdr->b_buf;
2165				if (!mutex_tryenter(&buf->b_evict_lock)) {
2166					missed += 1;
2167					break;
2168				}
2169				if (buf->b_data) {
2170					bytes_evicted += hdr->b_size;
2171					if (recycle && hdr->b_type == type &&
2172					    hdr->b_size == bytes &&
2173					    !HDR_L2_WRITING(hdr)) {
2174						stolen = buf->b_data;
2175						recycle = FALSE;
2176					}
2177				}
2178				if (buf->b_efunc) {
2179					mutex_enter(&arc_eviction_mtx);
2180					arc_buf_destroy(buf,
2181					    buf->b_data == stolen, FALSE);
2182					hdr->b_buf = buf->b_next;
2183					buf->b_hdr = &arc_eviction_hdr;
2184					buf->b_next = arc_eviction_list;
2185					arc_eviction_list = buf;
2186					mutex_exit(&arc_eviction_mtx);
2187					mutex_exit(&buf->b_evict_lock);
2188				} else {
2189					mutex_exit(&buf->b_evict_lock);
2190					arc_buf_destroy(buf,
2191					    buf->b_data == stolen, TRUE);
2192				}
2193			}
2194
2195			if (hdr->b_l2hdr) {
2196				ARCSTAT_INCR(arcstat_evict_l2_cached,
2197				    hdr->b_size);
2198			} else {
2199				if (l2arc_write_eligible(hdr->b_spa, hdr)) {
2200					ARCSTAT_INCR(arcstat_evict_l2_eligible,
2201					    hdr->b_size);
2202				} else {
2203					ARCSTAT_INCR(
2204					    arcstat_evict_l2_ineligible,
2205					    hdr->b_size);
2206				}
2207			}
2208
2209			if (hdr->b_datacnt == 0) {
2210				arc_change_state(evicted_state, hdr, hash_lock);
2211				ASSERT(HDR_IN_HASH_TABLE(hdr));
2212				hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
2213				hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
2214				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
2215			}
2216			if (!have_lock)
2217				mutex_exit(hash_lock);
2218			if (bytes >= 0 && bytes_evicted >= bytes)
2219				break;
2220			if (bytes_remaining > 0) {
2221				mutex_exit(evicted_lock);
2222				mutex_exit(lock);
2223				idx  = ((idx + 1) & (list_count - 1));
2224				lists++;
2225				goto evict_start;
2226			}
2227		} else {
2228			missed += 1;
2229		}
2230	}
2231
2232	mutex_exit(evicted_lock);
2233	mutex_exit(lock);
2234
2235	idx  = ((idx + 1) & (list_count - 1));
2236	lists++;
2237
2238	if (bytes_evicted < bytes) {
2239		if (lists < list_count)
2240			goto evict_start;
2241		else
2242			dprintf("only evicted %lld bytes from %x",
2243			    (longlong_t)bytes_evicted, state);
2244	}
2245	if (type == ARC_BUFC_METADATA)
2246		evict_metadata_offset = idx;
2247	else
2248		evict_data_offset = idx;
2249
2250	if (skipped)
2251		ARCSTAT_INCR(arcstat_evict_skip, skipped);
2252
2253	if (missed)
2254		ARCSTAT_INCR(arcstat_mutex_miss, missed);
2255
2256	/*
2257	 * Note: we have just evicted some data into the ghost state,
2258	 * potentially putting the ghost size over the desired size.  Rather
2259	 * that evicting from the ghost list in this hot code path, leave
2260	 * this chore to the arc_reclaim_thread().
2261	 */
2262
2263	if (stolen)
2264		ARCSTAT_BUMP(arcstat_stolen);
2265	return (stolen);
2266}
2267
2268/*
2269 * Remove buffers from list until we've removed the specified number of
2270 * bytes.  Destroy the buffers that are removed.
2271 */
2272static void
2273arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
2274{
2275	arc_buf_hdr_t *hdr, *hdr_prev;
2276	arc_buf_hdr_t marker = { 0 };
2277	list_t *list, *list_start;
2278	kmutex_t *hash_lock, *lock;
2279	uint64_t bytes_deleted = 0;
2280	uint64_t bufs_skipped = 0;
2281	int count = 0;
2282	static int evict_offset;
2283	int list_count, idx = evict_offset;
2284	int offset, lists = 0;
2285
2286	ASSERT(GHOST_STATE(state));
2287
2288	/*
2289	 * data lists come after metadata lists
2290	 */
2291	list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
2292	list_count = ARC_BUFC_NUMDATALISTS;
2293	offset = ARC_BUFC_NUMMETADATALISTS;
2294
2295evict_start:
2296	list = &list_start[idx];
2297	lock = ARCS_LOCK(state, idx + offset);
2298
2299	mutex_enter(lock);
2300	for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
2301		hdr_prev = list_prev(list, hdr);
2302		if (hdr->b_type > ARC_BUFC_NUMTYPES)
2303			panic("invalid hdr=%p", (void *)hdr);
2304		if (spa && hdr->b_spa != spa)
2305			continue;
2306
2307		/* ignore markers */
2308		if (hdr->b_spa == 0)
2309			continue;
2310
2311		hash_lock = HDR_LOCK(hdr);
2312		/* caller may be trying to modify this buffer, skip it */
2313		if (MUTEX_HELD(hash_lock))
2314			continue;
2315
2316		/*
2317		 * It may take a long time to evict all the bufs requested.
2318		 * To avoid blocking all arc activity, periodically drop
2319		 * the arcs_mtx and give other threads a chance to run
2320		 * before reacquiring the lock.
2321		 */
2322		if (count++ > arc_evict_iterations) {
2323			list_insert_after(list, hdr, &marker);
2324			mutex_exit(lock);
2325			kpreempt(KPREEMPT_SYNC);
2326			mutex_enter(lock);
2327			hdr_prev = list_prev(list, &marker);
2328			list_remove(list, &marker);
2329			count = 0;
2330			continue;
2331		}
2332		if (mutex_tryenter(hash_lock)) {
2333			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2334			ASSERT(hdr->b_buf == NULL);
2335			ARCSTAT_BUMP(arcstat_deleted);
2336			bytes_deleted += hdr->b_size;
2337
2338			if (hdr->b_l2hdr != NULL) {
2339				/*
2340				 * This buffer is cached on the 2nd Level ARC;
2341				 * don't destroy the header.
2342				 */
2343				arc_change_state(arc_l2c_only, hdr, hash_lock);
2344				mutex_exit(hash_lock);
2345			} else {
2346				arc_change_state(arc_anon, hdr, hash_lock);
2347				mutex_exit(hash_lock);
2348				arc_hdr_destroy(hdr);
2349			}
2350
2351			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
2352			if (bytes >= 0 && bytes_deleted >= bytes)
2353				break;
2354		} else if (bytes < 0) {
2355			/*
2356			 * Insert a list marker and then wait for the
2357			 * hash lock to become available. Once its
2358			 * available, restart from where we left off.
2359			 */
2360			list_insert_after(list, hdr, &marker);
2361			mutex_exit(lock);
2362			mutex_enter(hash_lock);
2363			mutex_exit(hash_lock);
2364			mutex_enter(lock);
2365			hdr_prev = list_prev(list, &marker);
2366			list_remove(list, &marker);
2367		} else {
2368			bufs_skipped += 1;
2369		}
2370
2371	}
2372	mutex_exit(lock);
2373	idx  = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
2374	lists++;
2375
2376	if (lists < list_count)
2377		goto evict_start;
2378
2379	evict_offset = idx;
2380	if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
2381	    (bytes < 0 || bytes_deleted < bytes)) {
2382		list_start = &state->arcs_lists[0];
2383		list_count = ARC_BUFC_NUMMETADATALISTS;
2384		offset = lists = 0;
2385		goto evict_start;
2386	}
2387
2388	if (bufs_skipped) {
2389		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2390		ASSERT(bytes >= 0);
2391	}
2392
2393	if (bytes_deleted < bytes)
2394		dprintf("only deleted %lld bytes from %p",
2395		    (longlong_t)bytes_deleted, state);
2396}
2397
2398static void
2399arc_adjust(void)
2400{
2401	int64_t adjustment, delta;
2402
2403	/*
2404	 * Adjust MRU size
2405	 */
2406
2407	adjustment = MIN((int64_t)(arc_size - arc_c),
2408	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2409	    arc_p));
2410
2411	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2412		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2413		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
2414		adjustment -= delta;
2415	}
2416
2417	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2418		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2419		(void) arc_evict(arc_mru, 0, delta, FALSE,
2420		    ARC_BUFC_METADATA);
2421	}
2422
2423	/*
2424	 * Adjust MFU size
2425	 */
2426
2427	adjustment = arc_size - arc_c;
2428
2429	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2430		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2431		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
2432		adjustment -= delta;
2433	}
2434
2435	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2436		int64_t delta = MIN(adjustment,
2437		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2438		(void) arc_evict(arc_mfu, 0, delta, FALSE,
2439		    ARC_BUFC_METADATA);
2440	}
2441
2442	/*
2443	 * Adjust ghost lists
2444	 */
2445
2446	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2447
2448	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2449		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2450		arc_evict_ghost(arc_mru_ghost, 0, delta);
2451	}
2452
2453	adjustment =
2454	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2455
2456	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2457		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2458		arc_evict_ghost(arc_mfu_ghost, 0, delta);
2459	}
2460}
2461
2462static void
2463arc_do_user_evicts(void)
2464{
2465	static arc_buf_t *tmp_arc_eviction_list;
2466
2467	/*
2468	 * Move list over to avoid LOR
2469	 */
2470restart:
2471	mutex_enter(&arc_eviction_mtx);
2472	tmp_arc_eviction_list = arc_eviction_list;
2473	arc_eviction_list = NULL;
2474	mutex_exit(&arc_eviction_mtx);
2475
2476	while (tmp_arc_eviction_list != NULL) {
2477		arc_buf_t *buf = tmp_arc_eviction_list;
2478		tmp_arc_eviction_list = buf->b_next;
2479		mutex_enter(&buf->b_evict_lock);
2480		buf->b_hdr = NULL;
2481		mutex_exit(&buf->b_evict_lock);
2482
2483		if (buf->b_efunc != NULL)
2484			VERIFY0(buf->b_efunc(buf->b_private));
2485
2486		buf->b_efunc = NULL;
2487		buf->b_private = NULL;
2488		kmem_cache_free(buf_cache, buf);
2489	}
2490
2491	if (arc_eviction_list != NULL)
2492		goto restart;
2493}
2494
2495/*
2496 * Flush all *evictable* data from the cache for the given spa.
2497 * NOTE: this will not touch "active" (i.e. referenced) data.
2498 */
2499void
2500arc_flush(spa_t *spa)
2501{
2502	uint64_t guid = 0;
2503
2504	if (spa)
2505		guid = spa_load_guid(spa);
2506
2507	while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
2508		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2509		if (spa)
2510			break;
2511	}
2512	while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
2513		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2514		if (spa)
2515			break;
2516	}
2517	while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
2518		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2519		if (spa)
2520			break;
2521	}
2522	while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
2523		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2524		if (spa)
2525			break;
2526	}
2527
2528	arc_evict_ghost(arc_mru_ghost, guid, -1);
2529	arc_evict_ghost(arc_mfu_ghost, guid, -1);
2530
2531	mutex_enter(&arc_reclaim_thr_lock);
2532	arc_do_user_evicts();
2533	mutex_exit(&arc_reclaim_thr_lock);
2534	ASSERT(spa || arc_eviction_list == NULL);
2535}
2536
2537void
2538arc_shrink(void)
2539{
2540
2541	if (arc_c > arc_c_min) {
2542		uint64_t to_free;
2543
2544		to_free = arc_c >> arc_shrink_shift;
2545		DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
2546			arc_c_min, uint64_t, arc_p, uint64_t, to_free);
2547		if (arc_c > arc_c_min + to_free)
2548			atomic_add_64(&arc_c, -to_free);
2549		else
2550			arc_c = arc_c_min;
2551
2552		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2553		if (arc_c > arc_size)
2554			arc_c = MAX(arc_size, arc_c_min);
2555		if (arc_p > arc_c)
2556			arc_p = (arc_c >> 1);
2557
2558		DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
2559			arc_p);
2560
2561		ASSERT(arc_c >= arc_c_min);
2562		ASSERT((int64_t)arc_p >= 0);
2563	}
2564
2565	if (arc_size > arc_c) {
2566		DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,
2567			uint64_t, arc_c);
2568		arc_adjust();
2569	}
2570}
2571
2572static int needfree = 0;
2573
2574static int
2575arc_reclaim_needed(void)
2576{
2577
2578#ifdef _KERNEL
2579
2580	if (needfree) {
2581		DTRACE_PROBE(arc__reclaim_needfree);
2582		return (1);
2583	}
2584
2585	/*
2586	 * Cooperate with pagedaemon when it's time for it to scan
2587	 * and reclaim some pages.
2588	 */
2589	if (freemem < zfs_arc_free_target) {
2590		DTRACE_PROBE2(arc__reclaim_freemem, uint64_t,
2591		    freemem, uint64_t, zfs_arc_free_target);
2592		return (1);
2593	}
2594
2595#ifdef illumos
2596	/*
2597	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
2598	 */
2599	extra = desfree;
2600
2601	/*
2602	 * check that we're out of range of the pageout scanner.  It starts to
2603	 * schedule paging if freemem is less than lotsfree and needfree.
2604	 * lotsfree is the high-water mark for pageout, and needfree is the
2605	 * number of needed free pages.  We add extra pages here to make sure
2606	 * the scanner doesn't start up while we're freeing memory.
2607	 */
2608	if (freemem < lotsfree + needfree + extra)
2609		return (1);
2610
2611	/*
2612	 * check to make sure that swapfs has enough space so that anon
2613	 * reservations can still succeed. anon_resvmem() checks that the
2614	 * availrmem is greater than swapfs_minfree, and the number of reserved
2615	 * swap pages.  We also add a bit of extra here just to prevent
2616	 * circumstances from getting really dire.
2617	 */
2618	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2619		return (1);
2620
2621	/*
2622	 * Check that we have enough availrmem that memory locking (e.g., via
2623	 * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
2624	 * stores the number of pages that cannot be locked; when availrmem
2625	 * drops below pages_pp_maximum, page locking mechanisms such as
2626	 * page_pp_lock() will fail.)
2627	 */
2628	if (availrmem <= pages_pp_maximum)
2629		return (1);
2630
2631#endif	/* illumos */
2632#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
2633	/*
2634	 * If we're on an i386 platform, it's possible that we'll exhaust the
2635	 * kernel heap space before we ever run out of available physical
2636	 * memory.  Most checks of the size of the heap_area compare against
2637	 * tune.t_minarmem, which is the minimum available real memory that we
2638	 * can have in the system.  However, this is generally fixed at 25 pages
2639	 * which is so low that it's useless.  In this comparison, we seek to
2640	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
2641	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2642	 * free)
2643	 */
2644	if (vmem_size(heap_arena, VMEM_FREE) <
2645	    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) {
2646		DTRACE_PROBE2(arc__reclaim_used, uint64_t,
2647		    vmem_size(heap_arena, VMEM_FREE), uint64_t,
2648		    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2);
2649		return (1);
2650	}
2651#endif
2652#ifdef illumos
2653	/*
2654	 * If zio data pages are being allocated out of a separate heap segment,
2655	 * then enforce that the size of available vmem for this arena remains
2656	 * above about 1/16th free.
2657	 *
2658	 * Note: The 1/16th arena free requirement was put in place
2659	 * to aggressively evict memory from the arc in order to avoid
2660	 * memory fragmentation issues.
2661	 */
2662	if (zio_arena != NULL &&
2663	    vmem_size(zio_arena, VMEM_FREE) <
2664	    (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2665		return (1);
2666#endif	/* illumos */
2667#else	/* _KERNEL */
2668	if (spa_get_random(100) == 0)
2669		return (1);
2670#endif	/* _KERNEL */
2671	DTRACE_PROBE(arc__reclaim_no);
2672
2673	return (0);
2674}
2675
2676extern kmem_cache_t	*zio_buf_cache[];
2677extern kmem_cache_t	*zio_data_buf_cache[];
2678extern kmem_cache_t	*range_seg_cache;
2679
2680static void __noinline
2681arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2682{
2683	size_t			i;
2684	kmem_cache_t		*prev_cache = NULL;
2685	kmem_cache_t		*prev_data_cache = NULL;
2686
2687	DTRACE_PROBE(arc__kmem_reap_start);
2688#ifdef _KERNEL
2689	if (arc_meta_used >= arc_meta_limit) {
2690		/*
2691		 * We are exceeding our meta-data cache limit.
2692		 * Purge some DNLC entries to release holds on meta-data.
2693		 */
2694		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2695	}
2696#if defined(__i386)
2697	/*
2698	 * Reclaim unused memory from all kmem caches.
2699	 */
2700	kmem_reap();
2701#endif
2702#endif
2703
2704	/*
2705	 * An aggressive reclamation will shrink the cache size as well as
2706	 * reap free buffers from the arc kmem caches.
2707	 */
2708	if (strat == ARC_RECLAIM_AGGR)
2709		arc_shrink();
2710
2711	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2712		if (zio_buf_cache[i] != prev_cache) {
2713			prev_cache = zio_buf_cache[i];
2714			kmem_cache_reap_now(zio_buf_cache[i]);
2715		}
2716		if (zio_data_buf_cache[i] != prev_data_cache) {
2717			prev_data_cache = zio_data_buf_cache[i];
2718			kmem_cache_reap_now(zio_data_buf_cache[i]);
2719		}
2720	}
2721	kmem_cache_reap_now(buf_cache);
2722	kmem_cache_reap_now(hdr_cache);
2723	kmem_cache_reap_now(range_seg_cache);
2724
2725#ifdef illumos
2726	/*
2727	 * Ask the vmem arena to reclaim unused memory from its
2728	 * quantum caches.
2729	 */
2730	if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2731		vmem_qcache_reap(zio_arena);
2732#endif
2733	DTRACE_PROBE(arc__kmem_reap_end);
2734}
2735
2736static void
2737arc_reclaim_thread(void *dummy __unused)
2738{
2739	clock_t			growtime = 0;
2740	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
2741	callb_cpr_t		cpr;
2742
2743	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2744
2745	mutex_enter(&arc_reclaim_thr_lock);
2746	while (arc_thread_exit == 0) {
2747		if (arc_reclaim_needed()) {
2748
2749			if (arc_no_grow) {
2750				if (last_reclaim == ARC_RECLAIM_CONS) {
2751					DTRACE_PROBE(arc__reclaim_aggr_no_grow);
2752					last_reclaim = ARC_RECLAIM_AGGR;
2753				} else {
2754					last_reclaim = ARC_RECLAIM_CONS;
2755				}
2756			} else {
2757				arc_no_grow = TRUE;
2758				last_reclaim = ARC_RECLAIM_AGGR;
2759				DTRACE_PROBE(arc__reclaim_aggr);
2760				membar_producer();
2761			}
2762
2763			/* reset the growth delay for every reclaim */
2764			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2765
2766			if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
2767				/*
2768				 * If needfree is TRUE our vm_lowmem hook
2769				 * was called and in that case we must free some
2770				 * memory, so switch to aggressive mode.
2771				 */
2772				arc_no_grow = TRUE;
2773				last_reclaim = ARC_RECLAIM_AGGR;
2774			}
2775			arc_kmem_reap_now(last_reclaim);
2776			arc_warm = B_TRUE;
2777
2778		} else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2779			arc_no_grow = FALSE;
2780		}
2781
2782		arc_adjust();
2783
2784		if (arc_eviction_list != NULL)
2785			arc_do_user_evicts();
2786
2787#ifdef _KERNEL
2788		if (needfree) {
2789			needfree = 0;
2790			wakeup(&needfree);
2791		}
2792#endif
2793
2794		/* block until needed, or one second, whichever is shorter */
2795		CALLB_CPR_SAFE_BEGIN(&cpr);
2796		(void) cv_timedwait(&arc_reclaim_thr_cv,
2797		    &arc_reclaim_thr_lock, hz);
2798		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2799	}
2800
2801	arc_thread_exit = 0;
2802	cv_broadcast(&arc_reclaim_thr_cv);
2803	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
2804	thread_exit();
2805}
2806
2807/*
2808 * Adapt arc info given the number of bytes we are trying to add and
2809 * the state that we are comming from.  This function is only called
2810 * when we are adding new content to the cache.
2811 */
2812static void
2813arc_adapt(int bytes, arc_state_t *state)
2814{
2815	int mult;
2816	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2817
2818	if (state == arc_l2c_only)
2819		return;
2820
2821	ASSERT(bytes > 0);
2822	/*
2823	 * Adapt the target size of the MRU list:
2824	 *	- if we just hit in the MRU ghost list, then increase
2825	 *	  the target size of the MRU list.
2826	 *	- if we just hit in the MFU ghost list, then increase
2827	 *	  the target size of the MFU list by decreasing the
2828	 *	  target size of the MRU list.
2829	 */
2830	if (state == arc_mru_ghost) {
2831		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2832		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2833		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2834
2835		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2836	} else if (state == arc_mfu_ghost) {
2837		uint64_t delta;
2838
2839		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2840		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2841		mult = MIN(mult, 10);
2842
2843		delta = MIN(bytes * mult, arc_p);
2844		arc_p = MAX(arc_p_min, arc_p - delta);
2845	}
2846	ASSERT((int64_t)arc_p >= 0);
2847
2848	if (arc_reclaim_needed()) {
2849		cv_signal(&arc_reclaim_thr_cv);
2850		return;
2851	}
2852
2853	if (arc_no_grow)
2854		return;
2855
2856	if (arc_c >= arc_c_max)
2857		return;
2858
2859	/*
2860	 * If we're within (2 * maxblocksize) bytes of the target
2861	 * cache size, increment the target cache size
2862	 */
2863	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2864		DTRACE_PROBE1(arc__inc_adapt, int, bytes);
2865		atomic_add_64(&arc_c, (int64_t)bytes);
2866		if (arc_c > arc_c_max)
2867			arc_c = arc_c_max;
2868		else if (state == arc_anon)
2869			atomic_add_64(&arc_p, (int64_t)bytes);
2870		if (arc_p > arc_c)
2871			arc_p = arc_c;
2872	}
2873	ASSERT((int64_t)arc_p >= 0);
2874}
2875
2876/*
2877 * Check if the cache has reached its limits and eviction is required
2878 * prior to insert.
2879 */
2880static int
2881arc_evict_needed(arc_buf_contents_t type)
2882{
2883	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2884		return (1);
2885
2886	if (arc_reclaim_needed())
2887		return (1);
2888
2889	return (arc_size > arc_c);
2890}
2891
2892/*
2893 * The buffer, supplied as the first argument, needs a data block.
2894 * So, if we are at cache max, determine which cache should be victimized.
2895 * We have the following cases:
2896 *
2897 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2898 * In this situation if we're out of space, but the resident size of the MFU is
2899 * under the limit, victimize the MFU cache to satisfy this insertion request.
2900 *
2901 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2902 * Here, we've used up all of the available space for the MRU, so we need to
2903 * evict from our own cache instead.  Evict from the set of resident MRU
2904 * entries.
2905 *
2906 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2907 * c minus p represents the MFU space in the cache, since p is the size of the
2908 * cache that is dedicated to the MRU.  In this situation there's still space on
2909 * the MFU side, so the MRU side needs to be victimized.
2910 *
2911 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2912 * MFU's resident set is consuming more space than it has been allotted.  In
2913 * this situation, we must victimize our own cache, the MFU, for this insertion.
2914 */
2915static void
2916arc_get_data_buf(arc_buf_t *buf)
2917{
2918	arc_state_t		*state = buf->b_hdr->b_state;
2919	uint64_t		size = buf->b_hdr->b_size;
2920	arc_buf_contents_t	type = buf->b_hdr->b_type;
2921
2922	arc_adapt(size, state);
2923
2924	/*
2925	 * We have not yet reached cache maximum size,
2926	 * just allocate a new buffer.
2927	 */
2928	if (!arc_evict_needed(type)) {
2929		if (type == ARC_BUFC_METADATA) {
2930			buf->b_data = zio_buf_alloc(size);
2931			arc_space_consume(size, ARC_SPACE_DATA);
2932		} else {
2933			ASSERT(type == ARC_BUFC_DATA);
2934			buf->b_data = zio_data_buf_alloc(size);
2935			ARCSTAT_INCR(arcstat_data_size, size);
2936			atomic_add_64(&arc_size, size);
2937		}
2938		goto out;
2939	}
2940
2941	/*
2942	 * If we are prefetching from the mfu ghost list, this buffer
2943	 * will end up on the mru list; so steal space from there.
2944	 */
2945	if (state == arc_mfu_ghost)
2946		state = buf->b_hdr->b_flags & ARC_FLAG_PREFETCH ?
2947		    arc_mru : arc_mfu;
2948	else if (state == arc_mru_ghost)
2949		state = arc_mru;
2950
2951	if (state == arc_mru || state == arc_anon) {
2952		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2953		state = (arc_mfu->arcs_lsize[type] >= size &&
2954		    arc_p > mru_used) ? arc_mfu : arc_mru;
2955	} else {
2956		/* MFU cases */
2957		uint64_t mfu_space = arc_c - arc_p;
2958		state =  (arc_mru->arcs_lsize[type] >= size &&
2959		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2960	}
2961	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
2962		if (type == ARC_BUFC_METADATA) {
2963			buf->b_data = zio_buf_alloc(size);
2964			arc_space_consume(size, ARC_SPACE_DATA);
2965		} else {
2966			ASSERT(type == ARC_BUFC_DATA);
2967			buf->b_data = zio_data_buf_alloc(size);
2968			ARCSTAT_INCR(arcstat_data_size, size);
2969			atomic_add_64(&arc_size, size);
2970		}
2971		ARCSTAT_BUMP(arcstat_recycle_miss);
2972	}
2973	ASSERT(buf->b_data != NULL);
2974out:
2975	/*
2976	 * Update the state size.  Note that ghost states have a
2977	 * "ghost size" and so don't need to be updated.
2978	 */
2979	if (!GHOST_STATE(buf->b_hdr->b_state)) {
2980		arc_buf_hdr_t *hdr = buf->b_hdr;
2981
2982		atomic_add_64(&hdr->b_state->arcs_size, size);
2983		if (list_link_active(&hdr->b_arc_node)) {
2984			ASSERT(refcount_is_zero(&hdr->b_refcnt));
2985			atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2986		}
2987		/*
2988		 * If we are growing the cache, and we are adding anonymous
2989		 * data, and we have outgrown arc_p, update arc_p
2990		 */
2991		if (arc_size < arc_c && hdr->b_state == arc_anon &&
2992		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2993			arc_p = MIN(arc_c, arc_p + size);
2994	}
2995	ARCSTAT_BUMP(arcstat_allocated);
2996}
2997
2998/*
2999 * This routine is called whenever a buffer is accessed.
3000 * NOTE: the hash lock is dropped in this function.
3001 */
3002static void
3003arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
3004{
3005	clock_t now;
3006
3007	ASSERT(MUTEX_HELD(hash_lock));
3008
3009	if (hdr->b_state == arc_anon) {
3010		/*
3011		 * This buffer is not in the cache, and does not
3012		 * appear in our "ghost" list.  Add the new buffer
3013		 * to the MRU state.
3014		 */
3015
3016		ASSERT(hdr->b_arc_access == 0);
3017		hdr->b_arc_access = ddi_get_lbolt();
3018		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
3019		arc_change_state(arc_mru, hdr, hash_lock);
3020
3021	} else if (hdr->b_state == arc_mru) {
3022		now = ddi_get_lbolt();
3023
3024		/*
3025		 * If this buffer is here because of a prefetch, then either:
3026		 * - clear the flag if this is a "referencing" read
3027		 *   (any subsequent access will bump this into the MFU state).
3028		 * or
3029		 * - move the buffer to the head of the list if this is
3030		 *   another prefetch (to make it less likely to be evicted).
3031		 */
3032		if ((hdr->b_flags & ARC_FLAG_PREFETCH) != 0) {
3033			if (refcount_count(&hdr->b_refcnt) == 0) {
3034				ASSERT(list_link_active(&hdr->b_arc_node));
3035			} else {
3036				hdr->b_flags &= ~ARC_FLAG_PREFETCH;
3037				ARCSTAT_BUMP(arcstat_mru_hits);
3038			}
3039			hdr->b_arc_access = now;
3040			return;
3041		}
3042
3043		/*
3044		 * This buffer has been "accessed" only once so far,
3045		 * but it is still in the cache. Move it to the MFU
3046		 * state.
3047		 */
3048		if (now > hdr->b_arc_access + ARC_MINTIME) {
3049			/*
3050			 * More than 125ms have passed since we
3051			 * instantiated this buffer.  Move it to the
3052			 * most frequently used state.
3053			 */
3054			hdr->b_arc_access = now;
3055			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3056			arc_change_state(arc_mfu, hdr, hash_lock);
3057		}
3058		ARCSTAT_BUMP(arcstat_mru_hits);
3059	} else if (hdr->b_state == arc_mru_ghost) {
3060		arc_state_t	*new_state;
3061		/*
3062		 * This buffer has been "accessed" recently, but
3063		 * was evicted from the cache.  Move it to the
3064		 * MFU state.
3065		 */
3066
3067		if (hdr->b_flags & ARC_FLAG_PREFETCH) {
3068			new_state = arc_mru;
3069			if (refcount_count(&hdr->b_refcnt) > 0)
3070				hdr->b_flags &= ~ARC_FLAG_PREFETCH;
3071			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
3072		} else {
3073			new_state = arc_mfu;
3074			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3075		}
3076
3077		hdr->b_arc_access = ddi_get_lbolt();
3078		arc_change_state(new_state, hdr, hash_lock);
3079
3080		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
3081	} else if (hdr->b_state == arc_mfu) {
3082		/*
3083		 * This buffer has been accessed more than once and is
3084		 * still in the cache.  Keep it in the MFU state.
3085		 *
3086		 * NOTE: an add_reference() that occurred when we did
3087		 * the arc_read() will have kicked this off the list.
3088		 * If it was a prefetch, we will explicitly move it to
3089		 * the head of the list now.
3090		 */
3091		if ((hdr->b_flags & ARC_FLAG_PREFETCH) != 0) {
3092			ASSERT(refcount_count(&hdr->b_refcnt) == 0);
3093			ASSERT(list_link_active(&hdr->b_arc_node));
3094		}
3095		ARCSTAT_BUMP(arcstat_mfu_hits);
3096		hdr->b_arc_access = ddi_get_lbolt();
3097	} else if (hdr->b_state == arc_mfu_ghost) {
3098		arc_state_t	*new_state = arc_mfu;
3099		/*
3100		 * This buffer has been accessed more than once but has
3101		 * been evicted from the cache.  Move it back to the
3102		 * MFU state.
3103		 */
3104
3105		if (hdr->b_flags & ARC_FLAG_PREFETCH) {
3106			/*
3107			 * This is a prefetch access...
3108			 * move this block back to the MRU state.
3109			 */
3110			ASSERT0(refcount_count(&hdr->b_refcnt));
3111			new_state = arc_mru;
3112		}
3113
3114		hdr->b_arc_access = ddi_get_lbolt();
3115		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3116		arc_change_state(new_state, hdr, hash_lock);
3117
3118		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
3119	} else if (hdr->b_state == arc_l2c_only) {
3120		/*
3121		 * This buffer is on the 2nd Level ARC.
3122		 */
3123
3124		hdr->b_arc_access = ddi_get_lbolt();
3125		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3126		arc_change_state(arc_mfu, hdr, hash_lock);
3127	} else {
3128		ASSERT(!"invalid arc state");
3129	}
3130}
3131
3132/* a generic arc_done_func_t which you can use */
3133/* ARGSUSED */
3134void
3135arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
3136{
3137	if (zio == NULL || zio->io_error == 0)
3138		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
3139	VERIFY(arc_buf_remove_ref(buf, arg));
3140}
3141
3142/* a generic arc_done_func_t */
3143void
3144arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
3145{
3146	arc_buf_t **bufp = arg;
3147	if (zio && zio->io_error) {
3148		VERIFY(arc_buf_remove_ref(buf, arg));
3149		*bufp = NULL;
3150	} else {
3151		*bufp = buf;
3152		ASSERT(buf->b_data);
3153	}
3154}
3155
3156static void
3157arc_read_done(zio_t *zio)
3158{
3159	arc_buf_hdr_t	*hdr;
3160	arc_buf_t	*buf;
3161	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
3162	kmutex_t	*hash_lock = NULL;
3163	arc_callback_t	*callback_list, *acb;
3164	int		freeable = FALSE;
3165
3166	buf = zio->io_private;
3167	hdr = buf->b_hdr;
3168
3169	/*
3170	 * The hdr was inserted into hash-table and removed from lists
3171	 * prior to starting I/O.  We should find this header, since
3172	 * it's in the hash table, and it should be legit since it's
3173	 * not possible to evict it during the I/O.  The only possible
3174	 * reason for it not to be found is if we were freed during the
3175	 * read.
3176	 */
3177	if (HDR_IN_HASH_TABLE(hdr)) {
3178		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
3179		ASSERT3U(hdr->b_dva.dva_word[0], ==,
3180		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
3181		ASSERT3U(hdr->b_dva.dva_word[1], ==,
3182		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
3183
3184		arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
3185		    &hash_lock);
3186
3187		ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
3188		    hash_lock == NULL) ||
3189		    (found == hdr &&
3190		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
3191		    (found == hdr && HDR_L2_READING(hdr)));
3192	}
3193
3194	hdr->b_flags &= ~ARC_FLAG_L2_EVICTED;
3195	if (l2arc_noprefetch && (hdr->b_flags & ARC_FLAG_PREFETCH))
3196		hdr->b_flags &= ~ARC_FLAG_L2CACHE;
3197
3198	/* byteswap if necessary */
3199	callback_list = hdr->b_acb;
3200	ASSERT(callback_list != NULL);
3201	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
3202		dmu_object_byteswap_t bswap =
3203		    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
3204		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
3205		    byteswap_uint64_array :
3206		    dmu_ot_byteswap[bswap].ob_func;
3207		func(buf->b_data, hdr->b_size);
3208	}
3209
3210	arc_cksum_compute(buf, B_FALSE);
3211#ifdef illumos
3212	arc_buf_watch(buf);
3213#endif
3214
3215	if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
3216		/*
3217		 * Only call arc_access on anonymous buffers.  This is because
3218		 * if we've issued an I/O for an evicted buffer, we've already
3219		 * called arc_access (to prevent any simultaneous readers from
3220		 * getting confused).
3221		 */
3222		arc_access(hdr, hash_lock);
3223	}
3224
3225	/* create copies of the data buffer for the callers */
3226	abuf = buf;
3227	for (acb = callback_list; acb; acb = acb->acb_next) {
3228		if (acb->acb_done) {
3229			if (abuf == NULL) {
3230				ARCSTAT_BUMP(arcstat_duplicate_reads);
3231				abuf = arc_buf_clone(buf);
3232			}
3233			acb->acb_buf = abuf;
3234			abuf = NULL;
3235		}
3236	}
3237	hdr->b_acb = NULL;
3238	hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
3239	ASSERT(!HDR_BUF_AVAILABLE(hdr));
3240	if (abuf == buf) {
3241		ASSERT(buf->b_efunc == NULL);
3242		ASSERT(hdr->b_datacnt == 1);
3243		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
3244	}
3245
3246	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
3247
3248	if (zio->io_error != 0) {
3249		hdr->b_flags |= ARC_FLAG_IO_ERROR;
3250		if (hdr->b_state != arc_anon)
3251			arc_change_state(arc_anon, hdr, hash_lock);
3252		if (HDR_IN_HASH_TABLE(hdr))
3253			buf_hash_remove(hdr);
3254		freeable = refcount_is_zero(&hdr->b_refcnt);
3255	}
3256
3257	/*
3258	 * Broadcast before we drop the hash_lock to avoid the possibility
3259	 * that the hdr (and hence the cv) might be freed before we get to
3260	 * the cv_broadcast().
3261	 */
3262	cv_broadcast(&hdr->b_cv);
3263
3264	if (hash_lock) {
3265		mutex_exit(hash_lock);
3266	} else {
3267		/*
3268		 * This block was freed while we waited for the read to
3269		 * complete.  It has been removed from the hash table and
3270		 * moved to the anonymous state (so that it won't show up
3271		 * in the cache).
3272		 */
3273		ASSERT3P(hdr->b_state, ==, arc_anon);
3274		freeable = refcount_is_zero(&hdr->b_refcnt);
3275	}
3276
3277	/* execute each callback and free its structure */
3278	while ((acb = callback_list) != NULL) {
3279		if (acb->acb_done)
3280			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3281
3282		if (acb->acb_zio_dummy != NULL) {
3283			acb->acb_zio_dummy->io_error = zio->io_error;
3284			zio_nowait(acb->acb_zio_dummy);
3285		}
3286
3287		callback_list = acb->acb_next;
3288		kmem_free(acb, sizeof (arc_callback_t));
3289	}
3290
3291	if (freeable)
3292		arc_hdr_destroy(hdr);
3293}
3294
3295/*
3296 * "Read" the block block at the specified DVA (in bp) via the
3297 * cache.  If the block is found in the cache, invoke the provided
3298 * callback immediately and return.  Note that the `zio' parameter
3299 * in the callback will be NULL in this case, since no IO was
3300 * required.  If the block is not in the cache pass the read request
3301 * on to the spa with a substitute callback function, so that the
3302 * requested block will be added to the cache.
3303 *
3304 * If a read request arrives for a block that has a read in-progress,
3305 * either wait for the in-progress read to complete (and return the
3306 * results); or, if this is a read with a "done" func, add a record
3307 * to the read to invoke the "done" func when the read completes,
3308 * and return; or just return.
3309 *
3310 * arc_read_done() will invoke all the requested "done" functions
3311 * for readers of this block.
3312 */
3313int
3314arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
3315    void *private, zio_priority_t priority, int zio_flags,
3316    arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
3317{
3318	arc_buf_hdr_t *hdr = NULL;
3319	arc_buf_t *buf = NULL;
3320	kmutex_t *hash_lock = NULL;
3321	zio_t *rzio;
3322	uint64_t guid = spa_load_guid(spa);
3323
3324	ASSERT(!BP_IS_EMBEDDED(bp) ||
3325	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
3326
3327top:
3328	if (!BP_IS_EMBEDDED(bp)) {
3329		/*
3330		 * Embedded BP's have no DVA and require no I/O to "read".
3331		 * Create an anonymous arc buf to back it.
3332		 */
3333		hdr = buf_hash_find(guid, bp, &hash_lock);
3334	}
3335
3336	if (hdr != NULL && hdr->b_datacnt > 0) {
3337
3338		*arc_flags |= ARC_FLAG_CACHED;
3339
3340		if (HDR_IO_IN_PROGRESS(hdr)) {
3341
3342			if (*arc_flags & ARC_FLAG_WAIT) {
3343				cv_wait(&hdr->b_cv, hash_lock);
3344				mutex_exit(hash_lock);
3345				goto top;
3346			}
3347			ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
3348
3349			if (done) {
3350				arc_callback_t	*acb = NULL;
3351
3352				acb = kmem_zalloc(sizeof (arc_callback_t),
3353				    KM_SLEEP);
3354				acb->acb_done = done;
3355				acb->acb_private = private;
3356				if (pio != NULL)
3357					acb->acb_zio_dummy = zio_null(pio,
3358					    spa, NULL, NULL, NULL, zio_flags);
3359
3360				ASSERT(acb->acb_done != NULL);
3361				acb->acb_next = hdr->b_acb;
3362				hdr->b_acb = acb;
3363				add_reference(hdr, hash_lock, private);
3364				mutex_exit(hash_lock);
3365				return (0);
3366			}
3367			mutex_exit(hash_lock);
3368			return (0);
3369		}
3370
3371		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3372
3373		if (done) {
3374			add_reference(hdr, hash_lock, private);
3375			/*
3376			 * If this block is already in use, create a new
3377			 * copy of the data so that we will be guaranteed
3378			 * that arc_release() will always succeed.
3379			 */
3380			buf = hdr->b_buf;
3381			ASSERT(buf);
3382			ASSERT(buf->b_data);
3383			if (HDR_BUF_AVAILABLE(hdr)) {
3384				ASSERT(buf->b_efunc == NULL);
3385				hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
3386			} else {
3387				buf = arc_buf_clone(buf);
3388			}
3389
3390		} else if (*arc_flags & ARC_FLAG_PREFETCH &&
3391		    refcount_count(&hdr->b_refcnt) == 0) {
3392			hdr->b_flags |= ARC_FLAG_PREFETCH;
3393		}
3394		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3395		arc_access(hdr, hash_lock);
3396		if (*arc_flags & ARC_FLAG_L2CACHE)
3397			hdr->b_flags |= ARC_FLAG_L2CACHE;
3398		if (*arc_flags & ARC_FLAG_L2COMPRESS)
3399			hdr->b_flags |= ARC_FLAG_L2COMPRESS;
3400		mutex_exit(hash_lock);
3401		ARCSTAT_BUMP(arcstat_hits);
3402		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH),
3403		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3404		    data, metadata, hits);
3405
3406		if (done)
3407			done(NULL, buf, private);
3408	} else {
3409		uint64_t size = BP_GET_LSIZE(bp);
3410		arc_callback_t *acb;
3411		vdev_t *vd = NULL;
3412		uint64_t addr = 0;
3413		boolean_t devw = B_FALSE;
3414		enum zio_compress b_compress = ZIO_COMPRESS_OFF;
3415		uint64_t b_asize = 0;
3416
3417		if (hdr == NULL) {
3418			/* this block is not in the cache */
3419			arc_buf_hdr_t *exists = NULL;
3420			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3421			buf = arc_buf_alloc(spa, size, private, type);
3422			hdr = buf->b_hdr;
3423			if (!BP_IS_EMBEDDED(bp)) {
3424				hdr->b_dva = *BP_IDENTITY(bp);
3425				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3426				hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3427				exists = buf_hash_insert(hdr, &hash_lock);
3428			}
3429			if (exists != NULL) {
3430				/* somebody beat us to the hash insert */
3431				mutex_exit(hash_lock);
3432				buf_discard_identity(hdr);
3433				(void) arc_buf_remove_ref(buf, private);
3434				goto top; /* restart the IO request */
3435			}
3436
3437			/* if this is a prefetch, we don't have a reference */
3438			if (*arc_flags & ARC_FLAG_PREFETCH) {
3439				(void) remove_reference(hdr, hash_lock,
3440				    private);
3441				hdr->b_flags |= ARC_FLAG_PREFETCH;
3442			}
3443			if (*arc_flags & ARC_FLAG_L2CACHE)
3444				hdr->b_flags |= ARC_FLAG_L2CACHE;
3445			if (*arc_flags & ARC_FLAG_L2COMPRESS)
3446				hdr->b_flags |= ARC_FLAG_L2COMPRESS;
3447			if (BP_GET_LEVEL(bp) > 0)
3448				hdr->b_flags |= ARC_FLAG_INDIRECT;
3449		} else {
3450			/* this block is in the ghost cache */
3451			ASSERT(GHOST_STATE(hdr->b_state));
3452			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3453			ASSERT0(refcount_count(&hdr->b_refcnt));
3454			ASSERT(hdr->b_buf == NULL);
3455
3456			/* if this is a prefetch, we don't have a reference */
3457			if (*arc_flags & ARC_FLAG_PREFETCH)
3458				hdr->b_flags |= ARC_FLAG_PREFETCH;
3459			else
3460				add_reference(hdr, hash_lock, private);
3461			if (*arc_flags & ARC_FLAG_L2CACHE)
3462				hdr->b_flags |= ARC_FLAG_L2CACHE;
3463			if (*arc_flags & ARC_FLAG_L2COMPRESS)
3464				hdr->b_flags |= ARC_FLAG_L2COMPRESS;
3465			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3466			buf->b_hdr = hdr;
3467			buf->b_data = NULL;
3468			buf->b_efunc = NULL;
3469			buf->b_private = NULL;
3470			buf->b_next = NULL;
3471			hdr->b_buf = buf;
3472			ASSERT(hdr->b_datacnt == 0);
3473			hdr->b_datacnt = 1;
3474			arc_get_data_buf(buf);
3475			arc_access(hdr, hash_lock);
3476		}
3477
3478		ASSERT(!GHOST_STATE(hdr->b_state));
3479
3480		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3481		acb->acb_done = done;
3482		acb->acb_private = private;
3483
3484		ASSERT(hdr->b_acb == NULL);
3485		hdr->b_acb = acb;
3486		hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
3487
3488		if (hdr->b_l2hdr != NULL &&
3489		    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3490			devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3491			addr = hdr->b_l2hdr->b_daddr;
3492			b_compress = hdr->b_l2hdr->b_compress;
3493			b_asize = hdr->b_l2hdr->b_asize;
3494			/*
3495			 * Lock out device removal.
3496			 */
3497			if (vdev_is_dead(vd) ||
3498			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3499				vd = NULL;
3500		}
3501
3502		if (hash_lock != NULL)
3503			mutex_exit(hash_lock);
3504
3505		/*
3506		 * At this point, we have a level 1 cache miss.  Try again in
3507		 * L2ARC if possible.
3508		 */
3509		ASSERT3U(hdr->b_size, ==, size);
3510		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3511		    uint64_t, size, zbookmark_phys_t *, zb);
3512		ARCSTAT_BUMP(arcstat_misses);
3513		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_FLAG_PREFETCH),
3514		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3515		    data, metadata, misses);
3516#ifdef _KERNEL
3517		curthread->td_ru.ru_inblock++;
3518#endif
3519
3520		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3521			/*
3522			 * Read from the L2ARC if the following are true:
3523			 * 1. The L2ARC vdev was previously cached.
3524			 * 2. This buffer still has L2ARC metadata.
3525			 * 3. This buffer isn't currently writing to the L2ARC.
3526			 * 4. The L2ARC entry wasn't evicted, which may
3527			 *    also have invalidated the vdev.
3528			 * 5. This isn't prefetch and l2arc_noprefetch is set.
3529			 */
3530			if (hdr->b_l2hdr != NULL &&
3531			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3532			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3533				l2arc_read_callback_t *cb;
3534
3535				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3536				ARCSTAT_BUMP(arcstat_l2_hits);
3537
3538				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3539				    KM_SLEEP);
3540				cb->l2rcb_buf = buf;
3541				cb->l2rcb_spa = spa;
3542				cb->l2rcb_bp = *bp;
3543				cb->l2rcb_zb = *zb;
3544				cb->l2rcb_flags = zio_flags;
3545				cb->l2rcb_compress = b_compress;
3546
3547				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3548				    addr + size < vd->vdev_psize -
3549				    VDEV_LABEL_END_SIZE);
3550
3551				/*
3552				 * l2arc read.  The SCL_L2ARC lock will be
3553				 * released by l2arc_read_done().
3554				 * Issue a null zio if the underlying buffer
3555				 * was squashed to zero size by compression.
3556				 */
3557				if (b_compress == ZIO_COMPRESS_EMPTY) {
3558					rzio = zio_null(pio, spa, vd,
3559					    l2arc_read_done, cb,
3560					    zio_flags | ZIO_FLAG_DONT_CACHE |
3561					    ZIO_FLAG_CANFAIL |
3562					    ZIO_FLAG_DONT_PROPAGATE |
3563					    ZIO_FLAG_DONT_RETRY);
3564				} else {
3565					rzio = zio_read_phys(pio, vd, addr,
3566					    b_asize, buf->b_data,
3567					    ZIO_CHECKSUM_OFF,
3568					    l2arc_read_done, cb, priority,
3569					    zio_flags | ZIO_FLAG_DONT_CACHE |
3570					    ZIO_FLAG_CANFAIL |
3571					    ZIO_FLAG_DONT_PROPAGATE |
3572					    ZIO_FLAG_DONT_RETRY, B_FALSE);
3573				}
3574				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3575				    zio_t *, rzio);
3576				ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
3577
3578				if (*arc_flags & ARC_FLAG_NOWAIT) {
3579					zio_nowait(rzio);
3580					return (0);
3581				}
3582
3583				ASSERT(*arc_flags & ARC_FLAG_WAIT);
3584				if (zio_wait(rzio) == 0)
3585					return (0);
3586
3587				/* l2arc read error; goto zio_read() */
3588			} else {
3589				DTRACE_PROBE1(l2arc__miss,
3590				    arc_buf_hdr_t *, hdr);
3591				ARCSTAT_BUMP(arcstat_l2_misses);
3592				if (HDR_L2_WRITING(hdr))
3593					ARCSTAT_BUMP(arcstat_l2_rw_clash);
3594				spa_config_exit(spa, SCL_L2ARC, vd);
3595			}
3596		} else {
3597			if (vd != NULL)
3598				spa_config_exit(spa, SCL_L2ARC, vd);
3599			if (l2arc_ndev != 0) {
3600				DTRACE_PROBE1(l2arc__miss,
3601				    arc_buf_hdr_t *, hdr);
3602				ARCSTAT_BUMP(arcstat_l2_misses);
3603			}
3604		}
3605
3606		rzio = zio_read(pio, spa, bp, buf->b_data, size,
3607		    arc_read_done, buf, priority, zio_flags, zb);
3608
3609		if (*arc_flags & ARC_FLAG_WAIT)
3610			return (zio_wait(rzio));
3611
3612		ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
3613		zio_nowait(rzio);
3614	}
3615	return (0);
3616}
3617
3618void
3619arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3620{
3621	ASSERT(buf->b_hdr != NULL);
3622	ASSERT(buf->b_hdr->b_state != arc_anon);
3623	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3624	ASSERT(buf->b_efunc == NULL);
3625	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3626
3627	buf->b_efunc = func;
3628	buf->b_private = private;
3629}
3630
3631/*
3632 * Notify the arc that a block was freed, and thus will never be used again.
3633 */
3634void
3635arc_freed(spa_t *spa, const blkptr_t *bp)
3636{
3637	arc_buf_hdr_t *hdr;
3638	kmutex_t *hash_lock;
3639	uint64_t guid = spa_load_guid(spa);
3640
3641	ASSERT(!BP_IS_EMBEDDED(bp));
3642
3643	hdr = buf_hash_find(guid, bp, &hash_lock);
3644	if (hdr == NULL)
3645		return;
3646	if (HDR_BUF_AVAILABLE(hdr)) {
3647		arc_buf_t *buf = hdr->b_buf;
3648		add_reference(hdr, hash_lock, FTAG);
3649		hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
3650		mutex_exit(hash_lock);
3651
3652		arc_release(buf, FTAG);
3653		(void) arc_buf_remove_ref(buf, FTAG);
3654	} else {
3655		mutex_exit(hash_lock);
3656	}
3657
3658}
3659
3660/*
3661 * Clear the user eviction callback set by arc_set_callback(), first calling
3662 * it if it exists.  Because the presence of a callback keeps an arc_buf cached
3663 * clearing the callback may result in the arc_buf being destroyed.  However,
3664 * it will not result in the *last* arc_buf being destroyed, hence the data
3665 * will remain cached in the ARC. We make a copy of the arc buffer here so
3666 * that we can process the callback without holding any locks.
3667 *
3668 * It's possible that the callback is already in the process of being cleared
3669 * by another thread.  In this case we can not clear the callback.
3670 *
3671 * Returns B_TRUE if the callback was successfully called and cleared.
3672 */
3673boolean_t
3674arc_clear_callback(arc_buf_t *buf)
3675{
3676	arc_buf_hdr_t *hdr;
3677	kmutex_t *hash_lock;
3678	arc_evict_func_t *efunc = buf->b_efunc;
3679	void *private = buf->b_private;
3680	list_t *list, *evicted_list;
3681	kmutex_t *lock, *evicted_lock;
3682
3683	mutex_enter(&buf->b_evict_lock);
3684	hdr = buf->b_hdr;
3685	if (hdr == NULL) {
3686		/*
3687		 * We are in arc_do_user_evicts().
3688		 */
3689		ASSERT(buf->b_data == NULL);
3690		mutex_exit(&buf->b_evict_lock);
3691		return (B_FALSE);
3692	} else if (buf->b_data == NULL) {
3693		/*
3694		 * We are on the eviction list; process this buffer now
3695		 * but let arc_do_user_evicts() do the reaping.
3696		 */
3697		buf->b_efunc = NULL;
3698		mutex_exit(&buf->b_evict_lock);
3699		VERIFY0(efunc(private));
3700		return (B_TRUE);
3701	}
3702	hash_lock = HDR_LOCK(hdr);
3703	mutex_enter(hash_lock);
3704	hdr = buf->b_hdr;
3705	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3706
3707	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3708	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3709
3710	buf->b_efunc = NULL;
3711	buf->b_private = NULL;
3712
3713	if (hdr->b_datacnt > 1) {
3714		mutex_exit(&buf->b_evict_lock);
3715		arc_buf_destroy(buf, FALSE, TRUE);
3716	} else {
3717		ASSERT(buf == hdr->b_buf);
3718		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
3719		mutex_exit(&buf->b_evict_lock);
3720	}
3721
3722	mutex_exit(hash_lock);
3723	VERIFY0(efunc(private));
3724	return (B_TRUE);
3725}
3726
3727/*
3728 * Release this buffer from the cache, making it an anonymous buffer.  This
3729 * must be done after a read and prior to modifying the buffer contents.
3730 * If the buffer has more than one reference, we must make
3731 * a new hdr for the buffer.
3732 */
3733void
3734arc_release(arc_buf_t *buf, void *tag)
3735{
3736	arc_buf_hdr_t *hdr;
3737	kmutex_t *hash_lock = NULL;
3738	l2arc_buf_hdr_t *l2hdr;
3739	uint64_t buf_size;
3740
3741	/*
3742	 * It would be nice to assert that if it's DMU metadata (level >
3743	 * 0 || it's the dnode file), then it must be syncing context.
3744	 * But we don't know that information at this level.
3745	 */
3746
3747	mutex_enter(&buf->b_evict_lock);
3748	hdr = buf->b_hdr;
3749
3750	/* this buffer is not on any list */
3751	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3752
3753	if (hdr->b_state == arc_anon) {
3754		/* this buffer is already released */
3755		ASSERT(buf->b_efunc == NULL);
3756	} else {
3757		hash_lock = HDR_LOCK(hdr);
3758		mutex_enter(hash_lock);
3759		hdr = buf->b_hdr;
3760		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3761	}
3762
3763	l2hdr = hdr->b_l2hdr;
3764	if (l2hdr) {
3765		mutex_enter(&l2arc_buflist_mtx);
3766		arc_buf_l2_cdata_free(hdr);
3767		hdr->b_l2hdr = NULL;
3768		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3769	}
3770	buf_size = hdr->b_size;
3771
3772	/*
3773	 * Do we have more than one buf?
3774	 */
3775	if (hdr->b_datacnt > 1) {
3776		arc_buf_hdr_t *nhdr;
3777		arc_buf_t **bufp;
3778		uint64_t blksz = hdr->b_size;
3779		uint64_t spa = hdr->b_spa;
3780		arc_buf_contents_t type = hdr->b_type;
3781		uint32_t flags = hdr->b_flags;
3782
3783		ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3784		/*
3785		 * Pull the data off of this hdr and attach it to
3786		 * a new anonymous hdr.
3787		 */
3788		(void) remove_reference(hdr, hash_lock, tag);
3789		bufp = &hdr->b_buf;
3790		while (*bufp != buf)
3791			bufp = &(*bufp)->b_next;
3792		*bufp = buf->b_next;
3793		buf->b_next = NULL;
3794
3795		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3796		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3797		if (refcount_is_zero(&hdr->b_refcnt)) {
3798			uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3799			ASSERT3U(*size, >=, hdr->b_size);
3800			atomic_add_64(size, -hdr->b_size);
3801		}
3802
3803		/*
3804		 * We're releasing a duplicate user data buffer, update
3805		 * our statistics accordingly.
3806		 */
3807		if (hdr->b_type == ARC_BUFC_DATA) {
3808			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3809			ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3810			    -hdr->b_size);
3811		}
3812		hdr->b_datacnt -= 1;
3813		arc_cksum_verify(buf);
3814#ifdef illumos
3815		arc_buf_unwatch(buf);
3816#endif
3817
3818		mutex_exit(hash_lock);
3819
3820		nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3821		nhdr->b_size = blksz;
3822		nhdr->b_spa = spa;
3823		nhdr->b_type = type;
3824		nhdr->b_buf = buf;
3825		nhdr->b_state = arc_anon;
3826		nhdr->b_arc_access = 0;
3827		nhdr->b_flags = flags & ARC_FLAG_L2_WRITING;
3828		nhdr->b_l2hdr = NULL;
3829		nhdr->b_datacnt = 1;
3830		nhdr->b_freeze_cksum = NULL;
3831		(void) refcount_add(&nhdr->b_refcnt, tag);
3832		buf->b_hdr = nhdr;
3833		mutex_exit(&buf->b_evict_lock);
3834		atomic_add_64(&arc_anon->arcs_size, blksz);
3835	} else {
3836		mutex_exit(&buf->b_evict_lock);
3837		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3838		ASSERT(!list_link_active(&hdr->b_arc_node));
3839		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3840		if (hdr->b_state != arc_anon)
3841			arc_change_state(arc_anon, hdr, hash_lock);
3842		hdr->b_arc_access = 0;
3843		if (hash_lock)
3844			mutex_exit(hash_lock);
3845
3846		buf_discard_identity(hdr);
3847		arc_buf_thaw(buf);
3848	}
3849	buf->b_efunc = NULL;
3850	buf->b_private = NULL;
3851
3852	if (l2hdr) {
3853		ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3854		vdev_space_update(l2hdr->b_dev->l2ad_vdev,
3855		    -l2hdr->b_asize, 0, 0);
3856		trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
3857		    hdr->b_size, 0);
3858		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3859		ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3860		mutex_exit(&l2arc_buflist_mtx);
3861	}
3862}
3863
3864int
3865arc_released(arc_buf_t *buf)
3866{
3867	int released;
3868
3869	mutex_enter(&buf->b_evict_lock);
3870	released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3871	mutex_exit(&buf->b_evict_lock);
3872	return (released);
3873}
3874
3875#ifdef ZFS_DEBUG
3876int
3877arc_referenced(arc_buf_t *buf)
3878{
3879	int referenced;
3880
3881	mutex_enter(&buf->b_evict_lock);
3882	referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3883	mutex_exit(&buf->b_evict_lock);
3884	return (referenced);
3885}
3886#endif
3887
3888static void
3889arc_write_ready(zio_t *zio)
3890{
3891	arc_write_callback_t *callback = zio->io_private;
3892	arc_buf_t *buf = callback->awcb_buf;
3893	arc_buf_hdr_t *hdr = buf->b_hdr;
3894
3895	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3896	callback->awcb_ready(zio, buf, callback->awcb_private);
3897
3898	/*
3899	 * If the IO is already in progress, then this is a re-write
3900	 * attempt, so we need to thaw and re-compute the cksum.
3901	 * It is the responsibility of the callback to handle the
3902	 * accounting for any re-write attempt.
3903	 */
3904	if (HDR_IO_IN_PROGRESS(hdr)) {
3905		mutex_enter(&hdr->b_freeze_lock);
3906		if (hdr->b_freeze_cksum != NULL) {
3907			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3908			hdr->b_freeze_cksum = NULL;
3909		}
3910		mutex_exit(&hdr->b_freeze_lock);
3911	}
3912	arc_cksum_compute(buf, B_FALSE);
3913	hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
3914}
3915
3916/*
3917 * The SPA calls this callback for each physical write that happens on behalf
3918 * of a logical write.  See the comment in dbuf_write_physdone() for details.
3919 */
3920static void
3921arc_write_physdone(zio_t *zio)
3922{
3923	arc_write_callback_t *cb = zio->io_private;
3924	if (cb->awcb_physdone != NULL)
3925		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
3926}
3927
3928static void
3929arc_write_done(zio_t *zio)
3930{
3931	arc_write_callback_t *callback = zio->io_private;
3932	arc_buf_t *buf = callback->awcb_buf;
3933	arc_buf_hdr_t *hdr = buf->b_hdr;
3934
3935	ASSERT(hdr->b_acb == NULL);
3936
3937	if (zio->io_error == 0) {
3938		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
3939			buf_discard_identity(hdr);
3940		} else {
3941			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3942			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3943			hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3944		}
3945	} else {
3946		ASSERT(BUF_EMPTY(hdr));
3947	}
3948
3949	/*
3950	 * If the block to be written was all-zero or compressed enough to be
3951	 * embedded in the BP, no write was performed so there will be no
3952	 * dva/birth/checksum.  The buffer must therefore remain anonymous
3953	 * (and uncached).
3954	 */
3955	if (!BUF_EMPTY(hdr)) {
3956		arc_buf_hdr_t *exists;
3957		kmutex_t *hash_lock;
3958
3959		ASSERT(zio->io_error == 0);
3960
3961		arc_cksum_verify(buf);
3962
3963		exists = buf_hash_insert(hdr, &hash_lock);
3964		if (exists) {
3965			/*
3966			 * This can only happen if we overwrite for
3967			 * sync-to-convergence, because we remove
3968			 * buffers from the hash table when we arc_free().
3969			 */
3970			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3971				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3972					panic("bad overwrite, hdr=%p exists=%p",
3973					    (void *)hdr, (void *)exists);
3974				ASSERT(refcount_is_zero(&exists->b_refcnt));
3975				arc_change_state(arc_anon, exists, hash_lock);
3976				mutex_exit(hash_lock);
3977				arc_hdr_destroy(exists);
3978				exists = buf_hash_insert(hdr, &hash_lock);
3979				ASSERT3P(exists, ==, NULL);
3980			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3981				/* nopwrite */
3982				ASSERT(zio->io_prop.zp_nopwrite);
3983				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3984					panic("bad nopwrite, hdr=%p exists=%p",
3985					    (void *)hdr, (void *)exists);
3986			} else {
3987				/* Dedup */
3988				ASSERT(hdr->b_datacnt == 1);
3989				ASSERT(hdr->b_state == arc_anon);
3990				ASSERT(BP_GET_DEDUP(zio->io_bp));
3991				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3992			}
3993		}
3994		hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
3995		/* if it's not anon, we are doing a scrub */
3996		if (!exists && hdr->b_state == arc_anon)
3997			arc_access(hdr, hash_lock);
3998		mutex_exit(hash_lock);
3999	} else {
4000		hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
4001	}
4002
4003	ASSERT(!refcount_is_zero(&hdr->b_refcnt));
4004	callback->awcb_done(zio, buf, callback->awcb_private);
4005
4006	kmem_free(callback, sizeof (arc_write_callback_t));
4007}
4008
4009zio_t *
4010arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
4011    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
4012    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
4013    arc_done_func_t *done, void *private, zio_priority_t priority,
4014    int zio_flags, const zbookmark_phys_t *zb)
4015{
4016	arc_buf_hdr_t *hdr = buf->b_hdr;
4017	arc_write_callback_t *callback;
4018	zio_t *zio;
4019
4020	ASSERT(ready != NULL);
4021	ASSERT(done != NULL);
4022	ASSERT(!HDR_IO_ERROR(hdr));
4023	ASSERT((hdr->b_flags & ARC_FLAG_IO_IN_PROGRESS) == 0);
4024	ASSERT(hdr->b_acb == NULL);
4025	if (l2arc)
4026		hdr->b_flags |= ARC_FLAG_L2CACHE;
4027	if (l2arc_compress)
4028		hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4029	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
4030	callback->awcb_ready = ready;
4031	callback->awcb_physdone = physdone;
4032	callback->awcb_done = done;
4033	callback->awcb_private = private;
4034	callback->awcb_buf = buf;
4035
4036	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
4037	    arc_write_ready, arc_write_physdone, arc_write_done, callback,
4038	    priority, zio_flags, zb);
4039
4040	return (zio);
4041}
4042
4043static int
4044arc_memory_throttle(uint64_t reserve, uint64_t txg)
4045{
4046#ifdef _KERNEL
4047	uint64_t available_memory = ptob(freemem);
4048	static uint64_t page_load = 0;
4049	static uint64_t last_txg = 0;
4050
4051#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
4052	available_memory =
4053	    MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE)));
4054#endif
4055
4056	if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
4057		return (0);
4058
4059	if (txg > last_txg) {
4060		last_txg = txg;
4061		page_load = 0;
4062	}
4063	/*
4064	 * If we are in pageout, we know that memory is already tight,
4065	 * the arc is already going to be evicting, so we just want to
4066	 * continue to let page writes occur as quickly as possible.
4067	 */
4068	if (curproc == pageproc) {
4069		if (page_load > MAX(ptob(minfree), available_memory) / 4)
4070			return (SET_ERROR(ERESTART));
4071		/* Note: reserve is inflated, so we deflate */
4072		page_load += reserve / 8;
4073		return (0);
4074	} else if (page_load > 0 && arc_reclaim_needed()) {
4075		/* memory is low, delay before restarting */
4076		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
4077		return (SET_ERROR(EAGAIN));
4078	}
4079	page_load = 0;
4080#endif
4081	return (0);
4082}
4083
4084void
4085arc_tempreserve_clear(uint64_t reserve)
4086{
4087	atomic_add_64(&arc_tempreserve, -reserve);
4088	ASSERT((int64_t)arc_tempreserve >= 0);
4089}
4090
4091int
4092arc_tempreserve_space(uint64_t reserve, uint64_t txg)
4093{
4094	int error;
4095	uint64_t anon_size;
4096
4097	if (reserve > arc_c/4 && !arc_no_grow) {
4098		arc_c = MIN(arc_c_max, reserve * 4);
4099		DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
4100	}
4101	if (reserve > arc_c)
4102		return (SET_ERROR(ENOMEM));
4103
4104	/*
4105	 * Don't count loaned bufs as in flight dirty data to prevent long
4106	 * network delays from blocking transactions that are ready to be
4107	 * assigned to a txg.
4108	 */
4109	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
4110
4111	/*
4112	 * Writes will, almost always, require additional memory allocations
4113	 * in order to compress/encrypt/etc the data.  We therefore need to
4114	 * make sure that there is sufficient available memory for this.
4115	 */
4116	error = arc_memory_throttle(reserve, txg);
4117	if (error != 0)
4118		return (error);
4119
4120	/*
4121	 * Throttle writes when the amount of dirty data in the cache
4122	 * gets too large.  We try to keep the cache less than half full
4123	 * of dirty blocks so that our sync times don't grow too large.
4124	 * Note: if two requests come in concurrently, we might let them
4125	 * both succeed, when one of them should fail.  Not a huge deal.
4126	 */
4127
4128	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
4129	    anon_size > arc_c / 4) {
4130		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
4131		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
4132		    arc_tempreserve>>10,
4133		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
4134		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
4135		    reserve>>10, arc_c>>10);
4136		return (SET_ERROR(ERESTART));
4137	}
4138	atomic_add_64(&arc_tempreserve, reserve);
4139	return (0);
4140}
4141
4142static kmutex_t arc_lowmem_lock;
4143#ifdef _KERNEL
4144static eventhandler_tag arc_event_lowmem = NULL;
4145
4146static void
4147arc_lowmem(void *arg __unused, int howto __unused)
4148{
4149
4150	/* Serialize access via arc_lowmem_lock. */
4151	mutex_enter(&arc_lowmem_lock);
4152	mutex_enter(&arc_reclaim_thr_lock);
4153	needfree = 1;
4154	DTRACE_PROBE(arc__needfree);
4155	cv_signal(&arc_reclaim_thr_cv);
4156
4157	/*
4158	 * It is unsafe to block here in arbitrary threads, because we can come
4159	 * here from ARC itself and may hold ARC locks and thus risk a deadlock
4160	 * with ARC reclaim thread.
4161	 */
4162	if (curproc == pageproc) {
4163		while (needfree)
4164			msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
4165	}
4166	mutex_exit(&arc_reclaim_thr_lock);
4167	mutex_exit(&arc_lowmem_lock);
4168}
4169#endif
4170
4171void
4172arc_init(void)
4173{
4174	int i, prefetch_tunable_set = 0;
4175
4176	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4177	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
4178	mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
4179
4180	/* Convert seconds to clock ticks */
4181	arc_min_prefetch_lifespan = 1 * hz;
4182
4183	/* Start out with 1/8 of all memory */
4184	arc_c = kmem_size() / 8;
4185
4186#ifdef illumos
4187#ifdef _KERNEL
4188	/*
4189	 * On architectures where the physical memory can be larger
4190	 * than the addressable space (intel in 32-bit mode), we may
4191	 * need to limit the cache to 1/8 of VM size.
4192	 */
4193	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
4194#endif
4195#endif	/* illumos */
4196	/* set min cache to 1/32 of all memory, or 16MB, whichever is more */
4197	arc_c_min = MAX(arc_c / 4, 64<<18);
4198	/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
4199	if (arc_c * 8 >= 1<<30)
4200		arc_c_max = (arc_c * 8) - (1<<30);
4201	else
4202		arc_c_max = arc_c_min;
4203	arc_c_max = MAX(arc_c * 5, arc_c_max);
4204
4205#ifdef _KERNEL
4206	/*
4207	 * Allow the tunables to override our calculations if they are
4208	 * reasonable (ie. over 16MB)
4209	 */
4210	if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size())
4211		arc_c_max = zfs_arc_max;
4212	if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max)
4213		arc_c_min = zfs_arc_min;
4214#endif
4215
4216	arc_c = arc_c_max;
4217	arc_p = (arc_c >> 1);
4218
4219	/* limit meta-data to 1/4 of the arc capacity */
4220	arc_meta_limit = arc_c_max / 4;
4221
4222	/* Allow the tunable to override if it is reasonable */
4223	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
4224		arc_meta_limit = zfs_arc_meta_limit;
4225
4226	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
4227		arc_c_min = arc_meta_limit / 2;
4228
4229	if (zfs_arc_meta_min > 0) {
4230		arc_meta_min = zfs_arc_meta_min;
4231	} else {
4232		arc_meta_min = arc_c_min / 2;
4233	}
4234
4235	if (zfs_arc_grow_retry > 0)
4236		arc_grow_retry = zfs_arc_grow_retry;
4237
4238	if (zfs_arc_shrink_shift > 0)
4239		arc_shrink_shift = zfs_arc_shrink_shift;
4240
4241	if (zfs_arc_p_min_shift > 0)
4242		arc_p_min_shift = zfs_arc_p_min_shift;
4243
4244	/* if kmem_flags are set, lets try to use less memory */
4245	if (kmem_debugging())
4246		arc_c = arc_c / 2;
4247	if (arc_c < arc_c_min)
4248		arc_c = arc_c_min;
4249
4250	zfs_arc_min = arc_c_min;
4251	zfs_arc_max = arc_c_max;
4252
4253	arc_anon = &ARC_anon;
4254	arc_mru = &ARC_mru;
4255	arc_mru_ghost = &ARC_mru_ghost;
4256	arc_mfu = &ARC_mfu;
4257	arc_mfu_ghost = &ARC_mfu_ghost;
4258	arc_l2c_only = &ARC_l2c_only;
4259	arc_size = 0;
4260
4261	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4262		mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
4263		    NULL, MUTEX_DEFAULT, NULL);
4264		mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
4265		    NULL, MUTEX_DEFAULT, NULL);
4266		mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
4267		    NULL, MUTEX_DEFAULT, NULL);
4268		mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
4269		    NULL, MUTEX_DEFAULT, NULL);
4270		mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
4271		    NULL, MUTEX_DEFAULT, NULL);
4272		mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
4273		    NULL, MUTEX_DEFAULT, NULL);
4274
4275		list_create(&arc_mru->arcs_lists[i],
4276		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4277		list_create(&arc_mru_ghost->arcs_lists[i],
4278		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4279		list_create(&arc_mfu->arcs_lists[i],
4280		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4281		list_create(&arc_mfu_ghost->arcs_lists[i],
4282		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4283		list_create(&arc_mfu_ghost->arcs_lists[i],
4284		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4285		list_create(&arc_l2c_only->arcs_lists[i],
4286		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4287	}
4288
4289	buf_init();
4290
4291	arc_thread_exit = 0;
4292	arc_eviction_list = NULL;
4293	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
4294	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4295
4296	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4297	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4298
4299	if (arc_ksp != NULL) {
4300		arc_ksp->ks_data = &arc_stats;
4301		kstat_install(arc_ksp);
4302	}
4303
4304	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
4305	    TS_RUN, minclsyspri);
4306
4307#ifdef _KERNEL
4308	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
4309	    EVENTHANDLER_PRI_FIRST);
4310#endif
4311
4312	arc_dead = FALSE;
4313	arc_warm = B_FALSE;
4314
4315	/*
4316	 * Calculate maximum amount of dirty data per pool.
4317	 *
4318	 * If it has been set by /etc/system, take that.
4319	 * Otherwise, use a percentage of physical memory defined by
4320	 * zfs_dirty_data_max_percent (default 10%) with a cap at
4321	 * zfs_dirty_data_max_max (default 4GB).
4322	 */
4323	if (zfs_dirty_data_max == 0) {
4324		zfs_dirty_data_max = ptob(physmem) *
4325		    zfs_dirty_data_max_percent / 100;
4326		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
4327		    zfs_dirty_data_max_max);
4328	}
4329
4330#ifdef _KERNEL
4331	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
4332		prefetch_tunable_set = 1;
4333
4334#ifdef __i386__
4335	if (prefetch_tunable_set == 0) {
4336		printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
4337		    "-- to enable,\n");
4338		printf("            add \"vfs.zfs.prefetch_disable=0\" "
4339		    "to /boot/loader.conf.\n");
4340		zfs_prefetch_disable = 1;
4341	}
4342#else
4343	if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
4344	    prefetch_tunable_set == 0) {
4345		printf("ZFS NOTICE: Prefetch is disabled by default if less "
4346		    "than 4GB of RAM is present;\n"
4347		    "            to enable, add \"vfs.zfs.prefetch_disable=0\" "
4348		    "to /boot/loader.conf.\n");
4349		zfs_prefetch_disable = 1;
4350	}
4351#endif
4352	/* Warn about ZFS memory and address space requirements. */
4353	if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
4354		printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
4355		    "expect unstable behavior.\n");
4356	}
4357	if (kmem_size() < 512 * (1 << 20)) {
4358		printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
4359		    "expect unstable behavior.\n");
4360		printf("             Consider tuning vm.kmem_size and "
4361		    "vm.kmem_size_max\n");
4362		printf("             in /boot/loader.conf.\n");
4363	}
4364#endif
4365}
4366
4367void
4368arc_fini(void)
4369{
4370	int i;
4371
4372	mutex_enter(&arc_reclaim_thr_lock);
4373	arc_thread_exit = 1;
4374	cv_signal(&arc_reclaim_thr_cv);
4375	while (arc_thread_exit != 0)
4376		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
4377	mutex_exit(&arc_reclaim_thr_lock);
4378
4379	arc_flush(NULL);
4380
4381	arc_dead = TRUE;
4382
4383	if (arc_ksp != NULL) {
4384		kstat_delete(arc_ksp);
4385		arc_ksp = NULL;
4386	}
4387
4388	mutex_destroy(&arc_eviction_mtx);
4389	mutex_destroy(&arc_reclaim_thr_lock);
4390	cv_destroy(&arc_reclaim_thr_cv);
4391
4392	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4393		list_destroy(&arc_mru->arcs_lists[i]);
4394		list_destroy(&arc_mru_ghost->arcs_lists[i]);
4395		list_destroy(&arc_mfu->arcs_lists[i]);
4396		list_destroy(&arc_mfu_ghost->arcs_lists[i]);
4397		list_destroy(&arc_l2c_only->arcs_lists[i]);
4398
4399		mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
4400		mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
4401		mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
4402		mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
4403		mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
4404		mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
4405	}
4406
4407	buf_fini();
4408
4409	ASSERT(arc_loaned_bytes == 0);
4410
4411	mutex_destroy(&arc_lowmem_lock);
4412#ifdef _KERNEL
4413	if (arc_event_lowmem != NULL)
4414		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
4415#endif
4416}
4417
4418/*
4419 * Level 2 ARC
4420 *
4421 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
4422 * It uses dedicated storage devices to hold cached data, which are populated
4423 * using large infrequent writes.  The main role of this cache is to boost
4424 * the performance of random read workloads.  The intended L2ARC devices
4425 * include short-stroked disks, solid state disks, and other media with
4426 * substantially faster read latency than disk.
4427 *
4428 *                 +-----------------------+
4429 *                 |         ARC           |
4430 *                 +-----------------------+
4431 *                    |         ^     ^
4432 *                    |         |     |
4433 *      l2arc_feed_thread()    arc_read()
4434 *                    |         |     |
4435 *                    |  l2arc read   |
4436 *                    V         |     |
4437 *               +---------------+    |
4438 *               |     L2ARC     |    |
4439 *               +---------------+    |
4440 *                   |    ^           |
4441 *          l2arc_write() |           |
4442 *                   |    |           |
4443 *                   V    |           |
4444 *                 +-------+      +-------+
4445 *                 | vdev  |      | vdev  |
4446 *                 | cache |      | cache |
4447 *                 +-------+      +-------+
4448 *                 +=========+     .-----.
4449 *                 :  L2ARC  :    |-_____-|
4450 *                 : devices :    | Disks |
4451 *                 +=========+    `-_____-'
4452 *
4453 * Read requests are satisfied from the following sources, in order:
4454 *
4455 *	1) ARC
4456 *	2) vdev cache of L2ARC devices
4457 *	3) L2ARC devices
4458 *	4) vdev cache of disks
4459 *	5) disks
4460 *
4461 * Some L2ARC device types exhibit extremely slow write performance.
4462 * To accommodate for this there are some significant differences between
4463 * the L2ARC and traditional cache design:
4464 *
4465 * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
4466 * the ARC behave as usual, freeing buffers and placing headers on ghost
4467 * lists.  The ARC does not send buffers to the L2ARC during eviction as
4468 * this would add inflated write latencies for all ARC memory pressure.
4469 *
4470 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4471 * It does this by periodically scanning buffers from the eviction-end of
4472 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4473 * not already there. It scans until a headroom of buffers is satisfied,
4474 * which itself is a buffer for ARC eviction. If a compressible buffer is
4475 * found during scanning and selected for writing to an L2ARC device, we
4476 * temporarily boost scanning headroom during the next scan cycle to make
4477 * sure we adapt to compression effects (which might significantly reduce
4478 * the data volume we write to L2ARC). The thread that does this is
4479 * l2arc_feed_thread(), illustrated below; example sizes are included to
4480 * provide a better sense of ratio than this diagram:
4481 *
4482 *	       head -->                        tail
4483 *	        +---------------------+----------+
4484 *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
4485 *	        +---------------------+----------+   |   o L2ARC eligible
4486 *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
4487 *	        +---------------------+----------+   |
4488 *	             15.9 Gbytes      ^ 32 Mbytes    |
4489 *	                           headroom          |
4490 *	                                      l2arc_feed_thread()
4491 *	                                             |
4492 *	                 l2arc write hand <--[oooo]--'
4493 *	                         |           8 Mbyte
4494 *	                         |          write max
4495 *	                         V
4496 *		  +==============================+
4497 *	L2ARC dev |####|#|###|###|    |####| ... |
4498 *	          +==============================+
4499 *	                     32 Gbytes
4500 *
4501 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4502 * evicted, then the L2ARC has cached a buffer much sooner than it probably
4503 * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
4504 * safe to say that this is an uncommon case, since buffers at the end of
4505 * the ARC lists have moved there due to inactivity.
4506 *
4507 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4508 * then the L2ARC simply misses copying some buffers.  This serves as a
4509 * pressure valve to prevent heavy read workloads from both stalling the ARC
4510 * with waits and clogging the L2ARC with writes.  This also helps prevent
4511 * the potential for the L2ARC to churn if it attempts to cache content too
4512 * quickly, such as during backups of the entire pool.
4513 *
4514 * 5. After system boot and before the ARC has filled main memory, there are
4515 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4516 * lists can remain mostly static.  Instead of searching from tail of these
4517 * lists as pictured, the l2arc_feed_thread() will search from the list heads
4518 * for eligible buffers, greatly increasing its chance of finding them.
4519 *
4520 * The L2ARC device write speed is also boosted during this time so that
4521 * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
4522 * there are no L2ARC reads, and no fear of degrading read performance
4523 * through increased writes.
4524 *
4525 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4526 * the vdev queue can aggregate them into larger and fewer writes.  Each
4527 * device is written to in a rotor fashion, sweeping writes through
4528 * available space then repeating.
4529 *
4530 * 7. The L2ARC does not store dirty content.  It never needs to flush
4531 * write buffers back to disk based storage.
4532 *
4533 * 8. If an ARC buffer is written (and dirtied) which also exists in the
4534 * L2ARC, the now stale L2ARC buffer is immediately dropped.
4535 *
4536 * The performance of the L2ARC can be tweaked by a number of tunables, which
4537 * may be necessary for different workloads:
4538 *
4539 *	l2arc_write_max		max write bytes per interval
4540 *	l2arc_write_boost	extra write bytes during device warmup
4541 *	l2arc_noprefetch	skip caching prefetched buffers
4542 *	l2arc_headroom		number of max device writes to precache
4543 *	l2arc_headroom_boost	when we find compressed buffers during ARC
4544 *				scanning, we multiply headroom by this
4545 *				percentage factor for the next scan cycle,
4546 *				since more compressed buffers are likely to
4547 *				be present
4548 *	l2arc_feed_secs		seconds between L2ARC writing
4549 *
4550 * Tunables may be removed or added as future performance improvements are
4551 * integrated, and also may become zpool properties.
4552 *
4553 * There are three key functions that control how the L2ARC warms up:
4554 *
4555 *	l2arc_write_eligible()	check if a buffer is eligible to cache
4556 *	l2arc_write_size()	calculate how much to write
4557 *	l2arc_write_interval()	calculate sleep delay between writes
4558 *
4559 * These three functions determine what to write, how much, and how quickly
4560 * to send writes.
4561 */
4562
4563static boolean_t
4564l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
4565{
4566	/*
4567	 * A buffer is *not* eligible for the L2ARC if it:
4568	 * 1. belongs to a different spa.
4569	 * 2. is already cached on the L2ARC.
4570	 * 3. has an I/O in progress (it may be an incomplete read).
4571	 * 4. is flagged not eligible (zfs property).
4572	 */
4573	if (hdr->b_spa != spa_guid) {
4574		ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
4575		return (B_FALSE);
4576	}
4577	if (hdr->b_l2hdr != NULL) {
4578		ARCSTAT_BUMP(arcstat_l2_write_in_l2);
4579		return (B_FALSE);
4580	}
4581	if (HDR_IO_IN_PROGRESS(hdr)) {
4582		ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
4583		return (B_FALSE);
4584	}
4585	if (!HDR_L2CACHE(hdr)) {
4586		ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
4587		return (B_FALSE);
4588	}
4589
4590	return (B_TRUE);
4591}
4592
4593static uint64_t
4594l2arc_write_size(void)
4595{
4596	uint64_t size;
4597
4598	/*
4599	 * Make sure our globals have meaningful values in case the user
4600	 * altered them.
4601	 */
4602	size = l2arc_write_max;
4603	if (size == 0) {
4604		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4605		    "be greater than zero, resetting it to the default (%d)",
4606		    L2ARC_WRITE_SIZE);
4607		size = l2arc_write_max = L2ARC_WRITE_SIZE;
4608	}
4609
4610	if (arc_warm == B_FALSE)
4611		size += l2arc_write_boost;
4612
4613	return (size);
4614
4615}
4616
4617static clock_t
4618l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4619{
4620	clock_t interval, next, now;
4621
4622	/*
4623	 * If the ARC lists are busy, increase our write rate; if the
4624	 * lists are stale, idle back.  This is achieved by checking
4625	 * how much we previously wrote - if it was more than half of
4626	 * what we wanted, schedule the next write much sooner.
4627	 */
4628	if (l2arc_feed_again && wrote > (wanted / 2))
4629		interval = (hz * l2arc_feed_min_ms) / 1000;
4630	else
4631		interval = hz * l2arc_feed_secs;
4632
4633	now = ddi_get_lbolt();
4634	next = MAX(now, MIN(now + interval, began + interval));
4635
4636	return (next);
4637}
4638
4639static void
4640l2arc_hdr_stat_add(void)
4641{
4642	ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4643	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4644}
4645
4646static void
4647l2arc_hdr_stat_remove(void)
4648{
4649	ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4650	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4651}
4652
4653/*
4654 * Cycle through L2ARC devices.  This is how L2ARC load balances.
4655 * If a device is returned, this also returns holding the spa config lock.
4656 */
4657static l2arc_dev_t *
4658l2arc_dev_get_next(void)
4659{
4660	l2arc_dev_t *first, *next = NULL;
4661
4662	/*
4663	 * Lock out the removal of spas (spa_namespace_lock), then removal
4664	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4665	 * both locks will be dropped and a spa config lock held instead.
4666	 */
4667	mutex_enter(&spa_namespace_lock);
4668	mutex_enter(&l2arc_dev_mtx);
4669
4670	/* if there are no vdevs, there is nothing to do */
4671	if (l2arc_ndev == 0)
4672		goto out;
4673
4674	first = NULL;
4675	next = l2arc_dev_last;
4676	do {
4677		/* loop around the list looking for a non-faulted vdev */
4678		if (next == NULL) {
4679			next = list_head(l2arc_dev_list);
4680		} else {
4681			next = list_next(l2arc_dev_list, next);
4682			if (next == NULL)
4683				next = list_head(l2arc_dev_list);
4684		}
4685
4686		/* if we have come back to the start, bail out */
4687		if (first == NULL)
4688			first = next;
4689		else if (next == first)
4690			break;
4691
4692	} while (vdev_is_dead(next->l2ad_vdev));
4693
4694	/* if we were unable to find any usable vdevs, return NULL */
4695	if (vdev_is_dead(next->l2ad_vdev))
4696		next = NULL;
4697
4698	l2arc_dev_last = next;
4699
4700out:
4701	mutex_exit(&l2arc_dev_mtx);
4702
4703	/*
4704	 * Grab the config lock to prevent the 'next' device from being
4705	 * removed while we are writing to it.
4706	 */
4707	if (next != NULL)
4708		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4709	mutex_exit(&spa_namespace_lock);
4710
4711	return (next);
4712}
4713
4714/*
4715 * Free buffers that were tagged for destruction.
4716 */
4717static void
4718l2arc_do_free_on_write()
4719{
4720	list_t *buflist;
4721	l2arc_data_free_t *df, *df_prev;
4722
4723	mutex_enter(&l2arc_free_on_write_mtx);
4724	buflist = l2arc_free_on_write;
4725
4726	for (df = list_tail(buflist); df; df = df_prev) {
4727		df_prev = list_prev(buflist, df);
4728		ASSERT(df->l2df_data != NULL);
4729		ASSERT(df->l2df_func != NULL);
4730		df->l2df_func(df->l2df_data, df->l2df_size);
4731		list_remove(buflist, df);
4732		kmem_free(df, sizeof (l2arc_data_free_t));
4733	}
4734
4735	mutex_exit(&l2arc_free_on_write_mtx);
4736}
4737
4738/*
4739 * A write to a cache device has completed.  Update all headers to allow
4740 * reads from these buffers to begin.
4741 */
4742static void
4743l2arc_write_done(zio_t *zio)
4744{
4745	l2arc_write_callback_t *cb;
4746	l2arc_dev_t *dev;
4747	list_t *buflist;
4748	arc_buf_hdr_t *head, *hdr, *hdr_prev;
4749	l2arc_buf_hdr_t *abl2;
4750	kmutex_t *hash_lock;
4751	int64_t bytes_dropped = 0;
4752
4753	cb = zio->io_private;
4754	ASSERT(cb != NULL);
4755	dev = cb->l2wcb_dev;
4756	ASSERT(dev != NULL);
4757	head = cb->l2wcb_head;
4758	ASSERT(head != NULL);
4759	buflist = dev->l2ad_buflist;
4760	ASSERT(buflist != NULL);
4761	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4762	    l2arc_write_callback_t *, cb);
4763
4764	if (zio->io_error != 0)
4765		ARCSTAT_BUMP(arcstat_l2_writes_error);
4766
4767	mutex_enter(&l2arc_buflist_mtx);
4768
4769	/*
4770	 * All writes completed, or an error was hit.
4771	 */
4772	for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
4773		hdr_prev = list_prev(buflist, hdr);
4774		abl2 = hdr->b_l2hdr;
4775
4776		/*
4777		 * Release the temporary compressed buffer as soon as possible.
4778		 */
4779		if (abl2->b_compress != ZIO_COMPRESS_OFF)
4780			l2arc_release_cdata_buf(hdr);
4781
4782		hash_lock = HDR_LOCK(hdr);
4783		if (!mutex_tryenter(hash_lock)) {
4784			/*
4785			 * This buffer misses out.  It may be in a stage
4786			 * of eviction.  Its ARC_L2_WRITING flag will be
4787			 * left set, denying reads to this buffer.
4788			 */
4789			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4790			continue;
4791		}
4792
4793		if (zio->io_error != 0) {
4794			/*
4795			 * Error - drop L2ARC entry.
4796			 */
4797			list_remove(buflist, hdr);
4798			ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4799			bytes_dropped += abl2->b_asize;
4800			hdr->b_l2hdr = NULL;
4801			trim_map_free(abl2->b_dev->l2ad_vdev, abl2->b_daddr,
4802			    hdr->b_size, 0);
4803			kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4804			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
4805		}
4806
4807		/*
4808		 * Allow ARC to begin reads to this L2ARC entry.
4809		 */
4810		hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
4811
4812		mutex_exit(hash_lock);
4813	}
4814
4815	atomic_inc_64(&l2arc_writes_done);
4816	list_remove(buflist, head);
4817	kmem_cache_free(hdr_cache, head);
4818	mutex_exit(&l2arc_buflist_mtx);
4819
4820	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
4821
4822	l2arc_do_free_on_write();
4823
4824	kmem_free(cb, sizeof (l2arc_write_callback_t));
4825}
4826
4827/*
4828 * A read to a cache device completed.  Validate buffer contents before
4829 * handing over to the regular ARC routines.
4830 */
4831static void
4832l2arc_read_done(zio_t *zio)
4833{
4834	l2arc_read_callback_t *cb;
4835	arc_buf_hdr_t *hdr;
4836	arc_buf_t *buf;
4837	kmutex_t *hash_lock;
4838	int equal;
4839
4840	ASSERT(zio->io_vd != NULL);
4841	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4842
4843	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4844
4845	cb = zio->io_private;
4846	ASSERT(cb != NULL);
4847	buf = cb->l2rcb_buf;
4848	ASSERT(buf != NULL);
4849
4850	hash_lock = HDR_LOCK(buf->b_hdr);
4851	mutex_enter(hash_lock);
4852	hdr = buf->b_hdr;
4853	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4854
4855	/*
4856	 * If the buffer was compressed, decompress it first.
4857	 */
4858	if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4859		l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4860	ASSERT(zio->io_data != NULL);
4861
4862	/*
4863	 * Check this survived the L2ARC journey.
4864	 */
4865	equal = arc_cksum_equal(buf);
4866	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4867		mutex_exit(hash_lock);
4868		zio->io_private = buf;
4869		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
4870		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
4871		arc_read_done(zio);
4872	} else {
4873		mutex_exit(hash_lock);
4874		/*
4875		 * Buffer didn't survive caching.  Increment stats and
4876		 * reissue to the original storage device.
4877		 */
4878		if (zio->io_error != 0) {
4879			ARCSTAT_BUMP(arcstat_l2_io_error);
4880		} else {
4881			zio->io_error = SET_ERROR(EIO);
4882		}
4883		if (!equal)
4884			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4885
4886		/*
4887		 * If there's no waiter, issue an async i/o to the primary
4888		 * storage now.  If there *is* a waiter, the caller must
4889		 * issue the i/o in a context where it's OK to block.
4890		 */
4891		if (zio->io_waiter == NULL) {
4892			zio_t *pio = zio_unique_parent(zio);
4893
4894			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4895
4896			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4897			    buf->b_data, zio->io_size, arc_read_done, buf,
4898			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4899		}
4900	}
4901
4902	kmem_free(cb, sizeof (l2arc_read_callback_t));
4903}
4904
4905/*
4906 * This is the list priority from which the L2ARC will search for pages to
4907 * cache.  This is used within loops (0..3) to cycle through lists in the
4908 * desired order.  This order can have a significant effect on cache
4909 * performance.
4910 *
4911 * Currently the metadata lists are hit first, MFU then MRU, followed by
4912 * the data lists.  This function returns a locked list, and also returns
4913 * the lock pointer.
4914 */
4915static list_t *
4916l2arc_list_locked(int list_num, kmutex_t **lock)
4917{
4918	list_t *list = NULL;
4919	int idx;
4920
4921	ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);
4922
4923	if (list_num < ARC_BUFC_NUMMETADATALISTS) {
4924		idx = list_num;
4925		list = &arc_mfu->arcs_lists[idx];
4926		*lock = ARCS_LOCK(arc_mfu, idx);
4927	} else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) {
4928		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4929		list = &arc_mru->arcs_lists[idx];
4930		*lock = ARCS_LOCK(arc_mru, idx);
4931	} else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 +
4932		ARC_BUFC_NUMDATALISTS)) {
4933		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4934		list = &arc_mfu->arcs_lists[idx];
4935		*lock = ARCS_LOCK(arc_mfu, idx);
4936	} else {
4937		idx = list_num - ARC_BUFC_NUMLISTS;
4938		list = &arc_mru->arcs_lists[idx];
4939		*lock = ARCS_LOCK(arc_mru, idx);
4940	}
4941
4942	ASSERT(!(MUTEX_HELD(*lock)));
4943	mutex_enter(*lock);
4944	return (list);
4945}
4946
4947/*
4948 * Evict buffers from the device write hand to the distance specified in
4949 * bytes.  This distance may span populated buffers, it may span nothing.
4950 * This is clearing a region on the L2ARC device ready for writing.
4951 * If the 'all' boolean is set, every buffer is evicted.
4952 */
4953static void
4954l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4955{
4956	list_t *buflist;
4957	l2arc_buf_hdr_t *abl2;
4958	arc_buf_hdr_t *hdr, *hdr_prev;
4959	kmutex_t *hash_lock;
4960	uint64_t taddr;
4961	int64_t bytes_evicted = 0;
4962
4963	buflist = dev->l2ad_buflist;
4964
4965	if (buflist == NULL)
4966		return;
4967
4968	if (!all && dev->l2ad_first) {
4969		/*
4970		 * This is the first sweep through the device.  There is
4971		 * nothing to evict.
4972		 */
4973		return;
4974	}
4975
4976	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4977		/*
4978		 * When nearing the end of the device, evict to the end
4979		 * before the device write hand jumps to the start.
4980		 */
4981		taddr = dev->l2ad_end;
4982	} else {
4983		taddr = dev->l2ad_hand + distance;
4984	}
4985	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4986	    uint64_t, taddr, boolean_t, all);
4987
4988top:
4989	mutex_enter(&l2arc_buflist_mtx);
4990	for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
4991		hdr_prev = list_prev(buflist, hdr);
4992
4993		hash_lock = HDR_LOCK(hdr);
4994		if (!mutex_tryenter(hash_lock)) {
4995			/*
4996			 * Missed the hash lock.  Retry.
4997			 */
4998			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4999			mutex_exit(&l2arc_buflist_mtx);
5000			mutex_enter(hash_lock);
5001			mutex_exit(hash_lock);
5002			goto top;
5003		}
5004
5005		if (HDR_L2_WRITE_HEAD(hdr)) {
5006			/*
5007			 * We hit a write head node.  Leave it for
5008			 * l2arc_write_done().
5009			 */
5010			list_remove(buflist, hdr);
5011			mutex_exit(hash_lock);
5012			continue;
5013		}
5014
5015		if (!all && hdr->b_l2hdr != NULL &&
5016		    (hdr->b_l2hdr->b_daddr > taddr ||
5017		    hdr->b_l2hdr->b_daddr < dev->l2ad_hand)) {
5018			/*
5019			 * We've evicted to the target address,
5020			 * or the end of the device.
5021			 */
5022			mutex_exit(hash_lock);
5023			break;
5024		}
5025
5026		if (HDR_FREE_IN_PROGRESS(hdr)) {
5027			/*
5028			 * Already on the path to destruction.
5029			 */
5030			mutex_exit(hash_lock);
5031			continue;
5032		}
5033
5034		if (hdr->b_state == arc_l2c_only) {
5035			ASSERT(!HDR_L2_READING(hdr));
5036			/*
5037			 * This doesn't exist in the ARC.  Destroy.
5038			 * arc_hdr_destroy() will call list_remove()
5039			 * and decrement arcstat_l2_size.
5040			 */
5041			arc_change_state(arc_anon, hdr, hash_lock);
5042			arc_hdr_destroy(hdr);
5043		} else {
5044			/*
5045			 * Invalidate issued or about to be issued
5046			 * reads, since we may be about to write
5047			 * over this location.
5048			 */
5049			if (HDR_L2_READING(hdr)) {
5050				ARCSTAT_BUMP(arcstat_l2_evict_reading);
5051				hdr->b_flags |= ARC_FLAG_L2_EVICTED;
5052			}
5053
5054			/*
5055			 * Tell ARC this no longer exists in L2ARC.
5056			 */
5057			if (hdr->b_l2hdr != NULL) {
5058				abl2 = hdr->b_l2hdr;
5059				ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
5060				bytes_evicted += abl2->b_asize;
5061				hdr->b_l2hdr = NULL;
5062				/*
5063				 * We are destroying l2hdr, so ensure that
5064				 * its compressed buffer, if any, is not leaked.
5065				 */
5066				ASSERT(abl2->b_tmp_cdata == NULL);
5067				kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
5068				ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
5069			}
5070			list_remove(buflist, hdr);
5071
5072			/*
5073			 * This may have been leftover after a
5074			 * failed write.
5075			 */
5076			hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
5077		}
5078		mutex_exit(hash_lock);
5079	}
5080	mutex_exit(&l2arc_buflist_mtx);
5081
5082	vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0);
5083	dev->l2ad_evict = taddr;
5084}
5085
5086/*
5087 * Find and write ARC buffers to the L2ARC device.
5088 *
5089 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
5090 * for reading until they have completed writing.
5091 * The headroom_boost is an in-out parameter used to maintain headroom boost
5092 * state between calls to this function.
5093 *
5094 * Returns the number of bytes actually written (which may be smaller than
5095 * the delta by which the device hand has changed due to alignment).
5096 */
5097static uint64_t
5098l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
5099    boolean_t *headroom_boost)
5100{
5101	arc_buf_hdr_t *hdr, *hdr_prev, *head;
5102	list_t *list;
5103	uint64_t write_asize, write_psize, write_sz, headroom,
5104	    buf_compress_minsz;
5105	void *buf_data;
5106	kmutex_t *list_lock;
5107	boolean_t full;
5108	l2arc_write_callback_t *cb;
5109	zio_t *pio, *wzio;
5110	uint64_t guid = spa_load_guid(spa);
5111	const boolean_t do_headroom_boost = *headroom_boost;
5112	int try;
5113
5114	ASSERT(dev->l2ad_vdev != NULL);
5115
5116	/* Lower the flag now, we might want to raise it again later. */
5117	*headroom_boost = B_FALSE;
5118
5119	pio = NULL;
5120	write_sz = write_asize = write_psize = 0;
5121	full = B_FALSE;
5122	head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
5123	head->b_flags |= ARC_FLAG_L2_WRITE_HEAD;
5124
5125	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
5126	/*
5127	 * We will want to try to compress buffers that are at least 2x the
5128	 * device sector size.
5129	 */
5130	buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
5131
5132	/*
5133	 * Copy buffers for L2ARC writing.
5134	 */
5135	mutex_enter(&l2arc_buflist_mtx);
5136	for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
5137		uint64_t passed_sz = 0;
5138
5139		list = l2arc_list_locked(try, &list_lock);
5140		ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
5141
5142		/*
5143		 * L2ARC fast warmup.
5144		 *
5145		 * Until the ARC is warm and starts to evict, read from the
5146		 * head of the ARC lists rather than the tail.
5147		 */
5148		if (arc_warm == B_FALSE)
5149			hdr = list_head(list);
5150		else
5151			hdr = list_tail(list);
5152		if (hdr == NULL)
5153			ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
5154
5155		headroom = target_sz * l2arc_headroom * 2 / ARC_BUFC_NUMLISTS;
5156		if (do_headroom_boost)
5157			headroom = (headroom * l2arc_headroom_boost) / 100;
5158
5159		for (; hdr; hdr = hdr_prev) {
5160			l2arc_buf_hdr_t *l2hdr;
5161			kmutex_t *hash_lock;
5162			uint64_t buf_sz;
5163
5164			if (arc_warm == B_FALSE)
5165				hdr_prev = list_next(list, hdr);
5166			else
5167				hdr_prev = list_prev(list, hdr);
5168			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size);
5169
5170			hash_lock = HDR_LOCK(hdr);
5171			if (!mutex_tryenter(hash_lock)) {
5172				ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
5173				/*
5174				 * Skip this buffer rather than waiting.
5175				 */
5176				continue;
5177			}
5178
5179			passed_sz += hdr->b_size;
5180			if (passed_sz > headroom) {
5181				/*
5182				 * Searched too far.
5183				 */
5184				mutex_exit(hash_lock);
5185				ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
5186				break;
5187			}
5188
5189			if (!l2arc_write_eligible(guid, hdr)) {
5190				mutex_exit(hash_lock);
5191				continue;
5192			}
5193
5194			if ((write_sz + hdr->b_size) > target_sz) {
5195				full = B_TRUE;
5196				mutex_exit(hash_lock);
5197				ARCSTAT_BUMP(arcstat_l2_write_full);
5198				break;
5199			}
5200
5201			if (pio == NULL) {
5202				/*
5203				 * Insert a dummy header on the buflist so
5204				 * l2arc_write_done() can find where the
5205				 * write buffers begin without searching.
5206				 */
5207				list_insert_head(dev->l2ad_buflist, head);
5208
5209				cb = kmem_alloc(
5210				    sizeof (l2arc_write_callback_t), KM_SLEEP);
5211				cb->l2wcb_dev = dev;
5212				cb->l2wcb_head = head;
5213				pio = zio_root(spa, l2arc_write_done, cb,
5214				    ZIO_FLAG_CANFAIL);
5215				ARCSTAT_BUMP(arcstat_l2_write_pios);
5216			}
5217
5218			/*
5219			 * Create and add a new L2ARC header.
5220			 */
5221			l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
5222			l2hdr->b_dev = dev;
5223			hdr->b_flags |= ARC_FLAG_L2_WRITING;
5224
5225			/*
5226			 * Temporarily stash the data buffer in b_tmp_cdata.
5227			 * The subsequent write step will pick it up from
5228			 * there. This is because can't access hdr->b_buf
5229			 * without holding the hash_lock, which we in turn
5230			 * can't access without holding the ARC list locks
5231			 * (which we want to avoid during compression/writing).
5232			 */
5233			l2hdr->b_compress = ZIO_COMPRESS_OFF;
5234			l2hdr->b_asize = hdr->b_size;
5235			l2hdr->b_tmp_cdata = hdr->b_buf->b_data;
5236
5237			buf_sz = hdr->b_size;
5238			hdr->b_l2hdr = l2hdr;
5239
5240			list_insert_head(dev->l2ad_buflist, hdr);
5241
5242			/*
5243			 * Compute and store the buffer cksum before
5244			 * writing.  On debug the cksum is verified first.
5245			 */
5246			arc_cksum_verify(hdr->b_buf);
5247			arc_cksum_compute(hdr->b_buf, B_TRUE);
5248
5249			mutex_exit(hash_lock);
5250
5251			write_sz += buf_sz;
5252		}
5253
5254		mutex_exit(list_lock);
5255
5256		if (full == B_TRUE)
5257			break;
5258	}
5259
5260	/* No buffers selected for writing? */
5261	if (pio == NULL) {
5262		ASSERT0(write_sz);
5263		mutex_exit(&l2arc_buflist_mtx);
5264		kmem_cache_free(hdr_cache, head);
5265		return (0);
5266	}
5267
5268	/*
5269	 * Now start writing the buffers. We're starting at the write head
5270	 * and work backwards, retracing the course of the buffer selector
5271	 * loop above.
5272	 */
5273	for (hdr = list_prev(dev->l2ad_buflist, head); hdr;
5274	    hdr = list_prev(dev->l2ad_buflist, hdr)) {
5275		l2arc_buf_hdr_t *l2hdr;
5276		uint64_t buf_sz;
5277
5278		/*
5279		 * We shouldn't need to lock the buffer here, since we flagged
5280		 * it as ARC_FLAG_L2_WRITING in the previous step, but we must
5281		 * take care to only access its L2 cache parameters. In
5282		 * particular, hdr->b_buf may be invalid by now due to
5283		 * ARC eviction.
5284		 */
5285		l2hdr = hdr->b_l2hdr;
5286		l2hdr->b_daddr = dev->l2ad_hand;
5287
5288		if ((hdr->b_flags & ARC_FLAG_L2COMPRESS) &&
5289		    l2hdr->b_asize >= buf_compress_minsz) {
5290			if (l2arc_compress_buf(l2hdr)) {
5291				/*
5292				 * If compression succeeded, enable headroom
5293				 * boost on the next scan cycle.
5294				 */
5295				*headroom_boost = B_TRUE;
5296			}
5297		}
5298
5299		/*
5300		 * Pick up the buffer data we had previously stashed away
5301		 * (and now potentially also compressed).
5302		 */
5303		buf_data = l2hdr->b_tmp_cdata;
5304		buf_sz = l2hdr->b_asize;
5305
5306		/*
5307		 * If the data has not been compressed, then clear b_tmp_cdata
5308		 * to make sure that it points only to a temporary compression
5309		 * buffer.
5310		 */
5311		if (!L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress))
5312			l2hdr->b_tmp_cdata = NULL;
5313
5314		/* Compression may have squashed the buffer to zero length. */
5315		if (buf_sz != 0) {
5316			uint64_t buf_p_sz;
5317
5318			wzio = zio_write_phys(pio, dev->l2ad_vdev,
5319			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5320			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5321			    ZIO_FLAG_CANFAIL, B_FALSE);
5322
5323			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5324			    zio_t *, wzio);
5325			(void) zio_nowait(wzio);
5326
5327			write_asize += buf_sz;
5328			/*
5329			 * Keep the clock hand suitably device-aligned.
5330			 */
5331			buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
5332			write_psize += buf_p_sz;
5333			dev->l2ad_hand += buf_p_sz;
5334		}
5335	}
5336
5337	mutex_exit(&l2arc_buflist_mtx);
5338
5339	ASSERT3U(write_asize, <=, target_sz);
5340	ARCSTAT_BUMP(arcstat_l2_writes_sent);
5341	ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5342	ARCSTAT_INCR(arcstat_l2_size, write_sz);
5343	ARCSTAT_INCR(arcstat_l2_asize, write_asize);
5344	vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
5345
5346	/*
5347	 * Bump device hand to the device start if it is approaching the end.
5348	 * l2arc_evict() will already have evicted ahead for this case.
5349	 */
5350	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
5351		dev->l2ad_hand = dev->l2ad_start;
5352		dev->l2ad_evict = dev->l2ad_start;
5353		dev->l2ad_first = B_FALSE;
5354	}
5355
5356	dev->l2ad_writing = B_TRUE;
5357	(void) zio_wait(pio);
5358	dev->l2ad_writing = B_FALSE;
5359
5360	return (write_asize);
5361}
5362
5363/*
5364 * Compresses an L2ARC buffer.
5365 * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
5366 * size in l2hdr->b_asize. This routine tries to compress the data and
5367 * depending on the compression result there are three possible outcomes:
5368 * *) The buffer was incompressible. The original l2hdr contents were left
5369 *    untouched and are ready for writing to an L2 device.
5370 * *) The buffer was all-zeros, so there is no need to write it to an L2
5371 *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
5372 *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
5373 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
5374 *    data buffer which holds the compressed data to be written, and b_asize
5375 *    tells us how much data there is. b_compress is set to the appropriate
5376 *    compression algorithm. Once writing is done, invoke
5377 *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
5378 *
5379 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
5380 * buffer was incompressible).
5381 */
5382static boolean_t
5383l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
5384{
5385	void *cdata;
5386	size_t csize, len, rounded;
5387
5388	ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
5389	ASSERT(l2hdr->b_tmp_cdata != NULL);
5390
5391	len = l2hdr->b_asize;
5392	cdata = zio_data_buf_alloc(len);
5393	csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
5394	    cdata, l2hdr->b_asize);
5395
5396	if (csize == 0) {
5397		/* zero block, indicate that there's nothing to write */
5398		zio_data_buf_free(cdata, len);
5399		l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
5400		l2hdr->b_asize = 0;
5401		l2hdr->b_tmp_cdata = NULL;
5402		ARCSTAT_BUMP(arcstat_l2_compress_zeros);
5403		return (B_TRUE);
5404	}
5405
5406	rounded = P2ROUNDUP(csize,
5407	    (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift);
5408	if (rounded < len) {
5409		/*
5410		 * Compression succeeded, we'll keep the cdata around for
5411		 * writing and release it afterwards.
5412		 */
5413		if (rounded > csize) {
5414			bzero((char *)cdata + csize, rounded - csize);
5415			csize = rounded;
5416		}
5417		l2hdr->b_compress = ZIO_COMPRESS_LZ4;
5418		l2hdr->b_asize = csize;
5419		l2hdr->b_tmp_cdata = cdata;
5420		ARCSTAT_BUMP(arcstat_l2_compress_successes);
5421		return (B_TRUE);
5422	} else {
5423		/*
5424		 * Compression failed, release the compressed buffer.
5425		 * l2hdr will be left unmodified.
5426		 */
5427		zio_data_buf_free(cdata, len);
5428		ARCSTAT_BUMP(arcstat_l2_compress_failures);
5429		return (B_FALSE);
5430	}
5431}
5432
5433/*
5434 * Decompresses a zio read back from an l2arc device. On success, the
5435 * underlying zio's io_data buffer is overwritten by the uncompressed
5436 * version. On decompression error (corrupt compressed stream), the
5437 * zio->io_error value is set to signal an I/O error.
5438 *
5439 * Please note that the compressed data stream is not checksummed, so
5440 * if the underlying device is experiencing data corruption, we may feed
5441 * corrupt data to the decompressor, so the decompressor needs to be
5442 * able to handle this situation (LZ4 does).
5443 */
5444static void
5445l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
5446{
5447	ASSERT(L2ARC_IS_VALID_COMPRESS(c));
5448
5449	if (zio->io_error != 0) {
5450		/*
5451		 * An io error has occured, just restore the original io
5452		 * size in preparation for a main pool read.
5453		 */
5454		zio->io_orig_size = zio->io_size = hdr->b_size;
5455		return;
5456	}
5457
5458	if (c == ZIO_COMPRESS_EMPTY) {
5459		/*
5460		 * An empty buffer results in a null zio, which means we
5461		 * need to fill its io_data after we're done restoring the
5462		 * buffer's contents.
5463		 */
5464		ASSERT(hdr->b_buf != NULL);
5465		bzero(hdr->b_buf->b_data, hdr->b_size);
5466		zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
5467	} else {
5468		ASSERT(zio->io_data != NULL);
5469		/*
5470		 * We copy the compressed data from the start of the arc buffer
5471		 * (the zio_read will have pulled in only what we need, the
5472		 * rest is garbage which we will overwrite at decompression)
5473		 * and then decompress back to the ARC data buffer. This way we
5474		 * can minimize copying by simply decompressing back over the
5475		 * original compressed data (rather than decompressing to an
5476		 * aux buffer and then copying back the uncompressed buffer,
5477		 * which is likely to be much larger).
5478		 */
5479		uint64_t csize;
5480		void *cdata;
5481
5482		csize = zio->io_size;
5483		cdata = zio_data_buf_alloc(csize);
5484		bcopy(zio->io_data, cdata, csize);
5485		if (zio_decompress_data(c, cdata, zio->io_data, csize,
5486		    hdr->b_size) != 0)
5487			zio->io_error = EIO;
5488		zio_data_buf_free(cdata, csize);
5489	}
5490
5491	/* Restore the expected uncompressed IO size. */
5492	zio->io_orig_size = zio->io_size = hdr->b_size;
5493}
5494
5495/*
5496 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
5497 * This buffer serves as a temporary holder of compressed data while
5498 * the buffer entry is being written to an l2arc device. Once that is
5499 * done, we can dispose of it.
5500 */
5501static void
5502l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
5503{
5504	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
5505
5506	ASSERT(L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress));
5507	if (l2hdr->b_compress != ZIO_COMPRESS_EMPTY) {
5508		/*
5509		 * If the data was compressed, then we've allocated a
5510		 * temporary buffer for it, so now we need to release it.
5511		 */
5512		ASSERT(l2hdr->b_tmp_cdata != NULL);
5513		zio_data_buf_free(l2hdr->b_tmp_cdata, hdr->b_size);
5514		l2hdr->b_tmp_cdata = NULL;
5515	} else {
5516		ASSERT(l2hdr->b_tmp_cdata == NULL);
5517	}
5518}
5519
5520/*
5521 * This thread feeds the L2ARC at regular intervals.  This is the beating
5522 * heart of the L2ARC.
5523 */
5524static void
5525l2arc_feed_thread(void *dummy __unused)
5526{
5527	callb_cpr_t cpr;
5528	l2arc_dev_t *dev;
5529	spa_t *spa;
5530	uint64_t size, wrote;
5531	clock_t begin, next = ddi_get_lbolt();
5532	boolean_t headroom_boost = B_FALSE;
5533
5534	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
5535
5536	mutex_enter(&l2arc_feed_thr_lock);
5537
5538	while (l2arc_thread_exit == 0) {
5539		CALLB_CPR_SAFE_BEGIN(&cpr);
5540		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
5541		    next - ddi_get_lbolt());
5542		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
5543		next = ddi_get_lbolt() + hz;
5544
5545		/*
5546		 * Quick check for L2ARC devices.
5547		 */
5548		mutex_enter(&l2arc_dev_mtx);
5549		if (l2arc_ndev == 0) {
5550			mutex_exit(&l2arc_dev_mtx);
5551			continue;
5552		}
5553		mutex_exit(&l2arc_dev_mtx);
5554		begin = ddi_get_lbolt();
5555
5556		/*
5557		 * This selects the next l2arc device to write to, and in
5558		 * doing so the next spa to feed from: dev->l2ad_spa.   This
5559		 * will return NULL if there are now no l2arc devices or if
5560		 * they are all faulted.
5561		 *
5562		 * If a device is returned, its spa's config lock is also
5563		 * held to prevent device removal.  l2arc_dev_get_next()
5564		 * will grab and release l2arc_dev_mtx.
5565		 */
5566		if ((dev = l2arc_dev_get_next()) == NULL)
5567			continue;
5568
5569		spa = dev->l2ad_spa;
5570		ASSERT(spa != NULL);
5571
5572		/*
5573		 * If the pool is read-only then force the feed thread to
5574		 * sleep a little longer.
5575		 */
5576		if (!spa_writeable(spa)) {
5577			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
5578			spa_config_exit(spa, SCL_L2ARC, dev);
5579			continue;
5580		}
5581
5582		/*
5583		 * Avoid contributing to memory pressure.
5584		 */
5585		if (arc_reclaim_needed()) {
5586			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5587			spa_config_exit(spa, SCL_L2ARC, dev);
5588			continue;
5589		}
5590
5591		ARCSTAT_BUMP(arcstat_l2_feeds);
5592
5593		size = l2arc_write_size();
5594
5595		/*
5596		 * Evict L2ARC buffers that will be overwritten.
5597		 */
5598		l2arc_evict(dev, size, B_FALSE);
5599
5600		/*
5601		 * Write ARC buffers.
5602		 */
5603		wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5604
5605		/*
5606		 * Calculate interval between writes.
5607		 */
5608		next = l2arc_write_interval(begin, size, wrote);
5609		spa_config_exit(spa, SCL_L2ARC, dev);
5610	}
5611
5612	l2arc_thread_exit = 0;
5613	cv_broadcast(&l2arc_feed_thr_cv);
5614	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
5615	thread_exit();
5616}
5617
5618boolean_t
5619l2arc_vdev_present(vdev_t *vd)
5620{
5621	l2arc_dev_t *dev;
5622
5623	mutex_enter(&l2arc_dev_mtx);
5624	for (dev = list_head(l2arc_dev_list); dev != NULL;
5625	    dev = list_next(l2arc_dev_list, dev)) {
5626		if (dev->l2ad_vdev == vd)
5627			break;
5628	}
5629	mutex_exit(&l2arc_dev_mtx);
5630
5631	return (dev != NULL);
5632}
5633
5634/*
5635 * Add a vdev for use by the L2ARC.  By this point the spa has already
5636 * validated the vdev and opened it.
5637 */
5638void
5639l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5640{
5641	l2arc_dev_t *adddev;
5642
5643	ASSERT(!l2arc_vdev_present(vd));
5644
5645	vdev_ashift_optimize(vd);
5646
5647	/*
5648	 * Create a new l2arc device entry.
5649	 */
5650	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5651	adddev->l2ad_spa = spa;
5652	adddev->l2ad_vdev = vd;
5653	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5654	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5655	adddev->l2ad_hand = adddev->l2ad_start;
5656	adddev->l2ad_evict = adddev->l2ad_start;
5657	adddev->l2ad_first = B_TRUE;
5658	adddev->l2ad_writing = B_FALSE;
5659
5660	/*
5661	 * This is a list of all ARC buffers that are still valid on the
5662	 * device.
5663	 */
5664	adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5665	list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5666	    offsetof(arc_buf_hdr_t, b_l2node));
5667
5668	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5669
5670	/*
5671	 * Add device to global list
5672	 */
5673	mutex_enter(&l2arc_dev_mtx);
5674	list_insert_head(l2arc_dev_list, adddev);
5675	atomic_inc_64(&l2arc_ndev);
5676	mutex_exit(&l2arc_dev_mtx);
5677}
5678
5679/*
5680 * Remove a vdev from the L2ARC.
5681 */
5682void
5683l2arc_remove_vdev(vdev_t *vd)
5684{
5685	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5686
5687	/*
5688	 * Find the device by vdev
5689	 */
5690	mutex_enter(&l2arc_dev_mtx);
5691	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5692		nextdev = list_next(l2arc_dev_list, dev);
5693		if (vd == dev->l2ad_vdev) {
5694			remdev = dev;
5695			break;
5696		}
5697	}
5698	ASSERT(remdev != NULL);
5699
5700	/*
5701	 * Remove device from global list
5702	 */
5703	list_remove(l2arc_dev_list, remdev);
5704	l2arc_dev_last = NULL;		/* may have been invalidated */
5705	atomic_dec_64(&l2arc_ndev);
5706	mutex_exit(&l2arc_dev_mtx);
5707
5708	/*
5709	 * Clear all buflists and ARC references.  L2ARC device flush.
5710	 */
5711	l2arc_evict(remdev, 0, B_TRUE);
5712	list_destroy(remdev->l2ad_buflist);
5713	kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5714	kmem_free(remdev, sizeof (l2arc_dev_t));
5715}
5716
5717void
5718l2arc_init(void)
5719{
5720	l2arc_thread_exit = 0;
5721	l2arc_ndev = 0;
5722	l2arc_writes_sent = 0;
5723	l2arc_writes_done = 0;
5724
5725	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5726	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5727	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5728	mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5729	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5730
5731	l2arc_dev_list = &L2ARC_dev_list;
5732	l2arc_free_on_write = &L2ARC_free_on_write;
5733	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5734	    offsetof(l2arc_dev_t, l2ad_node));
5735	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5736	    offsetof(l2arc_data_free_t, l2df_list_node));
5737}
5738
5739void
5740l2arc_fini(void)
5741{
5742	/*
5743	 * This is called from dmu_fini(), which is called from spa_fini();
5744	 * Because of this, we can assume that all l2arc devices have
5745	 * already been removed when the pools themselves were removed.
5746	 */
5747
5748	l2arc_do_free_on_write();
5749
5750	mutex_destroy(&l2arc_feed_thr_lock);
5751	cv_destroy(&l2arc_feed_thr_cv);
5752	mutex_destroy(&l2arc_dev_mtx);
5753	mutex_destroy(&l2arc_buflist_mtx);
5754	mutex_destroy(&l2arc_free_on_write_mtx);
5755
5756	list_destroy(l2arc_dev_list);
5757	list_destroy(l2arc_free_on_write);
5758}
5759
5760void
5761l2arc_start(void)
5762{
5763	if (!(spa_mode_global & FWRITE))
5764		return;
5765
5766	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5767	    TS_RUN, minclsyspri);
5768}
5769
5770void
5771l2arc_stop(void)
5772{
5773	if (!(spa_mode_global & FWRITE))
5774		return;
5775
5776	mutex_enter(&l2arc_feed_thr_lock);
5777	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
5778	l2arc_thread_exit = 1;
5779	while (l2arc_thread_exit != 0)
5780		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5781	mutex_exit(&l2arc_feed_thr_lock);
5782}
5783