arc.c revision 274625
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
24 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
25 * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
26 */
27
28/*
29 * DVA-based Adjustable Replacement Cache
30 *
31 * While much of the theory of operation used here is
32 * based on the self-tuning, low overhead replacement cache
33 * presented by Megiddo and Modha at FAST 2003, there are some
34 * significant differences:
35 *
36 * 1. The Megiddo and Modha model assumes any page is evictable.
37 * Pages in its cache cannot be "locked" into memory.  This makes
38 * the eviction algorithm simple: evict the last page in the list.
39 * This also make the performance characteristics easy to reason
40 * about.  Our cache is not so simple.  At any given moment, some
41 * subset of the blocks in the cache are un-evictable because we
42 * have handed out a reference to them.  Blocks are only evictable
43 * when there are no external references active.  This makes
44 * eviction far more problematic:  we choose to evict the evictable
45 * blocks that are the "lowest" in the list.
46 *
47 * There are times when it is not possible to evict the requested
48 * space.  In these circumstances we are unable to adjust the cache
49 * size.  To prevent the cache growing unbounded at these times we
50 * implement a "cache throttle" that slows the flow of new data
51 * into the cache until we can make space available.
52 *
53 * 2. The Megiddo and Modha model assumes a fixed cache size.
54 * Pages are evicted when the cache is full and there is a cache
55 * miss.  Our model has a variable sized cache.  It grows with
56 * high use, but also tries to react to memory pressure from the
57 * operating system: decreasing its size when system memory is
58 * tight.
59 *
60 * 3. The Megiddo and Modha model assumes a fixed page size. All
61 * elements of the cache are therefore exactly the same size.  So
62 * when adjusting the cache size following a cache miss, its simply
63 * a matter of choosing a single page to evict.  In our model, we
64 * have variable sized cache blocks (rangeing from 512 bytes to
65 * 128K bytes).  We therefore choose a set of blocks to evict to make
66 * space for a cache miss that approximates as closely as possible
67 * the space used by the new block.
68 *
69 * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
70 * by N. Megiddo & D. Modha, FAST 2003
71 */
72
73/*
74 * The locking model:
75 *
76 * A new reference to a cache buffer can be obtained in two
77 * ways: 1) via a hash table lookup using the DVA as a key,
78 * or 2) via one of the ARC lists.  The arc_read() interface
79 * uses method 1, while the internal arc algorithms for
80 * adjusting the cache use method 2.  We therefore provide two
81 * types of locks: 1) the hash table lock array, and 2) the
82 * arc list locks.
83 *
84 * Buffers do not have their own mutexs, rather they rely on the
85 * hash table mutexs for the bulk of their protection (i.e. most
86 * fields in the arc_buf_hdr_t are protected by these mutexs).
87 *
88 * buf_hash_find() returns the appropriate mutex (held) when it
89 * locates the requested buffer in the hash table.  It returns
90 * NULL for the mutex if the buffer was not in the table.
91 *
92 * buf_hash_remove() expects the appropriate hash mutex to be
93 * already held before it is invoked.
94 *
95 * Each arc state also has a mutex which is used to protect the
96 * buffer list associated with the state.  When attempting to
97 * obtain a hash table lock while holding an arc list lock you
98 * must use: mutex_tryenter() to avoid deadlock.  Also note that
99 * the active state mutex must be held before the ghost state mutex.
100 *
101 * Arc buffers may have an associated eviction callback function.
102 * This function will be invoked prior to removing the buffer (e.g.
103 * in arc_do_user_evicts()).  Note however that the data associated
104 * with the buffer may be evicted prior to the callback.  The callback
105 * must be made with *no locks held* (to prevent deadlock).  Additionally,
106 * the users of callbacks must ensure that their private data is
107 * protected from simultaneous callbacks from arc_clear_callback()
108 * and arc_do_user_evicts().
109 *
110 * Note that the majority of the performance stats are manipulated
111 * with atomic operations.
112 *
113 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
114 *
115 *	- L2ARC buflist creation
116 *	- L2ARC buflist eviction
117 *	- L2ARC write completion, which walks L2ARC buflists
118 *	- ARC header destruction, as it removes from L2ARC buflists
119 *	- ARC header release, as it removes from L2ARC buflists
120 */
121
122#include <sys/spa.h>
123#include <sys/zio.h>
124#include <sys/zio_compress.h>
125#include <sys/zfs_context.h>
126#include <sys/arc.h>
127#include <sys/refcount.h>
128#include <sys/vdev.h>
129#include <sys/vdev_impl.h>
130#include <sys/dsl_pool.h>
131#ifdef _KERNEL
132#include <sys/dnlc.h>
133#endif
134#include <sys/callb.h>
135#include <sys/kstat.h>
136#include <sys/trim_map.h>
137#include <zfs_fletcher.h>
138#include <sys/sdt.h>
139
140#include <vm/vm_pageout.h>
141#include <machine/vmparam.h>
142
143#ifdef illumos
144#ifndef _KERNEL
145/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
146boolean_t arc_watch = B_FALSE;
147int arc_procfd;
148#endif
149#endif /* illumos */
150
151static kmutex_t		arc_reclaim_thr_lock;
152static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
153static uint8_t		arc_thread_exit;
154
155#define	ARC_REDUCE_DNLC_PERCENT	3
156uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
157
158typedef enum arc_reclaim_strategy {
159	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
160	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
161} arc_reclaim_strategy_t;
162
163/*
164 * The number of iterations through arc_evict_*() before we
165 * drop & reacquire the lock.
166 */
167int arc_evict_iterations = 100;
168
169/* number of seconds before growing cache again */
170static int		arc_grow_retry = 60;
171
172/* shift of arc_c for calculating both min and max arc_p */
173static int		arc_p_min_shift = 4;
174
175/* log2(fraction of arc to reclaim) */
176static int		arc_shrink_shift = 5;
177
178/*
179 * minimum lifespan of a prefetch block in clock ticks
180 * (initialized in arc_init())
181 */
182static int		arc_min_prefetch_lifespan;
183
184/*
185 * If this percent of memory is free, don't throttle.
186 */
187int arc_lotsfree_percent = 10;
188
189static int arc_dead;
190extern int zfs_prefetch_disable;
191
192/*
193 * The arc has filled available memory and has now warmed up.
194 */
195static boolean_t arc_warm;
196
197uint64_t zfs_arc_max;
198uint64_t zfs_arc_min;
199uint64_t zfs_arc_meta_limit = 0;
200int zfs_arc_grow_retry = 0;
201int zfs_arc_shrink_shift = 0;
202int zfs_arc_p_min_shift = 0;
203int zfs_disable_dup_eviction = 0;
204uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
205u_int zfs_arc_free_target = 0;
206
207static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
208
209#ifdef _KERNEL
210static void
211arc_free_target_init(void *unused __unused)
212{
213
214	zfs_arc_free_target = vm_pageout_wakeup_thresh;
215}
216SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
217    arc_free_target_init, NULL);
218
219TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
220TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
221TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
222TUNABLE_QUAD("vfs.zfs.arc_average_blocksize", &zfs_arc_average_blocksize);
223TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift);
224SYSCTL_DECL(_vfs_zfs);
225SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
226    "Maximum ARC size");
227SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
228    "Minimum ARC size");
229SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
230    &zfs_arc_average_blocksize, 0,
231    "ARC average blocksize");
232SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
233    &arc_shrink_shift, 0,
234    "log2(fraction of arc to reclaim)");
235
236/*
237 * We don't have a tunable for arc_free_target due to the dependency on
238 * pagedaemon initialisation.
239 */
240SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
241    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
242    sysctl_vfs_zfs_arc_free_target, "IU",
243    "Desired number of free pages below which ARC triggers reclaim");
244
245static int
246sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
247{
248	u_int val;
249	int err;
250
251	val = zfs_arc_free_target;
252	err = sysctl_handle_int(oidp, &val, 0, req);
253	if (err != 0 || req->newptr == NULL)
254		return (err);
255
256	if (val < minfree)
257		return (EINVAL);
258	if (val > cnt.v_page_count)
259		return (EINVAL);
260
261	zfs_arc_free_target = val;
262
263	return (0);
264}
265#endif
266
267/*
268 * Note that buffers can be in one of 6 states:
269 *	ARC_anon	- anonymous (discussed below)
270 *	ARC_mru		- recently used, currently cached
271 *	ARC_mru_ghost	- recentely used, no longer in cache
272 *	ARC_mfu		- frequently used, currently cached
273 *	ARC_mfu_ghost	- frequently used, no longer in cache
274 *	ARC_l2c_only	- exists in L2ARC but not other states
275 * When there are no active references to the buffer, they are
276 * are linked onto a list in one of these arc states.  These are
277 * the only buffers that can be evicted or deleted.  Within each
278 * state there are multiple lists, one for meta-data and one for
279 * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
280 * etc.) is tracked separately so that it can be managed more
281 * explicitly: favored over data, limited explicitly.
282 *
283 * Anonymous buffers are buffers that are not associated with
284 * a DVA.  These are buffers that hold dirty block copies
285 * before they are written to stable storage.  By definition,
286 * they are "ref'd" and are considered part of arc_mru
287 * that cannot be freed.  Generally, they will aquire a DVA
288 * as they are written and migrate onto the arc_mru list.
289 *
290 * The ARC_l2c_only state is for buffers that are in the second
291 * level ARC but no longer in any of the ARC_m* lists.  The second
292 * level ARC itself may also contain buffers that are in any of
293 * the ARC_m* states - meaning that a buffer can exist in two
294 * places.  The reason for the ARC_l2c_only state is to keep the
295 * buffer header in the hash table, so that reads that hit the
296 * second level ARC benefit from these fast lookups.
297 */
298
299#define	ARCS_LOCK_PAD		CACHE_LINE_SIZE
300struct arcs_lock {
301	kmutex_t	arcs_lock;
302#ifdef _KERNEL
303	unsigned char	pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
304#endif
305};
306
307/*
308 * must be power of two for mask use to work
309 *
310 */
311#define ARC_BUFC_NUMDATALISTS		16
312#define ARC_BUFC_NUMMETADATALISTS	16
313#define ARC_BUFC_NUMLISTS	(ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
314
315typedef struct arc_state {
316	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
317	uint64_t arcs_size;	/* total amount of data in this state */
318	list_t	arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
319	struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
320} arc_state_t;
321
322#define ARCS_LOCK(s, i)	(&((s)->arcs_locks[(i)].arcs_lock))
323
324/* The 6 states: */
325static arc_state_t ARC_anon;
326static arc_state_t ARC_mru;
327static arc_state_t ARC_mru_ghost;
328static arc_state_t ARC_mfu;
329static arc_state_t ARC_mfu_ghost;
330static arc_state_t ARC_l2c_only;
331
332typedef struct arc_stats {
333	kstat_named_t arcstat_hits;
334	kstat_named_t arcstat_misses;
335	kstat_named_t arcstat_demand_data_hits;
336	kstat_named_t arcstat_demand_data_misses;
337	kstat_named_t arcstat_demand_metadata_hits;
338	kstat_named_t arcstat_demand_metadata_misses;
339	kstat_named_t arcstat_prefetch_data_hits;
340	kstat_named_t arcstat_prefetch_data_misses;
341	kstat_named_t arcstat_prefetch_metadata_hits;
342	kstat_named_t arcstat_prefetch_metadata_misses;
343	kstat_named_t arcstat_mru_hits;
344	kstat_named_t arcstat_mru_ghost_hits;
345	kstat_named_t arcstat_mfu_hits;
346	kstat_named_t arcstat_mfu_ghost_hits;
347	kstat_named_t arcstat_allocated;
348	kstat_named_t arcstat_deleted;
349	kstat_named_t arcstat_stolen;
350	kstat_named_t arcstat_recycle_miss;
351	/*
352	 * Number of buffers that could not be evicted because the hash lock
353	 * was held by another thread.  The lock may not necessarily be held
354	 * by something using the same buffer, since hash locks are shared
355	 * by multiple buffers.
356	 */
357	kstat_named_t arcstat_mutex_miss;
358	/*
359	 * Number of buffers skipped because they have I/O in progress, are
360	 * indrect prefetch buffers that have not lived long enough, or are
361	 * not from the spa we're trying to evict from.
362	 */
363	kstat_named_t arcstat_evict_skip;
364	kstat_named_t arcstat_evict_l2_cached;
365	kstat_named_t arcstat_evict_l2_eligible;
366	kstat_named_t arcstat_evict_l2_ineligible;
367	kstat_named_t arcstat_hash_elements;
368	kstat_named_t arcstat_hash_elements_max;
369	kstat_named_t arcstat_hash_collisions;
370	kstat_named_t arcstat_hash_chains;
371	kstat_named_t arcstat_hash_chain_max;
372	kstat_named_t arcstat_p;
373	kstat_named_t arcstat_c;
374	kstat_named_t arcstat_c_min;
375	kstat_named_t arcstat_c_max;
376	kstat_named_t arcstat_size;
377	kstat_named_t arcstat_hdr_size;
378	kstat_named_t arcstat_data_size;
379	kstat_named_t arcstat_other_size;
380	kstat_named_t arcstat_l2_hits;
381	kstat_named_t arcstat_l2_misses;
382	kstat_named_t arcstat_l2_feeds;
383	kstat_named_t arcstat_l2_rw_clash;
384	kstat_named_t arcstat_l2_read_bytes;
385	kstat_named_t arcstat_l2_write_bytes;
386	kstat_named_t arcstat_l2_writes_sent;
387	kstat_named_t arcstat_l2_writes_done;
388	kstat_named_t arcstat_l2_writes_error;
389	kstat_named_t arcstat_l2_writes_hdr_miss;
390	kstat_named_t arcstat_l2_evict_lock_retry;
391	kstat_named_t arcstat_l2_evict_reading;
392	kstat_named_t arcstat_l2_free_on_write;
393	kstat_named_t arcstat_l2_abort_lowmem;
394	kstat_named_t arcstat_l2_cksum_bad;
395	kstat_named_t arcstat_l2_io_error;
396	kstat_named_t arcstat_l2_size;
397	kstat_named_t arcstat_l2_asize;
398	kstat_named_t arcstat_l2_hdr_size;
399	kstat_named_t arcstat_l2_compress_successes;
400	kstat_named_t arcstat_l2_compress_zeros;
401	kstat_named_t arcstat_l2_compress_failures;
402	kstat_named_t arcstat_l2_write_trylock_fail;
403	kstat_named_t arcstat_l2_write_passed_headroom;
404	kstat_named_t arcstat_l2_write_spa_mismatch;
405	kstat_named_t arcstat_l2_write_in_l2;
406	kstat_named_t arcstat_l2_write_hdr_io_in_progress;
407	kstat_named_t arcstat_l2_write_not_cacheable;
408	kstat_named_t arcstat_l2_write_full;
409	kstat_named_t arcstat_l2_write_buffer_iter;
410	kstat_named_t arcstat_l2_write_pios;
411	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
412	kstat_named_t arcstat_l2_write_buffer_list_iter;
413	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
414	kstat_named_t arcstat_memory_throttle_count;
415	kstat_named_t arcstat_duplicate_buffers;
416	kstat_named_t arcstat_duplicate_buffers_size;
417	kstat_named_t arcstat_duplicate_reads;
418} arc_stats_t;
419
420static arc_stats_t arc_stats = {
421	{ "hits",			KSTAT_DATA_UINT64 },
422	{ "misses",			KSTAT_DATA_UINT64 },
423	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
424	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
425	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
426	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
427	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
428	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
429	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
430	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
431	{ "mru_hits",			KSTAT_DATA_UINT64 },
432	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
433	{ "mfu_hits",			KSTAT_DATA_UINT64 },
434	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
435	{ "allocated",			KSTAT_DATA_UINT64 },
436	{ "deleted",			KSTAT_DATA_UINT64 },
437	{ "stolen",			KSTAT_DATA_UINT64 },
438	{ "recycle_miss",		KSTAT_DATA_UINT64 },
439	{ "mutex_miss",			KSTAT_DATA_UINT64 },
440	{ "evict_skip",			KSTAT_DATA_UINT64 },
441	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
442	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
443	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
444	{ "hash_elements",		KSTAT_DATA_UINT64 },
445	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
446	{ "hash_collisions",		KSTAT_DATA_UINT64 },
447	{ "hash_chains",		KSTAT_DATA_UINT64 },
448	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
449	{ "p",				KSTAT_DATA_UINT64 },
450	{ "c",				KSTAT_DATA_UINT64 },
451	{ "c_min",			KSTAT_DATA_UINT64 },
452	{ "c_max",			KSTAT_DATA_UINT64 },
453	{ "size",			KSTAT_DATA_UINT64 },
454	{ "hdr_size",			KSTAT_DATA_UINT64 },
455	{ "data_size",			KSTAT_DATA_UINT64 },
456	{ "other_size",			KSTAT_DATA_UINT64 },
457	{ "l2_hits",			KSTAT_DATA_UINT64 },
458	{ "l2_misses",			KSTAT_DATA_UINT64 },
459	{ "l2_feeds",			KSTAT_DATA_UINT64 },
460	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
461	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
462	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
463	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
464	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
465	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
466	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
467	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
468	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
469	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
470	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
471	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
472	{ "l2_io_error",		KSTAT_DATA_UINT64 },
473	{ "l2_size",			KSTAT_DATA_UINT64 },
474	{ "l2_asize",			KSTAT_DATA_UINT64 },
475	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
476	{ "l2_compress_successes",	KSTAT_DATA_UINT64 },
477	{ "l2_compress_zeros",		KSTAT_DATA_UINT64 },
478	{ "l2_compress_failures",	KSTAT_DATA_UINT64 },
479	{ "l2_write_trylock_fail",	KSTAT_DATA_UINT64 },
480	{ "l2_write_passed_headroom",	KSTAT_DATA_UINT64 },
481	{ "l2_write_spa_mismatch",	KSTAT_DATA_UINT64 },
482	{ "l2_write_in_l2",		KSTAT_DATA_UINT64 },
483	{ "l2_write_io_in_progress",	KSTAT_DATA_UINT64 },
484	{ "l2_write_not_cacheable",	KSTAT_DATA_UINT64 },
485	{ "l2_write_full",		KSTAT_DATA_UINT64 },
486	{ "l2_write_buffer_iter",	KSTAT_DATA_UINT64 },
487	{ "l2_write_pios",		KSTAT_DATA_UINT64 },
488	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
489	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
490	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
491	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
492	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
493	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
494	{ "duplicate_reads",		KSTAT_DATA_UINT64 }
495};
496
497#define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
498
499#define	ARCSTAT_INCR(stat, val) \
500	atomic_add_64(&arc_stats.stat.value.ui64, (val))
501
502#define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
503#define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
504
505#define	ARCSTAT_MAX(stat, val) {					\
506	uint64_t m;							\
507	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
508	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
509		continue;						\
510}
511
512#define	ARCSTAT_MAXSTAT(stat) \
513	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
514
515/*
516 * We define a macro to allow ARC hits/misses to be easily broken down by
517 * two separate conditions, giving a total of four different subtypes for
518 * each of hits and misses (so eight statistics total).
519 */
520#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
521	if (cond1) {							\
522		if (cond2) {						\
523			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
524		} else {						\
525			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
526		}							\
527	} else {							\
528		if (cond2) {						\
529			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
530		} else {						\
531			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
532		}							\
533	}
534
535kstat_t			*arc_ksp;
536static arc_state_t	*arc_anon;
537static arc_state_t	*arc_mru;
538static arc_state_t	*arc_mru_ghost;
539static arc_state_t	*arc_mfu;
540static arc_state_t	*arc_mfu_ghost;
541static arc_state_t	*arc_l2c_only;
542
543/*
544 * There are several ARC variables that are critical to export as kstats --
545 * but we don't want to have to grovel around in the kstat whenever we wish to
546 * manipulate them.  For these variables, we therefore define them to be in
547 * terms of the statistic variable.  This assures that we are not introducing
548 * the possibility of inconsistency by having shadow copies of the variables,
549 * while still allowing the code to be readable.
550 */
551#define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
552#define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
553#define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
554#define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
555#define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
556
557#define	L2ARC_IS_VALID_COMPRESS(_c_) \
558	((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
559
560static int		arc_no_grow;	/* Don't try to grow cache size */
561static uint64_t		arc_tempreserve;
562static uint64_t		arc_loaned_bytes;
563static uint64_t		arc_meta_used;
564static uint64_t		arc_meta_limit;
565static uint64_t		arc_meta_max = 0;
566SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RD, &arc_meta_used, 0,
567    "ARC metadata used");
568SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RW, &arc_meta_limit, 0,
569    "ARC metadata limit");
570
571typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
572
573typedef struct arc_callback arc_callback_t;
574
575struct arc_callback {
576	void			*acb_private;
577	arc_done_func_t		*acb_done;
578	arc_buf_t		*acb_buf;
579	zio_t			*acb_zio_dummy;
580	arc_callback_t		*acb_next;
581};
582
583typedef struct arc_write_callback arc_write_callback_t;
584
585struct arc_write_callback {
586	void		*awcb_private;
587	arc_done_func_t	*awcb_ready;
588	arc_done_func_t	*awcb_physdone;
589	arc_done_func_t	*awcb_done;
590	arc_buf_t	*awcb_buf;
591};
592
593struct arc_buf_hdr {
594	/* protected by hash lock */
595	dva_t			b_dva;
596	uint64_t		b_birth;
597	uint64_t		b_cksum0;
598
599	kmutex_t		b_freeze_lock;
600	zio_cksum_t		*b_freeze_cksum;
601	void			*b_thawed;
602
603	arc_buf_hdr_t		*b_hash_next;
604	arc_buf_t		*b_buf;
605	uint32_t		b_flags;
606	uint32_t		b_datacnt;
607
608	arc_callback_t		*b_acb;
609	kcondvar_t		b_cv;
610
611	/* immutable */
612	arc_buf_contents_t	b_type;
613	uint64_t		b_size;
614	uint64_t		b_spa;
615
616	/* protected by arc state mutex */
617	arc_state_t		*b_state;
618	list_node_t		b_arc_node;
619
620	/* updated atomically */
621	clock_t			b_arc_access;
622
623	/* self protecting */
624	refcount_t		b_refcnt;
625
626	l2arc_buf_hdr_t		*b_l2hdr;
627	list_node_t		b_l2node;
628};
629
630static arc_buf_t *arc_eviction_list;
631static kmutex_t arc_eviction_mtx;
632static arc_buf_hdr_t arc_eviction_hdr;
633static void arc_get_data_buf(arc_buf_t *buf);
634static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
635static int arc_evict_needed(arc_buf_contents_t type);
636static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
637#ifdef illumos
638static void arc_buf_watch(arc_buf_t *buf);
639#endif /* illumos */
640
641static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
642
643#define	GHOST_STATE(state)	\
644	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
645	(state) == arc_l2c_only)
646
647/*
648 * Private ARC flags.  These flags are private ARC only flags that will show up
649 * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
650 * be passed in as arc_flags in things like arc_read.  However, these flags
651 * should never be passed and should only be set by ARC code.  When adding new
652 * public flags, make sure not to smash the private ones.
653 */
654
655#define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
656#define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
657#define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
658#define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
659#define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
660#define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
661#define	ARC_FREE_IN_PROGRESS	(1 << 15)	/* hdr about to be freed */
662#define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
663#define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
664#define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
665
666#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
667#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
668#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
669#define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_PREFETCH)
670#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
671#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
672#define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
673#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_L2CACHE)
674#define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS &&	\
675				    (hdr)->b_l2hdr != NULL)
676#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
677#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
678#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
679
680/*
681 * Other sizes
682 */
683
684#define	HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
685#define	L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
686
687/*
688 * Hash table routines
689 */
690
691#define	HT_LOCK_PAD	CACHE_LINE_SIZE
692
693struct ht_lock {
694	kmutex_t	ht_lock;
695#ifdef _KERNEL
696	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
697#endif
698};
699
700#define	BUF_LOCKS 256
701typedef struct buf_hash_table {
702	uint64_t ht_mask;
703	arc_buf_hdr_t **ht_table;
704	struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
705} buf_hash_table_t;
706
707static buf_hash_table_t buf_hash_table;
708
709#define	BUF_HASH_INDEX(spa, dva, birth) \
710	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
711#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
712#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
713#define	HDR_LOCK(hdr) \
714	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
715
716uint64_t zfs_crc64_table[256];
717
718/*
719 * Level 2 ARC
720 */
721
722#define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
723#define	L2ARC_HEADROOM		2			/* num of writes */
724/*
725 * If we discover during ARC scan any buffers to be compressed, we boost
726 * our headroom for the next scanning cycle by this percentage multiple.
727 */
728#define	L2ARC_HEADROOM_BOOST	200
729#define	L2ARC_FEED_SECS		1		/* caching interval secs */
730#define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
731
732#define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
733#define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
734
735/* L2ARC Performance Tunables */
736uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
737uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
738uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
739uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
740uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
741uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
742boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
743boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
744boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
745
746SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
747    &l2arc_write_max, 0, "max write size");
748SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
749    &l2arc_write_boost, 0, "extra write during warmup");
750SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
751    &l2arc_headroom, 0, "number of dev writes");
752SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
753    &l2arc_feed_secs, 0, "interval seconds");
754SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
755    &l2arc_feed_min_ms, 0, "min interval milliseconds");
756
757SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
758    &l2arc_noprefetch, 0, "don't cache prefetch bufs");
759SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
760    &l2arc_feed_again, 0, "turbo warmup");
761SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
762    &l2arc_norw, 0, "no reads during writes");
763
764SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
765    &ARC_anon.arcs_size, 0, "size of anonymous state");
766SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
767    &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
768SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
769    &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
770
771SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
772    &ARC_mru.arcs_size, 0, "size of mru state");
773SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
774    &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
775SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
776    &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
777
778SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
779    &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
780SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
781    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
782    "size of metadata in mru ghost state");
783SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
784    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
785    "size of data in mru ghost state");
786
787SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
788    &ARC_mfu.arcs_size, 0, "size of mfu state");
789SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
790    &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
791SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
792    &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
793
794SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
795    &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
796SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
797    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
798    "size of metadata in mfu ghost state");
799SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
800    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
801    "size of data in mfu ghost state");
802
803SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
804    &ARC_l2c_only.arcs_size, 0, "size of mru state");
805
806/*
807 * L2ARC Internals
808 */
809typedef struct l2arc_dev {
810	vdev_t			*l2ad_vdev;	/* vdev */
811	spa_t			*l2ad_spa;	/* spa */
812	uint64_t		l2ad_hand;	/* next write location */
813	uint64_t		l2ad_start;	/* first addr on device */
814	uint64_t		l2ad_end;	/* last addr on device */
815	uint64_t		l2ad_evict;	/* last addr eviction reached */
816	boolean_t		l2ad_first;	/* first sweep through */
817	boolean_t		l2ad_writing;	/* currently writing */
818	list_t			*l2ad_buflist;	/* buffer list */
819	list_node_t		l2ad_node;	/* device list node */
820} l2arc_dev_t;
821
822static list_t L2ARC_dev_list;			/* device list */
823static list_t *l2arc_dev_list;			/* device list pointer */
824static kmutex_t l2arc_dev_mtx;			/* device list mutex */
825static l2arc_dev_t *l2arc_dev_last;		/* last device used */
826static kmutex_t l2arc_buflist_mtx;		/* mutex for all buflists */
827static list_t L2ARC_free_on_write;		/* free after write buf list */
828static list_t *l2arc_free_on_write;		/* free after write list ptr */
829static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
830static uint64_t l2arc_ndev;			/* number of devices */
831
832typedef struct l2arc_read_callback {
833	arc_buf_t		*l2rcb_buf;		/* read buffer */
834	spa_t			*l2rcb_spa;		/* spa */
835	blkptr_t		l2rcb_bp;		/* original blkptr */
836	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
837	int			l2rcb_flags;		/* original flags */
838	enum zio_compress	l2rcb_compress;		/* applied compress */
839} l2arc_read_callback_t;
840
841typedef struct l2arc_write_callback {
842	l2arc_dev_t	*l2wcb_dev;		/* device info */
843	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
844} l2arc_write_callback_t;
845
846struct l2arc_buf_hdr {
847	/* protected by arc_buf_hdr  mutex */
848	l2arc_dev_t		*b_dev;		/* L2ARC device */
849	uint64_t		b_daddr;	/* disk address, offset byte */
850	/* compression applied to buffer data */
851	enum zio_compress	b_compress;
852	/* real alloc'd buffer size depending on b_compress applied */
853	int			b_asize;
854	/* temporary buffer holder for in-flight compressed data */
855	void			*b_tmp_cdata;
856};
857
858typedef struct l2arc_data_free {
859	/* protected by l2arc_free_on_write_mtx */
860	void		*l2df_data;
861	size_t		l2df_size;
862	void		(*l2df_func)(void *, size_t);
863	list_node_t	l2df_list_node;
864} l2arc_data_free_t;
865
866static kmutex_t l2arc_feed_thr_lock;
867static kcondvar_t l2arc_feed_thr_cv;
868static uint8_t l2arc_thread_exit;
869
870static void l2arc_read_done(zio_t *zio);
871static void l2arc_hdr_stat_add(void);
872static void l2arc_hdr_stat_remove(void);
873
874static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
875static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
876    enum zio_compress c);
877static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
878
879static uint64_t
880buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
881{
882	uint8_t *vdva = (uint8_t *)dva;
883	uint64_t crc = -1ULL;
884	int i;
885
886	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
887
888	for (i = 0; i < sizeof (dva_t); i++)
889		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
890
891	crc ^= (spa>>8) ^ birth;
892
893	return (crc);
894}
895
896#define	BUF_EMPTY(buf)						\
897	((buf)->b_dva.dva_word[0] == 0 &&			\
898	(buf)->b_dva.dva_word[1] == 0 &&			\
899	(buf)->b_cksum0 == 0)
900
901#define	BUF_EQUAL(spa, dva, birth, buf)				\
902	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
903	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
904	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
905
906static void
907buf_discard_identity(arc_buf_hdr_t *hdr)
908{
909	hdr->b_dva.dva_word[0] = 0;
910	hdr->b_dva.dva_word[1] = 0;
911	hdr->b_birth = 0;
912	hdr->b_cksum0 = 0;
913}
914
915static arc_buf_hdr_t *
916buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
917{
918	const dva_t *dva = BP_IDENTITY(bp);
919	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
920	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
921	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
922	arc_buf_hdr_t *buf;
923
924	mutex_enter(hash_lock);
925	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
926	    buf = buf->b_hash_next) {
927		if (BUF_EQUAL(spa, dva, birth, buf)) {
928			*lockp = hash_lock;
929			return (buf);
930		}
931	}
932	mutex_exit(hash_lock);
933	*lockp = NULL;
934	return (NULL);
935}
936
937/*
938 * Insert an entry into the hash table.  If there is already an element
939 * equal to elem in the hash table, then the already existing element
940 * will be returned and the new element will not be inserted.
941 * Otherwise returns NULL.
942 */
943static arc_buf_hdr_t *
944buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
945{
946	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
947	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
948	arc_buf_hdr_t *fbuf;
949	uint32_t i;
950
951	ASSERT(!DVA_IS_EMPTY(&buf->b_dva));
952	ASSERT(buf->b_birth != 0);
953	ASSERT(!HDR_IN_HASH_TABLE(buf));
954	*lockp = hash_lock;
955	mutex_enter(hash_lock);
956	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
957	    fbuf = fbuf->b_hash_next, i++) {
958		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
959			return (fbuf);
960	}
961
962	buf->b_hash_next = buf_hash_table.ht_table[idx];
963	buf_hash_table.ht_table[idx] = buf;
964	buf->b_flags |= ARC_IN_HASH_TABLE;
965
966	/* collect some hash table performance data */
967	if (i > 0) {
968		ARCSTAT_BUMP(arcstat_hash_collisions);
969		if (i == 1)
970			ARCSTAT_BUMP(arcstat_hash_chains);
971
972		ARCSTAT_MAX(arcstat_hash_chain_max, i);
973	}
974
975	ARCSTAT_BUMP(arcstat_hash_elements);
976	ARCSTAT_MAXSTAT(arcstat_hash_elements);
977
978	return (NULL);
979}
980
981static void
982buf_hash_remove(arc_buf_hdr_t *buf)
983{
984	arc_buf_hdr_t *fbuf, **bufp;
985	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
986
987	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
988	ASSERT(HDR_IN_HASH_TABLE(buf));
989
990	bufp = &buf_hash_table.ht_table[idx];
991	while ((fbuf = *bufp) != buf) {
992		ASSERT(fbuf != NULL);
993		bufp = &fbuf->b_hash_next;
994	}
995	*bufp = buf->b_hash_next;
996	buf->b_hash_next = NULL;
997	buf->b_flags &= ~ARC_IN_HASH_TABLE;
998
999	/* collect some hash table performance data */
1000	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1001
1002	if (buf_hash_table.ht_table[idx] &&
1003	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1004		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1005}
1006
1007/*
1008 * Global data structures and functions for the buf kmem cache.
1009 */
1010static kmem_cache_t *hdr_cache;
1011static kmem_cache_t *buf_cache;
1012
1013static void
1014buf_fini(void)
1015{
1016	int i;
1017
1018	kmem_free(buf_hash_table.ht_table,
1019	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
1020	for (i = 0; i < BUF_LOCKS; i++)
1021		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
1022	kmem_cache_destroy(hdr_cache);
1023	kmem_cache_destroy(buf_cache);
1024}
1025
1026/*
1027 * Constructor callback - called when the cache is empty
1028 * and a new buf is requested.
1029 */
1030/* ARGSUSED */
1031static int
1032hdr_cons(void *vbuf, void *unused, int kmflag)
1033{
1034	arc_buf_hdr_t *buf = vbuf;
1035
1036	bzero(buf, sizeof (arc_buf_hdr_t));
1037	refcount_create(&buf->b_refcnt);
1038	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
1039	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1040	arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1041
1042	return (0);
1043}
1044
1045/* ARGSUSED */
1046static int
1047buf_cons(void *vbuf, void *unused, int kmflag)
1048{
1049	arc_buf_t *buf = vbuf;
1050
1051	bzero(buf, sizeof (arc_buf_t));
1052	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1053	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1054
1055	return (0);
1056}
1057
1058/*
1059 * Destructor callback - called when a cached buf is
1060 * no longer required.
1061 */
1062/* ARGSUSED */
1063static void
1064hdr_dest(void *vbuf, void *unused)
1065{
1066	arc_buf_hdr_t *buf = vbuf;
1067
1068	ASSERT(BUF_EMPTY(buf));
1069	refcount_destroy(&buf->b_refcnt);
1070	cv_destroy(&buf->b_cv);
1071	mutex_destroy(&buf->b_freeze_lock);
1072	arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1073}
1074
1075/* ARGSUSED */
1076static void
1077buf_dest(void *vbuf, void *unused)
1078{
1079	arc_buf_t *buf = vbuf;
1080
1081	mutex_destroy(&buf->b_evict_lock);
1082	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1083}
1084
1085/*
1086 * Reclaim callback -- invoked when memory is low.
1087 */
1088/* ARGSUSED */
1089static void
1090hdr_recl(void *unused)
1091{
1092	dprintf("hdr_recl called\n");
1093	/*
1094	 * umem calls the reclaim func when we destroy the buf cache,
1095	 * which is after we do arc_fini().
1096	 */
1097	if (!arc_dead)
1098		cv_signal(&arc_reclaim_thr_cv);
1099}
1100
1101static void
1102buf_init(void)
1103{
1104	uint64_t *ct;
1105	uint64_t hsize = 1ULL << 12;
1106	int i, j;
1107
1108	/*
1109	 * The hash table is big enough to fill all of physical memory
1110	 * with an average block size of zfs_arc_average_blocksize (default 8K).
1111	 * By default, the table will take up
1112	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1113	 */
1114	while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
1115		hsize <<= 1;
1116retry:
1117	buf_hash_table.ht_mask = hsize - 1;
1118	buf_hash_table.ht_table =
1119	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1120	if (buf_hash_table.ht_table == NULL) {
1121		ASSERT(hsize > (1ULL << 8));
1122		hsize >>= 1;
1123		goto retry;
1124	}
1125
1126	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
1127	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
1128	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1129	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1130
1131	for (i = 0; i < 256; i++)
1132		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1133			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1134
1135	for (i = 0; i < BUF_LOCKS; i++) {
1136		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1137		    NULL, MUTEX_DEFAULT, NULL);
1138	}
1139}
1140
1141#define	ARC_MINTIME	(hz>>4) /* 62 ms */
1142
1143static void
1144arc_cksum_verify(arc_buf_t *buf)
1145{
1146	zio_cksum_t zc;
1147
1148	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1149		return;
1150
1151	mutex_enter(&buf->b_hdr->b_freeze_lock);
1152	if (buf->b_hdr->b_freeze_cksum == NULL ||
1153	    (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
1154		mutex_exit(&buf->b_hdr->b_freeze_lock);
1155		return;
1156	}
1157	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1158	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1159		panic("buffer modified while frozen!");
1160	mutex_exit(&buf->b_hdr->b_freeze_lock);
1161}
1162
1163static int
1164arc_cksum_equal(arc_buf_t *buf)
1165{
1166	zio_cksum_t zc;
1167	int equal;
1168
1169	mutex_enter(&buf->b_hdr->b_freeze_lock);
1170	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1171	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1172	mutex_exit(&buf->b_hdr->b_freeze_lock);
1173
1174	return (equal);
1175}
1176
1177static void
1178arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1179{
1180	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1181		return;
1182
1183	mutex_enter(&buf->b_hdr->b_freeze_lock);
1184	if (buf->b_hdr->b_freeze_cksum != NULL) {
1185		mutex_exit(&buf->b_hdr->b_freeze_lock);
1186		return;
1187	}
1188	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1189	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1190	    buf->b_hdr->b_freeze_cksum);
1191	mutex_exit(&buf->b_hdr->b_freeze_lock);
1192#ifdef illumos
1193	arc_buf_watch(buf);
1194#endif /* illumos */
1195}
1196
1197#ifdef illumos
1198#ifndef _KERNEL
1199typedef struct procctl {
1200	long cmd;
1201	prwatch_t prwatch;
1202} procctl_t;
1203#endif
1204
1205/* ARGSUSED */
1206static void
1207arc_buf_unwatch(arc_buf_t *buf)
1208{
1209#ifndef _KERNEL
1210	if (arc_watch) {
1211		int result;
1212		procctl_t ctl;
1213		ctl.cmd = PCWATCH;
1214		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1215		ctl.prwatch.pr_size = 0;
1216		ctl.prwatch.pr_wflags = 0;
1217		result = write(arc_procfd, &ctl, sizeof (ctl));
1218		ASSERT3U(result, ==, sizeof (ctl));
1219	}
1220#endif
1221}
1222
1223/* ARGSUSED */
1224static void
1225arc_buf_watch(arc_buf_t *buf)
1226{
1227#ifndef _KERNEL
1228	if (arc_watch) {
1229		int result;
1230		procctl_t ctl;
1231		ctl.cmd = PCWATCH;
1232		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1233		ctl.prwatch.pr_size = buf->b_hdr->b_size;
1234		ctl.prwatch.pr_wflags = WA_WRITE;
1235		result = write(arc_procfd, &ctl, sizeof (ctl));
1236		ASSERT3U(result, ==, sizeof (ctl));
1237	}
1238#endif
1239}
1240#endif /* illumos */
1241
1242void
1243arc_buf_thaw(arc_buf_t *buf)
1244{
1245	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1246		if (buf->b_hdr->b_state != arc_anon)
1247			panic("modifying non-anon buffer!");
1248		if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1249			panic("modifying buffer while i/o in progress!");
1250		arc_cksum_verify(buf);
1251	}
1252
1253	mutex_enter(&buf->b_hdr->b_freeze_lock);
1254	if (buf->b_hdr->b_freeze_cksum != NULL) {
1255		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1256		buf->b_hdr->b_freeze_cksum = NULL;
1257	}
1258
1259	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1260		if (buf->b_hdr->b_thawed)
1261			kmem_free(buf->b_hdr->b_thawed, 1);
1262		buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1263	}
1264
1265	mutex_exit(&buf->b_hdr->b_freeze_lock);
1266
1267#ifdef illumos
1268	arc_buf_unwatch(buf);
1269#endif /* illumos */
1270}
1271
1272void
1273arc_buf_freeze(arc_buf_t *buf)
1274{
1275	kmutex_t *hash_lock;
1276
1277	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1278		return;
1279
1280	hash_lock = HDR_LOCK(buf->b_hdr);
1281	mutex_enter(hash_lock);
1282
1283	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1284	    buf->b_hdr->b_state == arc_anon);
1285	arc_cksum_compute(buf, B_FALSE);
1286	mutex_exit(hash_lock);
1287
1288}
1289
1290static void
1291get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock)
1292{
1293	uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth);
1294
1295	if (ab->b_type == ARC_BUFC_METADATA)
1296		buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
1297	else {
1298		buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
1299		buf_hashid += ARC_BUFC_NUMMETADATALISTS;
1300	}
1301
1302	*list = &state->arcs_lists[buf_hashid];
1303	*lock = ARCS_LOCK(state, buf_hashid);
1304}
1305
1306
1307static void
1308add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1309{
1310	ASSERT(MUTEX_HELD(hash_lock));
1311
1312	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1313	    (ab->b_state != arc_anon)) {
1314		uint64_t delta = ab->b_size * ab->b_datacnt;
1315		uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1316		list_t *list;
1317		kmutex_t *lock;
1318
1319		get_buf_info(ab, ab->b_state, &list, &lock);
1320		ASSERT(!MUTEX_HELD(lock));
1321		mutex_enter(lock);
1322		ASSERT(list_link_active(&ab->b_arc_node));
1323		list_remove(list, ab);
1324		if (GHOST_STATE(ab->b_state)) {
1325			ASSERT0(ab->b_datacnt);
1326			ASSERT3P(ab->b_buf, ==, NULL);
1327			delta = ab->b_size;
1328		}
1329		ASSERT(delta > 0);
1330		ASSERT3U(*size, >=, delta);
1331		atomic_add_64(size, -delta);
1332		mutex_exit(lock);
1333		/* remove the prefetch flag if we get a reference */
1334		if (ab->b_flags & ARC_PREFETCH)
1335			ab->b_flags &= ~ARC_PREFETCH;
1336	}
1337}
1338
1339static int
1340remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1341{
1342	int cnt;
1343	arc_state_t *state = ab->b_state;
1344
1345	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1346	ASSERT(!GHOST_STATE(state));
1347
1348	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1349	    (state != arc_anon)) {
1350		uint64_t *size = &state->arcs_lsize[ab->b_type];
1351		list_t *list;
1352		kmutex_t *lock;
1353
1354		get_buf_info(ab, state, &list, &lock);
1355		ASSERT(!MUTEX_HELD(lock));
1356		mutex_enter(lock);
1357		ASSERT(!list_link_active(&ab->b_arc_node));
1358		list_insert_head(list, ab);
1359		ASSERT(ab->b_datacnt > 0);
1360		atomic_add_64(size, ab->b_size * ab->b_datacnt);
1361		mutex_exit(lock);
1362	}
1363	return (cnt);
1364}
1365
1366/*
1367 * Move the supplied buffer to the indicated state.  The mutex
1368 * for the buffer must be held by the caller.
1369 */
1370static void
1371arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1372{
1373	arc_state_t *old_state = ab->b_state;
1374	int64_t refcnt = refcount_count(&ab->b_refcnt);
1375	uint64_t from_delta, to_delta;
1376	list_t *list;
1377	kmutex_t *lock;
1378
1379	ASSERT(MUTEX_HELD(hash_lock));
1380	ASSERT3P(new_state, !=, old_state);
1381	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1382	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1383	ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1384
1385	from_delta = to_delta = ab->b_datacnt * ab->b_size;
1386
1387	/*
1388	 * If this buffer is evictable, transfer it from the
1389	 * old state list to the new state list.
1390	 */
1391	if (refcnt == 0) {
1392		if (old_state != arc_anon) {
1393			int use_mutex;
1394			uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1395
1396			get_buf_info(ab, old_state, &list, &lock);
1397			use_mutex = !MUTEX_HELD(lock);
1398			if (use_mutex)
1399				mutex_enter(lock);
1400
1401			ASSERT(list_link_active(&ab->b_arc_node));
1402			list_remove(list, ab);
1403
1404			/*
1405			 * If prefetching out of the ghost cache,
1406			 * we will have a non-zero datacnt.
1407			 */
1408			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1409				/* ghost elements have a ghost size */
1410				ASSERT(ab->b_buf == NULL);
1411				from_delta = ab->b_size;
1412			}
1413			ASSERT3U(*size, >=, from_delta);
1414			atomic_add_64(size, -from_delta);
1415
1416			if (use_mutex)
1417				mutex_exit(lock);
1418		}
1419		if (new_state != arc_anon) {
1420			int use_mutex;
1421			uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1422
1423			get_buf_info(ab, new_state, &list, &lock);
1424			use_mutex = !MUTEX_HELD(lock);
1425			if (use_mutex)
1426				mutex_enter(lock);
1427
1428			list_insert_head(list, ab);
1429
1430			/* ghost elements have a ghost size */
1431			if (GHOST_STATE(new_state)) {
1432				ASSERT(ab->b_datacnt == 0);
1433				ASSERT(ab->b_buf == NULL);
1434				to_delta = ab->b_size;
1435			}
1436			atomic_add_64(size, to_delta);
1437
1438			if (use_mutex)
1439				mutex_exit(lock);
1440		}
1441	}
1442
1443	ASSERT(!BUF_EMPTY(ab));
1444	if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1445		buf_hash_remove(ab);
1446
1447	/* adjust state sizes */
1448	if (to_delta)
1449		atomic_add_64(&new_state->arcs_size, to_delta);
1450	if (from_delta) {
1451		ASSERT3U(old_state->arcs_size, >=, from_delta);
1452		atomic_add_64(&old_state->arcs_size, -from_delta);
1453	}
1454	ab->b_state = new_state;
1455
1456	/* adjust l2arc hdr stats */
1457	if (new_state == arc_l2c_only)
1458		l2arc_hdr_stat_add();
1459	else if (old_state == arc_l2c_only)
1460		l2arc_hdr_stat_remove();
1461}
1462
1463void
1464arc_space_consume(uint64_t space, arc_space_type_t type)
1465{
1466	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1467
1468	switch (type) {
1469	case ARC_SPACE_DATA:
1470		ARCSTAT_INCR(arcstat_data_size, space);
1471		break;
1472	case ARC_SPACE_OTHER:
1473		ARCSTAT_INCR(arcstat_other_size, space);
1474		break;
1475	case ARC_SPACE_HDRS:
1476		ARCSTAT_INCR(arcstat_hdr_size, space);
1477		break;
1478	case ARC_SPACE_L2HDRS:
1479		ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1480		break;
1481	}
1482
1483	atomic_add_64(&arc_meta_used, space);
1484	atomic_add_64(&arc_size, space);
1485}
1486
1487void
1488arc_space_return(uint64_t space, arc_space_type_t type)
1489{
1490	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1491
1492	switch (type) {
1493	case ARC_SPACE_DATA:
1494		ARCSTAT_INCR(arcstat_data_size, -space);
1495		break;
1496	case ARC_SPACE_OTHER:
1497		ARCSTAT_INCR(arcstat_other_size, -space);
1498		break;
1499	case ARC_SPACE_HDRS:
1500		ARCSTAT_INCR(arcstat_hdr_size, -space);
1501		break;
1502	case ARC_SPACE_L2HDRS:
1503		ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1504		break;
1505	}
1506
1507	ASSERT(arc_meta_used >= space);
1508	if (arc_meta_max < arc_meta_used)
1509		arc_meta_max = arc_meta_used;
1510	atomic_add_64(&arc_meta_used, -space);
1511	ASSERT(arc_size >= space);
1512	atomic_add_64(&arc_size, -space);
1513}
1514
1515arc_buf_t *
1516arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1517{
1518	arc_buf_hdr_t *hdr;
1519	arc_buf_t *buf;
1520
1521	ASSERT3U(size, >, 0);
1522	hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1523	ASSERT(BUF_EMPTY(hdr));
1524	hdr->b_size = size;
1525	hdr->b_type = type;
1526	hdr->b_spa = spa_load_guid(spa);
1527	hdr->b_state = arc_anon;
1528	hdr->b_arc_access = 0;
1529	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1530	buf->b_hdr = hdr;
1531	buf->b_data = NULL;
1532	buf->b_efunc = NULL;
1533	buf->b_private = NULL;
1534	buf->b_next = NULL;
1535	hdr->b_buf = buf;
1536	arc_get_data_buf(buf);
1537	hdr->b_datacnt = 1;
1538	hdr->b_flags = 0;
1539	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1540	(void) refcount_add(&hdr->b_refcnt, tag);
1541
1542	return (buf);
1543}
1544
1545static char *arc_onloan_tag = "onloan";
1546
1547/*
1548 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1549 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1550 * buffers must be returned to the arc before they can be used by the DMU or
1551 * freed.
1552 */
1553arc_buf_t *
1554arc_loan_buf(spa_t *spa, int size)
1555{
1556	arc_buf_t *buf;
1557
1558	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1559
1560	atomic_add_64(&arc_loaned_bytes, size);
1561	return (buf);
1562}
1563
1564/*
1565 * Return a loaned arc buffer to the arc.
1566 */
1567void
1568arc_return_buf(arc_buf_t *buf, void *tag)
1569{
1570	arc_buf_hdr_t *hdr = buf->b_hdr;
1571
1572	ASSERT(buf->b_data != NULL);
1573	(void) refcount_add(&hdr->b_refcnt, tag);
1574	(void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1575
1576	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1577}
1578
1579/* Detach an arc_buf from a dbuf (tag) */
1580void
1581arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1582{
1583	arc_buf_hdr_t *hdr;
1584
1585	ASSERT(buf->b_data != NULL);
1586	hdr = buf->b_hdr;
1587	(void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1588	(void) refcount_remove(&hdr->b_refcnt, tag);
1589	buf->b_efunc = NULL;
1590	buf->b_private = NULL;
1591
1592	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1593}
1594
1595static arc_buf_t *
1596arc_buf_clone(arc_buf_t *from)
1597{
1598	arc_buf_t *buf;
1599	arc_buf_hdr_t *hdr = from->b_hdr;
1600	uint64_t size = hdr->b_size;
1601
1602	ASSERT(hdr->b_state != arc_anon);
1603
1604	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1605	buf->b_hdr = hdr;
1606	buf->b_data = NULL;
1607	buf->b_efunc = NULL;
1608	buf->b_private = NULL;
1609	buf->b_next = hdr->b_buf;
1610	hdr->b_buf = buf;
1611	arc_get_data_buf(buf);
1612	bcopy(from->b_data, buf->b_data, size);
1613
1614	/*
1615	 * This buffer already exists in the arc so create a duplicate
1616	 * copy for the caller.  If the buffer is associated with user data
1617	 * then track the size and number of duplicates.  These stats will be
1618	 * updated as duplicate buffers are created and destroyed.
1619	 */
1620	if (hdr->b_type == ARC_BUFC_DATA) {
1621		ARCSTAT_BUMP(arcstat_duplicate_buffers);
1622		ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1623	}
1624	hdr->b_datacnt += 1;
1625	return (buf);
1626}
1627
1628void
1629arc_buf_add_ref(arc_buf_t *buf, void* tag)
1630{
1631	arc_buf_hdr_t *hdr;
1632	kmutex_t *hash_lock;
1633
1634	/*
1635	 * Check to see if this buffer is evicted.  Callers
1636	 * must verify b_data != NULL to know if the add_ref
1637	 * was successful.
1638	 */
1639	mutex_enter(&buf->b_evict_lock);
1640	if (buf->b_data == NULL) {
1641		mutex_exit(&buf->b_evict_lock);
1642		return;
1643	}
1644	hash_lock = HDR_LOCK(buf->b_hdr);
1645	mutex_enter(hash_lock);
1646	hdr = buf->b_hdr;
1647	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1648	mutex_exit(&buf->b_evict_lock);
1649
1650	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1651	add_reference(hdr, hash_lock, tag);
1652	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1653	arc_access(hdr, hash_lock);
1654	mutex_exit(hash_lock);
1655	ARCSTAT_BUMP(arcstat_hits);
1656	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1657	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1658	    data, metadata, hits);
1659}
1660
1661/*
1662 * Free the arc data buffer.  If it is an l2arc write in progress,
1663 * the buffer is placed on l2arc_free_on_write to be freed later.
1664 */
1665static void
1666arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1667{
1668	arc_buf_hdr_t *hdr = buf->b_hdr;
1669
1670	if (HDR_L2_WRITING(hdr)) {
1671		l2arc_data_free_t *df;
1672		df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1673		df->l2df_data = buf->b_data;
1674		df->l2df_size = hdr->b_size;
1675		df->l2df_func = free_func;
1676		mutex_enter(&l2arc_free_on_write_mtx);
1677		list_insert_head(l2arc_free_on_write, df);
1678		mutex_exit(&l2arc_free_on_write_mtx);
1679		ARCSTAT_BUMP(arcstat_l2_free_on_write);
1680	} else {
1681		free_func(buf->b_data, hdr->b_size);
1682	}
1683}
1684
1685/*
1686 * Free up buf->b_data and if 'remove' is set, then pull the
1687 * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
1688 */
1689static void
1690arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
1691{
1692	arc_buf_t **bufp;
1693
1694	/* free up data associated with the buf */
1695	if (buf->b_data) {
1696		arc_state_t *state = buf->b_hdr->b_state;
1697		uint64_t size = buf->b_hdr->b_size;
1698		arc_buf_contents_t type = buf->b_hdr->b_type;
1699
1700		arc_cksum_verify(buf);
1701#ifdef illumos
1702		arc_buf_unwatch(buf);
1703#endif /* illumos */
1704
1705		if (!recycle) {
1706			if (type == ARC_BUFC_METADATA) {
1707				arc_buf_data_free(buf, zio_buf_free);
1708				arc_space_return(size, ARC_SPACE_DATA);
1709			} else {
1710				ASSERT(type == ARC_BUFC_DATA);
1711				arc_buf_data_free(buf, zio_data_buf_free);
1712				ARCSTAT_INCR(arcstat_data_size, -size);
1713				atomic_add_64(&arc_size, -size);
1714			}
1715		}
1716		if (list_link_active(&buf->b_hdr->b_arc_node)) {
1717			uint64_t *cnt = &state->arcs_lsize[type];
1718
1719			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1720			ASSERT(state != arc_anon);
1721
1722			ASSERT3U(*cnt, >=, size);
1723			atomic_add_64(cnt, -size);
1724		}
1725		ASSERT3U(state->arcs_size, >=, size);
1726		atomic_add_64(&state->arcs_size, -size);
1727		buf->b_data = NULL;
1728
1729		/*
1730		 * If we're destroying a duplicate buffer make sure
1731		 * that the appropriate statistics are updated.
1732		 */
1733		if (buf->b_hdr->b_datacnt > 1 &&
1734		    buf->b_hdr->b_type == ARC_BUFC_DATA) {
1735			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1736			ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1737		}
1738		ASSERT(buf->b_hdr->b_datacnt > 0);
1739		buf->b_hdr->b_datacnt -= 1;
1740	}
1741
1742	/* only remove the buf if requested */
1743	if (!remove)
1744		return;
1745
1746	/* remove the buf from the hdr list */
1747	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1748		continue;
1749	*bufp = buf->b_next;
1750	buf->b_next = NULL;
1751
1752	ASSERT(buf->b_efunc == NULL);
1753
1754	/* clean up the buf */
1755	buf->b_hdr = NULL;
1756	kmem_cache_free(buf_cache, buf);
1757}
1758
1759static void
1760arc_hdr_destroy(arc_buf_hdr_t *hdr)
1761{
1762	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1763	ASSERT3P(hdr->b_state, ==, arc_anon);
1764	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1765	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1766
1767	if (l2hdr != NULL) {
1768		boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1769		/*
1770		 * To prevent arc_free() and l2arc_evict() from
1771		 * attempting to free the same buffer at the same time,
1772		 * a FREE_IN_PROGRESS flag is given to arc_free() to
1773		 * give it priority.  l2arc_evict() can't destroy this
1774		 * header while we are waiting on l2arc_buflist_mtx.
1775		 *
1776		 * The hdr may be removed from l2ad_buflist before we
1777		 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1778		 */
1779		if (!buflist_held) {
1780			mutex_enter(&l2arc_buflist_mtx);
1781			l2hdr = hdr->b_l2hdr;
1782		}
1783
1784		if (l2hdr != NULL) {
1785			trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
1786			    hdr->b_size, 0);
1787			list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1788			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1789			ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1790			vdev_space_update(l2hdr->b_dev->l2ad_vdev,
1791			    -l2hdr->b_asize, 0, 0);
1792			kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1793			if (hdr->b_state == arc_l2c_only)
1794				l2arc_hdr_stat_remove();
1795			hdr->b_l2hdr = NULL;
1796		}
1797
1798		if (!buflist_held)
1799			mutex_exit(&l2arc_buflist_mtx);
1800	}
1801
1802	if (!BUF_EMPTY(hdr)) {
1803		ASSERT(!HDR_IN_HASH_TABLE(hdr));
1804		buf_discard_identity(hdr);
1805	}
1806	while (hdr->b_buf) {
1807		arc_buf_t *buf = hdr->b_buf;
1808
1809		if (buf->b_efunc) {
1810			mutex_enter(&arc_eviction_mtx);
1811			mutex_enter(&buf->b_evict_lock);
1812			ASSERT(buf->b_hdr != NULL);
1813			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1814			hdr->b_buf = buf->b_next;
1815			buf->b_hdr = &arc_eviction_hdr;
1816			buf->b_next = arc_eviction_list;
1817			arc_eviction_list = buf;
1818			mutex_exit(&buf->b_evict_lock);
1819			mutex_exit(&arc_eviction_mtx);
1820		} else {
1821			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1822		}
1823	}
1824	if (hdr->b_freeze_cksum != NULL) {
1825		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1826		hdr->b_freeze_cksum = NULL;
1827	}
1828	if (hdr->b_thawed) {
1829		kmem_free(hdr->b_thawed, 1);
1830		hdr->b_thawed = NULL;
1831	}
1832
1833	ASSERT(!list_link_active(&hdr->b_arc_node));
1834	ASSERT3P(hdr->b_hash_next, ==, NULL);
1835	ASSERT3P(hdr->b_acb, ==, NULL);
1836	kmem_cache_free(hdr_cache, hdr);
1837}
1838
1839void
1840arc_buf_free(arc_buf_t *buf, void *tag)
1841{
1842	arc_buf_hdr_t *hdr = buf->b_hdr;
1843	int hashed = hdr->b_state != arc_anon;
1844
1845	ASSERT(buf->b_efunc == NULL);
1846	ASSERT(buf->b_data != NULL);
1847
1848	if (hashed) {
1849		kmutex_t *hash_lock = HDR_LOCK(hdr);
1850
1851		mutex_enter(hash_lock);
1852		hdr = buf->b_hdr;
1853		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1854
1855		(void) remove_reference(hdr, hash_lock, tag);
1856		if (hdr->b_datacnt > 1) {
1857			arc_buf_destroy(buf, FALSE, TRUE);
1858		} else {
1859			ASSERT(buf == hdr->b_buf);
1860			ASSERT(buf->b_efunc == NULL);
1861			hdr->b_flags |= ARC_BUF_AVAILABLE;
1862		}
1863		mutex_exit(hash_lock);
1864	} else if (HDR_IO_IN_PROGRESS(hdr)) {
1865		int destroy_hdr;
1866		/*
1867		 * We are in the middle of an async write.  Don't destroy
1868		 * this buffer unless the write completes before we finish
1869		 * decrementing the reference count.
1870		 */
1871		mutex_enter(&arc_eviction_mtx);
1872		(void) remove_reference(hdr, NULL, tag);
1873		ASSERT(refcount_is_zero(&hdr->b_refcnt));
1874		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1875		mutex_exit(&arc_eviction_mtx);
1876		if (destroy_hdr)
1877			arc_hdr_destroy(hdr);
1878	} else {
1879		if (remove_reference(hdr, NULL, tag) > 0)
1880			arc_buf_destroy(buf, FALSE, TRUE);
1881		else
1882			arc_hdr_destroy(hdr);
1883	}
1884}
1885
1886boolean_t
1887arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1888{
1889	arc_buf_hdr_t *hdr = buf->b_hdr;
1890	kmutex_t *hash_lock = HDR_LOCK(hdr);
1891	boolean_t no_callback = (buf->b_efunc == NULL);
1892
1893	if (hdr->b_state == arc_anon) {
1894		ASSERT(hdr->b_datacnt == 1);
1895		arc_buf_free(buf, tag);
1896		return (no_callback);
1897	}
1898
1899	mutex_enter(hash_lock);
1900	hdr = buf->b_hdr;
1901	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1902	ASSERT(hdr->b_state != arc_anon);
1903	ASSERT(buf->b_data != NULL);
1904
1905	(void) remove_reference(hdr, hash_lock, tag);
1906	if (hdr->b_datacnt > 1) {
1907		if (no_callback)
1908			arc_buf_destroy(buf, FALSE, TRUE);
1909	} else if (no_callback) {
1910		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1911		ASSERT(buf->b_efunc == NULL);
1912		hdr->b_flags |= ARC_BUF_AVAILABLE;
1913	}
1914	ASSERT(no_callback || hdr->b_datacnt > 1 ||
1915	    refcount_is_zero(&hdr->b_refcnt));
1916	mutex_exit(hash_lock);
1917	return (no_callback);
1918}
1919
1920int
1921arc_buf_size(arc_buf_t *buf)
1922{
1923	return (buf->b_hdr->b_size);
1924}
1925
1926/*
1927 * Called from the DMU to determine if the current buffer should be
1928 * evicted. In order to ensure proper locking, the eviction must be initiated
1929 * from the DMU. Return true if the buffer is associated with user data and
1930 * duplicate buffers still exist.
1931 */
1932boolean_t
1933arc_buf_eviction_needed(arc_buf_t *buf)
1934{
1935	arc_buf_hdr_t *hdr;
1936	boolean_t evict_needed = B_FALSE;
1937
1938	if (zfs_disable_dup_eviction)
1939		return (B_FALSE);
1940
1941	mutex_enter(&buf->b_evict_lock);
1942	hdr = buf->b_hdr;
1943	if (hdr == NULL) {
1944		/*
1945		 * We are in arc_do_user_evicts(); let that function
1946		 * perform the eviction.
1947		 */
1948		ASSERT(buf->b_data == NULL);
1949		mutex_exit(&buf->b_evict_lock);
1950		return (B_FALSE);
1951	} else if (buf->b_data == NULL) {
1952		/*
1953		 * We have already been added to the arc eviction list;
1954		 * recommend eviction.
1955		 */
1956		ASSERT3P(hdr, ==, &arc_eviction_hdr);
1957		mutex_exit(&buf->b_evict_lock);
1958		return (B_TRUE);
1959	}
1960
1961	if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1962		evict_needed = B_TRUE;
1963
1964	mutex_exit(&buf->b_evict_lock);
1965	return (evict_needed);
1966}
1967
1968/*
1969 * Evict buffers from list until we've removed the specified number of
1970 * bytes.  Move the removed buffers to the appropriate evict state.
1971 * If the recycle flag is set, then attempt to "recycle" a buffer:
1972 * - look for a buffer to evict that is `bytes' long.
1973 * - return the data block from this buffer rather than freeing it.
1974 * This flag is used by callers that are trying to make space for a
1975 * new buffer in a full arc cache.
1976 *
1977 * This function makes a "best effort".  It skips over any buffers
1978 * it can't get a hash_lock on, and so may not catch all candidates.
1979 * It may also return without evicting as much space as requested.
1980 */
1981static void *
1982arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1983    arc_buf_contents_t type)
1984{
1985	arc_state_t *evicted_state;
1986	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1987	int64_t bytes_remaining;
1988	arc_buf_hdr_t *ab, *ab_prev = NULL;
1989	list_t *evicted_list, *list, *evicted_list_start, *list_start;
1990	kmutex_t *lock, *evicted_lock;
1991	kmutex_t *hash_lock;
1992	boolean_t have_lock;
1993	void *stolen = NULL;
1994	arc_buf_hdr_t marker = { 0 };
1995	int count = 0;
1996	static int evict_metadata_offset, evict_data_offset;
1997	int i, idx, offset, list_count, lists;
1998
1999	ASSERT(state == arc_mru || state == arc_mfu);
2000
2001	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2002
2003	if (type == ARC_BUFC_METADATA) {
2004		offset = 0;
2005		list_count = ARC_BUFC_NUMMETADATALISTS;
2006		list_start = &state->arcs_lists[0];
2007		evicted_list_start = &evicted_state->arcs_lists[0];
2008		idx = evict_metadata_offset;
2009	} else {
2010		offset = ARC_BUFC_NUMMETADATALISTS;
2011		list_start = &state->arcs_lists[offset];
2012		evicted_list_start = &evicted_state->arcs_lists[offset];
2013		list_count = ARC_BUFC_NUMDATALISTS;
2014		idx = evict_data_offset;
2015	}
2016	bytes_remaining = evicted_state->arcs_lsize[type];
2017	lists = 0;
2018
2019evict_start:
2020	list = &list_start[idx];
2021	evicted_list = &evicted_list_start[idx];
2022	lock = ARCS_LOCK(state, (offset + idx));
2023	evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
2024
2025	mutex_enter(lock);
2026	mutex_enter(evicted_lock);
2027
2028	for (ab = list_tail(list); ab; ab = ab_prev) {
2029		ab_prev = list_prev(list, ab);
2030		bytes_remaining -= (ab->b_size * ab->b_datacnt);
2031		/* prefetch buffers have a minimum lifespan */
2032		if (HDR_IO_IN_PROGRESS(ab) ||
2033		    (spa && ab->b_spa != spa) ||
2034		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
2035		    ddi_get_lbolt() - ab->b_arc_access <
2036		    arc_min_prefetch_lifespan)) {
2037			skipped++;
2038			continue;
2039		}
2040		/* "lookahead" for better eviction candidate */
2041		if (recycle && ab->b_size != bytes &&
2042		    ab_prev && ab_prev->b_size == bytes)
2043			continue;
2044
2045		/* ignore markers */
2046		if (ab->b_spa == 0)
2047			continue;
2048
2049		/*
2050		 * It may take a long time to evict all the bufs requested.
2051		 * To avoid blocking all arc activity, periodically drop
2052		 * the arcs_mtx and give other threads a chance to run
2053		 * before reacquiring the lock.
2054		 *
2055		 * If we are looking for a buffer to recycle, we are in
2056		 * the hot code path, so don't sleep.
2057		 */
2058		if (!recycle && count++ > arc_evict_iterations) {
2059			list_insert_after(list, ab, &marker);
2060			mutex_exit(evicted_lock);
2061			mutex_exit(lock);
2062			kpreempt(KPREEMPT_SYNC);
2063			mutex_enter(lock);
2064			mutex_enter(evicted_lock);
2065			ab_prev = list_prev(list, &marker);
2066			list_remove(list, &marker);
2067			count = 0;
2068			continue;
2069		}
2070
2071		hash_lock = HDR_LOCK(ab);
2072		have_lock = MUTEX_HELD(hash_lock);
2073		if (have_lock || mutex_tryenter(hash_lock)) {
2074			ASSERT0(refcount_count(&ab->b_refcnt));
2075			ASSERT(ab->b_datacnt > 0);
2076			while (ab->b_buf) {
2077				arc_buf_t *buf = ab->b_buf;
2078				if (!mutex_tryenter(&buf->b_evict_lock)) {
2079					missed += 1;
2080					break;
2081				}
2082				if (buf->b_data) {
2083					bytes_evicted += ab->b_size;
2084					if (recycle && ab->b_type == type &&
2085					    ab->b_size == bytes &&
2086					    !HDR_L2_WRITING(ab)) {
2087						stolen = buf->b_data;
2088						recycle = FALSE;
2089					}
2090				}
2091				if (buf->b_efunc) {
2092					mutex_enter(&arc_eviction_mtx);
2093					arc_buf_destroy(buf,
2094					    buf->b_data == stolen, FALSE);
2095					ab->b_buf = buf->b_next;
2096					buf->b_hdr = &arc_eviction_hdr;
2097					buf->b_next = arc_eviction_list;
2098					arc_eviction_list = buf;
2099					mutex_exit(&arc_eviction_mtx);
2100					mutex_exit(&buf->b_evict_lock);
2101				} else {
2102					mutex_exit(&buf->b_evict_lock);
2103					arc_buf_destroy(buf,
2104					    buf->b_data == stolen, TRUE);
2105				}
2106			}
2107
2108			if (ab->b_l2hdr) {
2109				ARCSTAT_INCR(arcstat_evict_l2_cached,
2110				    ab->b_size);
2111			} else {
2112				if (l2arc_write_eligible(ab->b_spa, ab)) {
2113					ARCSTAT_INCR(arcstat_evict_l2_eligible,
2114					    ab->b_size);
2115				} else {
2116					ARCSTAT_INCR(
2117					    arcstat_evict_l2_ineligible,
2118					    ab->b_size);
2119				}
2120			}
2121
2122			if (ab->b_datacnt == 0) {
2123				arc_change_state(evicted_state, ab, hash_lock);
2124				ASSERT(HDR_IN_HASH_TABLE(ab));
2125				ab->b_flags |= ARC_IN_HASH_TABLE;
2126				ab->b_flags &= ~ARC_BUF_AVAILABLE;
2127				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
2128			}
2129			if (!have_lock)
2130				mutex_exit(hash_lock);
2131			if (bytes >= 0 && bytes_evicted >= bytes)
2132				break;
2133			if (bytes_remaining > 0) {
2134				mutex_exit(evicted_lock);
2135				mutex_exit(lock);
2136				idx  = ((idx + 1) & (list_count - 1));
2137				lists++;
2138				goto evict_start;
2139			}
2140		} else {
2141			missed += 1;
2142		}
2143	}
2144
2145	mutex_exit(evicted_lock);
2146	mutex_exit(lock);
2147
2148	idx  = ((idx + 1) & (list_count - 1));
2149	lists++;
2150
2151	if (bytes_evicted < bytes) {
2152		if (lists < list_count)
2153			goto evict_start;
2154		else
2155			dprintf("only evicted %lld bytes from %x",
2156			    (longlong_t)bytes_evicted, state);
2157	}
2158	if (type == ARC_BUFC_METADATA)
2159		evict_metadata_offset = idx;
2160	else
2161		evict_data_offset = idx;
2162
2163	if (skipped)
2164		ARCSTAT_INCR(arcstat_evict_skip, skipped);
2165
2166	if (missed)
2167		ARCSTAT_INCR(arcstat_mutex_miss, missed);
2168
2169	/*
2170	 * Note: we have just evicted some data into the ghost state,
2171	 * potentially putting the ghost size over the desired size.  Rather
2172	 * that evicting from the ghost list in this hot code path, leave
2173	 * this chore to the arc_reclaim_thread().
2174	 */
2175
2176	if (stolen)
2177		ARCSTAT_BUMP(arcstat_stolen);
2178	return (stolen);
2179}
2180
2181/*
2182 * Remove buffers from list until we've removed the specified number of
2183 * bytes.  Destroy the buffers that are removed.
2184 */
2185static void
2186arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
2187{
2188	arc_buf_hdr_t *ab, *ab_prev;
2189	arc_buf_hdr_t marker = { 0 };
2190	list_t *list, *list_start;
2191	kmutex_t *hash_lock, *lock;
2192	uint64_t bytes_deleted = 0;
2193	uint64_t bufs_skipped = 0;
2194	int count = 0;
2195	static int evict_offset;
2196	int list_count, idx = evict_offset;
2197	int offset, lists = 0;
2198
2199	ASSERT(GHOST_STATE(state));
2200
2201	/*
2202	 * data lists come after metadata lists
2203	 */
2204	list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
2205	list_count = ARC_BUFC_NUMDATALISTS;
2206	offset = ARC_BUFC_NUMMETADATALISTS;
2207
2208evict_start:
2209	list = &list_start[idx];
2210	lock = ARCS_LOCK(state, idx + offset);
2211
2212	mutex_enter(lock);
2213	for (ab = list_tail(list); ab; ab = ab_prev) {
2214		ab_prev = list_prev(list, ab);
2215		if (ab->b_type > ARC_BUFC_NUMTYPES)
2216			panic("invalid ab=%p", (void *)ab);
2217		if (spa && ab->b_spa != spa)
2218			continue;
2219
2220		/* ignore markers */
2221		if (ab->b_spa == 0)
2222			continue;
2223
2224		hash_lock = HDR_LOCK(ab);
2225		/* caller may be trying to modify this buffer, skip it */
2226		if (MUTEX_HELD(hash_lock))
2227			continue;
2228
2229		/*
2230		 * It may take a long time to evict all the bufs requested.
2231		 * To avoid blocking all arc activity, periodically drop
2232		 * the arcs_mtx and give other threads a chance to run
2233		 * before reacquiring the lock.
2234		 */
2235		if (count++ > arc_evict_iterations) {
2236			list_insert_after(list, ab, &marker);
2237			mutex_exit(lock);
2238			kpreempt(KPREEMPT_SYNC);
2239			mutex_enter(lock);
2240			ab_prev = list_prev(list, &marker);
2241			list_remove(list, &marker);
2242			count = 0;
2243			continue;
2244		}
2245		if (mutex_tryenter(hash_lock)) {
2246			ASSERT(!HDR_IO_IN_PROGRESS(ab));
2247			ASSERT(ab->b_buf == NULL);
2248			ARCSTAT_BUMP(arcstat_deleted);
2249			bytes_deleted += ab->b_size;
2250
2251			if (ab->b_l2hdr != NULL) {
2252				/*
2253				 * This buffer is cached on the 2nd Level ARC;
2254				 * don't destroy the header.
2255				 */
2256				arc_change_state(arc_l2c_only, ab, hash_lock);
2257				mutex_exit(hash_lock);
2258			} else {
2259				arc_change_state(arc_anon, ab, hash_lock);
2260				mutex_exit(hash_lock);
2261				arc_hdr_destroy(ab);
2262			}
2263
2264			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2265			if (bytes >= 0 && bytes_deleted >= bytes)
2266				break;
2267		} else if (bytes < 0) {
2268			/*
2269			 * Insert a list marker and then wait for the
2270			 * hash lock to become available. Once its
2271			 * available, restart from where we left off.
2272			 */
2273			list_insert_after(list, ab, &marker);
2274			mutex_exit(lock);
2275			mutex_enter(hash_lock);
2276			mutex_exit(hash_lock);
2277			mutex_enter(lock);
2278			ab_prev = list_prev(list, &marker);
2279			list_remove(list, &marker);
2280		} else {
2281			bufs_skipped += 1;
2282		}
2283
2284	}
2285	mutex_exit(lock);
2286	idx  = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
2287	lists++;
2288
2289	if (lists < list_count)
2290		goto evict_start;
2291
2292	evict_offset = idx;
2293	if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
2294	    (bytes < 0 || bytes_deleted < bytes)) {
2295		list_start = &state->arcs_lists[0];
2296		list_count = ARC_BUFC_NUMMETADATALISTS;
2297		offset = lists = 0;
2298		goto evict_start;
2299	}
2300
2301	if (bufs_skipped) {
2302		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2303		ASSERT(bytes >= 0);
2304	}
2305
2306	if (bytes_deleted < bytes)
2307		dprintf("only deleted %lld bytes from %p",
2308		    (longlong_t)bytes_deleted, state);
2309}
2310
2311static void
2312arc_adjust(void)
2313{
2314	int64_t adjustment, delta;
2315
2316	/*
2317	 * Adjust MRU size
2318	 */
2319
2320	adjustment = MIN((int64_t)(arc_size - arc_c),
2321	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2322	    arc_p));
2323
2324	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2325		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2326		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
2327		adjustment -= delta;
2328	}
2329
2330	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2331		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2332		(void) arc_evict(arc_mru, 0, delta, FALSE,
2333		    ARC_BUFC_METADATA);
2334	}
2335
2336	/*
2337	 * Adjust MFU size
2338	 */
2339
2340	adjustment = arc_size - arc_c;
2341
2342	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2343		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2344		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
2345		adjustment -= delta;
2346	}
2347
2348	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2349		int64_t delta = MIN(adjustment,
2350		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2351		(void) arc_evict(arc_mfu, 0, delta, FALSE,
2352		    ARC_BUFC_METADATA);
2353	}
2354
2355	/*
2356	 * Adjust ghost lists
2357	 */
2358
2359	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2360
2361	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2362		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2363		arc_evict_ghost(arc_mru_ghost, 0, delta);
2364	}
2365
2366	adjustment =
2367	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2368
2369	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2370		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2371		arc_evict_ghost(arc_mfu_ghost, 0, delta);
2372	}
2373}
2374
2375static void
2376arc_do_user_evicts(void)
2377{
2378	static arc_buf_t *tmp_arc_eviction_list;
2379
2380	/*
2381	 * Move list over to avoid LOR
2382	 */
2383restart:
2384	mutex_enter(&arc_eviction_mtx);
2385	tmp_arc_eviction_list = arc_eviction_list;
2386	arc_eviction_list = NULL;
2387	mutex_exit(&arc_eviction_mtx);
2388
2389	while (tmp_arc_eviction_list != NULL) {
2390		arc_buf_t *buf = tmp_arc_eviction_list;
2391		tmp_arc_eviction_list = buf->b_next;
2392		mutex_enter(&buf->b_evict_lock);
2393		buf->b_hdr = NULL;
2394		mutex_exit(&buf->b_evict_lock);
2395
2396		if (buf->b_efunc != NULL)
2397			VERIFY0(buf->b_efunc(buf->b_private));
2398
2399		buf->b_efunc = NULL;
2400		buf->b_private = NULL;
2401		kmem_cache_free(buf_cache, buf);
2402	}
2403
2404	if (arc_eviction_list != NULL)
2405		goto restart;
2406}
2407
2408/*
2409 * Flush all *evictable* data from the cache for the given spa.
2410 * NOTE: this will not touch "active" (i.e. referenced) data.
2411 */
2412void
2413arc_flush(spa_t *spa)
2414{
2415	uint64_t guid = 0;
2416
2417	if (spa)
2418		guid = spa_load_guid(spa);
2419
2420	while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
2421		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2422		if (spa)
2423			break;
2424	}
2425	while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
2426		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2427		if (spa)
2428			break;
2429	}
2430	while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
2431		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2432		if (spa)
2433			break;
2434	}
2435	while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
2436		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2437		if (spa)
2438			break;
2439	}
2440
2441	arc_evict_ghost(arc_mru_ghost, guid, -1);
2442	arc_evict_ghost(arc_mfu_ghost, guid, -1);
2443
2444	mutex_enter(&arc_reclaim_thr_lock);
2445	arc_do_user_evicts();
2446	mutex_exit(&arc_reclaim_thr_lock);
2447	ASSERT(spa || arc_eviction_list == NULL);
2448}
2449
2450void
2451arc_shrink(void)
2452{
2453
2454	if (arc_c > arc_c_min) {
2455		uint64_t to_free;
2456
2457		DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
2458			arc_c_min, uint64_t, arc_p, uint64_t, to_free);
2459#ifdef _KERNEL
2460		to_free = arc_c >> arc_shrink_shift;
2461#else
2462		to_free = arc_c >> arc_shrink_shift;
2463#endif
2464		if (arc_c > arc_c_min + to_free)
2465			atomic_add_64(&arc_c, -to_free);
2466		else
2467			arc_c = arc_c_min;
2468
2469		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2470		if (arc_c > arc_size)
2471			arc_c = MAX(arc_size, arc_c_min);
2472		if (arc_p > arc_c)
2473			arc_p = (arc_c >> 1);
2474
2475		DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
2476			arc_p);
2477
2478		ASSERT(arc_c >= arc_c_min);
2479		ASSERT((int64_t)arc_p >= 0);
2480	}
2481
2482	if (arc_size > arc_c) {
2483		DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,
2484			uint64_t, arc_c);
2485		arc_adjust();
2486	}
2487}
2488
2489static int needfree = 0;
2490
2491static int
2492arc_reclaim_needed(void)
2493{
2494
2495#ifdef _KERNEL
2496
2497	if (needfree) {
2498		DTRACE_PROBE(arc__reclaim_needfree);
2499		return (1);
2500	}
2501
2502	/*
2503	 * Cooperate with pagedaemon when it's time for it to scan
2504	 * and reclaim some pages.
2505	 */
2506	if (freemem < zfs_arc_free_target) {
2507		DTRACE_PROBE2(arc__reclaim_freemem, uint64_t,
2508		    freemem, uint64_t, zfs_arc_free_target);
2509		return (1);
2510	}
2511
2512#ifdef sun
2513	/*
2514	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
2515	 */
2516	extra = desfree;
2517
2518	/*
2519	 * check that we're out of range of the pageout scanner.  It starts to
2520	 * schedule paging if freemem is less than lotsfree and needfree.
2521	 * lotsfree is the high-water mark for pageout, and needfree is the
2522	 * number of needed free pages.  We add extra pages here to make sure
2523	 * the scanner doesn't start up while we're freeing memory.
2524	 */
2525	if (freemem < lotsfree + needfree + extra)
2526		return (1);
2527
2528	/*
2529	 * check to make sure that swapfs has enough space so that anon
2530	 * reservations can still succeed. anon_resvmem() checks that the
2531	 * availrmem is greater than swapfs_minfree, and the number of reserved
2532	 * swap pages.  We also add a bit of extra here just to prevent
2533	 * circumstances from getting really dire.
2534	 */
2535	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2536		return (1);
2537
2538	/*
2539	 * Check that we have enough availrmem that memory locking (e.g., via
2540	 * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
2541	 * stores the number of pages that cannot be locked; when availrmem
2542	 * drops below pages_pp_maximum, page locking mechanisms such as
2543	 * page_pp_lock() will fail.)
2544	 */
2545	if (availrmem <= pages_pp_maximum)
2546		return (1);
2547
2548#endif	/* sun */
2549#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
2550	/*
2551	 * If we're on an i386 platform, it's possible that we'll exhaust the
2552	 * kernel heap space before we ever run out of available physical
2553	 * memory.  Most checks of the size of the heap_area compare against
2554	 * tune.t_minarmem, which is the minimum available real memory that we
2555	 * can have in the system.  However, this is generally fixed at 25 pages
2556	 * which is so low that it's useless.  In this comparison, we seek to
2557	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
2558	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2559	 * free)
2560	 */
2561	if (vmem_size(heap_arena, VMEM_FREE) <
2562	    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) {
2563		DTRACE_PROBE2(arc__reclaim_used, uint64_t,
2564		    vmem_size(heap_arena, VMEM_FREE), uint64_t,
2565		    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2);
2566		return (1);
2567	}
2568#endif
2569#ifdef sun
2570	/*
2571	 * If zio data pages are being allocated out of a separate heap segment,
2572	 * then enforce that the size of available vmem for this arena remains
2573	 * above about 1/16th free.
2574	 *
2575	 * Note: The 1/16th arena free requirement was put in place
2576	 * to aggressively evict memory from the arc in order to avoid
2577	 * memory fragmentation issues.
2578	 */
2579	if (zio_arena != NULL &&
2580	    vmem_size(zio_arena, VMEM_FREE) <
2581	    (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2582		return (1);
2583#endif	/* sun */
2584#else	/* _KERNEL */
2585	if (spa_get_random(100) == 0)
2586		return (1);
2587#endif	/* _KERNEL */
2588	DTRACE_PROBE(arc__reclaim_no);
2589
2590	return (0);
2591}
2592
2593extern kmem_cache_t	*zio_buf_cache[];
2594extern kmem_cache_t	*zio_data_buf_cache[];
2595extern kmem_cache_t	*range_seg_cache;
2596
2597static void __noinline
2598arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2599{
2600	size_t			i;
2601	kmem_cache_t		*prev_cache = NULL;
2602	kmem_cache_t		*prev_data_cache = NULL;
2603
2604	DTRACE_PROBE(arc__kmem_reap_start);
2605#ifdef _KERNEL
2606	if (arc_meta_used >= arc_meta_limit) {
2607		/*
2608		 * We are exceeding our meta-data cache limit.
2609		 * Purge some DNLC entries to release holds on meta-data.
2610		 */
2611		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2612	}
2613#if defined(__i386)
2614	/*
2615	 * Reclaim unused memory from all kmem caches.
2616	 */
2617	kmem_reap();
2618#endif
2619#endif
2620
2621	/*
2622	 * An aggressive reclamation will shrink the cache size as well as
2623	 * reap free buffers from the arc kmem caches.
2624	 */
2625	if (strat == ARC_RECLAIM_AGGR)
2626		arc_shrink();
2627
2628	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2629		if (zio_buf_cache[i] != prev_cache) {
2630			prev_cache = zio_buf_cache[i];
2631			kmem_cache_reap_now(zio_buf_cache[i]);
2632		}
2633		if (zio_data_buf_cache[i] != prev_data_cache) {
2634			prev_data_cache = zio_data_buf_cache[i];
2635			kmem_cache_reap_now(zio_data_buf_cache[i]);
2636		}
2637	}
2638	kmem_cache_reap_now(buf_cache);
2639	kmem_cache_reap_now(hdr_cache);
2640	kmem_cache_reap_now(range_seg_cache);
2641
2642#ifdef sun
2643	/*
2644	 * Ask the vmem arena to reclaim unused memory from its
2645	 * quantum caches.
2646	 */
2647	if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2648		vmem_qcache_reap(zio_arena);
2649#endif
2650	DTRACE_PROBE(arc__kmem_reap_end);
2651}
2652
2653static void
2654arc_reclaim_thread(void *dummy __unused)
2655{
2656	clock_t			growtime = 0;
2657	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
2658	callb_cpr_t		cpr;
2659
2660	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2661
2662	mutex_enter(&arc_reclaim_thr_lock);
2663	while (arc_thread_exit == 0) {
2664		if (arc_reclaim_needed()) {
2665
2666			if (arc_no_grow) {
2667				if (last_reclaim == ARC_RECLAIM_CONS) {
2668					DTRACE_PROBE(arc__reclaim_aggr_no_grow);
2669					last_reclaim = ARC_RECLAIM_AGGR;
2670				} else {
2671					last_reclaim = ARC_RECLAIM_CONS;
2672				}
2673			} else {
2674				arc_no_grow = TRUE;
2675				last_reclaim = ARC_RECLAIM_AGGR;
2676				DTRACE_PROBE(arc__reclaim_aggr);
2677				membar_producer();
2678			}
2679
2680			/* reset the growth delay for every reclaim */
2681			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2682
2683			if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
2684				/*
2685				 * If needfree is TRUE our vm_lowmem hook
2686				 * was called and in that case we must free some
2687				 * memory, so switch to aggressive mode.
2688				 */
2689				arc_no_grow = TRUE;
2690				last_reclaim = ARC_RECLAIM_AGGR;
2691			}
2692			arc_kmem_reap_now(last_reclaim);
2693			arc_warm = B_TRUE;
2694
2695		} else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2696			arc_no_grow = FALSE;
2697		}
2698
2699		arc_adjust();
2700
2701		if (arc_eviction_list != NULL)
2702			arc_do_user_evicts();
2703
2704#ifdef _KERNEL
2705		if (needfree) {
2706			needfree = 0;
2707			wakeup(&needfree);
2708		}
2709#endif
2710
2711		/* block until needed, or one second, whichever is shorter */
2712		CALLB_CPR_SAFE_BEGIN(&cpr);
2713		(void) cv_timedwait(&arc_reclaim_thr_cv,
2714		    &arc_reclaim_thr_lock, hz);
2715		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2716	}
2717
2718	arc_thread_exit = 0;
2719	cv_broadcast(&arc_reclaim_thr_cv);
2720	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
2721	thread_exit();
2722}
2723
2724/*
2725 * Adapt arc info given the number of bytes we are trying to add and
2726 * the state that we are comming from.  This function is only called
2727 * when we are adding new content to the cache.
2728 */
2729static void
2730arc_adapt(int bytes, arc_state_t *state)
2731{
2732	int mult;
2733	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2734
2735	if (state == arc_l2c_only)
2736		return;
2737
2738	ASSERT(bytes > 0);
2739	/*
2740	 * Adapt the target size of the MRU list:
2741	 *	- if we just hit in the MRU ghost list, then increase
2742	 *	  the target size of the MRU list.
2743	 *	- if we just hit in the MFU ghost list, then increase
2744	 *	  the target size of the MFU list by decreasing the
2745	 *	  target size of the MRU list.
2746	 */
2747	if (state == arc_mru_ghost) {
2748		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2749		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2750		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2751
2752		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2753	} else if (state == arc_mfu_ghost) {
2754		uint64_t delta;
2755
2756		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2757		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2758		mult = MIN(mult, 10);
2759
2760		delta = MIN(bytes * mult, arc_p);
2761		arc_p = MAX(arc_p_min, arc_p - delta);
2762	}
2763	ASSERT((int64_t)arc_p >= 0);
2764
2765	if (arc_reclaim_needed()) {
2766		cv_signal(&arc_reclaim_thr_cv);
2767		return;
2768	}
2769
2770	if (arc_no_grow)
2771		return;
2772
2773	if (arc_c >= arc_c_max)
2774		return;
2775
2776	/*
2777	 * If we're within (2 * maxblocksize) bytes of the target
2778	 * cache size, increment the target cache size
2779	 */
2780	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2781		DTRACE_PROBE1(arc__inc_adapt, int, bytes);
2782		atomic_add_64(&arc_c, (int64_t)bytes);
2783		if (arc_c > arc_c_max)
2784			arc_c = arc_c_max;
2785		else if (state == arc_anon)
2786			atomic_add_64(&arc_p, (int64_t)bytes);
2787		if (arc_p > arc_c)
2788			arc_p = arc_c;
2789	}
2790	ASSERT((int64_t)arc_p >= 0);
2791}
2792
2793/*
2794 * Check if the cache has reached its limits and eviction is required
2795 * prior to insert.
2796 */
2797static int
2798arc_evict_needed(arc_buf_contents_t type)
2799{
2800	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2801		return (1);
2802
2803	if (arc_reclaim_needed())
2804		return (1);
2805
2806	return (arc_size > arc_c);
2807}
2808
2809/*
2810 * The buffer, supplied as the first argument, needs a data block.
2811 * So, if we are at cache max, determine which cache should be victimized.
2812 * We have the following cases:
2813 *
2814 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2815 * In this situation if we're out of space, but the resident size of the MFU is
2816 * under the limit, victimize the MFU cache to satisfy this insertion request.
2817 *
2818 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2819 * Here, we've used up all of the available space for the MRU, so we need to
2820 * evict from our own cache instead.  Evict from the set of resident MRU
2821 * entries.
2822 *
2823 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2824 * c minus p represents the MFU space in the cache, since p is the size of the
2825 * cache that is dedicated to the MRU.  In this situation there's still space on
2826 * the MFU side, so the MRU side needs to be victimized.
2827 *
2828 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2829 * MFU's resident set is consuming more space than it has been allotted.  In
2830 * this situation, we must victimize our own cache, the MFU, for this insertion.
2831 */
2832static void
2833arc_get_data_buf(arc_buf_t *buf)
2834{
2835	arc_state_t		*state = buf->b_hdr->b_state;
2836	uint64_t		size = buf->b_hdr->b_size;
2837	arc_buf_contents_t	type = buf->b_hdr->b_type;
2838
2839	arc_adapt(size, state);
2840
2841	/*
2842	 * We have not yet reached cache maximum size,
2843	 * just allocate a new buffer.
2844	 */
2845	if (!arc_evict_needed(type)) {
2846		if (type == ARC_BUFC_METADATA) {
2847			buf->b_data = zio_buf_alloc(size);
2848			arc_space_consume(size, ARC_SPACE_DATA);
2849		} else {
2850			ASSERT(type == ARC_BUFC_DATA);
2851			buf->b_data = zio_data_buf_alloc(size);
2852			ARCSTAT_INCR(arcstat_data_size, size);
2853			atomic_add_64(&arc_size, size);
2854		}
2855		goto out;
2856	}
2857
2858	/*
2859	 * If we are prefetching from the mfu ghost list, this buffer
2860	 * will end up on the mru list; so steal space from there.
2861	 */
2862	if (state == arc_mfu_ghost)
2863		state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2864	else if (state == arc_mru_ghost)
2865		state = arc_mru;
2866
2867	if (state == arc_mru || state == arc_anon) {
2868		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2869		state = (arc_mfu->arcs_lsize[type] >= size &&
2870		    arc_p > mru_used) ? arc_mfu : arc_mru;
2871	} else {
2872		/* MFU cases */
2873		uint64_t mfu_space = arc_c - arc_p;
2874		state =  (arc_mru->arcs_lsize[type] >= size &&
2875		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2876	}
2877	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
2878		if (type == ARC_BUFC_METADATA) {
2879			buf->b_data = zio_buf_alloc(size);
2880			arc_space_consume(size, ARC_SPACE_DATA);
2881		} else {
2882			ASSERT(type == ARC_BUFC_DATA);
2883			buf->b_data = zio_data_buf_alloc(size);
2884			ARCSTAT_INCR(arcstat_data_size, size);
2885			atomic_add_64(&arc_size, size);
2886		}
2887		ARCSTAT_BUMP(arcstat_recycle_miss);
2888	}
2889	ASSERT(buf->b_data != NULL);
2890out:
2891	/*
2892	 * Update the state size.  Note that ghost states have a
2893	 * "ghost size" and so don't need to be updated.
2894	 */
2895	if (!GHOST_STATE(buf->b_hdr->b_state)) {
2896		arc_buf_hdr_t *hdr = buf->b_hdr;
2897
2898		atomic_add_64(&hdr->b_state->arcs_size, size);
2899		if (list_link_active(&hdr->b_arc_node)) {
2900			ASSERT(refcount_is_zero(&hdr->b_refcnt));
2901			atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2902		}
2903		/*
2904		 * If we are growing the cache, and we are adding anonymous
2905		 * data, and we have outgrown arc_p, update arc_p
2906		 */
2907		if (arc_size < arc_c && hdr->b_state == arc_anon &&
2908		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2909			arc_p = MIN(arc_c, arc_p + size);
2910	}
2911	ARCSTAT_BUMP(arcstat_allocated);
2912}
2913
2914/*
2915 * This routine is called whenever a buffer is accessed.
2916 * NOTE: the hash lock is dropped in this function.
2917 */
2918static void
2919arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2920{
2921	clock_t now;
2922
2923	ASSERT(MUTEX_HELD(hash_lock));
2924
2925	if (buf->b_state == arc_anon) {
2926		/*
2927		 * This buffer is not in the cache, and does not
2928		 * appear in our "ghost" list.  Add the new buffer
2929		 * to the MRU state.
2930		 */
2931
2932		ASSERT(buf->b_arc_access == 0);
2933		buf->b_arc_access = ddi_get_lbolt();
2934		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2935		arc_change_state(arc_mru, buf, hash_lock);
2936
2937	} else if (buf->b_state == arc_mru) {
2938		now = ddi_get_lbolt();
2939
2940		/*
2941		 * If this buffer is here because of a prefetch, then either:
2942		 * - clear the flag if this is a "referencing" read
2943		 *   (any subsequent access will bump this into the MFU state).
2944		 * or
2945		 * - move the buffer to the head of the list if this is
2946		 *   another prefetch (to make it less likely to be evicted).
2947		 */
2948		if ((buf->b_flags & ARC_PREFETCH) != 0) {
2949			if (refcount_count(&buf->b_refcnt) == 0) {
2950				ASSERT(list_link_active(&buf->b_arc_node));
2951			} else {
2952				buf->b_flags &= ~ARC_PREFETCH;
2953				ARCSTAT_BUMP(arcstat_mru_hits);
2954			}
2955			buf->b_arc_access = now;
2956			return;
2957		}
2958
2959		/*
2960		 * This buffer has been "accessed" only once so far,
2961		 * but it is still in the cache. Move it to the MFU
2962		 * state.
2963		 */
2964		if (now > buf->b_arc_access + ARC_MINTIME) {
2965			/*
2966			 * More than 125ms have passed since we
2967			 * instantiated this buffer.  Move it to the
2968			 * most frequently used state.
2969			 */
2970			buf->b_arc_access = now;
2971			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2972			arc_change_state(arc_mfu, buf, hash_lock);
2973		}
2974		ARCSTAT_BUMP(arcstat_mru_hits);
2975	} else if (buf->b_state == arc_mru_ghost) {
2976		arc_state_t	*new_state;
2977		/*
2978		 * This buffer has been "accessed" recently, but
2979		 * was evicted from the cache.  Move it to the
2980		 * MFU state.
2981		 */
2982
2983		if (buf->b_flags & ARC_PREFETCH) {
2984			new_state = arc_mru;
2985			if (refcount_count(&buf->b_refcnt) > 0)
2986				buf->b_flags &= ~ARC_PREFETCH;
2987			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2988		} else {
2989			new_state = arc_mfu;
2990			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2991		}
2992
2993		buf->b_arc_access = ddi_get_lbolt();
2994		arc_change_state(new_state, buf, hash_lock);
2995
2996		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2997	} else if (buf->b_state == arc_mfu) {
2998		/*
2999		 * This buffer has been accessed more than once and is
3000		 * still in the cache.  Keep it in the MFU state.
3001		 *
3002		 * NOTE: an add_reference() that occurred when we did
3003		 * the arc_read() will have kicked this off the list.
3004		 * If it was a prefetch, we will explicitly move it to
3005		 * the head of the list now.
3006		 */
3007		if ((buf->b_flags & ARC_PREFETCH) != 0) {
3008			ASSERT(refcount_count(&buf->b_refcnt) == 0);
3009			ASSERT(list_link_active(&buf->b_arc_node));
3010		}
3011		ARCSTAT_BUMP(arcstat_mfu_hits);
3012		buf->b_arc_access = ddi_get_lbolt();
3013	} else if (buf->b_state == arc_mfu_ghost) {
3014		arc_state_t	*new_state = arc_mfu;
3015		/*
3016		 * This buffer has been accessed more than once but has
3017		 * been evicted from the cache.  Move it back to the
3018		 * MFU state.
3019		 */
3020
3021		if (buf->b_flags & ARC_PREFETCH) {
3022			/*
3023			 * This is a prefetch access...
3024			 * move this block back to the MRU state.
3025			 */
3026			ASSERT0(refcount_count(&buf->b_refcnt));
3027			new_state = arc_mru;
3028		}
3029
3030		buf->b_arc_access = ddi_get_lbolt();
3031		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
3032		arc_change_state(new_state, buf, hash_lock);
3033
3034		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
3035	} else if (buf->b_state == arc_l2c_only) {
3036		/*
3037		 * This buffer is on the 2nd Level ARC.
3038		 */
3039
3040		buf->b_arc_access = ddi_get_lbolt();
3041		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
3042		arc_change_state(arc_mfu, buf, hash_lock);
3043	} else {
3044		ASSERT(!"invalid arc state");
3045	}
3046}
3047
3048/* a generic arc_done_func_t which you can use */
3049/* ARGSUSED */
3050void
3051arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
3052{
3053	if (zio == NULL || zio->io_error == 0)
3054		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
3055	VERIFY(arc_buf_remove_ref(buf, arg));
3056}
3057
3058/* a generic arc_done_func_t */
3059void
3060arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
3061{
3062	arc_buf_t **bufp = arg;
3063	if (zio && zio->io_error) {
3064		VERIFY(arc_buf_remove_ref(buf, arg));
3065		*bufp = NULL;
3066	} else {
3067		*bufp = buf;
3068		ASSERT(buf->b_data);
3069	}
3070}
3071
3072static void
3073arc_read_done(zio_t *zio)
3074{
3075	arc_buf_hdr_t	*hdr;
3076	arc_buf_t	*buf;
3077	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
3078	kmutex_t	*hash_lock = NULL;
3079	arc_callback_t	*callback_list, *acb;
3080	int		freeable = FALSE;
3081
3082	buf = zio->io_private;
3083	hdr = buf->b_hdr;
3084
3085	/*
3086	 * The hdr was inserted into hash-table and removed from lists
3087	 * prior to starting I/O.  We should find this header, since
3088	 * it's in the hash table, and it should be legit since it's
3089	 * not possible to evict it during the I/O.  The only possible
3090	 * reason for it not to be found is if we were freed during the
3091	 * read.
3092	 */
3093	if (HDR_IN_HASH_TABLE(hdr)) {
3094		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
3095		ASSERT3U(hdr->b_dva.dva_word[0], ==,
3096		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
3097		ASSERT3U(hdr->b_dva.dva_word[1], ==,
3098		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
3099
3100		arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
3101		    &hash_lock);
3102
3103		ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
3104		    hash_lock == NULL) ||
3105		    (found == hdr &&
3106		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
3107		    (found == hdr && HDR_L2_READING(hdr)));
3108	}
3109
3110	hdr->b_flags &= ~ARC_L2_EVICTED;
3111	if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
3112		hdr->b_flags &= ~ARC_L2CACHE;
3113
3114	/* byteswap if necessary */
3115	callback_list = hdr->b_acb;
3116	ASSERT(callback_list != NULL);
3117	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
3118		dmu_object_byteswap_t bswap =
3119		    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
3120		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
3121		    byteswap_uint64_array :
3122		    dmu_ot_byteswap[bswap].ob_func;
3123		func(buf->b_data, hdr->b_size);
3124	}
3125
3126	arc_cksum_compute(buf, B_FALSE);
3127#ifdef illumos
3128	arc_buf_watch(buf);
3129#endif /* illumos */
3130
3131	if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
3132		/*
3133		 * Only call arc_access on anonymous buffers.  This is because
3134		 * if we've issued an I/O for an evicted buffer, we've already
3135		 * called arc_access (to prevent any simultaneous readers from
3136		 * getting confused).
3137		 */
3138		arc_access(hdr, hash_lock);
3139	}
3140
3141	/* create copies of the data buffer for the callers */
3142	abuf = buf;
3143	for (acb = callback_list; acb; acb = acb->acb_next) {
3144		if (acb->acb_done) {
3145			if (abuf == NULL) {
3146				ARCSTAT_BUMP(arcstat_duplicate_reads);
3147				abuf = arc_buf_clone(buf);
3148			}
3149			acb->acb_buf = abuf;
3150			abuf = NULL;
3151		}
3152	}
3153	hdr->b_acb = NULL;
3154	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3155	ASSERT(!HDR_BUF_AVAILABLE(hdr));
3156	if (abuf == buf) {
3157		ASSERT(buf->b_efunc == NULL);
3158		ASSERT(hdr->b_datacnt == 1);
3159		hdr->b_flags |= ARC_BUF_AVAILABLE;
3160	}
3161
3162	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
3163
3164	if (zio->io_error != 0) {
3165		hdr->b_flags |= ARC_IO_ERROR;
3166		if (hdr->b_state != arc_anon)
3167			arc_change_state(arc_anon, hdr, hash_lock);
3168		if (HDR_IN_HASH_TABLE(hdr))
3169			buf_hash_remove(hdr);
3170		freeable = refcount_is_zero(&hdr->b_refcnt);
3171	}
3172
3173	/*
3174	 * Broadcast before we drop the hash_lock to avoid the possibility
3175	 * that the hdr (and hence the cv) might be freed before we get to
3176	 * the cv_broadcast().
3177	 */
3178	cv_broadcast(&hdr->b_cv);
3179
3180	if (hash_lock) {
3181		mutex_exit(hash_lock);
3182	} else {
3183		/*
3184		 * This block was freed while we waited for the read to
3185		 * complete.  It has been removed from the hash table and
3186		 * moved to the anonymous state (so that it won't show up
3187		 * in the cache).
3188		 */
3189		ASSERT3P(hdr->b_state, ==, arc_anon);
3190		freeable = refcount_is_zero(&hdr->b_refcnt);
3191	}
3192
3193	/* execute each callback and free its structure */
3194	while ((acb = callback_list) != NULL) {
3195		if (acb->acb_done)
3196			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3197
3198		if (acb->acb_zio_dummy != NULL) {
3199			acb->acb_zio_dummy->io_error = zio->io_error;
3200			zio_nowait(acb->acb_zio_dummy);
3201		}
3202
3203		callback_list = acb->acb_next;
3204		kmem_free(acb, sizeof (arc_callback_t));
3205	}
3206
3207	if (freeable)
3208		arc_hdr_destroy(hdr);
3209}
3210
3211/*
3212 * "Read" the block block at the specified DVA (in bp) via the
3213 * cache.  If the block is found in the cache, invoke the provided
3214 * callback immediately and return.  Note that the `zio' parameter
3215 * in the callback will be NULL in this case, since no IO was
3216 * required.  If the block is not in the cache pass the read request
3217 * on to the spa with a substitute callback function, so that the
3218 * requested block will be added to the cache.
3219 *
3220 * If a read request arrives for a block that has a read in-progress,
3221 * either wait for the in-progress read to complete (and return the
3222 * results); or, if this is a read with a "done" func, add a record
3223 * to the read to invoke the "done" func when the read completes,
3224 * and return; or just return.
3225 *
3226 * arc_read_done() will invoke all the requested "done" functions
3227 * for readers of this block.
3228 */
3229int
3230arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
3231    void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
3232    const zbookmark_phys_t *zb)
3233{
3234	arc_buf_hdr_t *hdr = NULL;
3235	arc_buf_t *buf = NULL;
3236	kmutex_t *hash_lock = NULL;
3237	zio_t *rzio;
3238	uint64_t guid = spa_load_guid(spa);
3239
3240	ASSERT(!BP_IS_EMBEDDED(bp) ||
3241	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
3242
3243top:
3244	if (!BP_IS_EMBEDDED(bp)) {
3245		/*
3246		 * Embedded BP's have no DVA and require no I/O to "read".
3247		 * Create an anonymous arc buf to back it.
3248		 */
3249		hdr = buf_hash_find(guid, bp, &hash_lock);
3250	}
3251
3252	if (hdr != NULL && hdr->b_datacnt > 0) {
3253
3254		*arc_flags |= ARC_CACHED;
3255
3256		if (HDR_IO_IN_PROGRESS(hdr)) {
3257
3258			if (*arc_flags & ARC_WAIT) {
3259				cv_wait(&hdr->b_cv, hash_lock);
3260				mutex_exit(hash_lock);
3261				goto top;
3262			}
3263			ASSERT(*arc_flags & ARC_NOWAIT);
3264
3265			if (done) {
3266				arc_callback_t	*acb = NULL;
3267
3268				acb = kmem_zalloc(sizeof (arc_callback_t),
3269				    KM_SLEEP);
3270				acb->acb_done = done;
3271				acb->acb_private = private;
3272				if (pio != NULL)
3273					acb->acb_zio_dummy = zio_null(pio,
3274					    spa, NULL, NULL, NULL, zio_flags);
3275
3276				ASSERT(acb->acb_done != NULL);
3277				acb->acb_next = hdr->b_acb;
3278				hdr->b_acb = acb;
3279				add_reference(hdr, hash_lock, private);
3280				mutex_exit(hash_lock);
3281				return (0);
3282			}
3283			mutex_exit(hash_lock);
3284			return (0);
3285		}
3286
3287		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3288
3289		if (done) {
3290			add_reference(hdr, hash_lock, private);
3291			/*
3292			 * If this block is already in use, create a new
3293			 * copy of the data so that we will be guaranteed
3294			 * that arc_release() will always succeed.
3295			 */
3296			buf = hdr->b_buf;
3297			ASSERT(buf);
3298			ASSERT(buf->b_data);
3299			if (HDR_BUF_AVAILABLE(hdr)) {
3300				ASSERT(buf->b_efunc == NULL);
3301				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3302			} else {
3303				buf = arc_buf_clone(buf);
3304			}
3305
3306		} else if (*arc_flags & ARC_PREFETCH &&
3307		    refcount_count(&hdr->b_refcnt) == 0) {
3308			hdr->b_flags |= ARC_PREFETCH;
3309		}
3310		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3311		arc_access(hdr, hash_lock);
3312		if (*arc_flags & ARC_L2CACHE)
3313			hdr->b_flags |= ARC_L2CACHE;
3314		if (*arc_flags & ARC_L2COMPRESS)
3315			hdr->b_flags |= ARC_L2COMPRESS;
3316		mutex_exit(hash_lock);
3317		ARCSTAT_BUMP(arcstat_hits);
3318		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3319		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3320		    data, metadata, hits);
3321
3322		if (done)
3323			done(NULL, buf, private);
3324	} else {
3325		uint64_t size = BP_GET_LSIZE(bp);
3326		arc_callback_t *acb;
3327		vdev_t *vd = NULL;
3328		uint64_t addr = 0;
3329		boolean_t devw = B_FALSE;
3330		enum zio_compress b_compress = ZIO_COMPRESS_OFF;
3331		uint64_t b_asize = 0;
3332
3333		if (hdr == NULL) {
3334			/* this block is not in the cache */
3335			arc_buf_hdr_t *exists = NULL;
3336			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3337			buf = arc_buf_alloc(spa, size, private, type);
3338			hdr = buf->b_hdr;
3339			if (!BP_IS_EMBEDDED(bp)) {
3340				hdr->b_dva = *BP_IDENTITY(bp);
3341				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3342				hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3343				exists = buf_hash_insert(hdr, &hash_lock);
3344			}
3345			if (exists != NULL) {
3346				/* somebody beat us to the hash insert */
3347				mutex_exit(hash_lock);
3348				buf_discard_identity(hdr);
3349				(void) arc_buf_remove_ref(buf, private);
3350				goto top; /* restart the IO request */
3351			}
3352			/* if this is a prefetch, we don't have a reference */
3353			if (*arc_flags & ARC_PREFETCH) {
3354				(void) remove_reference(hdr, hash_lock,
3355				    private);
3356				hdr->b_flags |= ARC_PREFETCH;
3357			}
3358			if (*arc_flags & ARC_L2CACHE)
3359				hdr->b_flags |= ARC_L2CACHE;
3360			if (*arc_flags & ARC_L2COMPRESS)
3361				hdr->b_flags |= ARC_L2COMPRESS;
3362			if (BP_GET_LEVEL(bp) > 0)
3363				hdr->b_flags |= ARC_INDIRECT;
3364		} else {
3365			/* this block is in the ghost cache */
3366			ASSERT(GHOST_STATE(hdr->b_state));
3367			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3368			ASSERT0(refcount_count(&hdr->b_refcnt));
3369			ASSERT(hdr->b_buf == NULL);
3370
3371			/* if this is a prefetch, we don't have a reference */
3372			if (*arc_flags & ARC_PREFETCH)
3373				hdr->b_flags |= ARC_PREFETCH;
3374			else
3375				add_reference(hdr, hash_lock, private);
3376			if (*arc_flags & ARC_L2CACHE)
3377				hdr->b_flags |= ARC_L2CACHE;
3378			if (*arc_flags & ARC_L2COMPRESS)
3379				hdr->b_flags |= ARC_L2COMPRESS;
3380			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3381			buf->b_hdr = hdr;
3382			buf->b_data = NULL;
3383			buf->b_efunc = NULL;
3384			buf->b_private = NULL;
3385			buf->b_next = NULL;
3386			hdr->b_buf = buf;
3387			ASSERT(hdr->b_datacnt == 0);
3388			hdr->b_datacnt = 1;
3389			arc_get_data_buf(buf);
3390			arc_access(hdr, hash_lock);
3391		}
3392
3393		ASSERT(!GHOST_STATE(hdr->b_state));
3394
3395		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3396		acb->acb_done = done;
3397		acb->acb_private = private;
3398
3399		ASSERT(hdr->b_acb == NULL);
3400		hdr->b_acb = acb;
3401		hdr->b_flags |= ARC_IO_IN_PROGRESS;
3402
3403		if (hdr->b_l2hdr != NULL &&
3404		    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3405			devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3406			addr = hdr->b_l2hdr->b_daddr;
3407			b_compress = hdr->b_l2hdr->b_compress;
3408			b_asize = hdr->b_l2hdr->b_asize;
3409			/*
3410			 * Lock out device removal.
3411			 */
3412			if (vdev_is_dead(vd) ||
3413			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3414				vd = NULL;
3415		}
3416
3417		if (hash_lock != NULL)
3418			mutex_exit(hash_lock);
3419
3420		/*
3421		 * At this point, we have a level 1 cache miss.  Try again in
3422		 * L2ARC if possible.
3423		 */
3424		ASSERT3U(hdr->b_size, ==, size);
3425		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3426		    uint64_t, size, zbookmark_phys_t *, zb);
3427		ARCSTAT_BUMP(arcstat_misses);
3428		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3429		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3430		    data, metadata, misses);
3431#ifdef _KERNEL
3432		curthread->td_ru.ru_inblock++;
3433#endif
3434
3435		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3436			/*
3437			 * Read from the L2ARC if the following are true:
3438			 * 1. The L2ARC vdev was previously cached.
3439			 * 2. This buffer still has L2ARC metadata.
3440			 * 3. This buffer isn't currently writing to the L2ARC.
3441			 * 4. The L2ARC entry wasn't evicted, which may
3442			 *    also have invalidated the vdev.
3443			 * 5. This isn't prefetch and l2arc_noprefetch is set.
3444			 */
3445			if (hdr->b_l2hdr != NULL &&
3446			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3447			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3448				l2arc_read_callback_t *cb;
3449
3450				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3451				ARCSTAT_BUMP(arcstat_l2_hits);
3452
3453				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3454				    KM_SLEEP);
3455				cb->l2rcb_buf = buf;
3456				cb->l2rcb_spa = spa;
3457				cb->l2rcb_bp = *bp;
3458				cb->l2rcb_zb = *zb;
3459				cb->l2rcb_flags = zio_flags;
3460				cb->l2rcb_compress = b_compress;
3461
3462				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3463				    addr + size < vd->vdev_psize -
3464				    VDEV_LABEL_END_SIZE);
3465
3466				/*
3467				 * l2arc read.  The SCL_L2ARC lock will be
3468				 * released by l2arc_read_done().
3469				 * Issue a null zio if the underlying buffer
3470				 * was squashed to zero size by compression.
3471				 */
3472				if (b_compress == ZIO_COMPRESS_EMPTY) {
3473					rzio = zio_null(pio, spa, vd,
3474					    l2arc_read_done, cb,
3475					    zio_flags | ZIO_FLAG_DONT_CACHE |
3476					    ZIO_FLAG_CANFAIL |
3477					    ZIO_FLAG_DONT_PROPAGATE |
3478					    ZIO_FLAG_DONT_RETRY);
3479				} else {
3480					rzio = zio_read_phys(pio, vd, addr,
3481					    b_asize, buf->b_data,
3482					    ZIO_CHECKSUM_OFF,
3483					    l2arc_read_done, cb, priority,
3484					    zio_flags | ZIO_FLAG_DONT_CACHE |
3485					    ZIO_FLAG_CANFAIL |
3486					    ZIO_FLAG_DONT_PROPAGATE |
3487					    ZIO_FLAG_DONT_RETRY, B_FALSE);
3488				}
3489				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3490				    zio_t *, rzio);
3491				ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
3492
3493				if (*arc_flags & ARC_NOWAIT) {
3494					zio_nowait(rzio);
3495					return (0);
3496				}
3497
3498				ASSERT(*arc_flags & ARC_WAIT);
3499				if (zio_wait(rzio) == 0)
3500					return (0);
3501
3502				/* l2arc read error; goto zio_read() */
3503			} else {
3504				DTRACE_PROBE1(l2arc__miss,
3505				    arc_buf_hdr_t *, hdr);
3506				ARCSTAT_BUMP(arcstat_l2_misses);
3507				if (HDR_L2_WRITING(hdr))
3508					ARCSTAT_BUMP(arcstat_l2_rw_clash);
3509				spa_config_exit(spa, SCL_L2ARC, vd);
3510			}
3511		} else {
3512			if (vd != NULL)
3513				spa_config_exit(spa, SCL_L2ARC, vd);
3514			if (l2arc_ndev != 0) {
3515				DTRACE_PROBE1(l2arc__miss,
3516				    arc_buf_hdr_t *, hdr);
3517				ARCSTAT_BUMP(arcstat_l2_misses);
3518			}
3519		}
3520
3521		rzio = zio_read(pio, spa, bp, buf->b_data, size,
3522		    arc_read_done, buf, priority, zio_flags, zb);
3523
3524		if (*arc_flags & ARC_WAIT)
3525			return (zio_wait(rzio));
3526
3527		ASSERT(*arc_flags & ARC_NOWAIT);
3528		zio_nowait(rzio);
3529	}
3530	return (0);
3531}
3532
3533void
3534arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3535{
3536	ASSERT(buf->b_hdr != NULL);
3537	ASSERT(buf->b_hdr->b_state != arc_anon);
3538	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3539	ASSERT(buf->b_efunc == NULL);
3540	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3541
3542	buf->b_efunc = func;
3543	buf->b_private = private;
3544}
3545
3546/*
3547 * Notify the arc that a block was freed, and thus will never be used again.
3548 */
3549void
3550arc_freed(spa_t *spa, const blkptr_t *bp)
3551{
3552	arc_buf_hdr_t *hdr;
3553	kmutex_t *hash_lock;
3554	uint64_t guid = spa_load_guid(spa);
3555
3556	ASSERT(!BP_IS_EMBEDDED(bp));
3557
3558	hdr = buf_hash_find(guid, bp, &hash_lock);
3559	if (hdr == NULL)
3560		return;
3561	if (HDR_BUF_AVAILABLE(hdr)) {
3562		arc_buf_t *buf = hdr->b_buf;
3563		add_reference(hdr, hash_lock, FTAG);
3564		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3565		mutex_exit(hash_lock);
3566
3567		arc_release(buf, FTAG);
3568		(void) arc_buf_remove_ref(buf, FTAG);
3569	} else {
3570		mutex_exit(hash_lock);
3571	}
3572
3573}
3574
3575/*
3576 * Clear the user eviction callback set by arc_set_callback(), first calling
3577 * it if it exists.  Because the presence of a callback keeps an arc_buf cached
3578 * clearing the callback may result in the arc_buf being destroyed.  However,
3579 * it will not result in the *last* arc_buf being destroyed, hence the data
3580 * will remain cached in the ARC. We make a copy of the arc buffer here so
3581 * that we can process the callback without holding any locks.
3582 *
3583 * It's possible that the callback is already in the process of being cleared
3584 * by another thread.  In this case we can not clear the callback.
3585 *
3586 * Returns B_TRUE if the callback was successfully called and cleared.
3587 */
3588boolean_t
3589arc_clear_callback(arc_buf_t *buf)
3590{
3591	arc_buf_hdr_t *hdr;
3592	kmutex_t *hash_lock;
3593	arc_evict_func_t *efunc = buf->b_efunc;
3594	void *private = buf->b_private;
3595	list_t *list, *evicted_list;
3596	kmutex_t *lock, *evicted_lock;
3597
3598	mutex_enter(&buf->b_evict_lock);
3599	hdr = buf->b_hdr;
3600	if (hdr == NULL) {
3601		/*
3602		 * We are in arc_do_user_evicts().
3603		 */
3604		ASSERT(buf->b_data == NULL);
3605		mutex_exit(&buf->b_evict_lock);
3606		return (B_FALSE);
3607	} else if (buf->b_data == NULL) {
3608		/*
3609		 * We are on the eviction list; process this buffer now
3610		 * but let arc_do_user_evicts() do the reaping.
3611		 */
3612		buf->b_efunc = NULL;
3613		mutex_exit(&buf->b_evict_lock);
3614		VERIFY0(efunc(private));
3615		return (B_TRUE);
3616	}
3617	hash_lock = HDR_LOCK(hdr);
3618	mutex_enter(hash_lock);
3619	hdr = buf->b_hdr;
3620	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3621
3622	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3623	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3624
3625	buf->b_efunc = NULL;
3626	buf->b_private = NULL;
3627
3628	if (hdr->b_datacnt > 1) {
3629		mutex_exit(&buf->b_evict_lock);
3630		arc_buf_destroy(buf, FALSE, TRUE);
3631	} else {
3632		ASSERT(buf == hdr->b_buf);
3633		hdr->b_flags |= ARC_BUF_AVAILABLE;
3634		mutex_exit(&buf->b_evict_lock);
3635	}
3636
3637	mutex_exit(hash_lock);
3638	VERIFY0(efunc(private));
3639	return (B_TRUE);
3640}
3641
3642/*
3643 * Release this buffer from the cache, making it an anonymous buffer.  This
3644 * must be done after a read and prior to modifying the buffer contents.
3645 * If the buffer has more than one reference, we must make
3646 * a new hdr for the buffer.
3647 */
3648void
3649arc_release(arc_buf_t *buf, void *tag)
3650{
3651	arc_buf_hdr_t *hdr;
3652	kmutex_t *hash_lock = NULL;
3653	l2arc_buf_hdr_t *l2hdr;
3654	uint64_t buf_size;
3655
3656	/*
3657	 * It would be nice to assert that if it's DMU metadata (level >
3658	 * 0 || it's the dnode file), then it must be syncing context.
3659	 * But we don't know that information at this level.
3660	 */
3661
3662	mutex_enter(&buf->b_evict_lock);
3663	hdr = buf->b_hdr;
3664
3665	/* this buffer is not on any list */
3666	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3667
3668	if (hdr->b_state == arc_anon) {
3669		/* this buffer is already released */
3670		ASSERT(buf->b_efunc == NULL);
3671	} else {
3672		hash_lock = HDR_LOCK(hdr);
3673		mutex_enter(hash_lock);
3674		hdr = buf->b_hdr;
3675		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3676	}
3677
3678	l2hdr = hdr->b_l2hdr;
3679	if (l2hdr) {
3680		mutex_enter(&l2arc_buflist_mtx);
3681		hdr->b_l2hdr = NULL;
3682		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3683	}
3684	buf_size = hdr->b_size;
3685
3686	/*
3687	 * Do we have more than one buf?
3688	 */
3689	if (hdr->b_datacnt > 1) {
3690		arc_buf_hdr_t *nhdr;
3691		arc_buf_t **bufp;
3692		uint64_t blksz = hdr->b_size;
3693		uint64_t spa = hdr->b_spa;
3694		arc_buf_contents_t type = hdr->b_type;
3695		uint32_t flags = hdr->b_flags;
3696
3697		ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3698		/*
3699		 * Pull the data off of this hdr and attach it to
3700		 * a new anonymous hdr.
3701		 */
3702		(void) remove_reference(hdr, hash_lock, tag);
3703		bufp = &hdr->b_buf;
3704		while (*bufp != buf)
3705			bufp = &(*bufp)->b_next;
3706		*bufp = buf->b_next;
3707		buf->b_next = NULL;
3708
3709		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3710		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3711		if (refcount_is_zero(&hdr->b_refcnt)) {
3712			uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3713			ASSERT3U(*size, >=, hdr->b_size);
3714			atomic_add_64(size, -hdr->b_size);
3715		}
3716
3717		/*
3718		 * We're releasing a duplicate user data buffer, update
3719		 * our statistics accordingly.
3720		 */
3721		if (hdr->b_type == ARC_BUFC_DATA) {
3722			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3723			ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3724			    -hdr->b_size);
3725		}
3726		hdr->b_datacnt -= 1;
3727		arc_cksum_verify(buf);
3728#ifdef illumos
3729		arc_buf_unwatch(buf);
3730#endif /* illumos */
3731
3732		mutex_exit(hash_lock);
3733
3734		nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3735		nhdr->b_size = blksz;
3736		nhdr->b_spa = spa;
3737		nhdr->b_type = type;
3738		nhdr->b_buf = buf;
3739		nhdr->b_state = arc_anon;
3740		nhdr->b_arc_access = 0;
3741		nhdr->b_flags = flags & ARC_L2_WRITING;
3742		nhdr->b_l2hdr = NULL;
3743		nhdr->b_datacnt = 1;
3744		nhdr->b_freeze_cksum = NULL;
3745		(void) refcount_add(&nhdr->b_refcnt, tag);
3746		buf->b_hdr = nhdr;
3747		mutex_exit(&buf->b_evict_lock);
3748		atomic_add_64(&arc_anon->arcs_size, blksz);
3749	} else {
3750		mutex_exit(&buf->b_evict_lock);
3751		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3752		ASSERT(!list_link_active(&hdr->b_arc_node));
3753		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3754		if (hdr->b_state != arc_anon)
3755			arc_change_state(arc_anon, hdr, hash_lock);
3756		hdr->b_arc_access = 0;
3757		if (hash_lock)
3758			mutex_exit(hash_lock);
3759
3760		buf_discard_identity(hdr);
3761		arc_buf_thaw(buf);
3762	}
3763	buf->b_efunc = NULL;
3764	buf->b_private = NULL;
3765
3766	if (l2hdr) {
3767		ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3768		vdev_space_update(l2hdr->b_dev->l2ad_vdev,
3769		    -l2hdr->b_asize, 0, 0);
3770		trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
3771		    hdr->b_size, 0);
3772		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3773		ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3774		mutex_exit(&l2arc_buflist_mtx);
3775	}
3776}
3777
3778int
3779arc_released(arc_buf_t *buf)
3780{
3781	int released;
3782
3783	mutex_enter(&buf->b_evict_lock);
3784	released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3785	mutex_exit(&buf->b_evict_lock);
3786	return (released);
3787}
3788
3789#ifdef ZFS_DEBUG
3790int
3791arc_referenced(arc_buf_t *buf)
3792{
3793	int referenced;
3794
3795	mutex_enter(&buf->b_evict_lock);
3796	referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3797	mutex_exit(&buf->b_evict_lock);
3798	return (referenced);
3799}
3800#endif
3801
3802static void
3803arc_write_ready(zio_t *zio)
3804{
3805	arc_write_callback_t *callback = zio->io_private;
3806	arc_buf_t *buf = callback->awcb_buf;
3807	arc_buf_hdr_t *hdr = buf->b_hdr;
3808
3809	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3810	callback->awcb_ready(zio, buf, callback->awcb_private);
3811
3812	/*
3813	 * If the IO is already in progress, then this is a re-write
3814	 * attempt, so we need to thaw and re-compute the cksum.
3815	 * It is the responsibility of the callback to handle the
3816	 * accounting for any re-write attempt.
3817	 */
3818	if (HDR_IO_IN_PROGRESS(hdr)) {
3819		mutex_enter(&hdr->b_freeze_lock);
3820		if (hdr->b_freeze_cksum != NULL) {
3821			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3822			hdr->b_freeze_cksum = NULL;
3823		}
3824		mutex_exit(&hdr->b_freeze_lock);
3825	}
3826	arc_cksum_compute(buf, B_FALSE);
3827	hdr->b_flags |= ARC_IO_IN_PROGRESS;
3828}
3829
3830/*
3831 * The SPA calls this callback for each physical write that happens on behalf
3832 * of a logical write.  See the comment in dbuf_write_physdone() for details.
3833 */
3834static void
3835arc_write_physdone(zio_t *zio)
3836{
3837	arc_write_callback_t *cb = zio->io_private;
3838	if (cb->awcb_physdone != NULL)
3839		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
3840}
3841
3842static void
3843arc_write_done(zio_t *zio)
3844{
3845	arc_write_callback_t *callback = zio->io_private;
3846	arc_buf_t *buf = callback->awcb_buf;
3847	arc_buf_hdr_t *hdr = buf->b_hdr;
3848
3849	ASSERT(hdr->b_acb == NULL);
3850
3851	if (zio->io_error == 0) {
3852		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
3853			buf_discard_identity(hdr);
3854		} else {
3855			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3856			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3857			hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3858		}
3859	} else {
3860		ASSERT(BUF_EMPTY(hdr));
3861	}
3862
3863	/*
3864	 * If the block to be written was all-zero or compressed enough to be
3865	 * embedded in the BP, no write was performed so there will be no
3866	 * dva/birth/checksum.  The buffer must therefore remain anonymous
3867	 * (and uncached).
3868	 */
3869	if (!BUF_EMPTY(hdr)) {
3870		arc_buf_hdr_t *exists;
3871		kmutex_t *hash_lock;
3872
3873		ASSERT(zio->io_error == 0);
3874
3875		arc_cksum_verify(buf);
3876
3877		exists = buf_hash_insert(hdr, &hash_lock);
3878		if (exists) {
3879			/*
3880			 * This can only happen if we overwrite for
3881			 * sync-to-convergence, because we remove
3882			 * buffers from the hash table when we arc_free().
3883			 */
3884			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3885				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3886					panic("bad overwrite, hdr=%p exists=%p",
3887					    (void *)hdr, (void *)exists);
3888				ASSERT(refcount_is_zero(&exists->b_refcnt));
3889				arc_change_state(arc_anon, exists, hash_lock);
3890				mutex_exit(hash_lock);
3891				arc_hdr_destroy(exists);
3892				exists = buf_hash_insert(hdr, &hash_lock);
3893				ASSERT3P(exists, ==, NULL);
3894			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3895				/* nopwrite */
3896				ASSERT(zio->io_prop.zp_nopwrite);
3897				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3898					panic("bad nopwrite, hdr=%p exists=%p",
3899					    (void *)hdr, (void *)exists);
3900			} else {
3901				/* Dedup */
3902				ASSERT(hdr->b_datacnt == 1);
3903				ASSERT(hdr->b_state == arc_anon);
3904				ASSERT(BP_GET_DEDUP(zio->io_bp));
3905				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3906			}
3907		}
3908		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3909		/* if it's not anon, we are doing a scrub */
3910		if (!exists && hdr->b_state == arc_anon)
3911			arc_access(hdr, hash_lock);
3912		mutex_exit(hash_lock);
3913	} else {
3914		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3915	}
3916
3917	ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3918	callback->awcb_done(zio, buf, callback->awcb_private);
3919
3920	kmem_free(callback, sizeof (arc_write_callback_t));
3921}
3922
3923zio_t *
3924arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3925    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3926    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
3927    arc_done_func_t *done, void *private, zio_priority_t priority,
3928    int zio_flags, const zbookmark_phys_t *zb)
3929{
3930	arc_buf_hdr_t *hdr = buf->b_hdr;
3931	arc_write_callback_t *callback;
3932	zio_t *zio;
3933
3934	ASSERT(ready != NULL);
3935	ASSERT(done != NULL);
3936	ASSERT(!HDR_IO_ERROR(hdr));
3937	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3938	ASSERT(hdr->b_acb == NULL);
3939	if (l2arc)
3940		hdr->b_flags |= ARC_L2CACHE;
3941	if (l2arc_compress)
3942		hdr->b_flags |= ARC_L2COMPRESS;
3943	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3944	callback->awcb_ready = ready;
3945	callback->awcb_physdone = physdone;
3946	callback->awcb_done = done;
3947	callback->awcb_private = private;
3948	callback->awcb_buf = buf;
3949
3950	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3951	    arc_write_ready, arc_write_physdone, arc_write_done, callback,
3952	    priority, zio_flags, zb);
3953
3954	return (zio);
3955}
3956
3957static int
3958arc_memory_throttle(uint64_t reserve, uint64_t txg)
3959{
3960#ifdef _KERNEL
3961	uint64_t available_memory = ptob(freemem);
3962	static uint64_t page_load = 0;
3963	static uint64_t last_txg = 0;
3964
3965#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
3966	available_memory =
3967	    MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE)));
3968#endif
3969
3970	if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
3971		return (0);
3972
3973	if (txg > last_txg) {
3974		last_txg = txg;
3975		page_load = 0;
3976	}
3977	/*
3978	 * If we are in pageout, we know that memory is already tight,
3979	 * the arc is already going to be evicting, so we just want to
3980	 * continue to let page writes occur as quickly as possible.
3981	 */
3982	if (curproc == pageproc) {
3983		if (page_load > MAX(ptob(minfree), available_memory) / 4)
3984			return (SET_ERROR(ERESTART));
3985		/* Note: reserve is inflated, so we deflate */
3986		page_load += reserve / 8;
3987		return (0);
3988	} else if (page_load > 0 && arc_reclaim_needed()) {
3989		/* memory is low, delay before restarting */
3990		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3991		return (SET_ERROR(EAGAIN));
3992	}
3993	page_load = 0;
3994#endif
3995	return (0);
3996}
3997
3998void
3999arc_tempreserve_clear(uint64_t reserve)
4000{
4001	atomic_add_64(&arc_tempreserve, -reserve);
4002	ASSERT((int64_t)arc_tempreserve >= 0);
4003}
4004
4005int
4006arc_tempreserve_space(uint64_t reserve, uint64_t txg)
4007{
4008	int error;
4009	uint64_t anon_size;
4010
4011	if (reserve > arc_c/4 && !arc_no_grow) {
4012		arc_c = MIN(arc_c_max, reserve * 4);
4013		DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
4014	}
4015	if (reserve > arc_c)
4016		return (SET_ERROR(ENOMEM));
4017
4018	/*
4019	 * Don't count loaned bufs as in flight dirty data to prevent long
4020	 * network delays from blocking transactions that are ready to be
4021	 * assigned to a txg.
4022	 */
4023	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
4024
4025	/*
4026	 * Writes will, almost always, require additional memory allocations
4027	 * in order to compress/encrypt/etc the data.  We therefore need to
4028	 * make sure that there is sufficient available memory for this.
4029	 */
4030	error = arc_memory_throttle(reserve, txg);
4031	if (error != 0)
4032		return (error);
4033
4034	/*
4035	 * Throttle writes when the amount of dirty data in the cache
4036	 * gets too large.  We try to keep the cache less than half full
4037	 * of dirty blocks so that our sync times don't grow too large.
4038	 * Note: if two requests come in concurrently, we might let them
4039	 * both succeed, when one of them should fail.  Not a huge deal.
4040	 */
4041
4042	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
4043	    anon_size > arc_c / 4) {
4044		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
4045		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
4046		    arc_tempreserve>>10,
4047		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
4048		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
4049		    reserve>>10, arc_c>>10);
4050		return (SET_ERROR(ERESTART));
4051	}
4052	atomic_add_64(&arc_tempreserve, reserve);
4053	return (0);
4054}
4055
4056static kmutex_t arc_lowmem_lock;
4057#ifdef _KERNEL
4058static eventhandler_tag arc_event_lowmem = NULL;
4059
4060static void
4061arc_lowmem(void *arg __unused, int howto __unused)
4062{
4063
4064	/* Serialize access via arc_lowmem_lock. */
4065	mutex_enter(&arc_lowmem_lock);
4066	mutex_enter(&arc_reclaim_thr_lock);
4067	needfree = 1;
4068	DTRACE_PROBE(arc__needfree);
4069	cv_signal(&arc_reclaim_thr_cv);
4070
4071	/*
4072	 * It is unsafe to block here in arbitrary threads, because we can come
4073	 * here from ARC itself and may hold ARC locks and thus risk a deadlock
4074	 * with ARC reclaim thread.
4075	 */
4076	if (curproc == pageproc) {
4077		while (needfree)
4078			msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
4079	}
4080	mutex_exit(&arc_reclaim_thr_lock);
4081	mutex_exit(&arc_lowmem_lock);
4082}
4083#endif
4084
4085void
4086arc_init(void)
4087{
4088	int i, prefetch_tunable_set = 0;
4089
4090	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4091	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
4092	mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
4093
4094	/* Convert seconds to clock ticks */
4095	arc_min_prefetch_lifespan = 1 * hz;
4096
4097	/* Start out with 1/8 of all memory */
4098	arc_c = kmem_size() / 8;
4099
4100#ifdef sun
4101#ifdef _KERNEL
4102	/*
4103	 * On architectures where the physical memory can be larger
4104	 * than the addressable space (intel in 32-bit mode), we may
4105	 * need to limit the cache to 1/8 of VM size.
4106	 */
4107	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
4108#endif
4109#endif	/* sun */
4110	/* set min cache to 1/32 of all memory, or 16MB, whichever is more */
4111	arc_c_min = MAX(arc_c / 4, 64<<18);
4112	/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
4113	if (arc_c * 8 >= 1<<30)
4114		arc_c_max = (arc_c * 8) - (1<<30);
4115	else
4116		arc_c_max = arc_c_min;
4117	arc_c_max = MAX(arc_c * 5, arc_c_max);
4118
4119#ifdef _KERNEL
4120	/*
4121	 * Allow the tunables to override our calculations if they are
4122	 * reasonable (ie. over 16MB)
4123	 */
4124	if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size())
4125		arc_c_max = zfs_arc_max;
4126	if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max)
4127		arc_c_min = zfs_arc_min;
4128#endif
4129
4130	arc_c = arc_c_max;
4131	arc_p = (arc_c >> 1);
4132
4133	/* limit meta-data to 1/4 of the arc capacity */
4134	arc_meta_limit = arc_c_max / 4;
4135
4136	/* Allow the tunable to override if it is reasonable */
4137	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
4138		arc_meta_limit = zfs_arc_meta_limit;
4139
4140	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
4141		arc_c_min = arc_meta_limit / 2;
4142
4143	if (zfs_arc_grow_retry > 0)
4144		arc_grow_retry = zfs_arc_grow_retry;
4145
4146	if (zfs_arc_shrink_shift > 0)
4147		arc_shrink_shift = zfs_arc_shrink_shift;
4148
4149	if (zfs_arc_p_min_shift > 0)
4150		arc_p_min_shift = zfs_arc_p_min_shift;
4151
4152	/* if kmem_flags are set, lets try to use less memory */
4153	if (kmem_debugging())
4154		arc_c = arc_c / 2;
4155	if (arc_c < arc_c_min)
4156		arc_c = arc_c_min;
4157
4158	zfs_arc_min = arc_c_min;
4159	zfs_arc_max = arc_c_max;
4160
4161	arc_anon = &ARC_anon;
4162	arc_mru = &ARC_mru;
4163	arc_mru_ghost = &ARC_mru_ghost;
4164	arc_mfu = &ARC_mfu;
4165	arc_mfu_ghost = &ARC_mfu_ghost;
4166	arc_l2c_only = &ARC_l2c_only;
4167	arc_size = 0;
4168
4169	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4170		mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
4171		    NULL, MUTEX_DEFAULT, NULL);
4172		mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
4173		    NULL, MUTEX_DEFAULT, NULL);
4174		mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
4175		    NULL, MUTEX_DEFAULT, NULL);
4176		mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
4177		    NULL, MUTEX_DEFAULT, NULL);
4178		mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
4179		    NULL, MUTEX_DEFAULT, NULL);
4180		mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
4181		    NULL, MUTEX_DEFAULT, NULL);
4182
4183		list_create(&arc_mru->arcs_lists[i],
4184		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4185		list_create(&arc_mru_ghost->arcs_lists[i],
4186		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4187		list_create(&arc_mfu->arcs_lists[i],
4188		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4189		list_create(&arc_mfu_ghost->arcs_lists[i],
4190		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4191		list_create(&arc_mfu_ghost->arcs_lists[i],
4192		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4193		list_create(&arc_l2c_only->arcs_lists[i],
4194		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4195	}
4196
4197	buf_init();
4198
4199	arc_thread_exit = 0;
4200	arc_eviction_list = NULL;
4201	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
4202	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4203
4204	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4205	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4206
4207	if (arc_ksp != NULL) {
4208		arc_ksp->ks_data = &arc_stats;
4209		kstat_install(arc_ksp);
4210	}
4211
4212	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
4213	    TS_RUN, minclsyspri);
4214
4215#ifdef _KERNEL
4216	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
4217	    EVENTHANDLER_PRI_FIRST);
4218#endif
4219
4220	arc_dead = FALSE;
4221	arc_warm = B_FALSE;
4222
4223	/*
4224	 * Calculate maximum amount of dirty data per pool.
4225	 *
4226	 * If it has been set by /etc/system, take that.
4227	 * Otherwise, use a percentage of physical memory defined by
4228	 * zfs_dirty_data_max_percent (default 10%) with a cap at
4229	 * zfs_dirty_data_max_max (default 4GB).
4230	 */
4231	if (zfs_dirty_data_max == 0) {
4232		zfs_dirty_data_max = ptob(physmem) *
4233		    zfs_dirty_data_max_percent / 100;
4234		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
4235		    zfs_dirty_data_max_max);
4236	}
4237
4238#ifdef _KERNEL
4239	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
4240		prefetch_tunable_set = 1;
4241
4242#ifdef __i386__
4243	if (prefetch_tunable_set == 0) {
4244		printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
4245		    "-- to enable,\n");
4246		printf("            add \"vfs.zfs.prefetch_disable=0\" "
4247		    "to /boot/loader.conf.\n");
4248		zfs_prefetch_disable = 1;
4249	}
4250#else
4251	if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
4252	    prefetch_tunable_set == 0) {
4253		printf("ZFS NOTICE: Prefetch is disabled by default if less "
4254		    "than 4GB of RAM is present;\n"
4255		    "            to enable, add \"vfs.zfs.prefetch_disable=0\" "
4256		    "to /boot/loader.conf.\n");
4257		zfs_prefetch_disable = 1;
4258	}
4259#endif
4260	/* Warn about ZFS memory and address space requirements. */
4261	if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
4262		printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
4263		    "expect unstable behavior.\n");
4264	}
4265	if (kmem_size() < 512 * (1 << 20)) {
4266		printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
4267		    "expect unstable behavior.\n");
4268		printf("             Consider tuning vm.kmem_size and "
4269		    "vm.kmem_size_max\n");
4270		printf("             in /boot/loader.conf.\n");
4271	}
4272#endif
4273}
4274
4275void
4276arc_fini(void)
4277{
4278	int i;
4279
4280	mutex_enter(&arc_reclaim_thr_lock);
4281	arc_thread_exit = 1;
4282	cv_signal(&arc_reclaim_thr_cv);
4283	while (arc_thread_exit != 0)
4284		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
4285	mutex_exit(&arc_reclaim_thr_lock);
4286
4287	arc_flush(NULL);
4288
4289	arc_dead = TRUE;
4290
4291	if (arc_ksp != NULL) {
4292		kstat_delete(arc_ksp);
4293		arc_ksp = NULL;
4294	}
4295
4296	mutex_destroy(&arc_eviction_mtx);
4297	mutex_destroy(&arc_reclaim_thr_lock);
4298	cv_destroy(&arc_reclaim_thr_cv);
4299
4300	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4301		list_destroy(&arc_mru->arcs_lists[i]);
4302		list_destroy(&arc_mru_ghost->arcs_lists[i]);
4303		list_destroy(&arc_mfu->arcs_lists[i]);
4304		list_destroy(&arc_mfu_ghost->arcs_lists[i]);
4305		list_destroy(&arc_l2c_only->arcs_lists[i]);
4306
4307		mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
4308		mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
4309		mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
4310		mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
4311		mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
4312		mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
4313	}
4314
4315	buf_fini();
4316
4317	ASSERT(arc_loaned_bytes == 0);
4318
4319	mutex_destroy(&arc_lowmem_lock);
4320#ifdef _KERNEL
4321	if (arc_event_lowmem != NULL)
4322		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
4323#endif
4324}
4325
4326/*
4327 * Level 2 ARC
4328 *
4329 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
4330 * It uses dedicated storage devices to hold cached data, which are populated
4331 * using large infrequent writes.  The main role of this cache is to boost
4332 * the performance of random read workloads.  The intended L2ARC devices
4333 * include short-stroked disks, solid state disks, and other media with
4334 * substantially faster read latency than disk.
4335 *
4336 *                 +-----------------------+
4337 *                 |         ARC           |
4338 *                 +-----------------------+
4339 *                    |         ^     ^
4340 *                    |         |     |
4341 *      l2arc_feed_thread()    arc_read()
4342 *                    |         |     |
4343 *                    |  l2arc read   |
4344 *                    V         |     |
4345 *               +---------------+    |
4346 *               |     L2ARC     |    |
4347 *               +---------------+    |
4348 *                   |    ^           |
4349 *          l2arc_write() |           |
4350 *                   |    |           |
4351 *                   V    |           |
4352 *                 +-------+      +-------+
4353 *                 | vdev  |      | vdev  |
4354 *                 | cache |      | cache |
4355 *                 +-------+      +-------+
4356 *                 +=========+     .-----.
4357 *                 :  L2ARC  :    |-_____-|
4358 *                 : devices :    | Disks |
4359 *                 +=========+    `-_____-'
4360 *
4361 * Read requests are satisfied from the following sources, in order:
4362 *
4363 *	1) ARC
4364 *	2) vdev cache of L2ARC devices
4365 *	3) L2ARC devices
4366 *	4) vdev cache of disks
4367 *	5) disks
4368 *
4369 * Some L2ARC device types exhibit extremely slow write performance.
4370 * To accommodate for this there are some significant differences between
4371 * the L2ARC and traditional cache design:
4372 *
4373 * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
4374 * the ARC behave as usual, freeing buffers and placing headers on ghost
4375 * lists.  The ARC does not send buffers to the L2ARC during eviction as
4376 * this would add inflated write latencies for all ARC memory pressure.
4377 *
4378 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4379 * It does this by periodically scanning buffers from the eviction-end of
4380 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4381 * not already there. It scans until a headroom of buffers is satisfied,
4382 * which itself is a buffer for ARC eviction. If a compressible buffer is
4383 * found during scanning and selected for writing to an L2ARC device, we
4384 * temporarily boost scanning headroom during the next scan cycle to make
4385 * sure we adapt to compression effects (which might significantly reduce
4386 * the data volume we write to L2ARC). The thread that does this is
4387 * l2arc_feed_thread(), illustrated below; example sizes are included to
4388 * provide a better sense of ratio than this diagram:
4389 *
4390 *	       head -->                        tail
4391 *	        +---------------------+----------+
4392 *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
4393 *	        +---------------------+----------+   |   o L2ARC eligible
4394 *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
4395 *	        +---------------------+----------+   |
4396 *	             15.9 Gbytes      ^ 32 Mbytes    |
4397 *	                           headroom          |
4398 *	                                      l2arc_feed_thread()
4399 *	                                             |
4400 *	                 l2arc write hand <--[oooo]--'
4401 *	                         |           8 Mbyte
4402 *	                         |          write max
4403 *	                         V
4404 *		  +==============================+
4405 *	L2ARC dev |####|#|###|###|    |####| ... |
4406 *	          +==============================+
4407 *	                     32 Gbytes
4408 *
4409 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4410 * evicted, then the L2ARC has cached a buffer much sooner than it probably
4411 * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
4412 * safe to say that this is an uncommon case, since buffers at the end of
4413 * the ARC lists have moved there due to inactivity.
4414 *
4415 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4416 * then the L2ARC simply misses copying some buffers.  This serves as a
4417 * pressure valve to prevent heavy read workloads from both stalling the ARC
4418 * with waits and clogging the L2ARC with writes.  This also helps prevent
4419 * the potential for the L2ARC to churn if it attempts to cache content too
4420 * quickly, such as during backups of the entire pool.
4421 *
4422 * 5. After system boot and before the ARC has filled main memory, there are
4423 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4424 * lists can remain mostly static.  Instead of searching from tail of these
4425 * lists as pictured, the l2arc_feed_thread() will search from the list heads
4426 * for eligible buffers, greatly increasing its chance of finding them.
4427 *
4428 * The L2ARC device write speed is also boosted during this time so that
4429 * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
4430 * there are no L2ARC reads, and no fear of degrading read performance
4431 * through increased writes.
4432 *
4433 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4434 * the vdev queue can aggregate them into larger and fewer writes.  Each
4435 * device is written to in a rotor fashion, sweeping writes through
4436 * available space then repeating.
4437 *
4438 * 7. The L2ARC does not store dirty content.  It never needs to flush
4439 * write buffers back to disk based storage.
4440 *
4441 * 8. If an ARC buffer is written (and dirtied) which also exists in the
4442 * L2ARC, the now stale L2ARC buffer is immediately dropped.
4443 *
4444 * The performance of the L2ARC can be tweaked by a number of tunables, which
4445 * may be necessary for different workloads:
4446 *
4447 *	l2arc_write_max		max write bytes per interval
4448 *	l2arc_write_boost	extra write bytes during device warmup
4449 *	l2arc_noprefetch	skip caching prefetched buffers
4450 *	l2arc_headroom		number of max device writes to precache
4451 *	l2arc_headroom_boost	when we find compressed buffers during ARC
4452 *				scanning, we multiply headroom by this
4453 *				percentage factor for the next scan cycle,
4454 *				since more compressed buffers are likely to
4455 *				be present
4456 *	l2arc_feed_secs		seconds between L2ARC writing
4457 *
4458 * Tunables may be removed or added as future performance improvements are
4459 * integrated, and also may become zpool properties.
4460 *
4461 * There are three key functions that control how the L2ARC warms up:
4462 *
4463 *	l2arc_write_eligible()	check if a buffer is eligible to cache
4464 *	l2arc_write_size()	calculate how much to write
4465 *	l2arc_write_interval()	calculate sleep delay between writes
4466 *
4467 * These three functions determine what to write, how much, and how quickly
4468 * to send writes.
4469 */
4470
4471static boolean_t
4472l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4473{
4474	/*
4475	 * A buffer is *not* eligible for the L2ARC if it:
4476	 * 1. belongs to a different spa.
4477	 * 2. is already cached on the L2ARC.
4478	 * 3. has an I/O in progress (it may be an incomplete read).
4479	 * 4. is flagged not eligible (zfs property).
4480	 */
4481	if (ab->b_spa != spa_guid) {
4482		ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
4483		return (B_FALSE);
4484	}
4485	if (ab->b_l2hdr != NULL) {
4486		ARCSTAT_BUMP(arcstat_l2_write_in_l2);
4487		return (B_FALSE);
4488	}
4489	if (HDR_IO_IN_PROGRESS(ab)) {
4490		ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
4491		return (B_FALSE);
4492	}
4493	if (!HDR_L2CACHE(ab)) {
4494		ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
4495		return (B_FALSE);
4496	}
4497
4498	return (B_TRUE);
4499}
4500
4501static uint64_t
4502l2arc_write_size(void)
4503{
4504	uint64_t size;
4505
4506	/*
4507	 * Make sure our globals have meaningful values in case the user
4508	 * altered them.
4509	 */
4510	size = l2arc_write_max;
4511	if (size == 0) {
4512		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4513		    "be greater than zero, resetting it to the default (%d)",
4514		    L2ARC_WRITE_SIZE);
4515		size = l2arc_write_max = L2ARC_WRITE_SIZE;
4516	}
4517
4518	if (arc_warm == B_FALSE)
4519		size += l2arc_write_boost;
4520
4521	return (size);
4522
4523}
4524
4525static clock_t
4526l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4527{
4528	clock_t interval, next, now;
4529
4530	/*
4531	 * If the ARC lists are busy, increase our write rate; if the
4532	 * lists are stale, idle back.  This is achieved by checking
4533	 * how much we previously wrote - if it was more than half of
4534	 * what we wanted, schedule the next write much sooner.
4535	 */
4536	if (l2arc_feed_again && wrote > (wanted / 2))
4537		interval = (hz * l2arc_feed_min_ms) / 1000;
4538	else
4539		interval = hz * l2arc_feed_secs;
4540
4541	now = ddi_get_lbolt();
4542	next = MAX(now, MIN(now + interval, began + interval));
4543
4544	return (next);
4545}
4546
4547static void
4548l2arc_hdr_stat_add(void)
4549{
4550	ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4551	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4552}
4553
4554static void
4555l2arc_hdr_stat_remove(void)
4556{
4557	ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4558	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4559}
4560
4561/*
4562 * Cycle through L2ARC devices.  This is how L2ARC load balances.
4563 * If a device is returned, this also returns holding the spa config lock.
4564 */
4565static l2arc_dev_t *
4566l2arc_dev_get_next(void)
4567{
4568	l2arc_dev_t *first, *next = NULL;
4569
4570	/*
4571	 * Lock out the removal of spas (spa_namespace_lock), then removal
4572	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4573	 * both locks will be dropped and a spa config lock held instead.
4574	 */
4575	mutex_enter(&spa_namespace_lock);
4576	mutex_enter(&l2arc_dev_mtx);
4577
4578	/* if there are no vdevs, there is nothing to do */
4579	if (l2arc_ndev == 0)
4580		goto out;
4581
4582	first = NULL;
4583	next = l2arc_dev_last;
4584	do {
4585		/* loop around the list looking for a non-faulted vdev */
4586		if (next == NULL) {
4587			next = list_head(l2arc_dev_list);
4588		} else {
4589			next = list_next(l2arc_dev_list, next);
4590			if (next == NULL)
4591				next = list_head(l2arc_dev_list);
4592		}
4593
4594		/* if we have come back to the start, bail out */
4595		if (first == NULL)
4596			first = next;
4597		else if (next == first)
4598			break;
4599
4600	} while (vdev_is_dead(next->l2ad_vdev));
4601
4602	/* if we were unable to find any usable vdevs, return NULL */
4603	if (vdev_is_dead(next->l2ad_vdev))
4604		next = NULL;
4605
4606	l2arc_dev_last = next;
4607
4608out:
4609	mutex_exit(&l2arc_dev_mtx);
4610
4611	/*
4612	 * Grab the config lock to prevent the 'next' device from being
4613	 * removed while we are writing to it.
4614	 */
4615	if (next != NULL)
4616		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4617	mutex_exit(&spa_namespace_lock);
4618
4619	return (next);
4620}
4621
4622/*
4623 * Free buffers that were tagged for destruction.
4624 */
4625static void
4626l2arc_do_free_on_write()
4627{
4628	list_t *buflist;
4629	l2arc_data_free_t *df, *df_prev;
4630
4631	mutex_enter(&l2arc_free_on_write_mtx);
4632	buflist = l2arc_free_on_write;
4633
4634	for (df = list_tail(buflist); df; df = df_prev) {
4635		df_prev = list_prev(buflist, df);
4636		ASSERT(df->l2df_data != NULL);
4637		ASSERT(df->l2df_func != NULL);
4638		df->l2df_func(df->l2df_data, df->l2df_size);
4639		list_remove(buflist, df);
4640		kmem_free(df, sizeof (l2arc_data_free_t));
4641	}
4642
4643	mutex_exit(&l2arc_free_on_write_mtx);
4644}
4645
4646/*
4647 * A write to a cache device has completed.  Update all headers to allow
4648 * reads from these buffers to begin.
4649 */
4650static void
4651l2arc_write_done(zio_t *zio)
4652{
4653	l2arc_write_callback_t *cb;
4654	l2arc_dev_t *dev;
4655	list_t *buflist;
4656	arc_buf_hdr_t *head, *ab, *ab_prev;
4657	l2arc_buf_hdr_t *abl2;
4658	kmutex_t *hash_lock;
4659	int64_t bytes_dropped = 0;
4660
4661	cb = zio->io_private;
4662	ASSERT(cb != NULL);
4663	dev = cb->l2wcb_dev;
4664	ASSERT(dev != NULL);
4665	head = cb->l2wcb_head;
4666	ASSERT(head != NULL);
4667	buflist = dev->l2ad_buflist;
4668	ASSERT(buflist != NULL);
4669	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4670	    l2arc_write_callback_t *, cb);
4671
4672	if (zio->io_error != 0)
4673		ARCSTAT_BUMP(arcstat_l2_writes_error);
4674
4675	mutex_enter(&l2arc_buflist_mtx);
4676
4677	/*
4678	 * All writes completed, or an error was hit.
4679	 */
4680	for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4681		ab_prev = list_prev(buflist, ab);
4682		abl2 = ab->b_l2hdr;
4683
4684		/*
4685		 * Release the temporary compressed buffer as soon as possible.
4686		 */
4687		if (abl2->b_compress != ZIO_COMPRESS_OFF)
4688			l2arc_release_cdata_buf(ab);
4689
4690		hash_lock = HDR_LOCK(ab);
4691		if (!mutex_tryenter(hash_lock)) {
4692			/*
4693			 * This buffer misses out.  It may be in a stage
4694			 * of eviction.  Its ARC_L2_WRITING flag will be
4695			 * left set, denying reads to this buffer.
4696			 */
4697			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4698			continue;
4699		}
4700
4701		if (zio->io_error != 0) {
4702			/*
4703			 * Error - drop L2ARC entry.
4704			 */
4705			list_remove(buflist, ab);
4706			ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4707			bytes_dropped += abl2->b_asize;
4708			ab->b_l2hdr = NULL;
4709			trim_map_free(abl2->b_dev->l2ad_vdev, abl2->b_daddr,
4710			    ab->b_size, 0);
4711			kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4712			ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4713		}
4714
4715		/*
4716		 * Allow ARC to begin reads to this L2ARC entry.
4717		 */
4718		ab->b_flags &= ~ARC_L2_WRITING;
4719
4720		mutex_exit(hash_lock);
4721	}
4722
4723	atomic_inc_64(&l2arc_writes_done);
4724	list_remove(buflist, head);
4725	kmem_cache_free(hdr_cache, head);
4726	mutex_exit(&l2arc_buflist_mtx);
4727
4728	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
4729
4730	l2arc_do_free_on_write();
4731
4732	kmem_free(cb, sizeof (l2arc_write_callback_t));
4733}
4734
4735/*
4736 * A read to a cache device completed.  Validate buffer contents before
4737 * handing over to the regular ARC routines.
4738 */
4739static void
4740l2arc_read_done(zio_t *zio)
4741{
4742	l2arc_read_callback_t *cb;
4743	arc_buf_hdr_t *hdr;
4744	arc_buf_t *buf;
4745	kmutex_t *hash_lock;
4746	int equal;
4747
4748	ASSERT(zio->io_vd != NULL);
4749	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4750
4751	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4752
4753	cb = zio->io_private;
4754	ASSERT(cb != NULL);
4755	buf = cb->l2rcb_buf;
4756	ASSERT(buf != NULL);
4757
4758	hash_lock = HDR_LOCK(buf->b_hdr);
4759	mutex_enter(hash_lock);
4760	hdr = buf->b_hdr;
4761	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4762
4763	/*
4764	 * If the buffer was compressed, decompress it first.
4765	 */
4766	if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4767		l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4768	ASSERT(zio->io_data != NULL);
4769
4770	/*
4771	 * Check this survived the L2ARC journey.
4772	 */
4773	equal = arc_cksum_equal(buf);
4774	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4775		mutex_exit(hash_lock);
4776		zio->io_private = buf;
4777		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
4778		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
4779		arc_read_done(zio);
4780	} else {
4781		mutex_exit(hash_lock);
4782		/*
4783		 * Buffer didn't survive caching.  Increment stats and
4784		 * reissue to the original storage device.
4785		 */
4786		if (zio->io_error != 0) {
4787			ARCSTAT_BUMP(arcstat_l2_io_error);
4788		} else {
4789			zio->io_error = SET_ERROR(EIO);
4790		}
4791		if (!equal)
4792			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4793
4794		/*
4795		 * If there's no waiter, issue an async i/o to the primary
4796		 * storage now.  If there *is* a waiter, the caller must
4797		 * issue the i/o in a context where it's OK to block.
4798		 */
4799		if (zio->io_waiter == NULL) {
4800			zio_t *pio = zio_unique_parent(zio);
4801
4802			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4803
4804			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4805			    buf->b_data, zio->io_size, arc_read_done, buf,
4806			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4807		}
4808	}
4809
4810	kmem_free(cb, sizeof (l2arc_read_callback_t));
4811}
4812
4813/*
4814 * This is the list priority from which the L2ARC will search for pages to
4815 * cache.  This is used within loops (0..3) to cycle through lists in the
4816 * desired order.  This order can have a significant effect on cache
4817 * performance.
4818 *
4819 * Currently the metadata lists are hit first, MFU then MRU, followed by
4820 * the data lists.  This function returns a locked list, and also returns
4821 * the lock pointer.
4822 */
4823static list_t *
4824l2arc_list_locked(int list_num, kmutex_t **lock)
4825{
4826	list_t *list = NULL;
4827	int idx;
4828
4829	ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);
4830
4831	if (list_num < ARC_BUFC_NUMMETADATALISTS) {
4832		idx = list_num;
4833		list = &arc_mfu->arcs_lists[idx];
4834		*lock = ARCS_LOCK(arc_mfu, idx);
4835	} else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) {
4836		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4837		list = &arc_mru->arcs_lists[idx];
4838		*lock = ARCS_LOCK(arc_mru, idx);
4839	} else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 +
4840		ARC_BUFC_NUMDATALISTS)) {
4841		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4842		list = &arc_mfu->arcs_lists[idx];
4843		*lock = ARCS_LOCK(arc_mfu, idx);
4844	} else {
4845		idx = list_num - ARC_BUFC_NUMLISTS;
4846		list = &arc_mru->arcs_lists[idx];
4847		*lock = ARCS_LOCK(arc_mru, idx);
4848	}
4849
4850	ASSERT(!(MUTEX_HELD(*lock)));
4851	mutex_enter(*lock);
4852	return (list);
4853}
4854
4855/*
4856 * Evict buffers from the device write hand to the distance specified in
4857 * bytes.  This distance may span populated buffers, it may span nothing.
4858 * This is clearing a region on the L2ARC device ready for writing.
4859 * If the 'all' boolean is set, every buffer is evicted.
4860 */
4861static void
4862l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4863{
4864	list_t *buflist;
4865	l2arc_buf_hdr_t *abl2;
4866	arc_buf_hdr_t *ab, *ab_prev;
4867	kmutex_t *hash_lock;
4868	uint64_t taddr;
4869	int64_t bytes_evicted = 0;
4870
4871	buflist = dev->l2ad_buflist;
4872
4873	if (buflist == NULL)
4874		return;
4875
4876	if (!all && dev->l2ad_first) {
4877		/*
4878		 * This is the first sweep through the device.  There is
4879		 * nothing to evict.
4880		 */
4881		return;
4882	}
4883
4884	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4885		/*
4886		 * When nearing the end of the device, evict to the end
4887		 * before the device write hand jumps to the start.
4888		 */
4889		taddr = dev->l2ad_end;
4890	} else {
4891		taddr = dev->l2ad_hand + distance;
4892	}
4893	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4894	    uint64_t, taddr, boolean_t, all);
4895
4896top:
4897	mutex_enter(&l2arc_buflist_mtx);
4898	for (ab = list_tail(buflist); ab; ab = ab_prev) {
4899		ab_prev = list_prev(buflist, ab);
4900
4901		hash_lock = HDR_LOCK(ab);
4902		if (!mutex_tryenter(hash_lock)) {
4903			/*
4904			 * Missed the hash lock.  Retry.
4905			 */
4906			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4907			mutex_exit(&l2arc_buflist_mtx);
4908			mutex_enter(hash_lock);
4909			mutex_exit(hash_lock);
4910			goto top;
4911		}
4912
4913		if (HDR_L2_WRITE_HEAD(ab)) {
4914			/*
4915			 * We hit a write head node.  Leave it for
4916			 * l2arc_write_done().
4917			 */
4918			list_remove(buflist, ab);
4919			mutex_exit(hash_lock);
4920			continue;
4921		}
4922
4923		if (!all && ab->b_l2hdr != NULL &&
4924		    (ab->b_l2hdr->b_daddr > taddr ||
4925		    ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4926			/*
4927			 * We've evicted to the target address,
4928			 * or the end of the device.
4929			 */
4930			mutex_exit(hash_lock);
4931			break;
4932		}
4933
4934		if (HDR_FREE_IN_PROGRESS(ab)) {
4935			/*
4936			 * Already on the path to destruction.
4937			 */
4938			mutex_exit(hash_lock);
4939			continue;
4940		}
4941
4942		if (ab->b_state == arc_l2c_only) {
4943			ASSERT(!HDR_L2_READING(ab));
4944			/*
4945			 * This doesn't exist in the ARC.  Destroy.
4946			 * arc_hdr_destroy() will call list_remove()
4947			 * and decrement arcstat_l2_size.
4948			 */
4949			arc_change_state(arc_anon, ab, hash_lock);
4950			arc_hdr_destroy(ab);
4951		} else {
4952			/*
4953			 * Invalidate issued or about to be issued
4954			 * reads, since we may be about to write
4955			 * over this location.
4956			 */
4957			if (HDR_L2_READING(ab)) {
4958				ARCSTAT_BUMP(arcstat_l2_evict_reading);
4959				ab->b_flags |= ARC_L2_EVICTED;
4960			}
4961
4962			/*
4963			 * Tell ARC this no longer exists in L2ARC.
4964			 */
4965			if (ab->b_l2hdr != NULL) {
4966				abl2 = ab->b_l2hdr;
4967				ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4968				bytes_evicted += abl2->b_asize;
4969				ab->b_l2hdr = NULL;
4970				kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4971				ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4972			}
4973			list_remove(buflist, ab);
4974
4975			/*
4976			 * This may have been leftover after a
4977			 * failed write.
4978			 */
4979			ab->b_flags &= ~ARC_L2_WRITING;
4980		}
4981		mutex_exit(hash_lock);
4982	}
4983	mutex_exit(&l2arc_buflist_mtx);
4984
4985	vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0);
4986	dev->l2ad_evict = taddr;
4987}
4988
4989/*
4990 * Find and write ARC buffers to the L2ARC device.
4991 *
4992 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4993 * for reading until they have completed writing.
4994 * The headroom_boost is an in-out parameter used to maintain headroom boost
4995 * state between calls to this function.
4996 *
4997 * Returns the number of bytes actually written (which may be smaller than
4998 * the delta by which the device hand has changed due to alignment).
4999 */
5000static uint64_t
5001l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
5002    boolean_t *headroom_boost)
5003{
5004	arc_buf_hdr_t *ab, *ab_prev, *head;
5005	list_t *list;
5006	uint64_t write_asize, write_psize, write_sz, headroom,
5007	    buf_compress_minsz;
5008	void *buf_data;
5009	kmutex_t *list_lock;
5010	boolean_t full;
5011	l2arc_write_callback_t *cb;
5012	zio_t *pio, *wzio;
5013	uint64_t guid = spa_load_guid(spa);
5014	const boolean_t do_headroom_boost = *headroom_boost;
5015	int try;
5016
5017	ASSERT(dev->l2ad_vdev != NULL);
5018
5019	/* Lower the flag now, we might want to raise it again later. */
5020	*headroom_boost = B_FALSE;
5021
5022	pio = NULL;
5023	write_sz = write_asize = write_psize = 0;
5024	full = B_FALSE;
5025	head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
5026	head->b_flags |= ARC_L2_WRITE_HEAD;
5027
5028	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
5029	/*
5030	 * We will want to try to compress buffers that are at least 2x the
5031	 * device sector size.
5032	 */
5033	buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
5034
5035	/*
5036	 * Copy buffers for L2ARC writing.
5037	 */
5038	mutex_enter(&l2arc_buflist_mtx);
5039	for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
5040		uint64_t passed_sz = 0;
5041
5042		list = l2arc_list_locked(try, &list_lock);
5043		ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
5044
5045		/*
5046		 * L2ARC fast warmup.
5047		 *
5048		 * Until the ARC is warm and starts to evict, read from the
5049		 * head of the ARC lists rather than the tail.
5050		 */
5051		if (arc_warm == B_FALSE)
5052			ab = list_head(list);
5053		else
5054			ab = list_tail(list);
5055		if (ab == NULL)
5056			ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
5057
5058		headroom = target_sz * l2arc_headroom * 2 / ARC_BUFC_NUMLISTS;
5059		if (do_headroom_boost)
5060			headroom = (headroom * l2arc_headroom_boost) / 100;
5061
5062		for (; ab; ab = ab_prev) {
5063			l2arc_buf_hdr_t *l2hdr;
5064			kmutex_t *hash_lock;
5065			uint64_t buf_sz;
5066
5067			if (arc_warm == B_FALSE)
5068				ab_prev = list_next(list, ab);
5069			else
5070				ab_prev = list_prev(list, ab);
5071			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size);
5072
5073			hash_lock = HDR_LOCK(ab);
5074			if (!mutex_tryenter(hash_lock)) {
5075				ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
5076				/*
5077				 * Skip this buffer rather than waiting.
5078				 */
5079				continue;
5080			}
5081
5082			passed_sz += ab->b_size;
5083			if (passed_sz > headroom) {
5084				/*
5085				 * Searched too far.
5086				 */
5087				mutex_exit(hash_lock);
5088				ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
5089				break;
5090			}
5091
5092			if (!l2arc_write_eligible(guid, ab)) {
5093				mutex_exit(hash_lock);
5094				continue;
5095			}
5096
5097			if ((write_sz + ab->b_size) > target_sz) {
5098				full = B_TRUE;
5099				mutex_exit(hash_lock);
5100				ARCSTAT_BUMP(arcstat_l2_write_full);
5101				break;
5102			}
5103
5104			if (pio == NULL) {
5105				/*
5106				 * Insert a dummy header on the buflist so
5107				 * l2arc_write_done() can find where the
5108				 * write buffers begin without searching.
5109				 */
5110				list_insert_head(dev->l2ad_buflist, head);
5111
5112				cb = kmem_alloc(
5113				    sizeof (l2arc_write_callback_t), KM_SLEEP);
5114				cb->l2wcb_dev = dev;
5115				cb->l2wcb_head = head;
5116				pio = zio_root(spa, l2arc_write_done, cb,
5117				    ZIO_FLAG_CANFAIL);
5118				ARCSTAT_BUMP(arcstat_l2_write_pios);
5119			}
5120
5121			/*
5122			 * Create and add a new L2ARC header.
5123			 */
5124			l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
5125			l2hdr->b_dev = dev;
5126			ab->b_flags |= ARC_L2_WRITING;
5127
5128			/*
5129			 * Temporarily stash the data buffer in b_tmp_cdata.
5130			 * The subsequent write step will pick it up from
5131			 * there. This is because can't access ab->b_buf
5132			 * without holding the hash_lock, which we in turn
5133			 * can't access without holding the ARC list locks
5134			 * (which we want to avoid during compression/writing).
5135			 */
5136			l2hdr->b_compress = ZIO_COMPRESS_OFF;
5137			l2hdr->b_asize = ab->b_size;
5138			l2hdr->b_tmp_cdata = ab->b_buf->b_data;
5139
5140			buf_sz = ab->b_size;
5141			ab->b_l2hdr = l2hdr;
5142
5143			list_insert_head(dev->l2ad_buflist, ab);
5144
5145			/*
5146			 * Compute and store the buffer cksum before
5147			 * writing.  On debug the cksum is verified first.
5148			 */
5149			arc_cksum_verify(ab->b_buf);
5150			arc_cksum_compute(ab->b_buf, B_TRUE);
5151
5152			mutex_exit(hash_lock);
5153
5154			write_sz += buf_sz;
5155		}
5156
5157		mutex_exit(list_lock);
5158
5159		if (full == B_TRUE)
5160			break;
5161	}
5162
5163	/* No buffers selected for writing? */
5164	if (pio == NULL) {
5165		ASSERT0(write_sz);
5166		mutex_exit(&l2arc_buflist_mtx);
5167		kmem_cache_free(hdr_cache, head);
5168		return (0);
5169	}
5170
5171	/*
5172	 * Now start writing the buffers. We're starting at the write head
5173	 * and work backwards, retracing the course of the buffer selector
5174	 * loop above.
5175	 */
5176	for (ab = list_prev(dev->l2ad_buflist, head); ab;
5177	    ab = list_prev(dev->l2ad_buflist, ab)) {
5178		l2arc_buf_hdr_t *l2hdr;
5179		uint64_t buf_sz;
5180
5181		/*
5182		 * We shouldn't need to lock the buffer here, since we flagged
5183		 * it as ARC_L2_WRITING in the previous step, but we must take
5184		 * care to only access its L2 cache parameters. In particular,
5185		 * ab->b_buf may be invalid by now due to ARC eviction.
5186		 */
5187		l2hdr = ab->b_l2hdr;
5188		l2hdr->b_daddr = dev->l2ad_hand;
5189
5190		if ((ab->b_flags & ARC_L2COMPRESS) &&
5191		    l2hdr->b_asize >= buf_compress_minsz) {
5192			if (l2arc_compress_buf(l2hdr)) {
5193				/*
5194				 * If compression succeeded, enable headroom
5195				 * boost on the next scan cycle.
5196				 */
5197				*headroom_boost = B_TRUE;
5198			}
5199		}
5200
5201		/*
5202		 * Pick up the buffer data we had previously stashed away
5203		 * (and now potentially also compressed).
5204		 */
5205		buf_data = l2hdr->b_tmp_cdata;
5206		buf_sz = l2hdr->b_asize;
5207
5208		/* Compression may have squashed the buffer to zero length. */
5209		if (buf_sz != 0) {
5210			uint64_t buf_p_sz;
5211
5212			wzio = zio_write_phys(pio, dev->l2ad_vdev,
5213			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5214			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5215			    ZIO_FLAG_CANFAIL, B_FALSE);
5216
5217			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5218			    zio_t *, wzio);
5219			(void) zio_nowait(wzio);
5220
5221			write_asize += buf_sz;
5222			/*
5223			 * Keep the clock hand suitably device-aligned.
5224			 */
5225			buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
5226			write_psize += buf_p_sz;
5227			dev->l2ad_hand += buf_p_sz;
5228		}
5229	}
5230
5231	mutex_exit(&l2arc_buflist_mtx);
5232
5233	ASSERT3U(write_asize, <=, target_sz);
5234	ARCSTAT_BUMP(arcstat_l2_writes_sent);
5235	ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5236	ARCSTAT_INCR(arcstat_l2_size, write_sz);
5237	ARCSTAT_INCR(arcstat_l2_asize, write_asize);
5238	vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
5239
5240	/*
5241	 * Bump device hand to the device start if it is approaching the end.
5242	 * l2arc_evict() will already have evicted ahead for this case.
5243	 */
5244	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
5245		dev->l2ad_hand = dev->l2ad_start;
5246		dev->l2ad_evict = dev->l2ad_start;
5247		dev->l2ad_first = B_FALSE;
5248	}
5249
5250	dev->l2ad_writing = B_TRUE;
5251	(void) zio_wait(pio);
5252	dev->l2ad_writing = B_FALSE;
5253
5254	return (write_asize);
5255}
5256
5257/*
5258 * Compresses an L2ARC buffer.
5259 * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
5260 * size in l2hdr->b_asize. This routine tries to compress the data and
5261 * depending on the compression result there are three possible outcomes:
5262 * *) The buffer was incompressible. The original l2hdr contents were left
5263 *    untouched and are ready for writing to an L2 device.
5264 * *) The buffer was all-zeros, so there is no need to write it to an L2
5265 *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
5266 *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
5267 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
5268 *    data buffer which holds the compressed data to be written, and b_asize
5269 *    tells us how much data there is. b_compress is set to the appropriate
5270 *    compression algorithm. Once writing is done, invoke
5271 *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
5272 *
5273 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
5274 * buffer was incompressible).
5275 */
5276static boolean_t
5277l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
5278{
5279	void *cdata;
5280	size_t csize, len, rounded;
5281
5282	ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
5283	ASSERT(l2hdr->b_tmp_cdata != NULL);
5284
5285	len = l2hdr->b_asize;
5286	cdata = zio_data_buf_alloc(len);
5287	csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
5288	    cdata, l2hdr->b_asize);
5289
5290	rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
5291	if (rounded > csize) {
5292		bzero((char *)cdata + csize, rounded - csize);
5293		csize = rounded;
5294	}
5295
5296	if (csize == 0) {
5297		/* zero block, indicate that there's nothing to write */
5298		zio_data_buf_free(cdata, len);
5299		l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
5300		l2hdr->b_asize = 0;
5301		l2hdr->b_tmp_cdata = NULL;
5302		ARCSTAT_BUMP(arcstat_l2_compress_zeros);
5303		return (B_TRUE);
5304	} else if (csize > 0 && csize < len) {
5305		/*
5306		 * Compression succeeded, we'll keep the cdata around for
5307		 * writing and release it afterwards.
5308		 */
5309		l2hdr->b_compress = ZIO_COMPRESS_LZ4;
5310		l2hdr->b_asize = csize;
5311		l2hdr->b_tmp_cdata = cdata;
5312		ARCSTAT_BUMP(arcstat_l2_compress_successes);
5313		return (B_TRUE);
5314	} else {
5315		/*
5316		 * Compression failed, release the compressed buffer.
5317		 * l2hdr will be left unmodified.
5318		 */
5319		zio_data_buf_free(cdata, len);
5320		ARCSTAT_BUMP(arcstat_l2_compress_failures);
5321		return (B_FALSE);
5322	}
5323}
5324
5325/*
5326 * Decompresses a zio read back from an l2arc device. On success, the
5327 * underlying zio's io_data buffer is overwritten by the uncompressed
5328 * version. On decompression error (corrupt compressed stream), the
5329 * zio->io_error value is set to signal an I/O error.
5330 *
5331 * Please note that the compressed data stream is not checksummed, so
5332 * if the underlying device is experiencing data corruption, we may feed
5333 * corrupt data to the decompressor, so the decompressor needs to be
5334 * able to handle this situation (LZ4 does).
5335 */
5336static void
5337l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
5338{
5339	ASSERT(L2ARC_IS_VALID_COMPRESS(c));
5340
5341	if (zio->io_error != 0) {
5342		/*
5343		 * An io error has occured, just restore the original io
5344		 * size in preparation for a main pool read.
5345		 */
5346		zio->io_orig_size = zio->io_size = hdr->b_size;
5347		return;
5348	}
5349
5350	if (c == ZIO_COMPRESS_EMPTY) {
5351		/*
5352		 * An empty buffer results in a null zio, which means we
5353		 * need to fill its io_data after we're done restoring the
5354		 * buffer's contents.
5355		 */
5356		ASSERT(hdr->b_buf != NULL);
5357		bzero(hdr->b_buf->b_data, hdr->b_size);
5358		zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
5359	} else {
5360		ASSERT(zio->io_data != NULL);
5361		/*
5362		 * We copy the compressed data from the start of the arc buffer
5363		 * (the zio_read will have pulled in only what we need, the
5364		 * rest is garbage which we will overwrite at decompression)
5365		 * and then decompress back to the ARC data buffer. This way we
5366		 * can minimize copying by simply decompressing back over the
5367		 * original compressed data (rather than decompressing to an
5368		 * aux buffer and then copying back the uncompressed buffer,
5369		 * which is likely to be much larger).
5370		 */
5371		uint64_t csize;
5372		void *cdata;
5373
5374		csize = zio->io_size;
5375		cdata = zio_data_buf_alloc(csize);
5376		bcopy(zio->io_data, cdata, csize);
5377		if (zio_decompress_data(c, cdata, zio->io_data, csize,
5378		    hdr->b_size) != 0)
5379			zio->io_error = EIO;
5380		zio_data_buf_free(cdata, csize);
5381	}
5382
5383	/* Restore the expected uncompressed IO size. */
5384	zio->io_orig_size = zio->io_size = hdr->b_size;
5385}
5386
5387/*
5388 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
5389 * This buffer serves as a temporary holder of compressed data while
5390 * the buffer entry is being written to an l2arc device. Once that is
5391 * done, we can dispose of it.
5392 */
5393static void
5394l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
5395{
5396	l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
5397
5398	if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
5399		/*
5400		 * If the data was compressed, then we've allocated a
5401		 * temporary buffer for it, so now we need to release it.
5402		 */
5403		ASSERT(l2hdr->b_tmp_cdata != NULL);
5404		zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
5405	}
5406	l2hdr->b_tmp_cdata = NULL;
5407}
5408
5409/*
5410 * This thread feeds the L2ARC at regular intervals.  This is the beating
5411 * heart of the L2ARC.
5412 */
5413static void
5414l2arc_feed_thread(void *dummy __unused)
5415{
5416	callb_cpr_t cpr;
5417	l2arc_dev_t *dev;
5418	spa_t *spa;
5419	uint64_t size, wrote;
5420	clock_t begin, next = ddi_get_lbolt();
5421	boolean_t headroom_boost = B_FALSE;
5422
5423	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
5424
5425	mutex_enter(&l2arc_feed_thr_lock);
5426
5427	while (l2arc_thread_exit == 0) {
5428		CALLB_CPR_SAFE_BEGIN(&cpr);
5429		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
5430		    next - ddi_get_lbolt());
5431		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
5432		next = ddi_get_lbolt() + hz;
5433
5434		/*
5435		 * Quick check for L2ARC devices.
5436		 */
5437		mutex_enter(&l2arc_dev_mtx);
5438		if (l2arc_ndev == 0) {
5439			mutex_exit(&l2arc_dev_mtx);
5440			continue;
5441		}
5442		mutex_exit(&l2arc_dev_mtx);
5443		begin = ddi_get_lbolt();
5444
5445		/*
5446		 * This selects the next l2arc device to write to, and in
5447		 * doing so the next spa to feed from: dev->l2ad_spa.   This
5448		 * will return NULL if there are now no l2arc devices or if
5449		 * they are all faulted.
5450		 *
5451		 * If a device is returned, its spa's config lock is also
5452		 * held to prevent device removal.  l2arc_dev_get_next()
5453		 * will grab and release l2arc_dev_mtx.
5454		 */
5455		if ((dev = l2arc_dev_get_next()) == NULL)
5456			continue;
5457
5458		spa = dev->l2ad_spa;
5459		ASSERT(spa != NULL);
5460
5461		/*
5462		 * If the pool is read-only then force the feed thread to
5463		 * sleep a little longer.
5464		 */
5465		if (!spa_writeable(spa)) {
5466			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
5467			spa_config_exit(spa, SCL_L2ARC, dev);
5468			continue;
5469		}
5470
5471		/*
5472		 * Avoid contributing to memory pressure.
5473		 */
5474		if (arc_reclaim_needed()) {
5475			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5476			spa_config_exit(spa, SCL_L2ARC, dev);
5477			continue;
5478		}
5479
5480		ARCSTAT_BUMP(arcstat_l2_feeds);
5481
5482		size = l2arc_write_size();
5483
5484		/*
5485		 * Evict L2ARC buffers that will be overwritten.
5486		 */
5487		l2arc_evict(dev, size, B_FALSE);
5488
5489		/*
5490		 * Write ARC buffers.
5491		 */
5492		wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5493
5494		/*
5495		 * Calculate interval between writes.
5496		 */
5497		next = l2arc_write_interval(begin, size, wrote);
5498		spa_config_exit(spa, SCL_L2ARC, dev);
5499	}
5500
5501	l2arc_thread_exit = 0;
5502	cv_broadcast(&l2arc_feed_thr_cv);
5503	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
5504	thread_exit();
5505}
5506
5507boolean_t
5508l2arc_vdev_present(vdev_t *vd)
5509{
5510	l2arc_dev_t *dev;
5511
5512	mutex_enter(&l2arc_dev_mtx);
5513	for (dev = list_head(l2arc_dev_list); dev != NULL;
5514	    dev = list_next(l2arc_dev_list, dev)) {
5515		if (dev->l2ad_vdev == vd)
5516			break;
5517	}
5518	mutex_exit(&l2arc_dev_mtx);
5519
5520	return (dev != NULL);
5521}
5522
5523/*
5524 * Add a vdev for use by the L2ARC.  By this point the spa has already
5525 * validated the vdev and opened it.
5526 */
5527void
5528l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5529{
5530	l2arc_dev_t *adddev;
5531
5532	ASSERT(!l2arc_vdev_present(vd));
5533
5534	vdev_ashift_optimize(vd);
5535
5536	/*
5537	 * Create a new l2arc device entry.
5538	 */
5539	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5540	adddev->l2ad_spa = spa;
5541	adddev->l2ad_vdev = vd;
5542	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5543	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5544	adddev->l2ad_hand = adddev->l2ad_start;
5545	adddev->l2ad_evict = adddev->l2ad_start;
5546	adddev->l2ad_first = B_TRUE;
5547	adddev->l2ad_writing = B_FALSE;
5548
5549	/*
5550	 * This is a list of all ARC buffers that are still valid on the
5551	 * device.
5552	 */
5553	adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5554	list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5555	    offsetof(arc_buf_hdr_t, b_l2node));
5556
5557	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5558
5559	/*
5560	 * Add device to global list
5561	 */
5562	mutex_enter(&l2arc_dev_mtx);
5563	list_insert_head(l2arc_dev_list, adddev);
5564	atomic_inc_64(&l2arc_ndev);
5565	mutex_exit(&l2arc_dev_mtx);
5566}
5567
5568/*
5569 * Remove a vdev from the L2ARC.
5570 */
5571void
5572l2arc_remove_vdev(vdev_t *vd)
5573{
5574	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5575
5576	/*
5577	 * Find the device by vdev
5578	 */
5579	mutex_enter(&l2arc_dev_mtx);
5580	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5581		nextdev = list_next(l2arc_dev_list, dev);
5582		if (vd == dev->l2ad_vdev) {
5583			remdev = dev;
5584			break;
5585		}
5586	}
5587	ASSERT(remdev != NULL);
5588
5589	/*
5590	 * Remove device from global list
5591	 */
5592	list_remove(l2arc_dev_list, remdev);
5593	l2arc_dev_last = NULL;		/* may have been invalidated */
5594	atomic_dec_64(&l2arc_ndev);
5595	mutex_exit(&l2arc_dev_mtx);
5596
5597	/*
5598	 * Clear all buflists and ARC references.  L2ARC device flush.
5599	 */
5600	l2arc_evict(remdev, 0, B_TRUE);
5601	list_destroy(remdev->l2ad_buflist);
5602	kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5603	kmem_free(remdev, sizeof (l2arc_dev_t));
5604}
5605
5606void
5607l2arc_init(void)
5608{
5609	l2arc_thread_exit = 0;
5610	l2arc_ndev = 0;
5611	l2arc_writes_sent = 0;
5612	l2arc_writes_done = 0;
5613
5614	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5615	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5616	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5617	mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5618	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5619
5620	l2arc_dev_list = &L2ARC_dev_list;
5621	l2arc_free_on_write = &L2ARC_free_on_write;
5622	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5623	    offsetof(l2arc_dev_t, l2ad_node));
5624	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5625	    offsetof(l2arc_data_free_t, l2df_list_node));
5626}
5627
5628void
5629l2arc_fini(void)
5630{
5631	/*
5632	 * This is called from dmu_fini(), which is called from spa_fini();
5633	 * Because of this, we can assume that all l2arc devices have
5634	 * already been removed when the pools themselves were removed.
5635	 */
5636
5637	l2arc_do_free_on_write();
5638
5639	mutex_destroy(&l2arc_feed_thr_lock);
5640	cv_destroy(&l2arc_feed_thr_cv);
5641	mutex_destroy(&l2arc_dev_mtx);
5642	mutex_destroy(&l2arc_buflist_mtx);
5643	mutex_destroy(&l2arc_free_on_write_mtx);
5644
5645	list_destroy(l2arc_dev_list);
5646	list_destroy(l2arc_free_on_write);
5647}
5648
5649void
5650l2arc_start(void)
5651{
5652	if (!(spa_mode_global & FWRITE))
5653		return;
5654
5655	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5656	    TS_RUN, minclsyspri);
5657}
5658
5659void
5660l2arc_stop(void)
5661{
5662	if (!(spa_mode_global & FWRITE))
5663		return;
5664
5665	mutex_enter(&l2arc_feed_thr_lock);
5666	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
5667	l2arc_thread_exit = 1;
5668	while (l2arc_thread_exit != 0)
5669		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5670	mutex_exit(&l2arc_feed_thr_lock);
5671}
5672