arc.c revision 275748
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
24 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
25 * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
26 */
27
28/*
29 * DVA-based Adjustable Replacement Cache
30 *
31 * While much of the theory of operation used here is
32 * based on the self-tuning, low overhead replacement cache
33 * presented by Megiddo and Modha at FAST 2003, there are some
34 * significant differences:
35 *
36 * 1. The Megiddo and Modha model assumes any page is evictable.
37 * Pages in its cache cannot be "locked" into memory.  This makes
38 * the eviction algorithm simple: evict the last page in the list.
39 * This also make the performance characteristics easy to reason
40 * about.  Our cache is not so simple.  At any given moment, some
41 * subset of the blocks in the cache are un-evictable because we
42 * have handed out a reference to them.  Blocks are only evictable
43 * when there are no external references active.  This makes
44 * eviction far more problematic:  we choose to evict the evictable
45 * blocks that are the "lowest" in the list.
46 *
47 * There are times when it is not possible to evict the requested
48 * space.  In these circumstances we are unable to adjust the cache
49 * size.  To prevent the cache growing unbounded at these times we
50 * implement a "cache throttle" that slows the flow of new data
51 * into the cache until we can make space available.
52 *
53 * 2. The Megiddo and Modha model assumes a fixed cache size.
54 * Pages are evicted when the cache is full and there is a cache
55 * miss.  Our model has a variable sized cache.  It grows with
56 * high use, but also tries to react to memory pressure from the
57 * operating system: decreasing its size when system memory is
58 * tight.
59 *
60 * 3. The Megiddo and Modha model assumes a fixed page size. All
61 * elements of the cache are therefore exactly the same size.  So
62 * when adjusting the cache size following a cache miss, its simply
63 * a matter of choosing a single page to evict.  In our model, we
64 * have variable sized cache blocks (rangeing from 512 bytes to
65 * 128K bytes).  We therefore choose a set of blocks to evict to make
66 * space for a cache miss that approximates as closely as possible
67 * the space used by the new block.
68 *
69 * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
70 * by N. Megiddo & D. Modha, FAST 2003
71 */
72
73/*
74 * The locking model:
75 *
76 * A new reference to a cache buffer can be obtained in two
77 * ways: 1) via a hash table lookup using the DVA as a key,
78 * or 2) via one of the ARC lists.  The arc_read() interface
79 * uses method 1, while the internal arc algorithms for
80 * adjusting the cache use method 2.  We therefore provide two
81 * types of locks: 1) the hash table lock array, and 2) the
82 * arc list locks.
83 *
84 * Buffers do not have their own mutexs, rather they rely on the
85 * hash table mutexs for the bulk of their protection (i.e. most
86 * fields in the arc_buf_hdr_t are protected by these mutexs).
87 *
88 * buf_hash_find() returns the appropriate mutex (held) when it
89 * locates the requested buffer in the hash table.  It returns
90 * NULL for the mutex if the buffer was not in the table.
91 *
92 * buf_hash_remove() expects the appropriate hash mutex to be
93 * already held before it is invoked.
94 *
95 * Each arc state also has a mutex which is used to protect the
96 * buffer list associated with the state.  When attempting to
97 * obtain a hash table lock while holding an arc list lock you
98 * must use: mutex_tryenter() to avoid deadlock.  Also note that
99 * the active state mutex must be held before the ghost state mutex.
100 *
101 * Arc buffers may have an associated eviction callback function.
102 * This function will be invoked prior to removing the buffer (e.g.
103 * in arc_do_user_evicts()).  Note however that the data associated
104 * with the buffer may be evicted prior to the callback.  The callback
105 * must be made with *no locks held* (to prevent deadlock).  Additionally,
106 * the users of callbacks must ensure that their private data is
107 * protected from simultaneous callbacks from arc_clear_callback()
108 * and arc_do_user_evicts().
109 *
110 * Note that the majority of the performance stats are manipulated
111 * with atomic operations.
112 *
113 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
114 *
115 *	- L2ARC buflist creation
116 *	- L2ARC buflist eviction
117 *	- L2ARC write completion, which walks L2ARC buflists
118 *	- ARC header destruction, as it removes from L2ARC buflists
119 *	- ARC header release, as it removes from L2ARC buflists
120 */
121
122#include <sys/spa.h>
123#include <sys/zio.h>
124#include <sys/zio_compress.h>
125#include <sys/zfs_context.h>
126#include <sys/arc.h>
127#include <sys/refcount.h>
128#include <sys/vdev.h>
129#include <sys/vdev_impl.h>
130#include <sys/dsl_pool.h>
131#ifdef _KERNEL
132#include <sys/dnlc.h>
133#endif
134#include <sys/callb.h>
135#include <sys/kstat.h>
136#include <sys/trim_map.h>
137#include <zfs_fletcher.h>
138#include <sys/sdt.h>
139
140#include <vm/vm_pageout.h>
141#include <machine/vmparam.h>
142
143#ifdef illumos
144#ifndef _KERNEL
145/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
146boolean_t arc_watch = B_FALSE;
147int arc_procfd;
148#endif
149#endif /* illumos */
150
151static kmutex_t		arc_reclaim_thr_lock;
152static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
153static uint8_t		arc_thread_exit;
154
155#define	ARC_REDUCE_DNLC_PERCENT	3
156uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
157
158typedef enum arc_reclaim_strategy {
159	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
160	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
161} arc_reclaim_strategy_t;
162
163/*
164 * The number of iterations through arc_evict_*() before we
165 * drop & reacquire the lock.
166 */
167int arc_evict_iterations = 100;
168
169/* number of seconds before growing cache again */
170static int		arc_grow_retry = 60;
171
172/* shift of arc_c for calculating both min and max arc_p */
173static int		arc_p_min_shift = 4;
174
175/* log2(fraction of arc to reclaim) */
176static int		arc_shrink_shift = 5;
177
178/*
179 * minimum lifespan of a prefetch block in clock ticks
180 * (initialized in arc_init())
181 */
182static int		arc_min_prefetch_lifespan;
183
184/*
185 * If this percent of memory is free, don't throttle.
186 */
187int arc_lotsfree_percent = 10;
188
189static int arc_dead;
190extern int zfs_prefetch_disable;
191
192/*
193 * The arc has filled available memory and has now warmed up.
194 */
195static boolean_t arc_warm;
196
197uint64_t zfs_arc_max;
198uint64_t zfs_arc_min;
199uint64_t zfs_arc_meta_limit = 0;
200int zfs_arc_grow_retry = 0;
201int zfs_arc_shrink_shift = 0;
202int zfs_arc_p_min_shift = 0;
203int zfs_disable_dup_eviction = 0;
204uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
205u_int zfs_arc_free_target = 0;
206
207static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
208static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS);
209
210#ifdef _KERNEL
211static void
212arc_free_target_init(void *unused __unused)
213{
214
215	zfs_arc_free_target = vm_pageout_wakeup_thresh;
216}
217SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
218    arc_free_target_init, NULL);
219
220TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
221TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift);
222SYSCTL_DECL(_vfs_zfs);
223SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
224    "Maximum ARC size");
225SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
226    "Minimum ARC size");
227SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
228    &zfs_arc_average_blocksize, 0,
229    "ARC average blocksize");
230SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
231    &arc_shrink_shift, 0,
232    "log2(fraction of arc to reclaim)");
233
234/*
235 * We don't have a tunable for arc_free_target due to the dependency on
236 * pagedaemon initialisation.
237 */
238SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
239    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
240    sysctl_vfs_zfs_arc_free_target, "IU",
241    "Desired number of free pages below which ARC triggers reclaim");
242
243static int
244sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
245{
246	u_int val;
247	int err;
248
249	val = zfs_arc_free_target;
250	err = sysctl_handle_int(oidp, &val, 0, req);
251	if (err != 0 || req->newptr == NULL)
252		return (err);
253
254	if (val < minfree)
255		return (EINVAL);
256	if (val > vm_cnt.v_page_count)
257		return (EINVAL);
258
259	zfs_arc_free_target = val;
260
261	return (0);
262}
263
264/*
265 * Must be declared here, before the definition of corresponding kstat
266 * macro which uses the same names will confuse the compiler.
267 */
268SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit,
269    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
270    sysctl_vfs_zfs_arc_meta_limit, "QU",
271    "ARC metadata limit");
272#endif
273
274/*
275 * Note that buffers can be in one of 6 states:
276 *	ARC_anon	- anonymous (discussed below)
277 *	ARC_mru		- recently used, currently cached
278 *	ARC_mru_ghost	- recentely used, no longer in cache
279 *	ARC_mfu		- frequently used, currently cached
280 *	ARC_mfu_ghost	- frequently used, no longer in cache
281 *	ARC_l2c_only	- exists in L2ARC but not other states
282 * When there are no active references to the buffer, they are
283 * are linked onto a list in one of these arc states.  These are
284 * the only buffers that can be evicted or deleted.  Within each
285 * state there are multiple lists, one for meta-data and one for
286 * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
287 * etc.) is tracked separately so that it can be managed more
288 * explicitly: favored over data, limited explicitly.
289 *
290 * Anonymous buffers are buffers that are not associated with
291 * a DVA.  These are buffers that hold dirty block copies
292 * before they are written to stable storage.  By definition,
293 * they are "ref'd" and are considered part of arc_mru
294 * that cannot be freed.  Generally, they will aquire a DVA
295 * as they are written and migrate onto the arc_mru list.
296 *
297 * The ARC_l2c_only state is for buffers that are in the second
298 * level ARC but no longer in any of the ARC_m* lists.  The second
299 * level ARC itself may also contain buffers that are in any of
300 * the ARC_m* states - meaning that a buffer can exist in two
301 * places.  The reason for the ARC_l2c_only state is to keep the
302 * buffer header in the hash table, so that reads that hit the
303 * second level ARC benefit from these fast lookups.
304 */
305
306#define	ARCS_LOCK_PAD		CACHE_LINE_SIZE
307struct arcs_lock {
308	kmutex_t	arcs_lock;
309#ifdef _KERNEL
310	unsigned char	pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
311#endif
312};
313
314/*
315 * must be power of two for mask use to work
316 *
317 */
318#define ARC_BUFC_NUMDATALISTS		16
319#define ARC_BUFC_NUMMETADATALISTS	16
320#define ARC_BUFC_NUMLISTS	(ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
321
322typedef struct arc_state {
323	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
324	uint64_t arcs_size;	/* total amount of data in this state */
325	list_t	arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
326	struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
327} arc_state_t;
328
329#define ARCS_LOCK(s, i)	(&((s)->arcs_locks[(i)].arcs_lock))
330
331/* The 6 states: */
332static arc_state_t ARC_anon;
333static arc_state_t ARC_mru;
334static arc_state_t ARC_mru_ghost;
335static arc_state_t ARC_mfu;
336static arc_state_t ARC_mfu_ghost;
337static arc_state_t ARC_l2c_only;
338
339typedef struct arc_stats {
340	kstat_named_t arcstat_hits;
341	kstat_named_t arcstat_misses;
342	kstat_named_t arcstat_demand_data_hits;
343	kstat_named_t arcstat_demand_data_misses;
344	kstat_named_t arcstat_demand_metadata_hits;
345	kstat_named_t arcstat_demand_metadata_misses;
346	kstat_named_t arcstat_prefetch_data_hits;
347	kstat_named_t arcstat_prefetch_data_misses;
348	kstat_named_t arcstat_prefetch_metadata_hits;
349	kstat_named_t arcstat_prefetch_metadata_misses;
350	kstat_named_t arcstat_mru_hits;
351	kstat_named_t arcstat_mru_ghost_hits;
352	kstat_named_t arcstat_mfu_hits;
353	kstat_named_t arcstat_mfu_ghost_hits;
354	kstat_named_t arcstat_allocated;
355	kstat_named_t arcstat_deleted;
356	kstat_named_t arcstat_stolen;
357	kstat_named_t arcstat_recycle_miss;
358	/*
359	 * Number of buffers that could not be evicted because the hash lock
360	 * was held by another thread.  The lock may not necessarily be held
361	 * by something using the same buffer, since hash locks are shared
362	 * by multiple buffers.
363	 */
364	kstat_named_t arcstat_mutex_miss;
365	/*
366	 * Number of buffers skipped because they have I/O in progress, are
367	 * indrect prefetch buffers that have not lived long enough, or are
368	 * not from the spa we're trying to evict from.
369	 */
370	kstat_named_t arcstat_evict_skip;
371	kstat_named_t arcstat_evict_l2_cached;
372	kstat_named_t arcstat_evict_l2_eligible;
373	kstat_named_t arcstat_evict_l2_ineligible;
374	kstat_named_t arcstat_hash_elements;
375	kstat_named_t arcstat_hash_elements_max;
376	kstat_named_t arcstat_hash_collisions;
377	kstat_named_t arcstat_hash_chains;
378	kstat_named_t arcstat_hash_chain_max;
379	kstat_named_t arcstat_p;
380	kstat_named_t arcstat_c;
381	kstat_named_t arcstat_c_min;
382	kstat_named_t arcstat_c_max;
383	kstat_named_t arcstat_size;
384	kstat_named_t arcstat_hdr_size;
385	kstat_named_t arcstat_data_size;
386	kstat_named_t arcstat_other_size;
387	kstat_named_t arcstat_l2_hits;
388	kstat_named_t arcstat_l2_misses;
389	kstat_named_t arcstat_l2_feeds;
390	kstat_named_t arcstat_l2_rw_clash;
391	kstat_named_t arcstat_l2_read_bytes;
392	kstat_named_t arcstat_l2_write_bytes;
393	kstat_named_t arcstat_l2_writes_sent;
394	kstat_named_t arcstat_l2_writes_done;
395	kstat_named_t arcstat_l2_writes_error;
396	kstat_named_t arcstat_l2_writes_hdr_miss;
397	kstat_named_t arcstat_l2_evict_lock_retry;
398	kstat_named_t arcstat_l2_evict_reading;
399	kstat_named_t arcstat_l2_free_on_write;
400	kstat_named_t arcstat_l2_cdata_free_on_write;
401	kstat_named_t arcstat_l2_abort_lowmem;
402	kstat_named_t arcstat_l2_cksum_bad;
403	kstat_named_t arcstat_l2_io_error;
404	kstat_named_t arcstat_l2_size;
405	kstat_named_t arcstat_l2_asize;
406	kstat_named_t arcstat_l2_hdr_size;
407	kstat_named_t arcstat_l2_compress_successes;
408	kstat_named_t arcstat_l2_compress_zeros;
409	kstat_named_t arcstat_l2_compress_failures;
410	kstat_named_t arcstat_l2_write_trylock_fail;
411	kstat_named_t arcstat_l2_write_passed_headroom;
412	kstat_named_t arcstat_l2_write_spa_mismatch;
413	kstat_named_t arcstat_l2_write_in_l2;
414	kstat_named_t arcstat_l2_write_hdr_io_in_progress;
415	kstat_named_t arcstat_l2_write_not_cacheable;
416	kstat_named_t arcstat_l2_write_full;
417	kstat_named_t arcstat_l2_write_buffer_iter;
418	kstat_named_t arcstat_l2_write_pios;
419	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
420	kstat_named_t arcstat_l2_write_buffer_list_iter;
421	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
422	kstat_named_t arcstat_memory_throttle_count;
423	kstat_named_t arcstat_duplicate_buffers;
424	kstat_named_t arcstat_duplicate_buffers_size;
425	kstat_named_t arcstat_duplicate_reads;
426	kstat_named_t arcstat_meta_used;
427	kstat_named_t arcstat_meta_limit;
428	kstat_named_t arcstat_meta_max;
429} arc_stats_t;
430
431static arc_stats_t arc_stats = {
432	{ "hits",			KSTAT_DATA_UINT64 },
433	{ "misses",			KSTAT_DATA_UINT64 },
434	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
435	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
436	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
437	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
438	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
439	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
440	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
441	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
442	{ "mru_hits",			KSTAT_DATA_UINT64 },
443	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
444	{ "mfu_hits",			KSTAT_DATA_UINT64 },
445	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
446	{ "allocated",			KSTAT_DATA_UINT64 },
447	{ "deleted",			KSTAT_DATA_UINT64 },
448	{ "stolen",			KSTAT_DATA_UINT64 },
449	{ "recycle_miss",		KSTAT_DATA_UINT64 },
450	{ "mutex_miss",			KSTAT_DATA_UINT64 },
451	{ "evict_skip",			KSTAT_DATA_UINT64 },
452	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
453	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
454	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
455	{ "hash_elements",		KSTAT_DATA_UINT64 },
456	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
457	{ "hash_collisions",		KSTAT_DATA_UINT64 },
458	{ "hash_chains",		KSTAT_DATA_UINT64 },
459	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
460	{ "p",				KSTAT_DATA_UINT64 },
461	{ "c",				KSTAT_DATA_UINT64 },
462	{ "c_min",			KSTAT_DATA_UINT64 },
463	{ "c_max",			KSTAT_DATA_UINT64 },
464	{ "size",			KSTAT_DATA_UINT64 },
465	{ "hdr_size",			KSTAT_DATA_UINT64 },
466	{ "data_size",			KSTAT_DATA_UINT64 },
467	{ "other_size",			KSTAT_DATA_UINT64 },
468	{ "l2_hits",			KSTAT_DATA_UINT64 },
469	{ "l2_misses",			KSTAT_DATA_UINT64 },
470	{ "l2_feeds",			KSTAT_DATA_UINT64 },
471	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
472	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
473	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
474	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
475	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
476	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
477	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
478	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
479	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
480	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
481	{ "l2_cdata_free_on_write",	KSTAT_DATA_UINT64 },
482	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
483	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
484	{ "l2_io_error",		KSTAT_DATA_UINT64 },
485	{ "l2_size",			KSTAT_DATA_UINT64 },
486	{ "l2_asize",			KSTAT_DATA_UINT64 },
487	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
488	{ "l2_compress_successes",	KSTAT_DATA_UINT64 },
489	{ "l2_compress_zeros",		KSTAT_DATA_UINT64 },
490	{ "l2_compress_failures",	KSTAT_DATA_UINT64 },
491	{ "l2_write_trylock_fail",	KSTAT_DATA_UINT64 },
492	{ "l2_write_passed_headroom",	KSTAT_DATA_UINT64 },
493	{ "l2_write_spa_mismatch",	KSTAT_DATA_UINT64 },
494	{ "l2_write_in_l2",		KSTAT_DATA_UINT64 },
495	{ "l2_write_io_in_progress",	KSTAT_DATA_UINT64 },
496	{ "l2_write_not_cacheable",	KSTAT_DATA_UINT64 },
497	{ "l2_write_full",		KSTAT_DATA_UINT64 },
498	{ "l2_write_buffer_iter",	KSTAT_DATA_UINT64 },
499	{ "l2_write_pios",		KSTAT_DATA_UINT64 },
500	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
501	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
502	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
503	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
504	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
505	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
506	{ "duplicate_reads",		KSTAT_DATA_UINT64 },
507	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
508	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
509	{ "arc_meta_max",		KSTAT_DATA_UINT64 }
510};
511
512#define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
513
514#define	ARCSTAT_INCR(stat, val) \
515	atomic_add_64(&arc_stats.stat.value.ui64, (val))
516
517#define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
518#define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
519
520#define	ARCSTAT_MAX(stat, val) {					\
521	uint64_t m;							\
522	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
523	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
524		continue;						\
525}
526
527#define	ARCSTAT_MAXSTAT(stat) \
528	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
529
530/*
531 * We define a macro to allow ARC hits/misses to be easily broken down by
532 * two separate conditions, giving a total of four different subtypes for
533 * each of hits and misses (so eight statistics total).
534 */
535#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
536	if (cond1) {							\
537		if (cond2) {						\
538			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
539		} else {						\
540			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
541		}							\
542	} else {							\
543		if (cond2) {						\
544			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
545		} else {						\
546			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
547		}							\
548	}
549
550kstat_t			*arc_ksp;
551static arc_state_t	*arc_anon;
552static arc_state_t	*arc_mru;
553static arc_state_t	*arc_mru_ghost;
554static arc_state_t	*arc_mfu;
555static arc_state_t	*arc_mfu_ghost;
556static arc_state_t	*arc_l2c_only;
557
558/*
559 * There are several ARC variables that are critical to export as kstats --
560 * but we don't want to have to grovel around in the kstat whenever we wish to
561 * manipulate them.  For these variables, we therefore define them to be in
562 * terms of the statistic variable.  This assures that we are not introducing
563 * the possibility of inconsistency by having shadow copies of the variables,
564 * while still allowing the code to be readable.
565 */
566#define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
567#define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
568#define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
569#define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
570#define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
571#define	arc_meta_limit	ARCSTAT(arcstat_meta_limit) /* max size for metadata */
572#define	arc_meta_used	ARCSTAT(arcstat_meta_used) /* size of metadata */
573#define	arc_meta_max	ARCSTAT(arcstat_meta_max) /* max size of metadata */
574
575#define	L2ARC_IS_VALID_COMPRESS(_c_) \
576	((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
577
578static int		arc_no_grow;	/* Don't try to grow cache size */
579static uint64_t		arc_tempreserve;
580static uint64_t		arc_loaned_bytes;
581
582typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
583
584typedef struct arc_callback arc_callback_t;
585
586struct arc_callback {
587	void			*acb_private;
588	arc_done_func_t		*acb_done;
589	arc_buf_t		*acb_buf;
590	zio_t			*acb_zio_dummy;
591	arc_callback_t		*acb_next;
592};
593
594typedef struct arc_write_callback arc_write_callback_t;
595
596struct arc_write_callback {
597	void		*awcb_private;
598	arc_done_func_t	*awcb_ready;
599	arc_done_func_t	*awcb_physdone;
600	arc_done_func_t	*awcb_done;
601	arc_buf_t	*awcb_buf;
602};
603
604struct arc_buf_hdr {
605	/* protected by hash lock */
606	dva_t			b_dva;
607	uint64_t		b_birth;
608	uint64_t		b_cksum0;
609
610	kmutex_t		b_freeze_lock;
611	zio_cksum_t		*b_freeze_cksum;
612	void			*b_thawed;
613
614	arc_buf_hdr_t		*b_hash_next;
615	arc_buf_t		*b_buf;
616	uint32_t		b_flags;
617	uint32_t		b_datacnt;
618
619	arc_callback_t		*b_acb;
620	kcondvar_t		b_cv;
621
622	/* immutable */
623	arc_buf_contents_t	b_type;
624	uint64_t		b_size;
625	uint64_t		b_spa;
626
627	/* protected by arc state mutex */
628	arc_state_t		*b_state;
629	list_node_t		b_arc_node;
630
631	/* updated atomically */
632	clock_t			b_arc_access;
633
634	/* self protecting */
635	refcount_t		b_refcnt;
636
637	l2arc_buf_hdr_t		*b_l2hdr;
638	list_node_t		b_l2node;
639};
640
641#ifdef _KERNEL
642static int
643sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS)
644{
645	uint64_t val;
646	int err;
647
648	val = arc_meta_limit;
649	err = sysctl_handle_64(oidp, &val, 0, req);
650	if (err != 0 || req->newptr == NULL)
651		return (err);
652
653        if (val <= 0 || val > arc_c_max)
654		return (EINVAL);
655
656	arc_meta_limit = val;
657	return (0);
658}
659#endif
660
661static arc_buf_t *arc_eviction_list;
662static kmutex_t arc_eviction_mtx;
663static arc_buf_hdr_t arc_eviction_hdr;
664static void arc_get_data_buf(arc_buf_t *buf);
665static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
666static int arc_evict_needed(arc_buf_contents_t type);
667static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
668#ifdef illumos
669static void arc_buf_watch(arc_buf_t *buf);
670#endif /* illumos */
671
672static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
673
674#define	GHOST_STATE(state)	\
675	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
676	(state) == arc_l2c_only)
677
678/*
679 * Private ARC flags.  These flags are private ARC only flags that will show up
680 * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
681 * be passed in as arc_flags in things like arc_read.  However, these flags
682 * should never be passed and should only be set by ARC code.  When adding new
683 * public flags, make sure not to smash the private ones.
684 */
685
686#define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
687#define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
688#define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
689#define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
690#define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
691#define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
692#define	ARC_FREE_IN_PROGRESS	(1 << 15)	/* hdr about to be freed */
693#define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
694#define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
695#define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
696
697#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
698#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
699#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
700#define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_PREFETCH)
701#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
702#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
703#define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
704#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_L2CACHE)
705#define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS &&	\
706				    (hdr)->b_l2hdr != NULL)
707#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
708#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
709#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
710
711/*
712 * Other sizes
713 */
714
715#define	HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
716#define	L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
717
718/*
719 * Hash table routines
720 */
721
722#define	HT_LOCK_PAD	CACHE_LINE_SIZE
723
724struct ht_lock {
725	kmutex_t	ht_lock;
726#ifdef _KERNEL
727	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
728#endif
729};
730
731#define	BUF_LOCKS 256
732typedef struct buf_hash_table {
733	uint64_t ht_mask;
734	arc_buf_hdr_t **ht_table;
735	struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
736} buf_hash_table_t;
737
738static buf_hash_table_t buf_hash_table;
739
740#define	BUF_HASH_INDEX(spa, dva, birth) \
741	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
742#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
743#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
744#define	HDR_LOCK(hdr) \
745	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
746
747uint64_t zfs_crc64_table[256];
748
749/*
750 * Level 2 ARC
751 */
752
753#define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
754#define	L2ARC_HEADROOM		2			/* num of writes */
755/*
756 * If we discover during ARC scan any buffers to be compressed, we boost
757 * our headroom for the next scanning cycle by this percentage multiple.
758 */
759#define	L2ARC_HEADROOM_BOOST	200
760#define	L2ARC_FEED_SECS		1		/* caching interval secs */
761#define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
762
763#define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
764#define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
765
766/* L2ARC Performance Tunables */
767uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
768uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
769uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
770uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
771uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
772uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
773boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
774boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
775boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
776
777SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
778    &l2arc_write_max, 0, "max write size");
779SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
780    &l2arc_write_boost, 0, "extra write during warmup");
781SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
782    &l2arc_headroom, 0, "number of dev writes");
783SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
784    &l2arc_feed_secs, 0, "interval seconds");
785SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
786    &l2arc_feed_min_ms, 0, "min interval milliseconds");
787
788SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
789    &l2arc_noprefetch, 0, "don't cache prefetch bufs");
790SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
791    &l2arc_feed_again, 0, "turbo warmup");
792SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
793    &l2arc_norw, 0, "no reads during writes");
794
795SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
796    &ARC_anon.arcs_size, 0, "size of anonymous state");
797SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
798    &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
799SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
800    &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
801
802SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
803    &ARC_mru.arcs_size, 0, "size of mru state");
804SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
805    &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
806SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
807    &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
808
809SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
810    &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
811SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
812    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
813    "size of metadata in mru ghost state");
814SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
815    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
816    "size of data in mru ghost state");
817
818SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
819    &ARC_mfu.arcs_size, 0, "size of mfu state");
820SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
821    &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
822SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
823    &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
824
825SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
826    &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
827SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
828    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
829    "size of metadata in mfu ghost state");
830SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
831    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
832    "size of data in mfu ghost state");
833
834SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
835    &ARC_l2c_only.arcs_size, 0, "size of mru state");
836
837/*
838 * L2ARC Internals
839 */
840typedef struct l2arc_dev {
841	vdev_t			*l2ad_vdev;	/* vdev */
842	spa_t			*l2ad_spa;	/* spa */
843	uint64_t		l2ad_hand;	/* next write location */
844	uint64_t		l2ad_start;	/* first addr on device */
845	uint64_t		l2ad_end;	/* last addr on device */
846	uint64_t		l2ad_evict;	/* last addr eviction reached */
847	boolean_t		l2ad_first;	/* first sweep through */
848	boolean_t		l2ad_writing;	/* currently writing */
849	list_t			*l2ad_buflist;	/* buffer list */
850	list_node_t		l2ad_node;	/* device list node */
851} l2arc_dev_t;
852
853static list_t L2ARC_dev_list;			/* device list */
854static list_t *l2arc_dev_list;			/* device list pointer */
855static kmutex_t l2arc_dev_mtx;			/* device list mutex */
856static l2arc_dev_t *l2arc_dev_last;		/* last device used */
857static kmutex_t l2arc_buflist_mtx;		/* mutex for all buflists */
858static list_t L2ARC_free_on_write;		/* free after write buf list */
859static list_t *l2arc_free_on_write;		/* free after write list ptr */
860static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
861static uint64_t l2arc_ndev;			/* number of devices */
862
863typedef struct l2arc_read_callback {
864	arc_buf_t		*l2rcb_buf;		/* read buffer */
865	spa_t			*l2rcb_spa;		/* spa */
866	blkptr_t		l2rcb_bp;		/* original blkptr */
867	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
868	int			l2rcb_flags;		/* original flags */
869	enum zio_compress	l2rcb_compress;		/* applied compress */
870} l2arc_read_callback_t;
871
872typedef struct l2arc_write_callback {
873	l2arc_dev_t	*l2wcb_dev;		/* device info */
874	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
875} l2arc_write_callback_t;
876
877struct l2arc_buf_hdr {
878	/* protected by arc_buf_hdr  mutex */
879	l2arc_dev_t		*b_dev;		/* L2ARC device */
880	uint64_t		b_daddr;	/* disk address, offset byte */
881	/* compression applied to buffer data */
882	enum zio_compress	b_compress;
883	/* real alloc'd buffer size depending on b_compress applied */
884	int			b_asize;
885	/* temporary buffer holder for in-flight compressed data */
886	void			*b_tmp_cdata;
887};
888
889typedef struct l2arc_data_free {
890	/* protected by l2arc_free_on_write_mtx */
891	void		*l2df_data;
892	size_t		l2df_size;
893	void		(*l2df_func)(void *, size_t);
894	list_node_t	l2df_list_node;
895} l2arc_data_free_t;
896
897static kmutex_t l2arc_feed_thr_lock;
898static kcondvar_t l2arc_feed_thr_cv;
899static uint8_t l2arc_thread_exit;
900
901static void l2arc_read_done(zio_t *zio);
902static void l2arc_hdr_stat_add(void);
903static void l2arc_hdr_stat_remove(void);
904
905static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
906static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
907    enum zio_compress c);
908static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
909
910static uint64_t
911buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
912{
913	uint8_t *vdva = (uint8_t *)dva;
914	uint64_t crc = -1ULL;
915	int i;
916
917	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
918
919	for (i = 0; i < sizeof (dva_t); i++)
920		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
921
922	crc ^= (spa>>8) ^ birth;
923
924	return (crc);
925}
926
927#define	BUF_EMPTY(buf)						\
928	((buf)->b_dva.dva_word[0] == 0 &&			\
929	(buf)->b_dva.dva_word[1] == 0 &&			\
930	(buf)->b_cksum0 == 0)
931
932#define	BUF_EQUAL(spa, dva, birth, buf)				\
933	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
934	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
935	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
936
937static void
938buf_discard_identity(arc_buf_hdr_t *hdr)
939{
940	hdr->b_dva.dva_word[0] = 0;
941	hdr->b_dva.dva_word[1] = 0;
942	hdr->b_birth = 0;
943	hdr->b_cksum0 = 0;
944}
945
946static arc_buf_hdr_t *
947buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
948{
949	const dva_t *dva = BP_IDENTITY(bp);
950	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
951	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
952	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
953	arc_buf_hdr_t *buf;
954
955	mutex_enter(hash_lock);
956	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
957	    buf = buf->b_hash_next) {
958		if (BUF_EQUAL(spa, dva, birth, buf)) {
959			*lockp = hash_lock;
960			return (buf);
961		}
962	}
963	mutex_exit(hash_lock);
964	*lockp = NULL;
965	return (NULL);
966}
967
968/*
969 * Insert an entry into the hash table.  If there is already an element
970 * equal to elem in the hash table, then the already existing element
971 * will be returned and the new element will not be inserted.
972 * Otherwise returns NULL.
973 */
974static arc_buf_hdr_t *
975buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
976{
977	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
978	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
979	arc_buf_hdr_t *fbuf;
980	uint32_t i;
981
982	ASSERT(!DVA_IS_EMPTY(&buf->b_dva));
983	ASSERT(buf->b_birth != 0);
984	ASSERT(!HDR_IN_HASH_TABLE(buf));
985	*lockp = hash_lock;
986	mutex_enter(hash_lock);
987	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
988	    fbuf = fbuf->b_hash_next, i++) {
989		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
990			return (fbuf);
991	}
992
993	buf->b_hash_next = buf_hash_table.ht_table[idx];
994	buf_hash_table.ht_table[idx] = buf;
995	buf->b_flags |= ARC_IN_HASH_TABLE;
996
997	/* collect some hash table performance data */
998	if (i > 0) {
999		ARCSTAT_BUMP(arcstat_hash_collisions);
1000		if (i == 1)
1001			ARCSTAT_BUMP(arcstat_hash_chains);
1002
1003		ARCSTAT_MAX(arcstat_hash_chain_max, i);
1004	}
1005
1006	ARCSTAT_BUMP(arcstat_hash_elements);
1007	ARCSTAT_MAXSTAT(arcstat_hash_elements);
1008
1009	return (NULL);
1010}
1011
1012static void
1013buf_hash_remove(arc_buf_hdr_t *buf)
1014{
1015	arc_buf_hdr_t *fbuf, **bufp;
1016	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
1017
1018	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
1019	ASSERT(HDR_IN_HASH_TABLE(buf));
1020
1021	bufp = &buf_hash_table.ht_table[idx];
1022	while ((fbuf = *bufp) != buf) {
1023		ASSERT(fbuf != NULL);
1024		bufp = &fbuf->b_hash_next;
1025	}
1026	*bufp = buf->b_hash_next;
1027	buf->b_hash_next = NULL;
1028	buf->b_flags &= ~ARC_IN_HASH_TABLE;
1029
1030	/* collect some hash table performance data */
1031	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1032
1033	if (buf_hash_table.ht_table[idx] &&
1034	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1035		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1036}
1037
1038/*
1039 * Global data structures and functions for the buf kmem cache.
1040 */
1041static kmem_cache_t *hdr_cache;
1042static kmem_cache_t *buf_cache;
1043
1044static void
1045buf_fini(void)
1046{
1047	int i;
1048
1049	kmem_free(buf_hash_table.ht_table,
1050	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
1051	for (i = 0; i < BUF_LOCKS; i++)
1052		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
1053	kmem_cache_destroy(hdr_cache);
1054	kmem_cache_destroy(buf_cache);
1055}
1056
1057/*
1058 * Constructor callback - called when the cache is empty
1059 * and a new buf is requested.
1060 */
1061/* ARGSUSED */
1062static int
1063hdr_cons(void *vbuf, void *unused, int kmflag)
1064{
1065	arc_buf_hdr_t *buf = vbuf;
1066
1067	bzero(buf, sizeof (arc_buf_hdr_t));
1068	refcount_create(&buf->b_refcnt);
1069	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
1070	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1071	arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1072
1073	return (0);
1074}
1075
1076/* ARGSUSED */
1077static int
1078buf_cons(void *vbuf, void *unused, int kmflag)
1079{
1080	arc_buf_t *buf = vbuf;
1081
1082	bzero(buf, sizeof (arc_buf_t));
1083	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1084	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1085
1086	return (0);
1087}
1088
1089/*
1090 * Destructor callback - called when a cached buf is
1091 * no longer required.
1092 */
1093/* ARGSUSED */
1094static void
1095hdr_dest(void *vbuf, void *unused)
1096{
1097	arc_buf_hdr_t *buf = vbuf;
1098
1099	ASSERT(BUF_EMPTY(buf));
1100	refcount_destroy(&buf->b_refcnt);
1101	cv_destroy(&buf->b_cv);
1102	mutex_destroy(&buf->b_freeze_lock);
1103	arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1104}
1105
1106/* ARGSUSED */
1107static void
1108buf_dest(void *vbuf, void *unused)
1109{
1110	arc_buf_t *buf = vbuf;
1111
1112	mutex_destroy(&buf->b_evict_lock);
1113	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1114}
1115
1116/*
1117 * Reclaim callback -- invoked when memory is low.
1118 */
1119/* ARGSUSED */
1120static void
1121hdr_recl(void *unused)
1122{
1123	dprintf("hdr_recl called\n");
1124	/*
1125	 * umem calls the reclaim func when we destroy the buf cache,
1126	 * which is after we do arc_fini().
1127	 */
1128	if (!arc_dead)
1129		cv_signal(&arc_reclaim_thr_cv);
1130}
1131
1132static void
1133buf_init(void)
1134{
1135	uint64_t *ct;
1136	uint64_t hsize = 1ULL << 12;
1137	int i, j;
1138
1139	/*
1140	 * The hash table is big enough to fill all of physical memory
1141	 * with an average block size of zfs_arc_average_blocksize (default 8K).
1142	 * By default, the table will take up
1143	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1144	 */
1145	while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
1146		hsize <<= 1;
1147retry:
1148	buf_hash_table.ht_mask = hsize - 1;
1149	buf_hash_table.ht_table =
1150	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1151	if (buf_hash_table.ht_table == NULL) {
1152		ASSERT(hsize > (1ULL << 8));
1153		hsize >>= 1;
1154		goto retry;
1155	}
1156
1157	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
1158	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
1159	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1160	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1161
1162	for (i = 0; i < 256; i++)
1163		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1164			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1165
1166	for (i = 0; i < BUF_LOCKS; i++) {
1167		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1168		    NULL, MUTEX_DEFAULT, NULL);
1169	}
1170}
1171
1172#define	ARC_MINTIME	(hz>>4) /* 62 ms */
1173
1174static void
1175arc_cksum_verify(arc_buf_t *buf)
1176{
1177	zio_cksum_t zc;
1178
1179	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1180		return;
1181
1182	mutex_enter(&buf->b_hdr->b_freeze_lock);
1183	if (buf->b_hdr->b_freeze_cksum == NULL ||
1184	    (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
1185		mutex_exit(&buf->b_hdr->b_freeze_lock);
1186		return;
1187	}
1188	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1189	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1190		panic("buffer modified while frozen!");
1191	mutex_exit(&buf->b_hdr->b_freeze_lock);
1192}
1193
1194static int
1195arc_cksum_equal(arc_buf_t *buf)
1196{
1197	zio_cksum_t zc;
1198	int equal;
1199
1200	mutex_enter(&buf->b_hdr->b_freeze_lock);
1201	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1202	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1203	mutex_exit(&buf->b_hdr->b_freeze_lock);
1204
1205	return (equal);
1206}
1207
1208static void
1209arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1210{
1211	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1212		return;
1213
1214	mutex_enter(&buf->b_hdr->b_freeze_lock);
1215	if (buf->b_hdr->b_freeze_cksum != NULL) {
1216		mutex_exit(&buf->b_hdr->b_freeze_lock);
1217		return;
1218	}
1219	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1220	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1221	    buf->b_hdr->b_freeze_cksum);
1222	mutex_exit(&buf->b_hdr->b_freeze_lock);
1223#ifdef illumos
1224	arc_buf_watch(buf);
1225#endif /* illumos */
1226}
1227
1228#ifdef illumos
1229#ifndef _KERNEL
1230typedef struct procctl {
1231	long cmd;
1232	prwatch_t prwatch;
1233} procctl_t;
1234#endif
1235
1236/* ARGSUSED */
1237static void
1238arc_buf_unwatch(arc_buf_t *buf)
1239{
1240#ifndef _KERNEL
1241	if (arc_watch) {
1242		int result;
1243		procctl_t ctl;
1244		ctl.cmd = PCWATCH;
1245		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1246		ctl.prwatch.pr_size = 0;
1247		ctl.prwatch.pr_wflags = 0;
1248		result = write(arc_procfd, &ctl, sizeof (ctl));
1249		ASSERT3U(result, ==, sizeof (ctl));
1250	}
1251#endif
1252}
1253
1254/* ARGSUSED */
1255static void
1256arc_buf_watch(arc_buf_t *buf)
1257{
1258#ifndef _KERNEL
1259	if (arc_watch) {
1260		int result;
1261		procctl_t ctl;
1262		ctl.cmd = PCWATCH;
1263		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1264		ctl.prwatch.pr_size = buf->b_hdr->b_size;
1265		ctl.prwatch.pr_wflags = WA_WRITE;
1266		result = write(arc_procfd, &ctl, sizeof (ctl));
1267		ASSERT3U(result, ==, sizeof (ctl));
1268	}
1269#endif
1270}
1271#endif /* illumos */
1272
1273void
1274arc_buf_thaw(arc_buf_t *buf)
1275{
1276	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1277		if (buf->b_hdr->b_state != arc_anon)
1278			panic("modifying non-anon buffer!");
1279		if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1280			panic("modifying buffer while i/o in progress!");
1281		arc_cksum_verify(buf);
1282	}
1283
1284	mutex_enter(&buf->b_hdr->b_freeze_lock);
1285	if (buf->b_hdr->b_freeze_cksum != NULL) {
1286		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1287		buf->b_hdr->b_freeze_cksum = NULL;
1288	}
1289
1290	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1291		if (buf->b_hdr->b_thawed)
1292			kmem_free(buf->b_hdr->b_thawed, 1);
1293		buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1294	}
1295
1296	mutex_exit(&buf->b_hdr->b_freeze_lock);
1297
1298#ifdef illumos
1299	arc_buf_unwatch(buf);
1300#endif /* illumos */
1301}
1302
1303void
1304arc_buf_freeze(arc_buf_t *buf)
1305{
1306	kmutex_t *hash_lock;
1307
1308	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1309		return;
1310
1311	hash_lock = HDR_LOCK(buf->b_hdr);
1312	mutex_enter(hash_lock);
1313
1314	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1315	    buf->b_hdr->b_state == arc_anon);
1316	arc_cksum_compute(buf, B_FALSE);
1317	mutex_exit(hash_lock);
1318
1319}
1320
1321static void
1322get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock)
1323{
1324	uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth);
1325
1326	if (ab->b_type == ARC_BUFC_METADATA)
1327		buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
1328	else {
1329		buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
1330		buf_hashid += ARC_BUFC_NUMMETADATALISTS;
1331	}
1332
1333	*list = &state->arcs_lists[buf_hashid];
1334	*lock = ARCS_LOCK(state, buf_hashid);
1335}
1336
1337
1338static void
1339add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1340{
1341	ASSERT(MUTEX_HELD(hash_lock));
1342
1343	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1344	    (ab->b_state != arc_anon)) {
1345		uint64_t delta = ab->b_size * ab->b_datacnt;
1346		uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1347		list_t *list;
1348		kmutex_t *lock;
1349
1350		get_buf_info(ab, ab->b_state, &list, &lock);
1351		ASSERT(!MUTEX_HELD(lock));
1352		mutex_enter(lock);
1353		ASSERT(list_link_active(&ab->b_arc_node));
1354		list_remove(list, ab);
1355		if (GHOST_STATE(ab->b_state)) {
1356			ASSERT0(ab->b_datacnt);
1357			ASSERT3P(ab->b_buf, ==, NULL);
1358			delta = ab->b_size;
1359		}
1360		ASSERT(delta > 0);
1361		ASSERT3U(*size, >=, delta);
1362		atomic_add_64(size, -delta);
1363		mutex_exit(lock);
1364		/* remove the prefetch flag if we get a reference */
1365		if (ab->b_flags & ARC_PREFETCH)
1366			ab->b_flags &= ~ARC_PREFETCH;
1367	}
1368}
1369
1370static int
1371remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1372{
1373	int cnt;
1374	arc_state_t *state = ab->b_state;
1375
1376	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1377	ASSERT(!GHOST_STATE(state));
1378
1379	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1380	    (state != arc_anon)) {
1381		uint64_t *size = &state->arcs_lsize[ab->b_type];
1382		list_t *list;
1383		kmutex_t *lock;
1384
1385		get_buf_info(ab, state, &list, &lock);
1386		ASSERT(!MUTEX_HELD(lock));
1387		mutex_enter(lock);
1388		ASSERT(!list_link_active(&ab->b_arc_node));
1389		list_insert_head(list, ab);
1390		ASSERT(ab->b_datacnt > 0);
1391		atomic_add_64(size, ab->b_size * ab->b_datacnt);
1392		mutex_exit(lock);
1393	}
1394	return (cnt);
1395}
1396
1397/*
1398 * Move the supplied buffer to the indicated state.  The mutex
1399 * for the buffer must be held by the caller.
1400 */
1401static void
1402arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1403{
1404	arc_state_t *old_state = ab->b_state;
1405	int64_t refcnt = refcount_count(&ab->b_refcnt);
1406	uint64_t from_delta, to_delta;
1407	list_t *list;
1408	kmutex_t *lock;
1409
1410	ASSERT(MUTEX_HELD(hash_lock));
1411	ASSERT3P(new_state, !=, old_state);
1412	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1413	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1414	ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1415
1416	from_delta = to_delta = ab->b_datacnt * ab->b_size;
1417
1418	/*
1419	 * If this buffer is evictable, transfer it from the
1420	 * old state list to the new state list.
1421	 */
1422	if (refcnt == 0) {
1423		if (old_state != arc_anon) {
1424			int use_mutex;
1425			uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1426
1427			get_buf_info(ab, old_state, &list, &lock);
1428			use_mutex = !MUTEX_HELD(lock);
1429			if (use_mutex)
1430				mutex_enter(lock);
1431
1432			ASSERT(list_link_active(&ab->b_arc_node));
1433			list_remove(list, ab);
1434
1435			/*
1436			 * If prefetching out of the ghost cache,
1437			 * we will have a non-zero datacnt.
1438			 */
1439			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1440				/* ghost elements have a ghost size */
1441				ASSERT(ab->b_buf == NULL);
1442				from_delta = ab->b_size;
1443			}
1444			ASSERT3U(*size, >=, from_delta);
1445			atomic_add_64(size, -from_delta);
1446
1447			if (use_mutex)
1448				mutex_exit(lock);
1449		}
1450		if (new_state != arc_anon) {
1451			int use_mutex;
1452			uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1453
1454			get_buf_info(ab, new_state, &list, &lock);
1455			use_mutex = !MUTEX_HELD(lock);
1456			if (use_mutex)
1457				mutex_enter(lock);
1458
1459			list_insert_head(list, ab);
1460
1461			/* ghost elements have a ghost size */
1462			if (GHOST_STATE(new_state)) {
1463				ASSERT(ab->b_datacnt == 0);
1464				ASSERT(ab->b_buf == NULL);
1465				to_delta = ab->b_size;
1466			}
1467			atomic_add_64(size, to_delta);
1468
1469			if (use_mutex)
1470				mutex_exit(lock);
1471		}
1472	}
1473
1474	ASSERT(!BUF_EMPTY(ab));
1475	if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1476		buf_hash_remove(ab);
1477
1478	/* adjust state sizes */
1479	if (to_delta)
1480		atomic_add_64(&new_state->arcs_size, to_delta);
1481	if (from_delta) {
1482		ASSERT3U(old_state->arcs_size, >=, from_delta);
1483		atomic_add_64(&old_state->arcs_size, -from_delta);
1484	}
1485	ab->b_state = new_state;
1486
1487	/* adjust l2arc hdr stats */
1488	if (new_state == arc_l2c_only)
1489		l2arc_hdr_stat_add();
1490	else if (old_state == arc_l2c_only)
1491		l2arc_hdr_stat_remove();
1492}
1493
1494void
1495arc_space_consume(uint64_t space, arc_space_type_t type)
1496{
1497	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1498
1499	switch (type) {
1500	case ARC_SPACE_DATA:
1501		ARCSTAT_INCR(arcstat_data_size, space);
1502		break;
1503	case ARC_SPACE_OTHER:
1504		ARCSTAT_INCR(arcstat_other_size, space);
1505		break;
1506	case ARC_SPACE_HDRS:
1507		ARCSTAT_INCR(arcstat_hdr_size, space);
1508		break;
1509	case ARC_SPACE_L2HDRS:
1510		ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1511		break;
1512	}
1513
1514	ARCSTAT_INCR(arcstat_meta_used, space);
1515	atomic_add_64(&arc_size, space);
1516}
1517
1518void
1519arc_space_return(uint64_t space, arc_space_type_t type)
1520{
1521	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1522
1523	switch (type) {
1524	case ARC_SPACE_DATA:
1525		ARCSTAT_INCR(arcstat_data_size, -space);
1526		break;
1527	case ARC_SPACE_OTHER:
1528		ARCSTAT_INCR(arcstat_other_size, -space);
1529		break;
1530	case ARC_SPACE_HDRS:
1531		ARCSTAT_INCR(arcstat_hdr_size, -space);
1532		break;
1533	case ARC_SPACE_L2HDRS:
1534		ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1535		break;
1536	}
1537
1538	ASSERT(arc_meta_used >= space);
1539	if (arc_meta_max < arc_meta_used)
1540		arc_meta_max = arc_meta_used;
1541	ARCSTAT_INCR(arcstat_meta_used, -space);
1542	ASSERT(arc_size >= space);
1543	atomic_add_64(&arc_size, -space);
1544}
1545
1546arc_buf_t *
1547arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1548{
1549	arc_buf_hdr_t *hdr;
1550	arc_buf_t *buf;
1551
1552	ASSERT3U(size, >, 0);
1553	hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1554	ASSERT(BUF_EMPTY(hdr));
1555	hdr->b_size = size;
1556	hdr->b_type = type;
1557	hdr->b_spa = spa_load_guid(spa);
1558	hdr->b_state = arc_anon;
1559	hdr->b_arc_access = 0;
1560	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1561	buf->b_hdr = hdr;
1562	buf->b_data = NULL;
1563	buf->b_efunc = NULL;
1564	buf->b_private = NULL;
1565	buf->b_next = NULL;
1566	hdr->b_buf = buf;
1567	arc_get_data_buf(buf);
1568	hdr->b_datacnt = 1;
1569	hdr->b_flags = 0;
1570	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1571	(void) refcount_add(&hdr->b_refcnt, tag);
1572
1573	return (buf);
1574}
1575
1576static char *arc_onloan_tag = "onloan";
1577
1578/*
1579 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1580 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1581 * buffers must be returned to the arc before they can be used by the DMU or
1582 * freed.
1583 */
1584arc_buf_t *
1585arc_loan_buf(spa_t *spa, int size)
1586{
1587	arc_buf_t *buf;
1588
1589	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1590
1591	atomic_add_64(&arc_loaned_bytes, size);
1592	return (buf);
1593}
1594
1595/*
1596 * Return a loaned arc buffer to the arc.
1597 */
1598void
1599arc_return_buf(arc_buf_t *buf, void *tag)
1600{
1601	arc_buf_hdr_t *hdr = buf->b_hdr;
1602
1603	ASSERT(buf->b_data != NULL);
1604	(void) refcount_add(&hdr->b_refcnt, tag);
1605	(void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1606
1607	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1608}
1609
1610/* Detach an arc_buf from a dbuf (tag) */
1611void
1612arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1613{
1614	arc_buf_hdr_t *hdr;
1615
1616	ASSERT(buf->b_data != NULL);
1617	hdr = buf->b_hdr;
1618	(void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1619	(void) refcount_remove(&hdr->b_refcnt, tag);
1620	buf->b_efunc = NULL;
1621	buf->b_private = NULL;
1622
1623	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1624}
1625
1626static arc_buf_t *
1627arc_buf_clone(arc_buf_t *from)
1628{
1629	arc_buf_t *buf;
1630	arc_buf_hdr_t *hdr = from->b_hdr;
1631	uint64_t size = hdr->b_size;
1632
1633	ASSERT(hdr->b_state != arc_anon);
1634
1635	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1636	buf->b_hdr = hdr;
1637	buf->b_data = NULL;
1638	buf->b_efunc = NULL;
1639	buf->b_private = NULL;
1640	buf->b_next = hdr->b_buf;
1641	hdr->b_buf = buf;
1642	arc_get_data_buf(buf);
1643	bcopy(from->b_data, buf->b_data, size);
1644
1645	/*
1646	 * This buffer already exists in the arc so create a duplicate
1647	 * copy for the caller.  If the buffer is associated with user data
1648	 * then track the size and number of duplicates.  These stats will be
1649	 * updated as duplicate buffers are created and destroyed.
1650	 */
1651	if (hdr->b_type == ARC_BUFC_DATA) {
1652		ARCSTAT_BUMP(arcstat_duplicate_buffers);
1653		ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1654	}
1655	hdr->b_datacnt += 1;
1656	return (buf);
1657}
1658
1659void
1660arc_buf_add_ref(arc_buf_t *buf, void* tag)
1661{
1662	arc_buf_hdr_t *hdr;
1663	kmutex_t *hash_lock;
1664
1665	/*
1666	 * Check to see if this buffer is evicted.  Callers
1667	 * must verify b_data != NULL to know if the add_ref
1668	 * was successful.
1669	 */
1670	mutex_enter(&buf->b_evict_lock);
1671	if (buf->b_data == NULL) {
1672		mutex_exit(&buf->b_evict_lock);
1673		return;
1674	}
1675	hash_lock = HDR_LOCK(buf->b_hdr);
1676	mutex_enter(hash_lock);
1677	hdr = buf->b_hdr;
1678	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1679	mutex_exit(&buf->b_evict_lock);
1680
1681	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1682	add_reference(hdr, hash_lock, tag);
1683	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1684	arc_access(hdr, hash_lock);
1685	mutex_exit(hash_lock);
1686	ARCSTAT_BUMP(arcstat_hits);
1687	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1688	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1689	    data, metadata, hits);
1690}
1691
1692static void
1693arc_buf_free_on_write(void *data, size_t size,
1694    void (*free_func)(void *, size_t))
1695{
1696	l2arc_data_free_t *df;
1697
1698	df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1699	df->l2df_data = data;
1700	df->l2df_size = size;
1701	df->l2df_func = free_func;
1702	mutex_enter(&l2arc_free_on_write_mtx);
1703	list_insert_head(l2arc_free_on_write, df);
1704	mutex_exit(&l2arc_free_on_write_mtx);
1705}
1706
1707/*
1708 * Free the arc data buffer.  If it is an l2arc write in progress,
1709 * the buffer is placed on l2arc_free_on_write to be freed later.
1710 */
1711static void
1712arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1713{
1714	arc_buf_hdr_t *hdr = buf->b_hdr;
1715
1716	if (HDR_L2_WRITING(hdr)) {
1717		arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func);
1718		ARCSTAT_BUMP(arcstat_l2_free_on_write);
1719	} else {
1720		free_func(buf->b_data, hdr->b_size);
1721	}
1722}
1723
1724/*
1725 * Free up buf->b_data and if 'remove' is set, then pull the
1726 * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
1727 */
1728static void
1729arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
1730{
1731	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1732
1733	ASSERT(MUTEX_HELD(&l2arc_buflist_mtx));
1734
1735	if (l2hdr->b_tmp_cdata == NULL)
1736		return;
1737
1738	ASSERT(HDR_L2_WRITING(hdr));
1739	arc_buf_free_on_write(l2hdr->b_tmp_cdata, hdr->b_size,
1740	    zio_data_buf_free);
1741	ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
1742	l2hdr->b_tmp_cdata = NULL;
1743}
1744
1745static void
1746arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
1747{
1748	arc_buf_t **bufp;
1749
1750	/* free up data associated with the buf */
1751	if (buf->b_data) {
1752		arc_state_t *state = buf->b_hdr->b_state;
1753		uint64_t size = buf->b_hdr->b_size;
1754		arc_buf_contents_t type = buf->b_hdr->b_type;
1755
1756		arc_cksum_verify(buf);
1757#ifdef illumos
1758		arc_buf_unwatch(buf);
1759#endif /* illumos */
1760
1761		if (!recycle) {
1762			if (type == ARC_BUFC_METADATA) {
1763				arc_buf_data_free(buf, zio_buf_free);
1764				arc_space_return(size, ARC_SPACE_DATA);
1765			} else {
1766				ASSERT(type == ARC_BUFC_DATA);
1767				arc_buf_data_free(buf, zio_data_buf_free);
1768				ARCSTAT_INCR(arcstat_data_size, -size);
1769				atomic_add_64(&arc_size, -size);
1770			}
1771		}
1772		if (list_link_active(&buf->b_hdr->b_arc_node)) {
1773			uint64_t *cnt = &state->arcs_lsize[type];
1774
1775			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1776			ASSERT(state != arc_anon);
1777
1778			ASSERT3U(*cnt, >=, size);
1779			atomic_add_64(cnt, -size);
1780		}
1781		ASSERT3U(state->arcs_size, >=, size);
1782		atomic_add_64(&state->arcs_size, -size);
1783		buf->b_data = NULL;
1784
1785		/*
1786		 * If we're destroying a duplicate buffer make sure
1787		 * that the appropriate statistics are updated.
1788		 */
1789		if (buf->b_hdr->b_datacnt > 1 &&
1790		    buf->b_hdr->b_type == ARC_BUFC_DATA) {
1791			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1792			ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1793		}
1794		ASSERT(buf->b_hdr->b_datacnt > 0);
1795		buf->b_hdr->b_datacnt -= 1;
1796	}
1797
1798	/* only remove the buf if requested */
1799	if (!remove)
1800		return;
1801
1802	/* remove the buf from the hdr list */
1803	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1804		continue;
1805	*bufp = buf->b_next;
1806	buf->b_next = NULL;
1807
1808	ASSERT(buf->b_efunc == NULL);
1809
1810	/* clean up the buf */
1811	buf->b_hdr = NULL;
1812	kmem_cache_free(buf_cache, buf);
1813}
1814
1815static void
1816arc_hdr_destroy(arc_buf_hdr_t *hdr)
1817{
1818	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1819	ASSERT3P(hdr->b_state, ==, arc_anon);
1820	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1821	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1822
1823	if (l2hdr != NULL) {
1824		boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1825		/*
1826		 * To prevent arc_free() and l2arc_evict() from
1827		 * attempting to free the same buffer at the same time,
1828		 * a FREE_IN_PROGRESS flag is given to arc_free() to
1829		 * give it priority.  l2arc_evict() can't destroy this
1830		 * header while we are waiting on l2arc_buflist_mtx.
1831		 *
1832		 * The hdr may be removed from l2ad_buflist before we
1833		 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1834		 */
1835		if (!buflist_held) {
1836			mutex_enter(&l2arc_buflist_mtx);
1837			l2hdr = hdr->b_l2hdr;
1838		}
1839
1840		if (l2hdr != NULL) {
1841			trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
1842			    hdr->b_size, 0);
1843			list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1844			arc_buf_l2_cdata_free(hdr);
1845			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1846			ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1847			vdev_space_update(l2hdr->b_dev->l2ad_vdev,
1848			    -l2hdr->b_asize, 0, 0);
1849			kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1850			if (hdr->b_state == arc_l2c_only)
1851				l2arc_hdr_stat_remove();
1852			hdr->b_l2hdr = NULL;
1853		}
1854
1855		if (!buflist_held)
1856			mutex_exit(&l2arc_buflist_mtx);
1857	}
1858
1859	if (!BUF_EMPTY(hdr)) {
1860		ASSERT(!HDR_IN_HASH_TABLE(hdr));
1861		buf_discard_identity(hdr);
1862	}
1863	while (hdr->b_buf) {
1864		arc_buf_t *buf = hdr->b_buf;
1865
1866		if (buf->b_efunc) {
1867			mutex_enter(&arc_eviction_mtx);
1868			mutex_enter(&buf->b_evict_lock);
1869			ASSERT(buf->b_hdr != NULL);
1870			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1871			hdr->b_buf = buf->b_next;
1872			buf->b_hdr = &arc_eviction_hdr;
1873			buf->b_next = arc_eviction_list;
1874			arc_eviction_list = buf;
1875			mutex_exit(&buf->b_evict_lock);
1876			mutex_exit(&arc_eviction_mtx);
1877		} else {
1878			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1879		}
1880	}
1881	if (hdr->b_freeze_cksum != NULL) {
1882		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1883		hdr->b_freeze_cksum = NULL;
1884	}
1885	if (hdr->b_thawed) {
1886		kmem_free(hdr->b_thawed, 1);
1887		hdr->b_thawed = NULL;
1888	}
1889
1890	ASSERT(!list_link_active(&hdr->b_arc_node));
1891	ASSERT3P(hdr->b_hash_next, ==, NULL);
1892	ASSERT3P(hdr->b_acb, ==, NULL);
1893	kmem_cache_free(hdr_cache, hdr);
1894}
1895
1896void
1897arc_buf_free(arc_buf_t *buf, void *tag)
1898{
1899	arc_buf_hdr_t *hdr = buf->b_hdr;
1900	int hashed = hdr->b_state != arc_anon;
1901
1902	ASSERT(buf->b_efunc == NULL);
1903	ASSERT(buf->b_data != NULL);
1904
1905	if (hashed) {
1906		kmutex_t *hash_lock = HDR_LOCK(hdr);
1907
1908		mutex_enter(hash_lock);
1909		hdr = buf->b_hdr;
1910		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1911
1912		(void) remove_reference(hdr, hash_lock, tag);
1913		if (hdr->b_datacnt > 1) {
1914			arc_buf_destroy(buf, FALSE, TRUE);
1915		} else {
1916			ASSERT(buf == hdr->b_buf);
1917			ASSERT(buf->b_efunc == NULL);
1918			hdr->b_flags |= ARC_BUF_AVAILABLE;
1919		}
1920		mutex_exit(hash_lock);
1921	} else if (HDR_IO_IN_PROGRESS(hdr)) {
1922		int destroy_hdr;
1923		/*
1924		 * We are in the middle of an async write.  Don't destroy
1925		 * this buffer unless the write completes before we finish
1926		 * decrementing the reference count.
1927		 */
1928		mutex_enter(&arc_eviction_mtx);
1929		(void) remove_reference(hdr, NULL, tag);
1930		ASSERT(refcount_is_zero(&hdr->b_refcnt));
1931		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1932		mutex_exit(&arc_eviction_mtx);
1933		if (destroy_hdr)
1934			arc_hdr_destroy(hdr);
1935	} else {
1936		if (remove_reference(hdr, NULL, tag) > 0)
1937			arc_buf_destroy(buf, FALSE, TRUE);
1938		else
1939			arc_hdr_destroy(hdr);
1940	}
1941}
1942
1943boolean_t
1944arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1945{
1946	arc_buf_hdr_t *hdr = buf->b_hdr;
1947	kmutex_t *hash_lock = HDR_LOCK(hdr);
1948	boolean_t no_callback = (buf->b_efunc == NULL);
1949
1950	if (hdr->b_state == arc_anon) {
1951		ASSERT(hdr->b_datacnt == 1);
1952		arc_buf_free(buf, tag);
1953		return (no_callback);
1954	}
1955
1956	mutex_enter(hash_lock);
1957	hdr = buf->b_hdr;
1958	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1959	ASSERT(hdr->b_state != arc_anon);
1960	ASSERT(buf->b_data != NULL);
1961
1962	(void) remove_reference(hdr, hash_lock, tag);
1963	if (hdr->b_datacnt > 1) {
1964		if (no_callback)
1965			arc_buf_destroy(buf, FALSE, TRUE);
1966	} else if (no_callback) {
1967		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1968		ASSERT(buf->b_efunc == NULL);
1969		hdr->b_flags |= ARC_BUF_AVAILABLE;
1970	}
1971	ASSERT(no_callback || hdr->b_datacnt > 1 ||
1972	    refcount_is_zero(&hdr->b_refcnt));
1973	mutex_exit(hash_lock);
1974	return (no_callback);
1975}
1976
1977int
1978arc_buf_size(arc_buf_t *buf)
1979{
1980	return (buf->b_hdr->b_size);
1981}
1982
1983/*
1984 * Called from the DMU to determine if the current buffer should be
1985 * evicted. In order to ensure proper locking, the eviction must be initiated
1986 * from the DMU. Return true if the buffer is associated with user data and
1987 * duplicate buffers still exist.
1988 */
1989boolean_t
1990arc_buf_eviction_needed(arc_buf_t *buf)
1991{
1992	arc_buf_hdr_t *hdr;
1993	boolean_t evict_needed = B_FALSE;
1994
1995	if (zfs_disable_dup_eviction)
1996		return (B_FALSE);
1997
1998	mutex_enter(&buf->b_evict_lock);
1999	hdr = buf->b_hdr;
2000	if (hdr == NULL) {
2001		/*
2002		 * We are in arc_do_user_evicts(); let that function
2003		 * perform the eviction.
2004		 */
2005		ASSERT(buf->b_data == NULL);
2006		mutex_exit(&buf->b_evict_lock);
2007		return (B_FALSE);
2008	} else if (buf->b_data == NULL) {
2009		/*
2010		 * We have already been added to the arc eviction list;
2011		 * recommend eviction.
2012		 */
2013		ASSERT3P(hdr, ==, &arc_eviction_hdr);
2014		mutex_exit(&buf->b_evict_lock);
2015		return (B_TRUE);
2016	}
2017
2018	if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
2019		evict_needed = B_TRUE;
2020
2021	mutex_exit(&buf->b_evict_lock);
2022	return (evict_needed);
2023}
2024
2025/*
2026 * Evict buffers from list until we've removed the specified number of
2027 * bytes.  Move the removed buffers to the appropriate evict state.
2028 * If the recycle flag is set, then attempt to "recycle" a buffer:
2029 * - look for a buffer to evict that is `bytes' long.
2030 * - return the data block from this buffer rather than freeing it.
2031 * This flag is used by callers that are trying to make space for a
2032 * new buffer in a full arc cache.
2033 *
2034 * This function makes a "best effort".  It skips over any buffers
2035 * it can't get a hash_lock on, and so may not catch all candidates.
2036 * It may also return without evicting as much space as requested.
2037 */
2038static void *
2039arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
2040    arc_buf_contents_t type)
2041{
2042	arc_state_t *evicted_state;
2043	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
2044	int64_t bytes_remaining;
2045	arc_buf_hdr_t *ab, *ab_prev = NULL;
2046	list_t *evicted_list, *list, *evicted_list_start, *list_start;
2047	kmutex_t *lock, *evicted_lock;
2048	kmutex_t *hash_lock;
2049	boolean_t have_lock;
2050	void *stolen = NULL;
2051	arc_buf_hdr_t marker = { 0 };
2052	int count = 0;
2053	static int evict_metadata_offset, evict_data_offset;
2054	int i, idx, offset, list_count, lists;
2055
2056	ASSERT(state == arc_mru || state == arc_mfu);
2057
2058	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2059
2060	if (type == ARC_BUFC_METADATA) {
2061		offset = 0;
2062		list_count = ARC_BUFC_NUMMETADATALISTS;
2063		list_start = &state->arcs_lists[0];
2064		evicted_list_start = &evicted_state->arcs_lists[0];
2065		idx = evict_metadata_offset;
2066	} else {
2067		offset = ARC_BUFC_NUMMETADATALISTS;
2068		list_start = &state->arcs_lists[offset];
2069		evicted_list_start = &evicted_state->arcs_lists[offset];
2070		list_count = ARC_BUFC_NUMDATALISTS;
2071		idx = evict_data_offset;
2072	}
2073	bytes_remaining = evicted_state->arcs_lsize[type];
2074	lists = 0;
2075
2076evict_start:
2077	list = &list_start[idx];
2078	evicted_list = &evicted_list_start[idx];
2079	lock = ARCS_LOCK(state, (offset + idx));
2080	evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
2081
2082	mutex_enter(lock);
2083	mutex_enter(evicted_lock);
2084
2085	for (ab = list_tail(list); ab; ab = ab_prev) {
2086		ab_prev = list_prev(list, ab);
2087		bytes_remaining -= (ab->b_size * ab->b_datacnt);
2088		/* prefetch buffers have a minimum lifespan */
2089		if (HDR_IO_IN_PROGRESS(ab) ||
2090		    (spa && ab->b_spa != spa) ||
2091		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
2092		    ddi_get_lbolt() - ab->b_arc_access <
2093		    arc_min_prefetch_lifespan)) {
2094			skipped++;
2095			continue;
2096		}
2097		/* "lookahead" for better eviction candidate */
2098		if (recycle && ab->b_size != bytes &&
2099		    ab_prev && ab_prev->b_size == bytes)
2100			continue;
2101
2102		/* ignore markers */
2103		if (ab->b_spa == 0)
2104			continue;
2105
2106		/*
2107		 * It may take a long time to evict all the bufs requested.
2108		 * To avoid blocking all arc activity, periodically drop
2109		 * the arcs_mtx and give other threads a chance to run
2110		 * before reacquiring the lock.
2111		 *
2112		 * If we are looking for a buffer to recycle, we are in
2113		 * the hot code path, so don't sleep.
2114		 */
2115		if (!recycle && count++ > arc_evict_iterations) {
2116			list_insert_after(list, ab, &marker);
2117			mutex_exit(evicted_lock);
2118			mutex_exit(lock);
2119			kpreempt(KPREEMPT_SYNC);
2120			mutex_enter(lock);
2121			mutex_enter(evicted_lock);
2122			ab_prev = list_prev(list, &marker);
2123			list_remove(list, &marker);
2124			count = 0;
2125			continue;
2126		}
2127
2128		hash_lock = HDR_LOCK(ab);
2129		have_lock = MUTEX_HELD(hash_lock);
2130		if (have_lock || mutex_tryenter(hash_lock)) {
2131			ASSERT0(refcount_count(&ab->b_refcnt));
2132			ASSERT(ab->b_datacnt > 0);
2133			while (ab->b_buf) {
2134				arc_buf_t *buf = ab->b_buf;
2135				if (!mutex_tryenter(&buf->b_evict_lock)) {
2136					missed += 1;
2137					break;
2138				}
2139				if (buf->b_data) {
2140					bytes_evicted += ab->b_size;
2141					if (recycle && ab->b_type == type &&
2142					    ab->b_size == bytes &&
2143					    !HDR_L2_WRITING(ab)) {
2144						stolen = buf->b_data;
2145						recycle = FALSE;
2146					}
2147				}
2148				if (buf->b_efunc) {
2149					mutex_enter(&arc_eviction_mtx);
2150					arc_buf_destroy(buf,
2151					    buf->b_data == stolen, FALSE);
2152					ab->b_buf = buf->b_next;
2153					buf->b_hdr = &arc_eviction_hdr;
2154					buf->b_next = arc_eviction_list;
2155					arc_eviction_list = buf;
2156					mutex_exit(&arc_eviction_mtx);
2157					mutex_exit(&buf->b_evict_lock);
2158				} else {
2159					mutex_exit(&buf->b_evict_lock);
2160					arc_buf_destroy(buf,
2161					    buf->b_data == stolen, TRUE);
2162				}
2163			}
2164
2165			if (ab->b_l2hdr) {
2166				ARCSTAT_INCR(arcstat_evict_l2_cached,
2167				    ab->b_size);
2168			} else {
2169				if (l2arc_write_eligible(ab->b_spa, ab)) {
2170					ARCSTAT_INCR(arcstat_evict_l2_eligible,
2171					    ab->b_size);
2172				} else {
2173					ARCSTAT_INCR(
2174					    arcstat_evict_l2_ineligible,
2175					    ab->b_size);
2176				}
2177			}
2178
2179			if (ab->b_datacnt == 0) {
2180				arc_change_state(evicted_state, ab, hash_lock);
2181				ASSERT(HDR_IN_HASH_TABLE(ab));
2182				ab->b_flags |= ARC_IN_HASH_TABLE;
2183				ab->b_flags &= ~ARC_BUF_AVAILABLE;
2184				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
2185			}
2186			if (!have_lock)
2187				mutex_exit(hash_lock);
2188			if (bytes >= 0 && bytes_evicted >= bytes)
2189				break;
2190			if (bytes_remaining > 0) {
2191				mutex_exit(evicted_lock);
2192				mutex_exit(lock);
2193				idx  = ((idx + 1) & (list_count - 1));
2194				lists++;
2195				goto evict_start;
2196			}
2197		} else {
2198			missed += 1;
2199		}
2200	}
2201
2202	mutex_exit(evicted_lock);
2203	mutex_exit(lock);
2204
2205	idx  = ((idx + 1) & (list_count - 1));
2206	lists++;
2207
2208	if (bytes_evicted < bytes) {
2209		if (lists < list_count)
2210			goto evict_start;
2211		else
2212			dprintf("only evicted %lld bytes from %x",
2213			    (longlong_t)bytes_evicted, state);
2214	}
2215	if (type == ARC_BUFC_METADATA)
2216		evict_metadata_offset = idx;
2217	else
2218		evict_data_offset = idx;
2219
2220	if (skipped)
2221		ARCSTAT_INCR(arcstat_evict_skip, skipped);
2222
2223	if (missed)
2224		ARCSTAT_INCR(arcstat_mutex_miss, missed);
2225
2226	/*
2227	 * Note: we have just evicted some data into the ghost state,
2228	 * potentially putting the ghost size over the desired size.  Rather
2229	 * that evicting from the ghost list in this hot code path, leave
2230	 * this chore to the arc_reclaim_thread().
2231	 */
2232
2233	if (stolen)
2234		ARCSTAT_BUMP(arcstat_stolen);
2235	return (stolen);
2236}
2237
2238/*
2239 * Remove buffers from list until we've removed the specified number of
2240 * bytes.  Destroy the buffers that are removed.
2241 */
2242static void
2243arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
2244{
2245	arc_buf_hdr_t *ab, *ab_prev;
2246	arc_buf_hdr_t marker = { 0 };
2247	list_t *list, *list_start;
2248	kmutex_t *hash_lock, *lock;
2249	uint64_t bytes_deleted = 0;
2250	uint64_t bufs_skipped = 0;
2251	int count = 0;
2252	static int evict_offset;
2253	int list_count, idx = evict_offset;
2254	int offset, lists = 0;
2255
2256	ASSERT(GHOST_STATE(state));
2257
2258	/*
2259	 * data lists come after metadata lists
2260	 */
2261	list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
2262	list_count = ARC_BUFC_NUMDATALISTS;
2263	offset = ARC_BUFC_NUMMETADATALISTS;
2264
2265evict_start:
2266	list = &list_start[idx];
2267	lock = ARCS_LOCK(state, idx + offset);
2268
2269	mutex_enter(lock);
2270	for (ab = list_tail(list); ab; ab = ab_prev) {
2271		ab_prev = list_prev(list, ab);
2272		if (ab->b_type > ARC_BUFC_NUMTYPES)
2273			panic("invalid ab=%p", (void *)ab);
2274		if (spa && ab->b_spa != spa)
2275			continue;
2276
2277		/* ignore markers */
2278		if (ab->b_spa == 0)
2279			continue;
2280
2281		hash_lock = HDR_LOCK(ab);
2282		/* caller may be trying to modify this buffer, skip it */
2283		if (MUTEX_HELD(hash_lock))
2284			continue;
2285
2286		/*
2287		 * It may take a long time to evict all the bufs requested.
2288		 * To avoid blocking all arc activity, periodically drop
2289		 * the arcs_mtx and give other threads a chance to run
2290		 * before reacquiring the lock.
2291		 */
2292		if (count++ > arc_evict_iterations) {
2293			list_insert_after(list, ab, &marker);
2294			mutex_exit(lock);
2295			kpreempt(KPREEMPT_SYNC);
2296			mutex_enter(lock);
2297			ab_prev = list_prev(list, &marker);
2298			list_remove(list, &marker);
2299			count = 0;
2300			continue;
2301		}
2302		if (mutex_tryenter(hash_lock)) {
2303			ASSERT(!HDR_IO_IN_PROGRESS(ab));
2304			ASSERT(ab->b_buf == NULL);
2305			ARCSTAT_BUMP(arcstat_deleted);
2306			bytes_deleted += ab->b_size;
2307
2308			if (ab->b_l2hdr != NULL) {
2309				/*
2310				 * This buffer is cached on the 2nd Level ARC;
2311				 * don't destroy the header.
2312				 */
2313				arc_change_state(arc_l2c_only, ab, hash_lock);
2314				mutex_exit(hash_lock);
2315			} else {
2316				arc_change_state(arc_anon, ab, hash_lock);
2317				mutex_exit(hash_lock);
2318				arc_hdr_destroy(ab);
2319			}
2320
2321			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2322			if (bytes >= 0 && bytes_deleted >= bytes)
2323				break;
2324		} else if (bytes < 0) {
2325			/*
2326			 * Insert a list marker and then wait for the
2327			 * hash lock to become available. Once its
2328			 * available, restart from where we left off.
2329			 */
2330			list_insert_after(list, ab, &marker);
2331			mutex_exit(lock);
2332			mutex_enter(hash_lock);
2333			mutex_exit(hash_lock);
2334			mutex_enter(lock);
2335			ab_prev = list_prev(list, &marker);
2336			list_remove(list, &marker);
2337		} else {
2338			bufs_skipped += 1;
2339		}
2340
2341	}
2342	mutex_exit(lock);
2343	idx  = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
2344	lists++;
2345
2346	if (lists < list_count)
2347		goto evict_start;
2348
2349	evict_offset = idx;
2350	if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
2351	    (bytes < 0 || bytes_deleted < bytes)) {
2352		list_start = &state->arcs_lists[0];
2353		list_count = ARC_BUFC_NUMMETADATALISTS;
2354		offset = lists = 0;
2355		goto evict_start;
2356	}
2357
2358	if (bufs_skipped) {
2359		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2360		ASSERT(bytes >= 0);
2361	}
2362
2363	if (bytes_deleted < bytes)
2364		dprintf("only deleted %lld bytes from %p",
2365		    (longlong_t)bytes_deleted, state);
2366}
2367
2368static void
2369arc_adjust(void)
2370{
2371	int64_t adjustment, delta;
2372
2373	/*
2374	 * Adjust MRU size
2375	 */
2376
2377	adjustment = MIN((int64_t)(arc_size - arc_c),
2378	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2379	    arc_p));
2380
2381	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2382		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2383		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
2384		adjustment -= delta;
2385	}
2386
2387	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2388		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2389		(void) arc_evict(arc_mru, 0, delta, FALSE,
2390		    ARC_BUFC_METADATA);
2391	}
2392
2393	/*
2394	 * Adjust MFU size
2395	 */
2396
2397	adjustment = arc_size - arc_c;
2398
2399	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2400		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2401		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
2402		adjustment -= delta;
2403	}
2404
2405	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2406		int64_t delta = MIN(adjustment,
2407		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2408		(void) arc_evict(arc_mfu, 0, delta, FALSE,
2409		    ARC_BUFC_METADATA);
2410	}
2411
2412	/*
2413	 * Adjust ghost lists
2414	 */
2415
2416	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2417
2418	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2419		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2420		arc_evict_ghost(arc_mru_ghost, 0, delta);
2421	}
2422
2423	adjustment =
2424	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2425
2426	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2427		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2428		arc_evict_ghost(arc_mfu_ghost, 0, delta);
2429	}
2430}
2431
2432static void
2433arc_do_user_evicts(void)
2434{
2435	static arc_buf_t *tmp_arc_eviction_list;
2436
2437	/*
2438	 * Move list over to avoid LOR
2439	 */
2440restart:
2441	mutex_enter(&arc_eviction_mtx);
2442	tmp_arc_eviction_list = arc_eviction_list;
2443	arc_eviction_list = NULL;
2444	mutex_exit(&arc_eviction_mtx);
2445
2446	while (tmp_arc_eviction_list != NULL) {
2447		arc_buf_t *buf = tmp_arc_eviction_list;
2448		tmp_arc_eviction_list = buf->b_next;
2449		mutex_enter(&buf->b_evict_lock);
2450		buf->b_hdr = NULL;
2451		mutex_exit(&buf->b_evict_lock);
2452
2453		if (buf->b_efunc != NULL)
2454			VERIFY0(buf->b_efunc(buf->b_private));
2455
2456		buf->b_efunc = NULL;
2457		buf->b_private = NULL;
2458		kmem_cache_free(buf_cache, buf);
2459	}
2460
2461	if (arc_eviction_list != NULL)
2462		goto restart;
2463}
2464
2465/*
2466 * Flush all *evictable* data from the cache for the given spa.
2467 * NOTE: this will not touch "active" (i.e. referenced) data.
2468 */
2469void
2470arc_flush(spa_t *spa)
2471{
2472	uint64_t guid = 0;
2473
2474	if (spa)
2475		guid = spa_load_guid(spa);
2476
2477	while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
2478		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2479		if (spa)
2480			break;
2481	}
2482	while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
2483		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2484		if (spa)
2485			break;
2486	}
2487	while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
2488		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2489		if (spa)
2490			break;
2491	}
2492	while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
2493		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2494		if (spa)
2495			break;
2496	}
2497
2498	arc_evict_ghost(arc_mru_ghost, guid, -1);
2499	arc_evict_ghost(arc_mfu_ghost, guid, -1);
2500
2501	mutex_enter(&arc_reclaim_thr_lock);
2502	arc_do_user_evicts();
2503	mutex_exit(&arc_reclaim_thr_lock);
2504	ASSERT(spa || arc_eviction_list == NULL);
2505}
2506
2507void
2508arc_shrink(void)
2509{
2510
2511	if (arc_c > arc_c_min) {
2512		uint64_t to_free;
2513
2514		DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
2515			arc_c_min, uint64_t, arc_p, uint64_t, to_free);
2516#ifdef _KERNEL
2517		to_free = arc_c >> arc_shrink_shift;
2518#else
2519		to_free = arc_c >> arc_shrink_shift;
2520#endif
2521		if (arc_c > arc_c_min + to_free)
2522			atomic_add_64(&arc_c, -to_free);
2523		else
2524			arc_c = arc_c_min;
2525
2526		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2527		if (arc_c > arc_size)
2528			arc_c = MAX(arc_size, arc_c_min);
2529		if (arc_p > arc_c)
2530			arc_p = (arc_c >> 1);
2531
2532		DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
2533			arc_p);
2534
2535		ASSERT(arc_c >= arc_c_min);
2536		ASSERT((int64_t)arc_p >= 0);
2537	}
2538
2539	if (arc_size > arc_c) {
2540		DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,
2541			uint64_t, arc_c);
2542		arc_adjust();
2543	}
2544}
2545
2546static int needfree = 0;
2547
2548static int
2549arc_reclaim_needed(void)
2550{
2551
2552#ifdef _KERNEL
2553
2554	if (needfree) {
2555		DTRACE_PROBE(arc__reclaim_needfree);
2556		return (1);
2557	}
2558
2559	/*
2560	 * Cooperate with pagedaemon when it's time for it to scan
2561	 * and reclaim some pages.
2562	 */
2563	if (freemem < zfs_arc_free_target) {
2564		DTRACE_PROBE2(arc__reclaim_freemem, uint64_t,
2565		    freemem, uint64_t, zfs_arc_free_target);
2566		return (1);
2567	}
2568
2569#ifdef sun
2570	/*
2571	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
2572	 */
2573	extra = desfree;
2574
2575	/*
2576	 * check that we're out of range of the pageout scanner.  It starts to
2577	 * schedule paging if freemem is less than lotsfree and needfree.
2578	 * lotsfree is the high-water mark for pageout, and needfree is the
2579	 * number of needed free pages.  We add extra pages here to make sure
2580	 * the scanner doesn't start up while we're freeing memory.
2581	 */
2582	if (freemem < lotsfree + needfree + extra)
2583		return (1);
2584
2585	/*
2586	 * check to make sure that swapfs has enough space so that anon
2587	 * reservations can still succeed. anon_resvmem() checks that the
2588	 * availrmem is greater than swapfs_minfree, and the number of reserved
2589	 * swap pages.  We also add a bit of extra here just to prevent
2590	 * circumstances from getting really dire.
2591	 */
2592	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2593		return (1);
2594
2595	/*
2596	 * Check that we have enough availrmem that memory locking (e.g., via
2597	 * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
2598	 * stores the number of pages that cannot be locked; when availrmem
2599	 * drops below pages_pp_maximum, page locking mechanisms such as
2600	 * page_pp_lock() will fail.)
2601	 */
2602	if (availrmem <= pages_pp_maximum)
2603		return (1);
2604
2605#endif	/* sun */
2606#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
2607	/*
2608	 * If we're on an i386 platform, it's possible that we'll exhaust the
2609	 * kernel heap space before we ever run out of available physical
2610	 * memory.  Most checks of the size of the heap_area compare against
2611	 * tune.t_minarmem, which is the minimum available real memory that we
2612	 * can have in the system.  However, this is generally fixed at 25 pages
2613	 * which is so low that it's useless.  In this comparison, we seek to
2614	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
2615	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2616	 * free)
2617	 */
2618	if (vmem_size(heap_arena, VMEM_FREE) <
2619	    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) {
2620		DTRACE_PROBE2(arc__reclaim_used, uint64_t,
2621		    vmem_size(heap_arena, VMEM_FREE), uint64_t,
2622		    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2);
2623		return (1);
2624	}
2625#endif
2626#ifdef sun
2627	/*
2628	 * If zio data pages are being allocated out of a separate heap segment,
2629	 * then enforce that the size of available vmem for this arena remains
2630	 * above about 1/16th free.
2631	 *
2632	 * Note: The 1/16th arena free requirement was put in place
2633	 * to aggressively evict memory from the arc in order to avoid
2634	 * memory fragmentation issues.
2635	 */
2636	if (zio_arena != NULL &&
2637	    vmem_size(zio_arena, VMEM_FREE) <
2638	    (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2639		return (1);
2640#endif	/* sun */
2641#else	/* _KERNEL */
2642	if (spa_get_random(100) == 0)
2643		return (1);
2644#endif	/* _KERNEL */
2645	DTRACE_PROBE(arc__reclaim_no);
2646
2647	return (0);
2648}
2649
2650extern kmem_cache_t	*zio_buf_cache[];
2651extern kmem_cache_t	*zio_data_buf_cache[];
2652extern kmem_cache_t	*range_seg_cache;
2653
2654static void __noinline
2655arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2656{
2657	size_t			i;
2658	kmem_cache_t		*prev_cache = NULL;
2659	kmem_cache_t		*prev_data_cache = NULL;
2660
2661	DTRACE_PROBE(arc__kmem_reap_start);
2662#ifdef _KERNEL
2663	if (arc_meta_used >= arc_meta_limit) {
2664		/*
2665		 * We are exceeding our meta-data cache limit.
2666		 * Purge some DNLC entries to release holds on meta-data.
2667		 */
2668		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2669	}
2670#if defined(__i386)
2671	/*
2672	 * Reclaim unused memory from all kmem caches.
2673	 */
2674	kmem_reap();
2675#endif
2676#endif
2677
2678	/*
2679	 * An aggressive reclamation will shrink the cache size as well as
2680	 * reap free buffers from the arc kmem caches.
2681	 */
2682	if (strat == ARC_RECLAIM_AGGR)
2683		arc_shrink();
2684
2685	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2686		if (zio_buf_cache[i] != prev_cache) {
2687			prev_cache = zio_buf_cache[i];
2688			kmem_cache_reap_now(zio_buf_cache[i]);
2689		}
2690		if (zio_data_buf_cache[i] != prev_data_cache) {
2691			prev_data_cache = zio_data_buf_cache[i];
2692			kmem_cache_reap_now(zio_data_buf_cache[i]);
2693		}
2694	}
2695	kmem_cache_reap_now(buf_cache);
2696	kmem_cache_reap_now(hdr_cache);
2697	kmem_cache_reap_now(range_seg_cache);
2698
2699#ifdef sun
2700	/*
2701	 * Ask the vmem arena to reclaim unused memory from its
2702	 * quantum caches.
2703	 */
2704	if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2705		vmem_qcache_reap(zio_arena);
2706#endif
2707	DTRACE_PROBE(arc__kmem_reap_end);
2708}
2709
2710static void
2711arc_reclaim_thread(void *dummy __unused)
2712{
2713	clock_t			growtime = 0;
2714	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
2715	callb_cpr_t		cpr;
2716
2717	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2718
2719	mutex_enter(&arc_reclaim_thr_lock);
2720	while (arc_thread_exit == 0) {
2721		if (arc_reclaim_needed()) {
2722
2723			if (arc_no_grow) {
2724				if (last_reclaim == ARC_RECLAIM_CONS) {
2725					DTRACE_PROBE(arc__reclaim_aggr_no_grow);
2726					last_reclaim = ARC_RECLAIM_AGGR;
2727				} else {
2728					last_reclaim = ARC_RECLAIM_CONS;
2729				}
2730			} else {
2731				arc_no_grow = TRUE;
2732				last_reclaim = ARC_RECLAIM_AGGR;
2733				DTRACE_PROBE(arc__reclaim_aggr);
2734				membar_producer();
2735			}
2736
2737			/* reset the growth delay for every reclaim */
2738			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2739
2740			if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
2741				/*
2742				 * If needfree is TRUE our vm_lowmem hook
2743				 * was called and in that case we must free some
2744				 * memory, so switch to aggressive mode.
2745				 */
2746				arc_no_grow = TRUE;
2747				last_reclaim = ARC_RECLAIM_AGGR;
2748			}
2749			arc_kmem_reap_now(last_reclaim);
2750			arc_warm = B_TRUE;
2751
2752		} else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2753			arc_no_grow = FALSE;
2754		}
2755
2756		arc_adjust();
2757
2758		if (arc_eviction_list != NULL)
2759			arc_do_user_evicts();
2760
2761#ifdef _KERNEL
2762		if (needfree) {
2763			needfree = 0;
2764			wakeup(&needfree);
2765		}
2766#endif
2767
2768		/* block until needed, or one second, whichever is shorter */
2769		CALLB_CPR_SAFE_BEGIN(&cpr);
2770		(void) cv_timedwait(&arc_reclaim_thr_cv,
2771		    &arc_reclaim_thr_lock, hz);
2772		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2773	}
2774
2775	arc_thread_exit = 0;
2776	cv_broadcast(&arc_reclaim_thr_cv);
2777	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
2778	thread_exit();
2779}
2780
2781/*
2782 * Adapt arc info given the number of bytes we are trying to add and
2783 * the state that we are comming from.  This function is only called
2784 * when we are adding new content to the cache.
2785 */
2786static void
2787arc_adapt(int bytes, arc_state_t *state)
2788{
2789	int mult;
2790	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2791
2792	if (state == arc_l2c_only)
2793		return;
2794
2795	ASSERT(bytes > 0);
2796	/*
2797	 * Adapt the target size of the MRU list:
2798	 *	- if we just hit in the MRU ghost list, then increase
2799	 *	  the target size of the MRU list.
2800	 *	- if we just hit in the MFU ghost list, then increase
2801	 *	  the target size of the MFU list by decreasing the
2802	 *	  target size of the MRU list.
2803	 */
2804	if (state == arc_mru_ghost) {
2805		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2806		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2807		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2808
2809		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2810	} else if (state == arc_mfu_ghost) {
2811		uint64_t delta;
2812
2813		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2814		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2815		mult = MIN(mult, 10);
2816
2817		delta = MIN(bytes * mult, arc_p);
2818		arc_p = MAX(arc_p_min, arc_p - delta);
2819	}
2820	ASSERT((int64_t)arc_p >= 0);
2821
2822	if (arc_reclaim_needed()) {
2823		cv_signal(&arc_reclaim_thr_cv);
2824		return;
2825	}
2826
2827	if (arc_no_grow)
2828		return;
2829
2830	if (arc_c >= arc_c_max)
2831		return;
2832
2833	/*
2834	 * If we're within (2 * maxblocksize) bytes of the target
2835	 * cache size, increment the target cache size
2836	 */
2837	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2838		DTRACE_PROBE1(arc__inc_adapt, int, bytes);
2839		atomic_add_64(&arc_c, (int64_t)bytes);
2840		if (arc_c > arc_c_max)
2841			arc_c = arc_c_max;
2842		else if (state == arc_anon)
2843			atomic_add_64(&arc_p, (int64_t)bytes);
2844		if (arc_p > arc_c)
2845			arc_p = arc_c;
2846	}
2847	ASSERT((int64_t)arc_p >= 0);
2848}
2849
2850/*
2851 * Check if the cache has reached its limits and eviction is required
2852 * prior to insert.
2853 */
2854static int
2855arc_evict_needed(arc_buf_contents_t type)
2856{
2857	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2858		return (1);
2859
2860	if (arc_reclaim_needed())
2861		return (1);
2862
2863	return (arc_size > arc_c);
2864}
2865
2866/*
2867 * The buffer, supplied as the first argument, needs a data block.
2868 * So, if we are at cache max, determine which cache should be victimized.
2869 * We have the following cases:
2870 *
2871 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2872 * In this situation if we're out of space, but the resident size of the MFU is
2873 * under the limit, victimize the MFU cache to satisfy this insertion request.
2874 *
2875 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2876 * Here, we've used up all of the available space for the MRU, so we need to
2877 * evict from our own cache instead.  Evict from the set of resident MRU
2878 * entries.
2879 *
2880 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2881 * c minus p represents the MFU space in the cache, since p is the size of the
2882 * cache that is dedicated to the MRU.  In this situation there's still space on
2883 * the MFU side, so the MRU side needs to be victimized.
2884 *
2885 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2886 * MFU's resident set is consuming more space than it has been allotted.  In
2887 * this situation, we must victimize our own cache, the MFU, for this insertion.
2888 */
2889static void
2890arc_get_data_buf(arc_buf_t *buf)
2891{
2892	arc_state_t		*state = buf->b_hdr->b_state;
2893	uint64_t		size = buf->b_hdr->b_size;
2894	arc_buf_contents_t	type = buf->b_hdr->b_type;
2895
2896	arc_adapt(size, state);
2897
2898	/*
2899	 * We have not yet reached cache maximum size,
2900	 * just allocate a new buffer.
2901	 */
2902	if (!arc_evict_needed(type)) {
2903		if (type == ARC_BUFC_METADATA) {
2904			buf->b_data = zio_buf_alloc(size);
2905			arc_space_consume(size, ARC_SPACE_DATA);
2906		} else {
2907			ASSERT(type == ARC_BUFC_DATA);
2908			buf->b_data = zio_data_buf_alloc(size);
2909			ARCSTAT_INCR(arcstat_data_size, size);
2910			atomic_add_64(&arc_size, size);
2911		}
2912		goto out;
2913	}
2914
2915	/*
2916	 * If we are prefetching from the mfu ghost list, this buffer
2917	 * will end up on the mru list; so steal space from there.
2918	 */
2919	if (state == arc_mfu_ghost)
2920		state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2921	else if (state == arc_mru_ghost)
2922		state = arc_mru;
2923
2924	if (state == arc_mru || state == arc_anon) {
2925		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2926		state = (arc_mfu->arcs_lsize[type] >= size &&
2927		    arc_p > mru_used) ? arc_mfu : arc_mru;
2928	} else {
2929		/* MFU cases */
2930		uint64_t mfu_space = arc_c - arc_p;
2931		state =  (arc_mru->arcs_lsize[type] >= size &&
2932		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2933	}
2934	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
2935		if (type == ARC_BUFC_METADATA) {
2936			buf->b_data = zio_buf_alloc(size);
2937			arc_space_consume(size, ARC_SPACE_DATA);
2938		} else {
2939			ASSERT(type == ARC_BUFC_DATA);
2940			buf->b_data = zio_data_buf_alloc(size);
2941			ARCSTAT_INCR(arcstat_data_size, size);
2942			atomic_add_64(&arc_size, size);
2943		}
2944		ARCSTAT_BUMP(arcstat_recycle_miss);
2945	}
2946	ASSERT(buf->b_data != NULL);
2947out:
2948	/*
2949	 * Update the state size.  Note that ghost states have a
2950	 * "ghost size" and so don't need to be updated.
2951	 */
2952	if (!GHOST_STATE(buf->b_hdr->b_state)) {
2953		arc_buf_hdr_t *hdr = buf->b_hdr;
2954
2955		atomic_add_64(&hdr->b_state->arcs_size, size);
2956		if (list_link_active(&hdr->b_arc_node)) {
2957			ASSERT(refcount_is_zero(&hdr->b_refcnt));
2958			atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2959		}
2960		/*
2961		 * If we are growing the cache, and we are adding anonymous
2962		 * data, and we have outgrown arc_p, update arc_p
2963		 */
2964		if (arc_size < arc_c && hdr->b_state == arc_anon &&
2965		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2966			arc_p = MIN(arc_c, arc_p + size);
2967	}
2968	ARCSTAT_BUMP(arcstat_allocated);
2969}
2970
2971/*
2972 * This routine is called whenever a buffer is accessed.
2973 * NOTE: the hash lock is dropped in this function.
2974 */
2975static void
2976arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2977{
2978	clock_t now;
2979
2980	ASSERT(MUTEX_HELD(hash_lock));
2981
2982	if (buf->b_state == arc_anon) {
2983		/*
2984		 * This buffer is not in the cache, and does not
2985		 * appear in our "ghost" list.  Add the new buffer
2986		 * to the MRU state.
2987		 */
2988
2989		ASSERT(buf->b_arc_access == 0);
2990		buf->b_arc_access = ddi_get_lbolt();
2991		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2992		arc_change_state(arc_mru, buf, hash_lock);
2993
2994	} else if (buf->b_state == arc_mru) {
2995		now = ddi_get_lbolt();
2996
2997		/*
2998		 * If this buffer is here because of a prefetch, then either:
2999		 * - clear the flag if this is a "referencing" read
3000		 *   (any subsequent access will bump this into the MFU state).
3001		 * or
3002		 * - move the buffer to the head of the list if this is
3003		 *   another prefetch (to make it less likely to be evicted).
3004		 */
3005		if ((buf->b_flags & ARC_PREFETCH) != 0) {
3006			if (refcount_count(&buf->b_refcnt) == 0) {
3007				ASSERT(list_link_active(&buf->b_arc_node));
3008			} else {
3009				buf->b_flags &= ~ARC_PREFETCH;
3010				ARCSTAT_BUMP(arcstat_mru_hits);
3011			}
3012			buf->b_arc_access = now;
3013			return;
3014		}
3015
3016		/*
3017		 * This buffer has been "accessed" only once so far,
3018		 * but it is still in the cache. Move it to the MFU
3019		 * state.
3020		 */
3021		if (now > buf->b_arc_access + ARC_MINTIME) {
3022			/*
3023			 * More than 125ms have passed since we
3024			 * instantiated this buffer.  Move it to the
3025			 * most frequently used state.
3026			 */
3027			buf->b_arc_access = now;
3028			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
3029			arc_change_state(arc_mfu, buf, hash_lock);
3030		}
3031		ARCSTAT_BUMP(arcstat_mru_hits);
3032	} else if (buf->b_state == arc_mru_ghost) {
3033		arc_state_t	*new_state;
3034		/*
3035		 * This buffer has been "accessed" recently, but
3036		 * was evicted from the cache.  Move it to the
3037		 * MFU state.
3038		 */
3039
3040		if (buf->b_flags & ARC_PREFETCH) {
3041			new_state = arc_mru;
3042			if (refcount_count(&buf->b_refcnt) > 0)
3043				buf->b_flags &= ~ARC_PREFETCH;
3044			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
3045		} else {
3046			new_state = arc_mfu;
3047			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
3048		}
3049
3050		buf->b_arc_access = ddi_get_lbolt();
3051		arc_change_state(new_state, buf, hash_lock);
3052
3053		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
3054	} else if (buf->b_state == arc_mfu) {
3055		/*
3056		 * This buffer has been accessed more than once and is
3057		 * still in the cache.  Keep it in the MFU state.
3058		 *
3059		 * NOTE: an add_reference() that occurred when we did
3060		 * the arc_read() will have kicked this off the list.
3061		 * If it was a prefetch, we will explicitly move it to
3062		 * the head of the list now.
3063		 */
3064		if ((buf->b_flags & ARC_PREFETCH) != 0) {
3065			ASSERT(refcount_count(&buf->b_refcnt) == 0);
3066			ASSERT(list_link_active(&buf->b_arc_node));
3067		}
3068		ARCSTAT_BUMP(arcstat_mfu_hits);
3069		buf->b_arc_access = ddi_get_lbolt();
3070	} else if (buf->b_state == arc_mfu_ghost) {
3071		arc_state_t	*new_state = arc_mfu;
3072		/*
3073		 * This buffer has been accessed more than once but has
3074		 * been evicted from the cache.  Move it back to the
3075		 * MFU state.
3076		 */
3077
3078		if (buf->b_flags & ARC_PREFETCH) {
3079			/*
3080			 * This is a prefetch access...
3081			 * move this block back to the MRU state.
3082			 */
3083			ASSERT0(refcount_count(&buf->b_refcnt));
3084			new_state = arc_mru;
3085		}
3086
3087		buf->b_arc_access = ddi_get_lbolt();
3088		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
3089		arc_change_state(new_state, buf, hash_lock);
3090
3091		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
3092	} else if (buf->b_state == arc_l2c_only) {
3093		/*
3094		 * This buffer is on the 2nd Level ARC.
3095		 */
3096
3097		buf->b_arc_access = ddi_get_lbolt();
3098		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
3099		arc_change_state(arc_mfu, buf, hash_lock);
3100	} else {
3101		ASSERT(!"invalid arc state");
3102	}
3103}
3104
3105/* a generic arc_done_func_t which you can use */
3106/* ARGSUSED */
3107void
3108arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
3109{
3110	if (zio == NULL || zio->io_error == 0)
3111		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
3112	VERIFY(arc_buf_remove_ref(buf, arg));
3113}
3114
3115/* a generic arc_done_func_t */
3116void
3117arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
3118{
3119	arc_buf_t **bufp = arg;
3120	if (zio && zio->io_error) {
3121		VERIFY(arc_buf_remove_ref(buf, arg));
3122		*bufp = NULL;
3123	} else {
3124		*bufp = buf;
3125		ASSERT(buf->b_data);
3126	}
3127}
3128
3129static void
3130arc_read_done(zio_t *zio)
3131{
3132	arc_buf_hdr_t	*hdr;
3133	arc_buf_t	*buf;
3134	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
3135	kmutex_t	*hash_lock = NULL;
3136	arc_callback_t	*callback_list, *acb;
3137	int		freeable = FALSE;
3138
3139	buf = zio->io_private;
3140	hdr = buf->b_hdr;
3141
3142	/*
3143	 * The hdr was inserted into hash-table and removed from lists
3144	 * prior to starting I/O.  We should find this header, since
3145	 * it's in the hash table, and it should be legit since it's
3146	 * not possible to evict it during the I/O.  The only possible
3147	 * reason for it not to be found is if we were freed during the
3148	 * read.
3149	 */
3150	if (HDR_IN_HASH_TABLE(hdr)) {
3151		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
3152		ASSERT3U(hdr->b_dva.dva_word[0], ==,
3153		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
3154		ASSERT3U(hdr->b_dva.dva_word[1], ==,
3155		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
3156
3157		arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
3158		    &hash_lock);
3159
3160		ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
3161		    hash_lock == NULL) ||
3162		    (found == hdr &&
3163		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
3164		    (found == hdr && HDR_L2_READING(hdr)));
3165	}
3166
3167	hdr->b_flags &= ~ARC_L2_EVICTED;
3168	if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
3169		hdr->b_flags &= ~ARC_L2CACHE;
3170
3171	/* byteswap if necessary */
3172	callback_list = hdr->b_acb;
3173	ASSERT(callback_list != NULL);
3174	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
3175		dmu_object_byteswap_t bswap =
3176		    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
3177		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
3178		    byteswap_uint64_array :
3179		    dmu_ot_byteswap[bswap].ob_func;
3180		func(buf->b_data, hdr->b_size);
3181	}
3182
3183	arc_cksum_compute(buf, B_FALSE);
3184#ifdef illumos
3185	arc_buf_watch(buf);
3186#endif /* illumos */
3187
3188	if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
3189		/*
3190		 * Only call arc_access on anonymous buffers.  This is because
3191		 * if we've issued an I/O for an evicted buffer, we've already
3192		 * called arc_access (to prevent any simultaneous readers from
3193		 * getting confused).
3194		 */
3195		arc_access(hdr, hash_lock);
3196	}
3197
3198	/* create copies of the data buffer for the callers */
3199	abuf = buf;
3200	for (acb = callback_list; acb; acb = acb->acb_next) {
3201		if (acb->acb_done) {
3202			if (abuf == NULL) {
3203				ARCSTAT_BUMP(arcstat_duplicate_reads);
3204				abuf = arc_buf_clone(buf);
3205			}
3206			acb->acb_buf = abuf;
3207			abuf = NULL;
3208		}
3209	}
3210	hdr->b_acb = NULL;
3211	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3212	ASSERT(!HDR_BUF_AVAILABLE(hdr));
3213	if (abuf == buf) {
3214		ASSERT(buf->b_efunc == NULL);
3215		ASSERT(hdr->b_datacnt == 1);
3216		hdr->b_flags |= ARC_BUF_AVAILABLE;
3217	}
3218
3219	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
3220
3221	if (zio->io_error != 0) {
3222		hdr->b_flags |= ARC_IO_ERROR;
3223		if (hdr->b_state != arc_anon)
3224			arc_change_state(arc_anon, hdr, hash_lock);
3225		if (HDR_IN_HASH_TABLE(hdr))
3226			buf_hash_remove(hdr);
3227		freeable = refcount_is_zero(&hdr->b_refcnt);
3228	}
3229
3230	/*
3231	 * Broadcast before we drop the hash_lock to avoid the possibility
3232	 * that the hdr (and hence the cv) might be freed before we get to
3233	 * the cv_broadcast().
3234	 */
3235	cv_broadcast(&hdr->b_cv);
3236
3237	if (hash_lock) {
3238		mutex_exit(hash_lock);
3239	} else {
3240		/*
3241		 * This block was freed while we waited for the read to
3242		 * complete.  It has been removed from the hash table and
3243		 * moved to the anonymous state (so that it won't show up
3244		 * in the cache).
3245		 */
3246		ASSERT3P(hdr->b_state, ==, arc_anon);
3247		freeable = refcount_is_zero(&hdr->b_refcnt);
3248	}
3249
3250	/* execute each callback and free its structure */
3251	while ((acb = callback_list) != NULL) {
3252		if (acb->acb_done)
3253			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3254
3255		if (acb->acb_zio_dummy != NULL) {
3256			acb->acb_zio_dummy->io_error = zio->io_error;
3257			zio_nowait(acb->acb_zio_dummy);
3258		}
3259
3260		callback_list = acb->acb_next;
3261		kmem_free(acb, sizeof (arc_callback_t));
3262	}
3263
3264	if (freeable)
3265		arc_hdr_destroy(hdr);
3266}
3267
3268/*
3269 * "Read" the block block at the specified DVA (in bp) via the
3270 * cache.  If the block is found in the cache, invoke the provided
3271 * callback immediately and return.  Note that the `zio' parameter
3272 * in the callback will be NULL in this case, since no IO was
3273 * required.  If the block is not in the cache pass the read request
3274 * on to the spa with a substitute callback function, so that the
3275 * requested block will be added to the cache.
3276 *
3277 * If a read request arrives for a block that has a read in-progress,
3278 * either wait for the in-progress read to complete (and return the
3279 * results); or, if this is a read with a "done" func, add a record
3280 * to the read to invoke the "done" func when the read completes,
3281 * and return; or just return.
3282 *
3283 * arc_read_done() will invoke all the requested "done" functions
3284 * for readers of this block.
3285 */
3286int
3287arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
3288    void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
3289    const zbookmark_phys_t *zb)
3290{
3291	arc_buf_hdr_t *hdr = NULL;
3292	arc_buf_t *buf = NULL;
3293	kmutex_t *hash_lock = NULL;
3294	zio_t *rzio;
3295	uint64_t guid = spa_load_guid(spa);
3296
3297	ASSERT(!BP_IS_EMBEDDED(bp) ||
3298	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
3299
3300top:
3301	if (!BP_IS_EMBEDDED(bp)) {
3302		/*
3303		 * Embedded BP's have no DVA and require no I/O to "read".
3304		 * Create an anonymous arc buf to back it.
3305		 */
3306		hdr = buf_hash_find(guid, bp, &hash_lock);
3307	}
3308
3309	if (hdr != NULL && hdr->b_datacnt > 0) {
3310
3311		*arc_flags |= ARC_CACHED;
3312
3313		if (HDR_IO_IN_PROGRESS(hdr)) {
3314
3315			if (*arc_flags & ARC_WAIT) {
3316				cv_wait(&hdr->b_cv, hash_lock);
3317				mutex_exit(hash_lock);
3318				goto top;
3319			}
3320			ASSERT(*arc_flags & ARC_NOWAIT);
3321
3322			if (done) {
3323				arc_callback_t	*acb = NULL;
3324
3325				acb = kmem_zalloc(sizeof (arc_callback_t),
3326				    KM_SLEEP);
3327				acb->acb_done = done;
3328				acb->acb_private = private;
3329				if (pio != NULL)
3330					acb->acb_zio_dummy = zio_null(pio,
3331					    spa, NULL, NULL, NULL, zio_flags);
3332
3333				ASSERT(acb->acb_done != NULL);
3334				acb->acb_next = hdr->b_acb;
3335				hdr->b_acb = acb;
3336				add_reference(hdr, hash_lock, private);
3337				mutex_exit(hash_lock);
3338				return (0);
3339			}
3340			mutex_exit(hash_lock);
3341			return (0);
3342		}
3343
3344		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3345
3346		if (done) {
3347			add_reference(hdr, hash_lock, private);
3348			/*
3349			 * If this block is already in use, create a new
3350			 * copy of the data so that we will be guaranteed
3351			 * that arc_release() will always succeed.
3352			 */
3353			buf = hdr->b_buf;
3354			ASSERT(buf);
3355			ASSERT(buf->b_data);
3356			if (HDR_BUF_AVAILABLE(hdr)) {
3357				ASSERT(buf->b_efunc == NULL);
3358				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3359			} else {
3360				buf = arc_buf_clone(buf);
3361			}
3362
3363		} else if (*arc_flags & ARC_PREFETCH &&
3364		    refcount_count(&hdr->b_refcnt) == 0) {
3365			hdr->b_flags |= ARC_PREFETCH;
3366		}
3367		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3368		arc_access(hdr, hash_lock);
3369		if (*arc_flags & ARC_L2CACHE)
3370			hdr->b_flags |= ARC_L2CACHE;
3371		if (*arc_flags & ARC_L2COMPRESS)
3372			hdr->b_flags |= ARC_L2COMPRESS;
3373		mutex_exit(hash_lock);
3374		ARCSTAT_BUMP(arcstat_hits);
3375		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3376		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3377		    data, metadata, hits);
3378
3379		if (done)
3380			done(NULL, buf, private);
3381	} else {
3382		uint64_t size = BP_GET_LSIZE(bp);
3383		arc_callback_t *acb;
3384		vdev_t *vd = NULL;
3385		uint64_t addr = 0;
3386		boolean_t devw = B_FALSE;
3387		enum zio_compress b_compress = ZIO_COMPRESS_OFF;
3388		uint64_t b_asize = 0;
3389
3390		if (hdr == NULL) {
3391			/* this block is not in the cache */
3392			arc_buf_hdr_t *exists = NULL;
3393			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3394			buf = arc_buf_alloc(spa, size, private, type);
3395			hdr = buf->b_hdr;
3396			if (!BP_IS_EMBEDDED(bp)) {
3397				hdr->b_dva = *BP_IDENTITY(bp);
3398				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3399				hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3400				exists = buf_hash_insert(hdr, &hash_lock);
3401			}
3402			if (exists != NULL) {
3403				/* somebody beat us to the hash insert */
3404				mutex_exit(hash_lock);
3405				buf_discard_identity(hdr);
3406				(void) arc_buf_remove_ref(buf, private);
3407				goto top; /* restart the IO request */
3408			}
3409			/* if this is a prefetch, we don't have a reference */
3410			if (*arc_flags & ARC_PREFETCH) {
3411				(void) remove_reference(hdr, hash_lock,
3412				    private);
3413				hdr->b_flags |= ARC_PREFETCH;
3414			}
3415			if (*arc_flags & ARC_L2CACHE)
3416				hdr->b_flags |= ARC_L2CACHE;
3417			if (*arc_flags & ARC_L2COMPRESS)
3418				hdr->b_flags |= ARC_L2COMPRESS;
3419			if (BP_GET_LEVEL(bp) > 0)
3420				hdr->b_flags |= ARC_INDIRECT;
3421		} else {
3422			/* this block is in the ghost cache */
3423			ASSERT(GHOST_STATE(hdr->b_state));
3424			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3425			ASSERT0(refcount_count(&hdr->b_refcnt));
3426			ASSERT(hdr->b_buf == NULL);
3427
3428			/* if this is a prefetch, we don't have a reference */
3429			if (*arc_flags & ARC_PREFETCH)
3430				hdr->b_flags |= ARC_PREFETCH;
3431			else
3432				add_reference(hdr, hash_lock, private);
3433			if (*arc_flags & ARC_L2CACHE)
3434				hdr->b_flags |= ARC_L2CACHE;
3435			if (*arc_flags & ARC_L2COMPRESS)
3436				hdr->b_flags |= ARC_L2COMPRESS;
3437			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3438			buf->b_hdr = hdr;
3439			buf->b_data = NULL;
3440			buf->b_efunc = NULL;
3441			buf->b_private = NULL;
3442			buf->b_next = NULL;
3443			hdr->b_buf = buf;
3444			ASSERT(hdr->b_datacnt == 0);
3445			hdr->b_datacnt = 1;
3446			arc_get_data_buf(buf);
3447			arc_access(hdr, hash_lock);
3448		}
3449
3450		ASSERT(!GHOST_STATE(hdr->b_state));
3451
3452		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3453		acb->acb_done = done;
3454		acb->acb_private = private;
3455
3456		ASSERT(hdr->b_acb == NULL);
3457		hdr->b_acb = acb;
3458		hdr->b_flags |= ARC_IO_IN_PROGRESS;
3459
3460		if (hdr->b_l2hdr != NULL &&
3461		    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3462			devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3463			addr = hdr->b_l2hdr->b_daddr;
3464			b_compress = hdr->b_l2hdr->b_compress;
3465			b_asize = hdr->b_l2hdr->b_asize;
3466			/*
3467			 * Lock out device removal.
3468			 */
3469			if (vdev_is_dead(vd) ||
3470			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3471				vd = NULL;
3472		}
3473
3474		if (hash_lock != NULL)
3475			mutex_exit(hash_lock);
3476
3477		/*
3478		 * At this point, we have a level 1 cache miss.  Try again in
3479		 * L2ARC if possible.
3480		 */
3481		ASSERT3U(hdr->b_size, ==, size);
3482		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3483		    uint64_t, size, zbookmark_phys_t *, zb);
3484		ARCSTAT_BUMP(arcstat_misses);
3485		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3486		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3487		    data, metadata, misses);
3488#ifdef _KERNEL
3489		curthread->td_ru.ru_inblock++;
3490#endif
3491
3492		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3493			/*
3494			 * Read from the L2ARC if the following are true:
3495			 * 1. The L2ARC vdev was previously cached.
3496			 * 2. This buffer still has L2ARC metadata.
3497			 * 3. This buffer isn't currently writing to the L2ARC.
3498			 * 4. The L2ARC entry wasn't evicted, which may
3499			 *    also have invalidated the vdev.
3500			 * 5. This isn't prefetch and l2arc_noprefetch is set.
3501			 */
3502			if (hdr->b_l2hdr != NULL &&
3503			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3504			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3505				l2arc_read_callback_t *cb;
3506
3507				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3508				ARCSTAT_BUMP(arcstat_l2_hits);
3509
3510				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3511				    KM_SLEEP);
3512				cb->l2rcb_buf = buf;
3513				cb->l2rcb_spa = spa;
3514				cb->l2rcb_bp = *bp;
3515				cb->l2rcb_zb = *zb;
3516				cb->l2rcb_flags = zio_flags;
3517				cb->l2rcb_compress = b_compress;
3518
3519				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3520				    addr + size < vd->vdev_psize -
3521				    VDEV_LABEL_END_SIZE);
3522
3523				/*
3524				 * l2arc read.  The SCL_L2ARC lock will be
3525				 * released by l2arc_read_done().
3526				 * Issue a null zio if the underlying buffer
3527				 * was squashed to zero size by compression.
3528				 */
3529				if (b_compress == ZIO_COMPRESS_EMPTY) {
3530					rzio = zio_null(pio, spa, vd,
3531					    l2arc_read_done, cb,
3532					    zio_flags | ZIO_FLAG_DONT_CACHE |
3533					    ZIO_FLAG_CANFAIL |
3534					    ZIO_FLAG_DONT_PROPAGATE |
3535					    ZIO_FLAG_DONT_RETRY);
3536				} else {
3537					rzio = zio_read_phys(pio, vd, addr,
3538					    b_asize, buf->b_data,
3539					    ZIO_CHECKSUM_OFF,
3540					    l2arc_read_done, cb, priority,
3541					    zio_flags | ZIO_FLAG_DONT_CACHE |
3542					    ZIO_FLAG_CANFAIL |
3543					    ZIO_FLAG_DONT_PROPAGATE |
3544					    ZIO_FLAG_DONT_RETRY, B_FALSE);
3545				}
3546				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3547				    zio_t *, rzio);
3548				ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
3549
3550				if (*arc_flags & ARC_NOWAIT) {
3551					zio_nowait(rzio);
3552					return (0);
3553				}
3554
3555				ASSERT(*arc_flags & ARC_WAIT);
3556				if (zio_wait(rzio) == 0)
3557					return (0);
3558
3559				/* l2arc read error; goto zio_read() */
3560			} else {
3561				DTRACE_PROBE1(l2arc__miss,
3562				    arc_buf_hdr_t *, hdr);
3563				ARCSTAT_BUMP(arcstat_l2_misses);
3564				if (HDR_L2_WRITING(hdr))
3565					ARCSTAT_BUMP(arcstat_l2_rw_clash);
3566				spa_config_exit(spa, SCL_L2ARC, vd);
3567			}
3568		} else {
3569			if (vd != NULL)
3570				spa_config_exit(spa, SCL_L2ARC, vd);
3571			if (l2arc_ndev != 0) {
3572				DTRACE_PROBE1(l2arc__miss,
3573				    arc_buf_hdr_t *, hdr);
3574				ARCSTAT_BUMP(arcstat_l2_misses);
3575			}
3576		}
3577
3578		rzio = zio_read(pio, spa, bp, buf->b_data, size,
3579		    arc_read_done, buf, priority, zio_flags, zb);
3580
3581		if (*arc_flags & ARC_WAIT)
3582			return (zio_wait(rzio));
3583
3584		ASSERT(*arc_flags & ARC_NOWAIT);
3585		zio_nowait(rzio);
3586	}
3587	return (0);
3588}
3589
3590void
3591arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3592{
3593	ASSERT(buf->b_hdr != NULL);
3594	ASSERT(buf->b_hdr->b_state != arc_anon);
3595	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3596	ASSERT(buf->b_efunc == NULL);
3597	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3598
3599	buf->b_efunc = func;
3600	buf->b_private = private;
3601}
3602
3603/*
3604 * Notify the arc that a block was freed, and thus will never be used again.
3605 */
3606void
3607arc_freed(spa_t *spa, const blkptr_t *bp)
3608{
3609	arc_buf_hdr_t *hdr;
3610	kmutex_t *hash_lock;
3611	uint64_t guid = spa_load_guid(spa);
3612
3613	ASSERT(!BP_IS_EMBEDDED(bp));
3614
3615	hdr = buf_hash_find(guid, bp, &hash_lock);
3616	if (hdr == NULL)
3617		return;
3618	if (HDR_BUF_AVAILABLE(hdr)) {
3619		arc_buf_t *buf = hdr->b_buf;
3620		add_reference(hdr, hash_lock, FTAG);
3621		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3622		mutex_exit(hash_lock);
3623
3624		arc_release(buf, FTAG);
3625		(void) arc_buf_remove_ref(buf, FTAG);
3626	} else {
3627		mutex_exit(hash_lock);
3628	}
3629
3630}
3631
3632/*
3633 * Clear the user eviction callback set by arc_set_callback(), first calling
3634 * it if it exists.  Because the presence of a callback keeps an arc_buf cached
3635 * clearing the callback may result in the arc_buf being destroyed.  However,
3636 * it will not result in the *last* arc_buf being destroyed, hence the data
3637 * will remain cached in the ARC. We make a copy of the arc buffer here so
3638 * that we can process the callback without holding any locks.
3639 *
3640 * It's possible that the callback is already in the process of being cleared
3641 * by another thread.  In this case we can not clear the callback.
3642 *
3643 * Returns B_TRUE if the callback was successfully called and cleared.
3644 */
3645boolean_t
3646arc_clear_callback(arc_buf_t *buf)
3647{
3648	arc_buf_hdr_t *hdr;
3649	kmutex_t *hash_lock;
3650	arc_evict_func_t *efunc = buf->b_efunc;
3651	void *private = buf->b_private;
3652	list_t *list, *evicted_list;
3653	kmutex_t *lock, *evicted_lock;
3654
3655	mutex_enter(&buf->b_evict_lock);
3656	hdr = buf->b_hdr;
3657	if (hdr == NULL) {
3658		/*
3659		 * We are in arc_do_user_evicts().
3660		 */
3661		ASSERT(buf->b_data == NULL);
3662		mutex_exit(&buf->b_evict_lock);
3663		return (B_FALSE);
3664	} else if (buf->b_data == NULL) {
3665		/*
3666		 * We are on the eviction list; process this buffer now
3667		 * but let arc_do_user_evicts() do the reaping.
3668		 */
3669		buf->b_efunc = NULL;
3670		mutex_exit(&buf->b_evict_lock);
3671		VERIFY0(efunc(private));
3672		return (B_TRUE);
3673	}
3674	hash_lock = HDR_LOCK(hdr);
3675	mutex_enter(hash_lock);
3676	hdr = buf->b_hdr;
3677	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3678
3679	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3680	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3681
3682	buf->b_efunc = NULL;
3683	buf->b_private = NULL;
3684
3685	if (hdr->b_datacnt > 1) {
3686		mutex_exit(&buf->b_evict_lock);
3687		arc_buf_destroy(buf, FALSE, TRUE);
3688	} else {
3689		ASSERT(buf == hdr->b_buf);
3690		hdr->b_flags |= ARC_BUF_AVAILABLE;
3691		mutex_exit(&buf->b_evict_lock);
3692	}
3693
3694	mutex_exit(hash_lock);
3695	VERIFY0(efunc(private));
3696	return (B_TRUE);
3697}
3698
3699/*
3700 * Release this buffer from the cache, making it an anonymous buffer.  This
3701 * must be done after a read and prior to modifying the buffer contents.
3702 * If the buffer has more than one reference, we must make
3703 * a new hdr for the buffer.
3704 */
3705void
3706arc_release(arc_buf_t *buf, void *tag)
3707{
3708	arc_buf_hdr_t *hdr;
3709	kmutex_t *hash_lock = NULL;
3710	l2arc_buf_hdr_t *l2hdr;
3711	uint64_t buf_size;
3712
3713	/*
3714	 * It would be nice to assert that if it's DMU metadata (level >
3715	 * 0 || it's the dnode file), then it must be syncing context.
3716	 * But we don't know that information at this level.
3717	 */
3718
3719	mutex_enter(&buf->b_evict_lock);
3720	hdr = buf->b_hdr;
3721
3722	/* this buffer is not on any list */
3723	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3724
3725	if (hdr->b_state == arc_anon) {
3726		/* this buffer is already released */
3727		ASSERT(buf->b_efunc == NULL);
3728	} else {
3729		hash_lock = HDR_LOCK(hdr);
3730		mutex_enter(hash_lock);
3731		hdr = buf->b_hdr;
3732		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3733	}
3734
3735	l2hdr = hdr->b_l2hdr;
3736	if (l2hdr) {
3737		mutex_enter(&l2arc_buflist_mtx);
3738		arc_buf_l2_cdata_free(hdr);
3739		hdr->b_l2hdr = NULL;
3740		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3741	}
3742	buf_size = hdr->b_size;
3743
3744	/*
3745	 * Do we have more than one buf?
3746	 */
3747	if (hdr->b_datacnt > 1) {
3748		arc_buf_hdr_t *nhdr;
3749		arc_buf_t **bufp;
3750		uint64_t blksz = hdr->b_size;
3751		uint64_t spa = hdr->b_spa;
3752		arc_buf_contents_t type = hdr->b_type;
3753		uint32_t flags = hdr->b_flags;
3754
3755		ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3756		/*
3757		 * Pull the data off of this hdr and attach it to
3758		 * a new anonymous hdr.
3759		 */
3760		(void) remove_reference(hdr, hash_lock, tag);
3761		bufp = &hdr->b_buf;
3762		while (*bufp != buf)
3763			bufp = &(*bufp)->b_next;
3764		*bufp = buf->b_next;
3765		buf->b_next = NULL;
3766
3767		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3768		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3769		if (refcount_is_zero(&hdr->b_refcnt)) {
3770			uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3771			ASSERT3U(*size, >=, hdr->b_size);
3772			atomic_add_64(size, -hdr->b_size);
3773		}
3774
3775		/*
3776		 * We're releasing a duplicate user data buffer, update
3777		 * our statistics accordingly.
3778		 */
3779		if (hdr->b_type == ARC_BUFC_DATA) {
3780			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3781			ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3782			    -hdr->b_size);
3783		}
3784		hdr->b_datacnt -= 1;
3785		arc_cksum_verify(buf);
3786#ifdef illumos
3787		arc_buf_unwatch(buf);
3788#endif /* illumos */
3789
3790		mutex_exit(hash_lock);
3791
3792		nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3793		nhdr->b_size = blksz;
3794		nhdr->b_spa = spa;
3795		nhdr->b_type = type;
3796		nhdr->b_buf = buf;
3797		nhdr->b_state = arc_anon;
3798		nhdr->b_arc_access = 0;
3799		nhdr->b_flags = flags & ARC_L2_WRITING;
3800		nhdr->b_l2hdr = NULL;
3801		nhdr->b_datacnt = 1;
3802		nhdr->b_freeze_cksum = NULL;
3803		(void) refcount_add(&nhdr->b_refcnt, tag);
3804		buf->b_hdr = nhdr;
3805		mutex_exit(&buf->b_evict_lock);
3806		atomic_add_64(&arc_anon->arcs_size, blksz);
3807	} else {
3808		mutex_exit(&buf->b_evict_lock);
3809		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3810		ASSERT(!list_link_active(&hdr->b_arc_node));
3811		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3812		if (hdr->b_state != arc_anon)
3813			arc_change_state(arc_anon, hdr, hash_lock);
3814		hdr->b_arc_access = 0;
3815		if (hash_lock)
3816			mutex_exit(hash_lock);
3817
3818		buf_discard_identity(hdr);
3819		arc_buf_thaw(buf);
3820	}
3821	buf->b_efunc = NULL;
3822	buf->b_private = NULL;
3823
3824	if (l2hdr) {
3825		ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3826		vdev_space_update(l2hdr->b_dev->l2ad_vdev,
3827		    -l2hdr->b_asize, 0, 0);
3828		trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
3829		    hdr->b_size, 0);
3830		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3831		ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3832		mutex_exit(&l2arc_buflist_mtx);
3833	}
3834}
3835
3836int
3837arc_released(arc_buf_t *buf)
3838{
3839	int released;
3840
3841	mutex_enter(&buf->b_evict_lock);
3842	released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3843	mutex_exit(&buf->b_evict_lock);
3844	return (released);
3845}
3846
3847#ifdef ZFS_DEBUG
3848int
3849arc_referenced(arc_buf_t *buf)
3850{
3851	int referenced;
3852
3853	mutex_enter(&buf->b_evict_lock);
3854	referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3855	mutex_exit(&buf->b_evict_lock);
3856	return (referenced);
3857}
3858#endif
3859
3860static void
3861arc_write_ready(zio_t *zio)
3862{
3863	arc_write_callback_t *callback = zio->io_private;
3864	arc_buf_t *buf = callback->awcb_buf;
3865	arc_buf_hdr_t *hdr = buf->b_hdr;
3866
3867	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3868	callback->awcb_ready(zio, buf, callback->awcb_private);
3869
3870	/*
3871	 * If the IO is already in progress, then this is a re-write
3872	 * attempt, so we need to thaw and re-compute the cksum.
3873	 * It is the responsibility of the callback to handle the
3874	 * accounting for any re-write attempt.
3875	 */
3876	if (HDR_IO_IN_PROGRESS(hdr)) {
3877		mutex_enter(&hdr->b_freeze_lock);
3878		if (hdr->b_freeze_cksum != NULL) {
3879			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3880			hdr->b_freeze_cksum = NULL;
3881		}
3882		mutex_exit(&hdr->b_freeze_lock);
3883	}
3884	arc_cksum_compute(buf, B_FALSE);
3885	hdr->b_flags |= ARC_IO_IN_PROGRESS;
3886}
3887
3888/*
3889 * The SPA calls this callback for each physical write that happens on behalf
3890 * of a logical write.  See the comment in dbuf_write_physdone() for details.
3891 */
3892static void
3893arc_write_physdone(zio_t *zio)
3894{
3895	arc_write_callback_t *cb = zio->io_private;
3896	if (cb->awcb_physdone != NULL)
3897		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
3898}
3899
3900static void
3901arc_write_done(zio_t *zio)
3902{
3903	arc_write_callback_t *callback = zio->io_private;
3904	arc_buf_t *buf = callback->awcb_buf;
3905	arc_buf_hdr_t *hdr = buf->b_hdr;
3906
3907	ASSERT(hdr->b_acb == NULL);
3908
3909	if (zio->io_error == 0) {
3910		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
3911			buf_discard_identity(hdr);
3912		} else {
3913			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3914			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3915			hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3916		}
3917	} else {
3918		ASSERT(BUF_EMPTY(hdr));
3919	}
3920
3921	/*
3922	 * If the block to be written was all-zero or compressed enough to be
3923	 * embedded in the BP, no write was performed so there will be no
3924	 * dva/birth/checksum.  The buffer must therefore remain anonymous
3925	 * (and uncached).
3926	 */
3927	if (!BUF_EMPTY(hdr)) {
3928		arc_buf_hdr_t *exists;
3929		kmutex_t *hash_lock;
3930
3931		ASSERT(zio->io_error == 0);
3932
3933		arc_cksum_verify(buf);
3934
3935		exists = buf_hash_insert(hdr, &hash_lock);
3936		if (exists) {
3937			/*
3938			 * This can only happen if we overwrite for
3939			 * sync-to-convergence, because we remove
3940			 * buffers from the hash table when we arc_free().
3941			 */
3942			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3943				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3944					panic("bad overwrite, hdr=%p exists=%p",
3945					    (void *)hdr, (void *)exists);
3946				ASSERT(refcount_is_zero(&exists->b_refcnt));
3947				arc_change_state(arc_anon, exists, hash_lock);
3948				mutex_exit(hash_lock);
3949				arc_hdr_destroy(exists);
3950				exists = buf_hash_insert(hdr, &hash_lock);
3951				ASSERT3P(exists, ==, NULL);
3952			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3953				/* nopwrite */
3954				ASSERT(zio->io_prop.zp_nopwrite);
3955				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3956					panic("bad nopwrite, hdr=%p exists=%p",
3957					    (void *)hdr, (void *)exists);
3958			} else {
3959				/* Dedup */
3960				ASSERT(hdr->b_datacnt == 1);
3961				ASSERT(hdr->b_state == arc_anon);
3962				ASSERT(BP_GET_DEDUP(zio->io_bp));
3963				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3964			}
3965		}
3966		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3967		/* if it's not anon, we are doing a scrub */
3968		if (!exists && hdr->b_state == arc_anon)
3969			arc_access(hdr, hash_lock);
3970		mutex_exit(hash_lock);
3971	} else {
3972		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3973	}
3974
3975	ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3976	callback->awcb_done(zio, buf, callback->awcb_private);
3977
3978	kmem_free(callback, sizeof (arc_write_callback_t));
3979}
3980
3981zio_t *
3982arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3983    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3984    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
3985    arc_done_func_t *done, void *private, zio_priority_t priority,
3986    int zio_flags, const zbookmark_phys_t *zb)
3987{
3988	arc_buf_hdr_t *hdr = buf->b_hdr;
3989	arc_write_callback_t *callback;
3990	zio_t *zio;
3991
3992	ASSERT(ready != NULL);
3993	ASSERT(done != NULL);
3994	ASSERT(!HDR_IO_ERROR(hdr));
3995	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3996	ASSERT(hdr->b_acb == NULL);
3997	if (l2arc)
3998		hdr->b_flags |= ARC_L2CACHE;
3999	if (l2arc_compress)
4000		hdr->b_flags |= ARC_L2COMPRESS;
4001	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
4002	callback->awcb_ready = ready;
4003	callback->awcb_physdone = physdone;
4004	callback->awcb_done = done;
4005	callback->awcb_private = private;
4006	callback->awcb_buf = buf;
4007
4008	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
4009	    arc_write_ready, arc_write_physdone, arc_write_done, callback,
4010	    priority, zio_flags, zb);
4011
4012	return (zio);
4013}
4014
4015static int
4016arc_memory_throttle(uint64_t reserve, uint64_t txg)
4017{
4018#ifdef _KERNEL
4019	uint64_t available_memory = ptob(freemem);
4020	static uint64_t page_load = 0;
4021	static uint64_t last_txg = 0;
4022
4023#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
4024	available_memory =
4025	    MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE)));
4026#endif
4027
4028	if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
4029		return (0);
4030
4031	if (txg > last_txg) {
4032		last_txg = txg;
4033		page_load = 0;
4034	}
4035	/*
4036	 * If we are in pageout, we know that memory is already tight,
4037	 * the arc is already going to be evicting, so we just want to
4038	 * continue to let page writes occur as quickly as possible.
4039	 */
4040	if (curproc == pageproc) {
4041		if (page_load > MAX(ptob(minfree), available_memory) / 4)
4042			return (SET_ERROR(ERESTART));
4043		/* Note: reserve is inflated, so we deflate */
4044		page_load += reserve / 8;
4045		return (0);
4046	} else if (page_load > 0 && arc_reclaim_needed()) {
4047		/* memory is low, delay before restarting */
4048		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
4049		return (SET_ERROR(EAGAIN));
4050	}
4051	page_load = 0;
4052#endif
4053	return (0);
4054}
4055
4056void
4057arc_tempreserve_clear(uint64_t reserve)
4058{
4059	atomic_add_64(&arc_tempreserve, -reserve);
4060	ASSERT((int64_t)arc_tempreserve >= 0);
4061}
4062
4063int
4064arc_tempreserve_space(uint64_t reserve, uint64_t txg)
4065{
4066	int error;
4067	uint64_t anon_size;
4068
4069	if (reserve > arc_c/4 && !arc_no_grow) {
4070		arc_c = MIN(arc_c_max, reserve * 4);
4071		DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
4072	}
4073	if (reserve > arc_c)
4074		return (SET_ERROR(ENOMEM));
4075
4076	/*
4077	 * Don't count loaned bufs as in flight dirty data to prevent long
4078	 * network delays from blocking transactions that are ready to be
4079	 * assigned to a txg.
4080	 */
4081	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
4082
4083	/*
4084	 * Writes will, almost always, require additional memory allocations
4085	 * in order to compress/encrypt/etc the data.  We therefore need to
4086	 * make sure that there is sufficient available memory for this.
4087	 */
4088	error = arc_memory_throttle(reserve, txg);
4089	if (error != 0)
4090		return (error);
4091
4092	/*
4093	 * Throttle writes when the amount of dirty data in the cache
4094	 * gets too large.  We try to keep the cache less than half full
4095	 * of dirty blocks so that our sync times don't grow too large.
4096	 * Note: if two requests come in concurrently, we might let them
4097	 * both succeed, when one of them should fail.  Not a huge deal.
4098	 */
4099
4100	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
4101	    anon_size > arc_c / 4) {
4102		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
4103		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
4104		    arc_tempreserve>>10,
4105		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
4106		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
4107		    reserve>>10, arc_c>>10);
4108		return (SET_ERROR(ERESTART));
4109	}
4110	atomic_add_64(&arc_tempreserve, reserve);
4111	return (0);
4112}
4113
4114static kmutex_t arc_lowmem_lock;
4115#ifdef _KERNEL
4116static eventhandler_tag arc_event_lowmem = NULL;
4117
4118static void
4119arc_lowmem(void *arg __unused, int howto __unused)
4120{
4121
4122	/* Serialize access via arc_lowmem_lock. */
4123	mutex_enter(&arc_lowmem_lock);
4124	mutex_enter(&arc_reclaim_thr_lock);
4125	needfree = 1;
4126	DTRACE_PROBE(arc__needfree);
4127	cv_signal(&arc_reclaim_thr_cv);
4128
4129	/*
4130	 * It is unsafe to block here in arbitrary threads, because we can come
4131	 * here from ARC itself and may hold ARC locks and thus risk a deadlock
4132	 * with ARC reclaim thread.
4133	 */
4134	if (curproc == pageproc) {
4135		while (needfree)
4136			msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
4137	}
4138	mutex_exit(&arc_reclaim_thr_lock);
4139	mutex_exit(&arc_lowmem_lock);
4140}
4141#endif
4142
4143void
4144arc_init(void)
4145{
4146	int i, prefetch_tunable_set = 0;
4147
4148	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4149	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
4150	mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
4151
4152	/* Convert seconds to clock ticks */
4153	arc_min_prefetch_lifespan = 1 * hz;
4154
4155	/* Start out with 1/8 of all memory */
4156	arc_c = kmem_size() / 8;
4157
4158#ifdef sun
4159#ifdef _KERNEL
4160	/*
4161	 * On architectures where the physical memory can be larger
4162	 * than the addressable space (intel in 32-bit mode), we may
4163	 * need to limit the cache to 1/8 of VM size.
4164	 */
4165	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
4166#endif
4167#endif	/* sun */
4168	/* set min cache to 1/32 of all memory, or 16MB, whichever is more */
4169	arc_c_min = MAX(arc_c / 4, 64<<18);
4170	/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
4171	if (arc_c * 8 >= 1<<30)
4172		arc_c_max = (arc_c * 8) - (1<<30);
4173	else
4174		arc_c_max = arc_c_min;
4175	arc_c_max = MAX(arc_c * 5, arc_c_max);
4176
4177#ifdef _KERNEL
4178	/*
4179	 * Allow the tunables to override our calculations if they are
4180	 * reasonable (ie. over 16MB)
4181	 */
4182	if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size())
4183		arc_c_max = zfs_arc_max;
4184	if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max)
4185		arc_c_min = zfs_arc_min;
4186#endif
4187
4188	arc_c = arc_c_max;
4189	arc_p = (arc_c >> 1);
4190
4191	/* limit meta-data to 1/4 of the arc capacity */
4192	arc_meta_limit = arc_c_max / 4;
4193
4194	/* Allow the tunable to override if it is reasonable */
4195	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
4196		arc_meta_limit = zfs_arc_meta_limit;
4197
4198	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
4199		arc_c_min = arc_meta_limit / 2;
4200
4201	if (zfs_arc_grow_retry > 0)
4202		arc_grow_retry = zfs_arc_grow_retry;
4203
4204	if (zfs_arc_shrink_shift > 0)
4205		arc_shrink_shift = zfs_arc_shrink_shift;
4206
4207	if (zfs_arc_p_min_shift > 0)
4208		arc_p_min_shift = zfs_arc_p_min_shift;
4209
4210	/* if kmem_flags are set, lets try to use less memory */
4211	if (kmem_debugging())
4212		arc_c = arc_c / 2;
4213	if (arc_c < arc_c_min)
4214		arc_c = arc_c_min;
4215
4216	zfs_arc_min = arc_c_min;
4217	zfs_arc_max = arc_c_max;
4218
4219	arc_anon = &ARC_anon;
4220	arc_mru = &ARC_mru;
4221	arc_mru_ghost = &ARC_mru_ghost;
4222	arc_mfu = &ARC_mfu;
4223	arc_mfu_ghost = &ARC_mfu_ghost;
4224	arc_l2c_only = &ARC_l2c_only;
4225	arc_size = 0;
4226
4227	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4228		mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
4229		    NULL, MUTEX_DEFAULT, NULL);
4230		mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
4231		    NULL, MUTEX_DEFAULT, NULL);
4232		mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
4233		    NULL, MUTEX_DEFAULT, NULL);
4234		mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
4235		    NULL, MUTEX_DEFAULT, NULL);
4236		mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
4237		    NULL, MUTEX_DEFAULT, NULL);
4238		mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
4239		    NULL, MUTEX_DEFAULT, NULL);
4240
4241		list_create(&arc_mru->arcs_lists[i],
4242		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4243		list_create(&arc_mru_ghost->arcs_lists[i],
4244		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4245		list_create(&arc_mfu->arcs_lists[i],
4246		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4247		list_create(&arc_mfu_ghost->arcs_lists[i],
4248		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4249		list_create(&arc_mfu_ghost->arcs_lists[i],
4250		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4251		list_create(&arc_l2c_only->arcs_lists[i],
4252		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4253	}
4254
4255	buf_init();
4256
4257	arc_thread_exit = 0;
4258	arc_eviction_list = NULL;
4259	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
4260	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4261
4262	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4263	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4264
4265	if (arc_ksp != NULL) {
4266		arc_ksp->ks_data = &arc_stats;
4267		kstat_install(arc_ksp);
4268	}
4269
4270	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
4271	    TS_RUN, minclsyspri);
4272
4273#ifdef _KERNEL
4274	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
4275	    EVENTHANDLER_PRI_FIRST);
4276#endif
4277
4278	arc_dead = FALSE;
4279	arc_warm = B_FALSE;
4280
4281	/*
4282	 * Calculate maximum amount of dirty data per pool.
4283	 *
4284	 * If it has been set by /etc/system, take that.
4285	 * Otherwise, use a percentage of physical memory defined by
4286	 * zfs_dirty_data_max_percent (default 10%) with a cap at
4287	 * zfs_dirty_data_max_max (default 4GB).
4288	 */
4289	if (zfs_dirty_data_max == 0) {
4290		zfs_dirty_data_max = ptob(physmem) *
4291		    zfs_dirty_data_max_percent / 100;
4292		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
4293		    zfs_dirty_data_max_max);
4294	}
4295
4296#ifdef _KERNEL
4297	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
4298		prefetch_tunable_set = 1;
4299
4300#ifdef __i386__
4301	if (prefetch_tunable_set == 0) {
4302		printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
4303		    "-- to enable,\n");
4304		printf("            add \"vfs.zfs.prefetch_disable=0\" "
4305		    "to /boot/loader.conf.\n");
4306		zfs_prefetch_disable = 1;
4307	}
4308#else
4309	if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
4310	    prefetch_tunable_set == 0) {
4311		printf("ZFS NOTICE: Prefetch is disabled by default if less "
4312		    "than 4GB of RAM is present;\n"
4313		    "            to enable, add \"vfs.zfs.prefetch_disable=0\" "
4314		    "to /boot/loader.conf.\n");
4315		zfs_prefetch_disable = 1;
4316	}
4317#endif
4318	/* Warn about ZFS memory and address space requirements. */
4319	if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
4320		printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
4321		    "expect unstable behavior.\n");
4322	}
4323	if (kmem_size() < 512 * (1 << 20)) {
4324		printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
4325		    "expect unstable behavior.\n");
4326		printf("             Consider tuning vm.kmem_size and "
4327		    "vm.kmem_size_max\n");
4328		printf("             in /boot/loader.conf.\n");
4329	}
4330#endif
4331}
4332
4333void
4334arc_fini(void)
4335{
4336	int i;
4337
4338	mutex_enter(&arc_reclaim_thr_lock);
4339	arc_thread_exit = 1;
4340	cv_signal(&arc_reclaim_thr_cv);
4341	while (arc_thread_exit != 0)
4342		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
4343	mutex_exit(&arc_reclaim_thr_lock);
4344
4345	arc_flush(NULL);
4346
4347	arc_dead = TRUE;
4348
4349	if (arc_ksp != NULL) {
4350		kstat_delete(arc_ksp);
4351		arc_ksp = NULL;
4352	}
4353
4354	mutex_destroy(&arc_eviction_mtx);
4355	mutex_destroy(&arc_reclaim_thr_lock);
4356	cv_destroy(&arc_reclaim_thr_cv);
4357
4358	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4359		list_destroy(&arc_mru->arcs_lists[i]);
4360		list_destroy(&arc_mru_ghost->arcs_lists[i]);
4361		list_destroy(&arc_mfu->arcs_lists[i]);
4362		list_destroy(&arc_mfu_ghost->arcs_lists[i]);
4363		list_destroy(&arc_l2c_only->arcs_lists[i]);
4364
4365		mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
4366		mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
4367		mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
4368		mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
4369		mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
4370		mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
4371	}
4372
4373	buf_fini();
4374
4375	ASSERT(arc_loaned_bytes == 0);
4376
4377	mutex_destroy(&arc_lowmem_lock);
4378#ifdef _KERNEL
4379	if (arc_event_lowmem != NULL)
4380		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
4381#endif
4382}
4383
4384/*
4385 * Level 2 ARC
4386 *
4387 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
4388 * It uses dedicated storage devices to hold cached data, which are populated
4389 * using large infrequent writes.  The main role of this cache is to boost
4390 * the performance of random read workloads.  The intended L2ARC devices
4391 * include short-stroked disks, solid state disks, and other media with
4392 * substantially faster read latency than disk.
4393 *
4394 *                 +-----------------------+
4395 *                 |         ARC           |
4396 *                 +-----------------------+
4397 *                    |         ^     ^
4398 *                    |         |     |
4399 *      l2arc_feed_thread()    arc_read()
4400 *                    |         |     |
4401 *                    |  l2arc read   |
4402 *                    V         |     |
4403 *               +---------------+    |
4404 *               |     L2ARC     |    |
4405 *               +---------------+    |
4406 *                   |    ^           |
4407 *          l2arc_write() |           |
4408 *                   |    |           |
4409 *                   V    |           |
4410 *                 +-------+      +-------+
4411 *                 | vdev  |      | vdev  |
4412 *                 | cache |      | cache |
4413 *                 +-------+      +-------+
4414 *                 +=========+     .-----.
4415 *                 :  L2ARC  :    |-_____-|
4416 *                 : devices :    | Disks |
4417 *                 +=========+    `-_____-'
4418 *
4419 * Read requests are satisfied from the following sources, in order:
4420 *
4421 *	1) ARC
4422 *	2) vdev cache of L2ARC devices
4423 *	3) L2ARC devices
4424 *	4) vdev cache of disks
4425 *	5) disks
4426 *
4427 * Some L2ARC device types exhibit extremely slow write performance.
4428 * To accommodate for this there are some significant differences between
4429 * the L2ARC and traditional cache design:
4430 *
4431 * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
4432 * the ARC behave as usual, freeing buffers and placing headers on ghost
4433 * lists.  The ARC does not send buffers to the L2ARC during eviction as
4434 * this would add inflated write latencies for all ARC memory pressure.
4435 *
4436 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4437 * It does this by periodically scanning buffers from the eviction-end of
4438 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4439 * not already there. It scans until a headroom of buffers is satisfied,
4440 * which itself is a buffer for ARC eviction. If a compressible buffer is
4441 * found during scanning and selected for writing to an L2ARC device, we
4442 * temporarily boost scanning headroom during the next scan cycle to make
4443 * sure we adapt to compression effects (which might significantly reduce
4444 * the data volume we write to L2ARC). The thread that does this is
4445 * l2arc_feed_thread(), illustrated below; example sizes are included to
4446 * provide a better sense of ratio than this diagram:
4447 *
4448 *	       head -->                        tail
4449 *	        +---------------------+----------+
4450 *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
4451 *	        +---------------------+----------+   |   o L2ARC eligible
4452 *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
4453 *	        +---------------------+----------+   |
4454 *	             15.9 Gbytes      ^ 32 Mbytes    |
4455 *	                           headroom          |
4456 *	                                      l2arc_feed_thread()
4457 *	                                             |
4458 *	                 l2arc write hand <--[oooo]--'
4459 *	                         |           8 Mbyte
4460 *	                         |          write max
4461 *	                         V
4462 *		  +==============================+
4463 *	L2ARC dev |####|#|###|###|    |####| ... |
4464 *	          +==============================+
4465 *	                     32 Gbytes
4466 *
4467 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4468 * evicted, then the L2ARC has cached a buffer much sooner than it probably
4469 * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
4470 * safe to say that this is an uncommon case, since buffers at the end of
4471 * the ARC lists have moved there due to inactivity.
4472 *
4473 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4474 * then the L2ARC simply misses copying some buffers.  This serves as a
4475 * pressure valve to prevent heavy read workloads from both stalling the ARC
4476 * with waits and clogging the L2ARC with writes.  This also helps prevent
4477 * the potential for the L2ARC to churn if it attempts to cache content too
4478 * quickly, such as during backups of the entire pool.
4479 *
4480 * 5. After system boot and before the ARC has filled main memory, there are
4481 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4482 * lists can remain mostly static.  Instead of searching from tail of these
4483 * lists as pictured, the l2arc_feed_thread() will search from the list heads
4484 * for eligible buffers, greatly increasing its chance of finding them.
4485 *
4486 * The L2ARC device write speed is also boosted during this time so that
4487 * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
4488 * there are no L2ARC reads, and no fear of degrading read performance
4489 * through increased writes.
4490 *
4491 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4492 * the vdev queue can aggregate them into larger and fewer writes.  Each
4493 * device is written to in a rotor fashion, sweeping writes through
4494 * available space then repeating.
4495 *
4496 * 7. The L2ARC does not store dirty content.  It never needs to flush
4497 * write buffers back to disk based storage.
4498 *
4499 * 8. If an ARC buffer is written (and dirtied) which also exists in the
4500 * L2ARC, the now stale L2ARC buffer is immediately dropped.
4501 *
4502 * The performance of the L2ARC can be tweaked by a number of tunables, which
4503 * may be necessary for different workloads:
4504 *
4505 *	l2arc_write_max		max write bytes per interval
4506 *	l2arc_write_boost	extra write bytes during device warmup
4507 *	l2arc_noprefetch	skip caching prefetched buffers
4508 *	l2arc_headroom		number of max device writes to precache
4509 *	l2arc_headroom_boost	when we find compressed buffers during ARC
4510 *				scanning, we multiply headroom by this
4511 *				percentage factor for the next scan cycle,
4512 *				since more compressed buffers are likely to
4513 *				be present
4514 *	l2arc_feed_secs		seconds between L2ARC writing
4515 *
4516 * Tunables may be removed or added as future performance improvements are
4517 * integrated, and also may become zpool properties.
4518 *
4519 * There are three key functions that control how the L2ARC warms up:
4520 *
4521 *	l2arc_write_eligible()	check if a buffer is eligible to cache
4522 *	l2arc_write_size()	calculate how much to write
4523 *	l2arc_write_interval()	calculate sleep delay between writes
4524 *
4525 * These three functions determine what to write, how much, and how quickly
4526 * to send writes.
4527 */
4528
4529static boolean_t
4530l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4531{
4532	/*
4533	 * A buffer is *not* eligible for the L2ARC if it:
4534	 * 1. belongs to a different spa.
4535	 * 2. is already cached on the L2ARC.
4536	 * 3. has an I/O in progress (it may be an incomplete read).
4537	 * 4. is flagged not eligible (zfs property).
4538	 */
4539	if (ab->b_spa != spa_guid) {
4540		ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
4541		return (B_FALSE);
4542	}
4543	if (ab->b_l2hdr != NULL) {
4544		ARCSTAT_BUMP(arcstat_l2_write_in_l2);
4545		return (B_FALSE);
4546	}
4547	if (HDR_IO_IN_PROGRESS(ab)) {
4548		ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
4549		return (B_FALSE);
4550	}
4551	if (!HDR_L2CACHE(ab)) {
4552		ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
4553		return (B_FALSE);
4554	}
4555
4556	return (B_TRUE);
4557}
4558
4559static uint64_t
4560l2arc_write_size(void)
4561{
4562	uint64_t size;
4563
4564	/*
4565	 * Make sure our globals have meaningful values in case the user
4566	 * altered them.
4567	 */
4568	size = l2arc_write_max;
4569	if (size == 0) {
4570		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4571		    "be greater than zero, resetting it to the default (%d)",
4572		    L2ARC_WRITE_SIZE);
4573		size = l2arc_write_max = L2ARC_WRITE_SIZE;
4574	}
4575
4576	if (arc_warm == B_FALSE)
4577		size += l2arc_write_boost;
4578
4579	return (size);
4580
4581}
4582
4583static clock_t
4584l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4585{
4586	clock_t interval, next, now;
4587
4588	/*
4589	 * If the ARC lists are busy, increase our write rate; if the
4590	 * lists are stale, idle back.  This is achieved by checking
4591	 * how much we previously wrote - if it was more than half of
4592	 * what we wanted, schedule the next write much sooner.
4593	 */
4594	if (l2arc_feed_again && wrote > (wanted / 2))
4595		interval = (hz * l2arc_feed_min_ms) / 1000;
4596	else
4597		interval = hz * l2arc_feed_secs;
4598
4599	now = ddi_get_lbolt();
4600	next = MAX(now, MIN(now + interval, began + interval));
4601
4602	return (next);
4603}
4604
4605static void
4606l2arc_hdr_stat_add(void)
4607{
4608	ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4609	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4610}
4611
4612static void
4613l2arc_hdr_stat_remove(void)
4614{
4615	ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4616	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4617}
4618
4619/*
4620 * Cycle through L2ARC devices.  This is how L2ARC load balances.
4621 * If a device is returned, this also returns holding the spa config lock.
4622 */
4623static l2arc_dev_t *
4624l2arc_dev_get_next(void)
4625{
4626	l2arc_dev_t *first, *next = NULL;
4627
4628	/*
4629	 * Lock out the removal of spas (spa_namespace_lock), then removal
4630	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4631	 * both locks will be dropped and a spa config lock held instead.
4632	 */
4633	mutex_enter(&spa_namespace_lock);
4634	mutex_enter(&l2arc_dev_mtx);
4635
4636	/* if there are no vdevs, there is nothing to do */
4637	if (l2arc_ndev == 0)
4638		goto out;
4639
4640	first = NULL;
4641	next = l2arc_dev_last;
4642	do {
4643		/* loop around the list looking for a non-faulted vdev */
4644		if (next == NULL) {
4645			next = list_head(l2arc_dev_list);
4646		} else {
4647			next = list_next(l2arc_dev_list, next);
4648			if (next == NULL)
4649				next = list_head(l2arc_dev_list);
4650		}
4651
4652		/* if we have come back to the start, bail out */
4653		if (first == NULL)
4654			first = next;
4655		else if (next == first)
4656			break;
4657
4658	} while (vdev_is_dead(next->l2ad_vdev));
4659
4660	/* if we were unable to find any usable vdevs, return NULL */
4661	if (vdev_is_dead(next->l2ad_vdev))
4662		next = NULL;
4663
4664	l2arc_dev_last = next;
4665
4666out:
4667	mutex_exit(&l2arc_dev_mtx);
4668
4669	/*
4670	 * Grab the config lock to prevent the 'next' device from being
4671	 * removed while we are writing to it.
4672	 */
4673	if (next != NULL)
4674		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4675	mutex_exit(&spa_namespace_lock);
4676
4677	return (next);
4678}
4679
4680/*
4681 * Free buffers that were tagged for destruction.
4682 */
4683static void
4684l2arc_do_free_on_write()
4685{
4686	list_t *buflist;
4687	l2arc_data_free_t *df, *df_prev;
4688
4689	mutex_enter(&l2arc_free_on_write_mtx);
4690	buflist = l2arc_free_on_write;
4691
4692	for (df = list_tail(buflist); df; df = df_prev) {
4693		df_prev = list_prev(buflist, df);
4694		ASSERT(df->l2df_data != NULL);
4695		ASSERT(df->l2df_func != NULL);
4696		df->l2df_func(df->l2df_data, df->l2df_size);
4697		list_remove(buflist, df);
4698		kmem_free(df, sizeof (l2arc_data_free_t));
4699	}
4700
4701	mutex_exit(&l2arc_free_on_write_mtx);
4702}
4703
4704/*
4705 * A write to a cache device has completed.  Update all headers to allow
4706 * reads from these buffers to begin.
4707 */
4708static void
4709l2arc_write_done(zio_t *zio)
4710{
4711	l2arc_write_callback_t *cb;
4712	l2arc_dev_t *dev;
4713	list_t *buflist;
4714	arc_buf_hdr_t *head, *ab, *ab_prev;
4715	l2arc_buf_hdr_t *abl2;
4716	kmutex_t *hash_lock;
4717	int64_t bytes_dropped = 0;
4718
4719	cb = zio->io_private;
4720	ASSERT(cb != NULL);
4721	dev = cb->l2wcb_dev;
4722	ASSERT(dev != NULL);
4723	head = cb->l2wcb_head;
4724	ASSERT(head != NULL);
4725	buflist = dev->l2ad_buflist;
4726	ASSERT(buflist != NULL);
4727	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4728	    l2arc_write_callback_t *, cb);
4729
4730	if (zio->io_error != 0)
4731		ARCSTAT_BUMP(arcstat_l2_writes_error);
4732
4733	mutex_enter(&l2arc_buflist_mtx);
4734
4735	/*
4736	 * All writes completed, or an error was hit.
4737	 */
4738	for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4739		ab_prev = list_prev(buflist, ab);
4740		abl2 = ab->b_l2hdr;
4741
4742		/*
4743		 * Release the temporary compressed buffer as soon as possible.
4744		 */
4745		if (abl2->b_compress != ZIO_COMPRESS_OFF)
4746			l2arc_release_cdata_buf(ab);
4747
4748		hash_lock = HDR_LOCK(ab);
4749		if (!mutex_tryenter(hash_lock)) {
4750			/*
4751			 * This buffer misses out.  It may be in a stage
4752			 * of eviction.  Its ARC_L2_WRITING flag will be
4753			 * left set, denying reads to this buffer.
4754			 */
4755			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4756			continue;
4757		}
4758
4759		if (zio->io_error != 0) {
4760			/*
4761			 * Error - drop L2ARC entry.
4762			 */
4763			list_remove(buflist, ab);
4764			ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4765			bytes_dropped += abl2->b_asize;
4766			ab->b_l2hdr = NULL;
4767			trim_map_free(abl2->b_dev->l2ad_vdev, abl2->b_daddr,
4768			    ab->b_size, 0);
4769			kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4770			ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4771		}
4772
4773		/*
4774		 * Allow ARC to begin reads to this L2ARC entry.
4775		 */
4776		ab->b_flags &= ~ARC_L2_WRITING;
4777
4778		mutex_exit(hash_lock);
4779	}
4780
4781	atomic_inc_64(&l2arc_writes_done);
4782	list_remove(buflist, head);
4783	kmem_cache_free(hdr_cache, head);
4784	mutex_exit(&l2arc_buflist_mtx);
4785
4786	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
4787
4788	l2arc_do_free_on_write();
4789
4790	kmem_free(cb, sizeof (l2arc_write_callback_t));
4791}
4792
4793/*
4794 * A read to a cache device completed.  Validate buffer contents before
4795 * handing over to the regular ARC routines.
4796 */
4797static void
4798l2arc_read_done(zio_t *zio)
4799{
4800	l2arc_read_callback_t *cb;
4801	arc_buf_hdr_t *hdr;
4802	arc_buf_t *buf;
4803	kmutex_t *hash_lock;
4804	int equal;
4805
4806	ASSERT(zio->io_vd != NULL);
4807	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4808
4809	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4810
4811	cb = zio->io_private;
4812	ASSERT(cb != NULL);
4813	buf = cb->l2rcb_buf;
4814	ASSERT(buf != NULL);
4815
4816	hash_lock = HDR_LOCK(buf->b_hdr);
4817	mutex_enter(hash_lock);
4818	hdr = buf->b_hdr;
4819	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4820
4821	/*
4822	 * If the buffer was compressed, decompress it first.
4823	 */
4824	if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4825		l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4826	ASSERT(zio->io_data != NULL);
4827
4828	/*
4829	 * Check this survived the L2ARC journey.
4830	 */
4831	equal = arc_cksum_equal(buf);
4832	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4833		mutex_exit(hash_lock);
4834		zio->io_private = buf;
4835		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
4836		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
4837		arc_read_done(zio);
4838	} else {
4839		mutex_exit(hash_lock);
4840		/*
4841		 * Buffer didn't survive caching.  Increment stats and
4842		 * reissue to the original storage device.
4843		 */
4844		if (zio->io_error != 0) {
4845			ARCSTAT_BUMP(arcstat_l2_io_error);
4846		} else {
4847			zio->io_error = SET_ERROR(EIO);
4848		}
4849		if (!equal)
4850			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4851
4852		/*
4853		 * If there's no waiter, issue an async i/o to the primary
4854		 * storage now.  If there *is* a waiter, the caller must
4855		 * issue the i/o in a context where it's OK to block.
4856		 */
4857		if (zio->io_waiter == NULL) {
4858			zio_t *pio = zio_unique_parent(zio);
4859
4860			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4861
4862			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4863			    buf->b_data, zio->io_size, arc_read_done, buf,
4864			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4865		}
4866	}
4867
4868	kmem_free(cb, sizeof (l2arc_read_callback_t));
4869}
4870
4871/*
4872 * This is the list priority from which the L2ARC will search for pages to
4873 * cache.  This is used within loops (0..3) to cycle through lists in the
4874 * desired order.  This order can have a significant effect on cache
4875 * performance.
4876 *
4877 * Currently the metadata lists are hit first, MFU then MRU, followed by
4878 * the data lists.  This function returns a locked list, and also returns
4879 * the lock pointer.
4880 */
4881static list_t *
4882l2arc_list_locked(int list_num, kmutex_t **lock)
4883{
4884	list_t *list = NULL;
4885	int idx;
4886
4887	ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);
4888
4889	if (list_num < ARC_BUFC_NUMMETADATALISTS) {
4890		idx = list_num;
4891		list = &arc_mfu->arcs_lists[idx];
4892		*lock = ARCS_LOCK(arc_mfu, idx);
4893	} else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) {
4894		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4895		list = &arc_mru->arcs_lists[idx];
4896		*lock = ARCS_LOCK(arc_mru, idx);
4897	} else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 +
4898		ARC_BUFC_NUMDATALISTS)) {
4899		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4900		list = &arc_mfu->arcs_lists[idx];
4901		*lock = ARCS_LOCK(arc_mfu, idx);
4902	} else {
4903		idx = list_num - ARC_BUFC_NUMLISTS;
4904		list = &arc_mru->arcs_lists[idx];
4905		*lock = ARCS_LOCK(arc_mru, idx);
4906	}
4907
4908	ASSERT(!(MUTEX_HELD(*lock)));
4909	mutex_enter(*lock);
4910	return (list);
4911}
4912
4913/*
4914 * Evict buffers from the device write hand to the distance specified in
4915 * bytes.  This distance may span populated buffers, it may span nothing.
4916 * This is clearing a region on the L2ARC device ready for writing.
4917 * If the 'all' boolean is set, every buffer is evicted.
4918 */
4919static void
4920l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4921{
4922	list_t *buflist;
4923	l2arc_buf_hdr_t *abl2;
4924	arc_buf_hdr_t *ab, *ab_prev;
4925	kmutex_t *hash_lock;
4926	uint64_t taddr;
4927	int64_t bytes_evicted = 0;
4928
4929	buflist = dev->l2ad_buflist;
4930
4931	if (buflist == NULL)
4932		return;
4933
4934	if (!all && dev->l2ad_first) {
4935		/*
4936		 * This is the first sweep through the device.  There is
4937		 * nothing to evict.
4938		 */
4939		return;
4940	}
4941
4942	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4943		/*
4944		 * When nearing the end of the device, evict to the end
4945		 * before the device write hand jumps to the start.
4946		 */
4947		taddr = dev->l2ad_end;
4948	} else {
4949		taddr = dev->l2ad_hand + distance;
4950	}
4951	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4952	    uint64_t, taddr, boolean_t, all);
4953
4954top:
4955	mutex_enter(&l2arc_buflist_mtx);
4956	for (ab = list_tail(buflist); ab; ab = ab_prev) {
4957		ab_prev = list_prev(buflist, ab);
4958
4959		hash_lock = HDR_LOCK(ab);
4960		if (!mutex_tryenter(hash_lock)) {
4961			/*
4962			 * Missed the hash lock.  Retry.
4963			 */
4964			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4965			mutex_exit(&l2arc_buflist_mtx);
4966			mutex_enter(hash_lock);
4967			mutex_exit(hash_lock);
4968			goto top;
4969		}
4970
4971		if (HDR_L2_WRITE_HEAD(ab)) {
4972			/*
4973			 * We hit a write head node.  Leave it for
4974			 * l2arc_write_done().
4975			 */
4976			list_remove(buflist, ab);
4977			mutex_exit(hash_lock);
4978			continue;
4979		}
4980
4981		if (!all && ab->b_l2hdr != NULL &&
4982		    (ab->b_l2hdr->b_daddr > taddr ||
4983		    ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4984			/*
4985			 * We've evicted to the target address,
4986			 * or the end of the device.
4987			 */
4988			mutex_exit(hash_lock);
4989			break;
4990		}
4991
4992		if (HDR_FREE_IN_PROGRESS(ab)) {
4993			/*
4994			 * Already on the path to destruction.
4995			 */
4996			mutex_exit(hash_lock);
4997			continue;
4998		}
4999
5000		if (ab->b_state == arc_l2c_only) {
5001			ASSERT(!HDR_L2_READING(ab));
5002			/*
5003			 * This doesn't exist in the ARC.  Destroy.
5004			 * arc_hdr_destroy() will call list_remove()
5005			 * and decrement arcstat_l2_size.
5006			 */
5007			arc_change_state(arc_anon, ab, hash_lock);
5008			arc_hdr_destroy(ab);
5009		} else {
5010			/*
5011			 * Invalidate issued or about to be issued
5012			 * reads, since we may be about to write
5013			 * over this location.
5014			 */
5015			if (HDR_L2_READING(ab)) {
5016				ARCSTAT_BUMP(arcstat_l2_evict_reading);
5017				ab->b_flags |= ARC_L2_EVICTED;
5018			}
5019
5020			/*
5021			 * Tell ARC this no longer exists in L2ARC.
5022			 */
5023			if (ab->b_l2hdr != NULL) {
5024				abl2 = ab->b_l2hdr;
5025				ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
5026				bytes_evicted += abl2->b_asize;
5027				ab->b_l2hdr = NULL;
5028				/*
5029				 * We are destroying l2hdr, so ensure that
5030				 * its compressed buffer, if any, is not leaked.
5031				 */
5032				ASSERT(abl2->b_tmp_cdata == NULL);
5033				kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
5034				ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
5035			}
5036			list_remove(buflist, ab);
5037
5038			/*
5039			 * This may have been leftover after a
5040			 * failed write.
5041			 */
5042			ab->b_flags &= ~ARC_L2_WRITING;
5043		}
5044		mutex_exit(hash_lock);
5045	}
5046	mutex_exit(&l2arc_buflist_mtx);
5047
5048	vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0);
5049	dev->l2ad_evict = taddr;
5050}
5051
5052/*
5053 * Find and write ARC buffers to the L2ARC device.
5054 *
5055 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
5056 * for reading until they have completed writing.
5057 * The headroom_boost is an in-out parameter used to maintain headroom boost
5058 * state between calls to this function.
5059 *
5060 * Returns the number of bytes actually written (which may be smaller than
5061 * the delta by which the device hand has changed due to alignment).
5062 */
5063static uint64_t
5064l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
5065    boolean_t *headroom_boost)
5066{
5067	arc_buf_hdr_t *ab, *ab_prev, *head;
5068	list_t *list;
5069	uint64_t write_asize, write_psize, write_sz, headroom,
5070	    buf_compress_minsz;
5071	void *buf_data;
5072	kmutex_t *list_lock;
5073	boolean_t full;
5074	l2arc_write_callback_t *cb;
5075	zio_t *pio, *wzio;
5076	uint64_t guid = spa_load_guid(spa);
5077	const boolean_t do_headroom_boost = *headroom_boost;
5078	int try;
5079
5080	ASSERT(dev->l2ad_vdev != NULL);
5081
5082	/* Lower the flag now, we might want to raise it again later. */
5083	*headroom_boost = B_FALSE;
5084
5085	pio = NULL;
5086	write_sz = write_asize = write_psize = 0;
5087	full = B_FALSE;
5088	head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
5089	head->b_flags |= ARC_L2_WRITE_HEAD;
5090
5091	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
5092	/*
5093	 * We will want to try to compress buffers that are at least 2x the
5094	 * device sector size.
5095	 */
5096	buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
5097
5098	/*
5099	 * Copy buffers for L2ARC writing.
5100	 */
5101	mutex_enter(&l2arc_buflist_mtx);
5102	for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
5103		uint64_t passed_sz = 0;
5104
5105		list = l2arc_list_locked(try, &list_lock);
5106		ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
5107
5108		/*
5109		 * L2ARC fast warmup.
5110		 *
5111		 * Until the ARC is warm and starts to evict, read from the
5112		 * head of the ARC lists rather than the tail.
5113		 */
5114		if (arc_warm == B_FALSE)
5115			ab = list_head(list);
5116		else
5117			ab = list_tail(list);
5118		if (ab == NULL)
5119			ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
5120
5121		headroom = target_sz * l2arc_headroom * 2 / ARC_BUFC_NUMLISTS;
5122		if (do_headroom_boost)
5123			headroom = (headroom * l2arc_headroom_boost) / 100;
5124
5125		for (; ab; ab = ab_prev) {
5126			l2arc_buf_hdr_t *l2hdr;
5127			kmutex_t *hash_lock;
5128			uint64_t buf_sz;
5129
5130			if (arc_warm == B_FALSE)
5131				ab_prev = list_next(list, ab);
5132			else
5133				ab_prev = list_prev(list, ab);
5134			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size);
5135
5136			hash_lock = HDR_LOCK(ab);
5137			if (!mutex_tryenter(hash_lock)) {
5138				ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
5139				/*
5140				 * Skip this buffer rather than waiting.
5141				 */
5142				continue;
5143			}
5144
5145			passed_sz += ab->b_size;
5146			if (passed_sz > headroom) {
5147				/*
5148				 * Searched too far.
5149				 */
5150				mutex_exit(hash_lock);
5151				ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
5152				break;
5153			}
5154
5155			if (!l2arc_write_eligible(guid, ab)) {
5156				mutex_exit(hash_lock);
5157				continue;
5158			}
5159
5160			if ((write_sz + ab->b_size) > target_sz) {
5161				full = B_TRUE;
5162				mutex_exit(hash_lock);
5163				ARCSTAT_BUMP(arcstat_l2_write_full);
5164				break;
5165			}
5166
5167			if (pio == NULL) {
5168				/*
5169				 * Insert a dummy header on the buflist so
5170				 * l2arc_write_done() can find where the
5171				 * write buffers begin without searching.
5172				 */
5173				list_insert_head(dev->l2ad_buflist, head);
5174
5175				cb = kmem_alloc(
5176				    sizeof (l2arc_write_callback_t), KM_SLEEP);
5177				cb->l2wcb_dev = dev;
5178				cb->l2wcb_head = head;
5179				pio = zio_root(spa, l2arc_write_done, cb,
5180				    ZIO_FLAG_CANFAIL);
5181				ARCSTAT_BUMP(arcstat_l2_write_pios);
5182			}
5183
5184			/*
5185			 * Create and add a new L2ARC header.
5186			 */
5187			l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
5188			l2hdr->b_dev = dev;
5189			ab->b_flags |= ARC_L2_WRITING;
5190
5191			/*
5192			 * Temporarily stash the data buffer in b_tmp_cdata.
5193			 * The subsequent write step will pick it up from
5194			 * there. This is because can't access ab->b_buf
5195			 * without holding the hash_lock, which we in turn
5196			 * can't access without holding the ARC list locks
5197			 * (which we want to avoid during compression/writing).
5198			 */
5199			l2hdr->b_compress = ZIO_COMPRESS_OFF;
5200			l2hdr->b_asize = ab->b_size;
5201			l2hdr->b_tmp_cdata = ab->b_buf->b_data;
5202
5203			buf_sz = ab->b_size;
5204			ab->b_l2hdr = l2hdr;
5205
5206			list_insert_head(dev->l2ad_buflist, ab);
5207
5208			/*
5209			 * Compute and store the buffer cksum before
5210			 * writing.  On debug the cksum is verified first.
5211			 */
5212			arc_cksum_verify(ab->b_buf);
5213			arc_cksum_compute(ab->b_buf, B_TRUE);
5214
5215			mutex_exit(hash_lock);
5216
5217			write_sz += buf_sz;
5218		}
5219
5220		mutex_exit(list_lock);
5221
5222		if (full == B_TRUE)
5223			break;
5224	}
5225
5226	/* No buffers selected for writing? */
5227	if (pio == NULL) {
5228		ASSERT0(write_sz);
5229		mutex_exit(&l2arc_buflist_mtx);
5230		kmem_cache_free(hdr_cache, head);
5231		return (0);
5232	}
5233
5234	/*
5235	 * Now start writing the buffers. We're starting at the write head
5236	 * and work backwards, retracing the course of the buffer selector
5237	 * loop above.
5238	 */
5239	for (ab = list_prev(dev->l2ad_buflist, head); ab;
5240	    ab = list_prev(dev->l2ad_buflist, ab)) {
5241		l2arc_buf_hdr_t *l2hdr;
5242		uint64_t buf_sz;
5243
5244		/*
5245		 * We shouldn't need to lock the buffer here, since we flagged
5246		 * it as ARC_L2_WRITING in the previous step, but we must take
5247		 * care to only access its L2 cache parameters. In particular,
5248		 * ab->b_buf may be invalid by now due to ARC eviction.
5249		 */
5250		l2hdr = ab->b_l2hdr;
5251		l2hdr->b_daddr = dev->l2ad_hand;
5252
5253		if ((ab->b_flags & ARC_L2COMPRESS) &&
5254		    l2hdr->b_asize >= buf_compress_minsz) {
5255			if (l2arc_compress_buf(l2hdr)) {
5256				/*
5257				 * If compression succeeded, enable headroom
5258				 * boost on the next scan cycle.
5259				 */
5260				*headroom_boost = B_TRUE;
5261			}
5262		}
5263
5264		/*
5265		 * Pick up the buffer data we had previously stashed away
5266		 * (and now potentially also compressed).
5267		 */
5268		buf_data = l2hdr->b_tmp_cdata;
5269		buf_sz = l2hdr->b_asize;
5270
5271		/*
5272		 * If the data has not been compressed, then clear b_tmp_cdata
5273		 * to make sure that it points only to a temporary compression
5274		 * buffer.
5275		 */
5276		if (!L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress))
5277			l2hdr->b_tmp_cdata = NULL;
5278
5279		/* Compression may have squashed the buffer to zero length. */
5280		if (buf_sz != 0) {
5281			uint64_t buf_p_sz;
5282
5283			wzio = zio_write_phys(pio, dev->l2ad_vdev,
5284			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5285			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5286			    ZIO_FLAG_CANFAIL, B_FALSE);
5287
5288			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5289			    zio_t *, wzio);
5290			(void) zio_nowait(wzio);
5291
5292			write_asize += buf_sz;
5293			/*
5294			 * Keep the clock hand suitably device-aligned.
5295			 */
5296			buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
5297			write_psize += buf_p_sz;
5298			dev->l2ad_hand += buf_p_sz;
5299		}
5300	}
5301
5302	mutex_exit(&l2arc_buflist_mtx);
5303
5304	ASSERT3U(write_asize, <=, target_sz);
5305	ARCSTAT_BUMP(arcstat_l2_writes_sent);
5306	ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5307	ARCSTAT_INCR(arcstat_l2_size, write_sz);
5308	ARCSTAT_INCR(arcstat_l2_asize, write_asize);
5309	vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
5310
5311	/*
5312	 * Bump device hand to the device start if it is approaching the end.
5313	 * l2arc_evict() will already have evicted ahead for this case.
5314	 */
5315	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
5316		dev->l2ad_hand = dev->l2ad_start;
5317		dev->l2ad_evict = dev->l2ad_start;
5318		dev->l2ad_first = B_FALSE;
5319	}
5320
5321	dev->l2ad_writing = B_TRUE;
5322	(void) zio_wait(pio);
5323	dev->l2ad_writing = B_FALSE;
5324
5325	return (write_asize);
5326}
5327
5328/*
5329 * Compresses an L2ARC buffer.
5330 * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
5331 * size in l2hdr->b_asize. This routine tries to compress the data and
5332 * depending on the compression result there are three possible outcomes:
5333 * *) The buffer was incompressible. The original l2hdr contents were left
5334 *    untouched and are ready for writing to an L2 device.
5335 * *) The buffer was all-zeros, so there is no need to write it to an L2
5336 *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
5337 *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
5338 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
5339 *    data buffer which holds the compressed data to be written, and b_asize
5340 *    tells us how much data there is. b_compress is set to the appropriate
5341 *    compression algorithm. Once writing is done, invoke
5342 *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
5343 *
5344 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
5345 * buffer was incompressible).
5346 */
5347static boolean_t
5348l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
5349{
5350	void *cdata;
5351	size_t csize, len, rounded;
5352
5353	ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
5354	ASSERT(l2hdr->b_tmp_cdata != NULL);
5355
5356	len = l2hdr->b_asize;
5357	cdata = zio_data_buf_alloc(len);
5358	csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
5359	    cdata, l2hdr->b_asize);
5360
5361	if (csize == 0) {
5362		/* zero block, indicate that there's nothing to write */
5363		zio_data_buf_free(cdata, len);
5364		l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
5365		l2hdr->b_asize = 0;
5366		l2hdr->b_tmp_cdata = NULL;
5367		ARCSTAT_BUMP(arcstat_l2_compress_zeros);
5368		return (B_TRUE);
5369	}
5370
5371	rounded = P2ROUNDUP(csize,
5372	    (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift);
5373	if (rounded < len) {
5374		/*
5375		 * Compression succeeded, we'll keep the cdata around for
5376		 * writing and release it afterwards.
5377		 */
5378		if (rounded > csize) {
5379			bzero((char *)cdata + csize, rounded - csize);
5380			csize = rounded;
5381		}
5382		l2hdr->b_compress = ZIO_COMPRESS_LZ4;
5383		l2hdr->b_asize = csize;
5384		l2hdr->b_tmp_cdata = cdata;
5385		ARCSTAT_BUMP(arcstat_l2_compress_successes);
5386		return (B_TRUE);
5387	} else {
5388		/*
5389		 * Compression failed, release the compressed buffer.
5390		 * l2hdr will be left unmodified.
5391		 */
5392		zio_data_buf_free(cdata, len);
5393		ARCSTAT_BUMP(arcstat_l2_compress_failures);
5394		return (B_FALSE);
5395	}
5396}
5397
5398/*
5399 * Decompresses a zio read back from an l2arc device. On success, the
5400 * underlying zio's io_data buffer is overwritten by the uncompressed
5401 * version. On decompression error (corrupt compressed stream), the
5402 * zio->io_error value is set to signal an I/O error.
5403 *
5404 * Please note that the compressed data stream is not checksummed, so
5405 * if the underlying device is experiencing data corruption, we may feed
5406 * corrupt data to the decompressor, so the decompressor needs to be
5407 * able to handle this situation (LZ4 does).
5408 */
5409static void
5410l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
5411{
5412	ASSERT(L2ARC_IS_VALID_COMPRESS(c));
5413
5414	if (zio->io_error != 0) {
5415		/*
5416		 * An io error has occured, just restore the original io
5417		 * size in preparation for a main pool read.
5418		 */
5419		zio->io_orig_size = zio->io_size = hdr->b_size;
5420		return;
5421	}
5422
5423	if (c == ZIO_COMPRESS_EMPTY) {
5424		/*
5425		 * An empty buffer results in a null zio, which means we
5426		 * need to fill its io_data after we're done restoring the
5427		 * buffer's contents.
5428		 */
5429		ASSERT(hdr->b_buf != NULL);
5430		bzero(hdr->b_buf->b_data, hdr->b_size);
5431		zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
5432	} else {
5433		ASSERT(zio->io_data != NULL);
5434		/*
5435		 * We copy the compressed data from the start of the arc buffer
5436		 * (the zio_read will have pulled in only what we need, the
5437		 * rest is garbage which we will overwrite at decompression)
5438		 * and then decompress back to the ARC data buffer. This way we
5439		 * can minimize copying by simply decompressing back over the
5440		 * original compressed data (rather than decompressing to an
5441		 * aux buffer and then copying back the uncompressed buffer,
5442		 * which is likely to be much larger).
5443		 */
5444		uint64_t csize;
5445		void *cdata;
5446
5447		csize = zio->io_size;
5448		cdata = zio_data_buf_alloc(csize);
5449		bcopy(zio->io_data, cdata, csize);
5450		if (zio_decompress_data(c, cdata, zio->io_data, csize,
5451		    hdr->b_size) != 0)
5452			zio->io_error = EIO;
5453		zio_data_buf_free(cdata, csize);
5454	}
5455
5456	/* Restore the expected uncompressed IO size. */
5457	zio->io_orig_size = zio->io_size = hdr->b_size;
5458}
5459
5460/*
5461 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
5462 * This buffer serves as a temporary holder of compressed data while
5463 * the buffer entry is being written to an l2arc device. Once that is
5464 * done, we can dispose of it.
5465 */
5466static void
5467l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
5468{
5469	l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
5470
5471	ASSERT(L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress));
5472	if (l2hdr->b_compress != ZIO_COMPRESS_EMPTY) {
5473		/*
5474		 * If the data was compressed, then we've allocated a
5475		 * temporary buffer for it, so now we need to release it.
5476		 */
5477		ASSERT(l2hdr->b_tmp_cdata != NULL);
5478		zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
5479		l2hdr->b_tmp_cdata = NULL;
5480	} else {
5481		ASSERT(l2hdr->b_tmp_cdata == NULL);
5482	}
5483}
5484
5485/*
5486 * This thread feeds the L2ARC at regular intervals.  This is the beating
5487 * heart of the L2ARC.
5488 */
5489static void
5490l2arc_feed_thread(void *dummy __unused)
5491{
5492	callb_cpr_t cpr;
5493	l2arc_dev_t *dev;
5494	spa_t *spa;
5495	uint64_t size, wrote;
5496	clock_t begin, next = ddi_get_lbolt();
5497	boolean_t headroom_boost = B_FALSE;
5498
5499	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
5500
5501	mutex_enter(&l2arc_feed_thr_lock);
5502
5503	while (l2arc_thread_exit == 0) {
5504		CALLB_CPR_SAFE_BEGIN(&cpr);
5505		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
5506		    next - ddi_get_lbolt());
5507		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
5508		next = ddi_get_lbolt() + hz;
5509
5510		/*
5511		 * Quick check for L2ARC devices.
5512		 */
5513		mutex_enter(&l2arc_dev_mtx);
5514		if (l2arc_ndev == 0) {
5515			mutex_exit(&l2arc_dev_mtx);
5516			continue;
5517		}
5518		mutex_exit(&l2arc_dev_mtx);
5519		begin = ddi_get_lbolt();
5520
5521		/*
5522		 * This selects the next l2arc device to write to, and in
5523		 * doing so the next spa to feed from: dev->l2ad_spa.   This
5524		 * will return NULL if there are now no l2arc devices or if
5525		 * they are all faulted.
5526		 *
5527		 * If a device is returned, its spa's config lock is also
5528		 * held to prevent device removal.  l2arc_dev_get_next()
5529		 * will grab and release l2arc_dev_mtx.
5530		 */
5531		if ((dev = l2arc_dev_get_next()) == NULL)
5532			continue;
5533
5534		spa = dev->l2ad_spa;
5535		ASSERT(spa != NULL);
5536
5537		/*
5538		 * If the pool is read-only then force the feed thread to
5539		 * sleep a little longer.
5540		 */
5541		if (!spa_writeable(spa)) {
5542			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
5543			spa_config_exit(spa, SCL_L2ARC, dev);
5544			continue;
5545		}
5546
5547		/*
5548		 * Avoid contributing to memory pressure.
5549		 */
5550		if (arc_reclaim_needed()) {
5551			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5552			spa_config_exit(spa, SCL_L2ARC, dev);
5553			continue;
5554		}
5555
5556		ARCSTAT_BUMP(arcstat_l2_feeds);
5557
5558		size = l2arc_write_size();
5559
5560		/*
5561		 * Evict L2ARC buffers that will be overwritten.
5562		 */
5563		l2arc_evict(dev, size, B_FALSE);
5564
5565		/*
5566		 * Write ARC buffers.
5567		 */
5568		wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5569
5570		/*
5571		 * Calculate interval between writes.
5572		 */
5573		next = l2arc_write_interval(begin, size, wrote);
5574		spa_config_exit(spa, SCL_L2ARC, dev);
5575	}
5576
5577	l2arc_thread_exit = 0;
5578	cv_broadcast(&l2arc_feed_thr_cv);
5579	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
5580	thread_exit();
5581}
5582
5583boolean_t
5584l2arc_vdev_present(vdev_t *vd)
5585{
5586	l2arc_dev_t *dev;
5587
5588	mutex_enter(&l2arc_dev_mtx);
5589	for (dev = list_head(l2arc_dev_list); dev != NULL;
5590	    dev = list_next(l2arc_dev_list, dev)) {
5591		if (dev->l2ad_vdev == vd)
5592			break;
5593	}
5594	mutex_exit(&l2arc_dev_mtx);
5595
5596	return (dev != NULL);
5597}
5598
5599/*
5600 * Add a vdev for use by the L2ARC.  By this point the spa has already
5601 * validated the vdev and opened it.
5602 */
5603void
5604l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5605{
5606	l2arc_dev_t *adddev;
5607
5608	ASSERT(!l2arc_vdev_present(vd));
5609
5610	vdev_ashift_optimize(vd);
5611
5612	/*
5613	 * Create a new l2arc device entry.
5614	 */
5615	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5616	adddev->l2ad_spa = spa;
5617	adddev->l2ad_vdev = vd;
5618	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5619	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5620	adddev->l2ad_hand = adddev->l2ad_start;
5621	adddev->l2ad_evict = adddev->l2ad_start;
5622	adddev->l2ad_first = B_TRUE;
5623	adddev->l2ad_writing = B_FALSE;
5624
5625	/*
5626	 * This is a list of all ARC buffers that are still valid on the
5627	 * device.
5628	 */
5629	adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5630	list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5631	    offsetof(arc_buf_hdr_t, b_l2node));
5632
5633	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5634
5635	/*
5636	 * Add device to global list
5637	 */
5638	mutex_enter(&l2arc_dev_mtx);
5639	list_insert_head(l2arc_dev_list, adddev);
5640	atomic_inc_64(&l2arc_ndev);
5641	mutex_exit(&l2arc_dev_mtx);
5642}
5643
5644/*
5645 * Remove a vdev from the L2ARC.
5646 */
5647void
5648l2arc_remove_vdev(vdev_t *vd)
5649{
5650	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5651
5652	/*
5653	 * Find the device by vdev
5654	 */
5655	mutex_enter(&l2arc_dev_mtx);
5656	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5657		nextdev = list_next(l2arc_dev_list, dev);
5658		if (vd == dev->l2ad_vdev) {
5659			remdev = dev;
5660			break;
5661		}
5662	}
5663	ASSERT(remdev != NULL);
5664
5665	/*
5666	 * Remove device from global list
5667	 */
5668	list_remove(l2arc_dev_list, remdev);
5669	l2arc_dev_last = NULL;		/* may have been invalidated */
5670	atomic_dec_64(&l2arc_ndev);
5671	mutex_exit(&l2arc_dev_mtx);
5672
5673	/*
5674	 * Clear all buflists and ARC references.  L2ARC device flush.
5675	 */
5676	l2arc_evict(remdev, 0, B_TRUE);
5677	list_destroy(remdev->l2ad_buflist);
5678	kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5679	kmem_free(remdev, sizeof (l2arc_dev_t));
5680}
5681
5682void
5683l2arc_init(void)
5684{
5685	l2arc_thread_exit = 0;
5686	l2arc_ndev = 0;
5687	l2arc_writes_sent = 0;
5688	l2arc_writes_done = 0;
5689
5690	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5691	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5692	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5693	mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5694	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5695
5696	l2arc_dev_list = &L2ARC_dev_list;
5697	l2arc_free_on_write = &L2ARC_free_on_write;
5698	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5699	    offsetof(l2arc_dev_t, l2ad_node));
5700	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5701	    offsetof(l2arc_data_free_t, l2df_list_node));
5702}
5703
5704void
5705l2arc_fini(void)
5706{
5707	/*
5708	 * This is called from dmu_fini(), which is called from spa_fini();
5709	 * Because of this, we can assume that all l2arc devices have
5710	 * already been removed when the pools themselves were removed.
5711	 */
5712
5713	l2arc_do_free_on_write();
5714
5715	mutex_destroy(&l2arc_feed_thr_lock);
5716	cv_destroy(&l2arc_feed_thr_cv);
5717	mutex_destroy(&l2arc_dev_mtx);
5718	mutex_destroy(&l2arc_buflist_mtx);
5719	mutex_destroy(&l2arc_free_on_write_mtx);
5720
5721	list_destroy(l2arc_dev_list);
5722	list_destroy(l2arc_free_on_write);
5723}
5724
5725void
5726l2arc_start(void)
5727{
5728	if (!(spa_mode_global & FWRITE))
5729		return;
5730
5731	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5732	    TS_RUN, minclsyspri);
5733}
5734
5735void
5736l2arc_stop(void)
5737{
5738	if (!(spa_mode_global & FWRITE))
5739		return;
5740
5741	mutex_enter(&l2arc_feed_thr_lock);
5742	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
5743	l2arc_thread_exit = 1;
5744	while (l2arc_thread_exit != 0)
5745		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5746	mutex_exit(&l2arc_feed_thr_lock);
5747}
5748