arc.c revision 286598
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
26 * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
27 */
28
29/*
30 * DVA-based Adjustable Replacement Cache
31 *
32 * While much of the theory of operation used here is
33 * based on the self-tuning, low overhead replacement cache
34 * presented by Megiddo and Modha at FAST 2003, there are some
35 * significant differences:
36 *
37 * 1. The Megiddo and Modha model assumes any page is evictable.
38 * Pages in its cache cannot be "locked" into memory.  This makes
39 * the eviction algorithm simple: evict the last page in the list.
40 * This also make the performance characteristics easy to reason
41 * about.  Our cache is not so simple.  At any given moment, some
42 * subset of the blocks in the cache are un-evictable because we
43 * have handed out a reference to them.  Blocks are only evictable
44 * when there are no external references active.  This makes
45 * eviction far more problematic:  we choose to evict the evictable
46 * blocks that are the "lowest" in the list.
47 *
48 * There are times when it is not possible to evict the requested
49 * space.  In these circumstances we are unable to adjust the cache
50 * size.  To prevent the cache growing unbounded at these times we
51 * implement a "cache throttle" that slows the flow of new data
52 * into the cache until we can make space available.
53 *
54 * 2. The Megiddo and Modha model assumes a fixed cache size.
55 * Pages are evicted when the cache is full and there is a cache
56 * miss.  Our model has a variable sized cache.  It grows with
57 * high use, but also tries to react to memory pressure from the
58 * operating system: decreasing its size when system memory is
59 * tight.
60 *
61 * 3. The Megiddo and Modha model assumes a fixed page size. All
62 * elements of the cache are therefore exactly the same size.  So
63 * when adjusting the cache size following a cache miss, its simply
64 * a matter of choosing a single page to evict.  In our model, we
65 * have variable sized cache blocks (rangeing from 512 bytes to
66 * 128K bytes).  We therefore choose a set of blocks to evict to make
67 * space for a cache miss that approximates as closely as possible
68 * the space used by the new block.
69 *
70 * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
71 * by N. Megiddo & D. Modha, FAST 2003
72 */
73
74/*
75 * The locking model:
76 *
77 * A new reference to a cache buffer can be obtained in two
78 * ways: 1) via a hash table lookup using the DVA as a key,
79 * or 2) via one of the ARC lists.  The arc_read() interface
80 * uses method 1, while the internal arc algorithms for
81 * adjusting the cache use method 2.  We therefore provide two
82 * types of locks: 1) the hash table lock array, and 2) the
83 * arc list locks.
84 *
85 * Buffers do not have their own mutexs, rather they rely on the
86 * hash table mutexs for the bulk of their protection (i.e. most
87 * fields in the arc_buf_hdr_t are protected by these mutexs).
88 *
89 * buf_hash_find() returns the appropriate mutex (held) when it
90 * locates the requested buffer in the hash table.  It returns
91 * NULL for the mutex if the buffer was not in the table.
92 *
93 * buf_hash_remove() expects the appropriate hash mutex to be
94 * already held before it is invoked.
95 *
96 * Each arc state also has a mutex which is used to protect the
97 * buffer list associated with the state.  When attempting to
98 * obtain a hash table lock while holding an arc list lock you
99 * must use: mutex_tryenter() to avoid deadlock.  Also note that
100 * the active state mutex must be held before the ghost state mutex.
101 *
102 * Arc buffers may have an associated eviction callback function.
103 * This function will be invoked prior to removing the buffer (e.g.
104 * in arc_do_user_evicts()).  Note however that the data associated
105 * with the buffer may be evicted prior to the callback.  The callback
106 * must be made with *no locks held* (to prevent deadlock).  Additionally,
107 * the users of callbacks must ensure that their private data is
108 * protected from simultaneous callbacks from arc_clear_callback()
109 * and arc_do_user_evicts().
110 *
111 * Note that the majority of the performance stats are manipulated
112 * with atomic operations.
113 *
114 * The L2ARC uses the l2ad_mtx on each vdev for the following:
115 *
116 *	- L2ARC buflist creation
117 *	- L2ARC buflist eviction
118 *	- L2ARC write completion, which walks L2ARC buflists
119 *	- ARC header destruction, as it removes from L2ARC buflists
120 *	- ARC header release, as it removes from L2ARC buflists
121 */
122
123#include <sys/spa.h>
124#include <sys/zio.h>
125#include <sys/zio_compress.h>
126#include <sys/zfs_context.h>
127#include <sys/arc.h>
128#include <sys/refcount.h>
129#include <sys/vdev.h>
130#include <sys/vdev_impl.h>
131#include <sys/dsl_pool.h>
132#ifdef _KERNEL
133#include <sys/dnlc.h>
134#endif
135#include <sys/callb.h>
136#include <sys/kstat.h>
137#include <sys/trim_map.h>
138#include <zfs_fletcher.h>
139#include <sys/sdt.h>
140
141#include <vm/vm_pageout.h>
142#include <machine/vmparam.h>
143
144#ifdef illumos
145#ifndef _KERNEL
146/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
147boolean_t arc_watch = B_FALSE;
148int arc_procfd;
149#endif
150#endif /* illumos */
151
152static kmutex_t		arc_reclaim_thr_lock;
153static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
154static uint8_t		arc_thread_exit;
155
156#define	ARC_REDUCE_DNLC_PERCENT	3
157uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
158
159typedef enum arc_reclaim_strategy {
160	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
161	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
162} arc_reclaim_strategy_t;
163
164/*
165 * The number of iterations through arc_evict_*() before we
166 * drop & reacquire the lock.
167 */
168int arc_evict_iterations = 100;
169
170/* number of seconds before growing cache again */
171static int		arc_grow_retry = 60;
172
173/* shift of arc_c for calculating both min and max arc_p */
174static int		arc_p_min_shift = 4;
175
176/* log2(fraction of arc to reclaim) */
177static int		arc_shrink_shift = 5;
178
179/*
180 * minimum lifespan of a prefetch block in clock ticks
181 * (initialized in arc_init())
182 */
183static int		arc_min_prefetch_lifespan;
184
185/*
186 * If this percent of memory is free, don't throttle.
187 */
188int arc_lotsfree_percent = 10;
189
190static int arc_dead;
191extern int zfs_prefetch_disable;
192
193/*
194 * The arc has filled available memory and has now warmed up.
195 */
196static boolean_t arc_warm;
197
198uint64_t zfs_arc_max;
199uint64_t zfs_arc_min;
200uint64_t zfs_arc_meta_limit = 0;
201uint64_t zfs_arc_meta_min = 0;
202int zfs_arc_grow_retry = 0;
203int zfs_arc_shrink_shift = 0;
204int zfs_arc_p_min_shift = 0;
205int zfs_disable_dup_eviction = 0;
206uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
207u_int zfs_arc_free_target = 0;
208
209static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
210static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS);
211
212#ifdef _KERNEL
213static void
214arc_free_target_init(void *unused __unused)
215{
216
217	zfs_arc_free_target = vm_pageout_wakeup_thresh;
218}
219SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
220    arc_free_target_init, NULL);
221
222TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
223TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min);
224TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift);
225SYSCTL_DECL(_vfs_zfs);
226SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
227    "Maximum ARC size");
228SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
229    "Minimum ARC size");
230SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
231    &zfs_arc_average_blocksize, 0,
232    "ARC average blocksize");
233SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
234    &arc_shrink_shift, 0,
235    "log2(fraction of arc to reclaim)");
236
237/*
238 * We don't have a tunable for arc_free_target due to the dependency on
239 * pagedaemon initialisation.
240 */
241SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
242    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
243    sysctl_vfs_zfs_arc_free_target, "IU",
244    "Desired number of free pages below which ARC triggers reclaim");
245
246static int
247sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
248{
249	u_int val;
250	int err;
251
252	val = zfs_arc_free_target;
253	err = sysctl_handle_int(oidp, &val, 0, req);
254	if (err != 0 || req->newptr == NULL)
255		return (err);
256
257	if (val < minfree)
258		return (EINVAL);
259	if (val > vm_cnt.v_page_count)
260		return (EINVAL);
261
262	zfs_arc_free_target = val;
263
264	return (0);
265}
266
267/*
268 * Must be declared here, before the definition of corresponding kstat
269 * macro which uses the same names will confuse the compiler.
270 */
271SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit,
272    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
273    sysctl_vfs_zfs_arc_meta_limit, "QU",
274    "ARC metadata limit");
275#endif
276
277/*
278 * Note that buffers can be in one of 6 states:
279 *	ARC_anon	- anonymous (discussed below)
280 *	ARC_mru		- recently used, currently cached
281 *	ARC_mru_ghost	- recentely used, no longer in cache
282 *	ARC_mfu		- frequently used, currently cached
283 *	ARC_mfu_ghost	- frequently used, no longer in cache
284 *	ARC_l2c_only	- exists in L2ARC but not other states
285 * When there are no active references to the buffer, they are
286 * are linked onto a list in one of these arc states.  These are
287 * the only buffers that can be evicted or deleted.  Within each
288 * state there are multiple lists, one for meta-data and one for
289 * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
290 * etc.) is tracked separately so that it can be managed more
291 * explicitly: favored over data, limited explicitly.
292 *
293 * Anonymous buffers are buffers that are not associated with
294 * a DVA.  These are buffers that hold dirty block copies
295 * before they are written to stable storage.  By definition,
296 * they are "ref'd" and are considered part of arc_mru
297 * that cannot be freed.  Generally, they will aquire a DVA
298 * as they are written and migrate onto the arc_mru list.
299 *
300 * The ARC_l2c_only state is for buffers that are in the second
301 * level ARC but no longer in any of the ARC_m* lists.  The second
302 * level ARC itself may also contain buffers that are in any of
303 * the ARC_m* states - meaning that a buffer can exist in two
304 * places.  The reason for the ARC_l2c_only state is to keep the
305 * buffer header in the hash table, so that reads that hit the
306 * second level ARC benefit from these fast lookups.
307 */
308
309#define	ARCS_LOCK_PAD		CACHE_LINE_SIZE
310struct arcs_lock {
311	kmutex_t	arcs_lock;
312#ifdef _KERNEL
313	unsigned char	pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
314#endif
315};
316
317/*
318 * must be power of two for mask use to work
319 *
320 */
321#define ARC_BUFC_NUMDATALISTS		16
322#define ARC_BUFC_NUMMETADATALISTS	16
323#define ARC_BUFC_NUMLISTS	(ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
324
325typedef struct arc_state {
326	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
327	uint64_t arcs_size;	/* total amount of data in this state */
328	list_t	arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
329	struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
330} arc_state_t;
331
332#define ARCS_LOCK(s, i)	(&((s)->arcs_locks[(i)].arcs_lock))
333
334/* The 6 states: */
335static arc_state_t ARC_anon;
336static arc_state_t ARC_mru;
337static arc_state_t ARC_mru_ghost;
338static arc_state_t ARC_mfu;
339static arc_state_t ARC_mfu_ghost;
340static arc_state_t ARC_l2c_only;
341
342typedef struct arc_stats {
343	kstat_named_t arcstat_hits;
344	kstat_named_t arcstat_misses;
345	kstat_named_t arcstat_demand_data_hits;
346	kstat_named_t arcstat_demand_data_misses;
347	kstat_named_t arcstat_demand_metadata_hits;
348	kstat_named_t arcstat_demand_metadata_misses;
349	kstat_named_t arcstat_prefetch_data_hits;
350	kstat_named_t arcstat_prefetch_data_misses;
351	kstat_named_t arcstat_prefetch_metadata_hits;
352	kstat_named_t arcstat_prefetch_metadata_misses;
353	kstat_named_t arcstat_mru_hits;
354	kstat_named_t arcstat_mru_ghost_hits;
355	kstat_named_t arcstat_mfu_hits;
356	kstat_named_t arcstat_mfu_ghost_hits;
357	kstat_named_t arcstat_allocated;
358	kstat_named_t arcstat_deleted;
359	kstat_named_t arcstat_stolen;
360	kstat_named_t arcstat_recycle_miss;
361	/*
362	 * Number of buffers that could not be evicted because the hash lock
363	 * was held by another thread.  The lock may not necessarily be held
364	 * by something using the same buffer, since hash locks are shared
365	 * by multiple buffers.
366	 */
367	kstat_named_t arcstat_mutex_miss;
368	/*
369	 * Number of buffers skipped because they have I/O in progress, are
370	 * indrect prefetch buffers that have not lived long enough, or are
371	 * not from the spa we're trying to evict from.
372	 */
373	kstat_named_t arcstat_evict_skip;
374	kstat_named_t arcstat_evict_l2_cached;
375	kstat_named_t arcstat_evict_l2_eligible;
376	kstat_named_t arcstat_evict_l2_ineligible;
377	kstat_named_t arcstat_hash_elements;
378	kstat_named_t arcstat_hash_elements_max;
379	kstat_named_t arcstat_hash_collisions;
380	kstat_named_t arcstat_hash_chains;
381	kstat_named_t arcstat_hash_chain_max;
382	kstat_named_t arcstat_p;
383	kstat_named_t arcstat_c;
384	kstat_named_t arcstat_c_min;
385	kstat_named_t arcstat_c_max;
386	kstat_named_t arcstat_size;
387	/*
388	 * Number of bytes consumed by internal ARC structures necessary
389	 * for tracking purposes; these structures are not actually
390	 * backed by ARC buffers. This includes arc_buf_hdr_t structures
391	 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
392	 * caches), and arc_buf_t structures (allocated via arc_buf_t
393	 * cache).
394	 */
395	kstat_named_t arcstat_hdr_size;
396	/*
397	 * Number of bytes consumed by ARC buffers of type equal to
398	 * ARC_BUFC_DATA. This is generally consumed by buffers backing
399	 * on disk user data (e.g. plain file contents).
400	 */
401	kstat_named_t arcstat_data_size;
402	/*
403	 * Number of bytes consumed by ARC buffers of type equal to
404	 * ARC_BUFC_METADATA. This is generally consumed by buffers
405	 * backing on disk data that is used for internal ZFS
406	 * structures (e.g. ZAP, dnode, indirect blocks, etc).
407	 */
408	kstat_named_t arcstat_metadata_size;
409	/*
410	 * Number of bytes consumed by various buffers and structures
411	 * not actually backed with ARC buffers. This includes bonus
412	 * buffers (allocated directly via zio_buf_* functions),
413	 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
414	 * cache), and dnode_t structures (allocated via dnode_t cache).
415	 */
416	kstat_named_t arcstat_other_size;
417	/*
418	 * Total number of bytes consumed by ARC buffers residing in the
419	 * arc_anon state. This includes *all* buffers in the arc_anon
420	 * state; e.g. data, metadata, evictable, and unevictable buffers
421	 * are all included in this value.
422	 */
423	kstat_named_t arcstat_anon_size;
424	/*
425	 * Number of bytes consumed by ARC buffers that meet the
426	 * following criteria: backing buffers of type ARC_BUFC_DATA,
427	 * residing in the arc_anon state, and are eligible for eviction
428	 * (e.g. have no outstanding holds on the buffer).
429	 */
430	kstat_named_t arcstat_anon_evictable_data;
431	/*
432	 * Number of bytes consumed by ARC buffers that meet the
433	 * following criteria: backing buffers of type ARC_BUFC_METADATA,
434	 * residing in the arc_anon state, and are eligible for eviction
435	 * (e.g. have no outstanding holds on the buffer).
436	 */
437	kstat_named_t arcstat_anon_evictable_metadata;
438	/*
439	 * Total number of bytes consumed by ARC buffers residing in the
440	 * arc_mru state. This includes *all* buffers in the arc_mru
441	 * state; e.g. data, metadata, evictable, and unevictable buffers
442	 * are all included in this value.
443	 */
444	kstat_named_t arcstat_mru_size;
445	/*
446	 * Number of bytes consumed by ARC buffers that meet the
447	 * following criteria: backing buffers of type ARC_BUFC_DATA,
448	 * residing in the arc_mru state, and are eligible for eviction
449	 * (e.g. have no outstanding holds on the buffer).
450	 */
451	kstat_named_t arcstat_mru_evictable_data;
452	/*
453	 * Number of bytes consumed by ARC buffers that meet the
454	 * following criteria: backing buffers of type ARC_BUFC_METADATA,
455	 * residing in the arc_mru state, and are eligible for eviction
456	 * (e.g. have no outstanding holds on the buffer).
457	 */
458	kstat_named_t arcstat_mru_evictable_metadata;
459	/*
460	 * Total number of bytes that *would have been* consumed by ARC
461	 * buffers in the arc_mru_ghost state. The key thing to note
462	 * here, is the fact that this size doesn't actually indicate
463	 * RAM consumption. The ghost lists only consist of headers and
464	 * don't actually have ARC buffers linked off of these headers.
465	 * Thus, *if* the headers had associated ARC buffers, these
466	 * buffers *would have* consumed this number of bytes.
467	 */
468	kstat_named_t arcstat_mru_ghost_size;
469	/*
470	 * Number of bytes that *would have been* consumed by ARC
471	 * buffers that are eligible for eviction, of type
472	 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
473	 */
474	kstat_named_t arcstat_mru_ghost_evictable_data;
475	/*
476	 * Number of bytes that *would have been* consumed by ARC
477	 * buffers that are eligible for eviction, of type
478	 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
479	 */
480	kstat_named_t arcstat_mru_ghost_evictable_metadata;
481	/*
482	 * Total number of bytes consumed by ARC buffers residing in the
483	 * arc_mfu state. This includes *all* buffers in the arc_mfu
484	 * state; e.g. data, metadata, evictable, and unevictable buffers
485	 * are all included in this value.
486	 */
487	kstat_named_t arcstat_mfu_size;
488	/*
489	 * Number of bytes consumed by ARC buffers that are eligible for
490	 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
491	 * state.
492	 */
493	kstat_named_t arcstat_mfu_evictable_data;
494	/*
495	 * Number of bytes consumed by ARC buffers that are eligible for
496	 * eviction, of type ARC_BUFC_METADATA, and reside in the
497	 * arc_mfu state.
498	 */
499	kstat_named_t arcstat_mfu_evictable_metadata;
500	/*
501	 * Total number of bytes that *would have been* consumed by ARC
502	 * buffers in the arc_mfu_ghost state. See the comment above
503	 * arcstat_mru_ghost_size for more details.
504	 */
505	kstat_named_t arcstat_mfu_ghost_size;
506	/*
507	 * Number of bytes that *would have been* consumed by ARC
508	 * buffers that are eligible for eviction, of type
509	 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
510	 */
511	kstat_named_t arcstat_mfu_ghost_evictable_data;
512	/*
513	 * Number of bytes that *would have been* consumed by ARC
514	 * buffers that are eligible for eviction, of type
515	 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
516	 */
517	kstat_named_t arcstat_mfu_ghost_evictable_metadata;
518	kstat_named_t arcstat_l2_hits;
519	kstat_named_t arcstat_l2_misses;
520	kstat_named_t arcstat_l2_feeds;
521	kstat_named_t arcstat_l2_rw_clash;
522	kstat_named_t arcstat_l2_read_bytes;
523	kstat_named_t arcstat_l2_write_bytes;
524	kstat_named_t arcstat_l2_writes_sent;
525	kstat_named_t arcstat_l2_writes_done;
526	kstat_named_t arcstat_l2_writes_error;
527	kstat_named_t arcstat_l2_writes_hdr_miss;
528	kstat_named_t arcstat_l2_evict_lock_retry;
529	kstat_named_t arcstat_l2_evict_reading;
530	kstat_named_t arcstat_l2_evict_l1cached;
531	kstat_named_t arcstat_l2_free_on_write;
532	kstat_named_t arcstat_l2_cdata_free_on_write;
533	kstat_named_t arcstat_l2_abort_lowmem;
534	kstat_named_t arcstat_l2_cksum_bad;
535	kstat_named_t arcstat_l2_io_error;
536	kstat_named_t arcstat_l2_size;
537	kstat_named_t arcstat_l2_asize;
538	kstat_named_t arcstat_l2_hdr_size;
539	kstat_named_t arcstat_l2_compress_successes;
540	kstat_named_t arcstat_l2_compress_zeros;
541	kstat_named_t arcstat_l2_compress_failures;
542	kstat_named_t arcstat_l2_write_trylock_fail;
543	kstat_named_t arcstat_l2_write_passed_headroom;
544	kstat_named_t arcstat_l2_write_spa_mismatch;
545	kstat_named_t arcstat_l2_write_in_l2;
546	kstat_named_t arcstat_l2_write_hdr_io_in_progress;
547	kstat_named_t arcstat_l2_write_not_cacheable;
548	kstat_named_t arcstat_l2_write_full;
549	kstat_named_t arcstat_l2_write_buffer_iter;
550	kstat_named_t arcstat_l2_write_pios;
551	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
552	kstat_named_t arcstat_l2_write_buffer_list_iter;
553	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
554	kstat_named_t arcstat_memory_throttle_count;
555	kstat_named_t arcstat_duplicate_buffers;
556	kstat_named_t arcstat_duplicate_buffers_size;
557	kstat_named_t arcstat_duplicate_reads;
558	kstat_named_t arcstat_meta_used;
559	kstat_named_t arcstat_meta_limit;
560	kstat_named_t arcstat_meta_max;
561	kstat_named_t arcstat_meta_min;
562} arc_stats_t;
563
564static arc_stats_t arc_stats = {
565	{ "hits",			KSTAT_DATA_UINT64 },
566	{ "misses",			KSTAT_DATA_UINT64 },
567	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
568	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
569	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
570	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
571	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
572	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
573	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
574	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
575	{ "mru_hits",			KSTAT_DATA_UINT64 },
576	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
577	{ "mfu_hits",			KSTAT_DATA_UINT64 },
578	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
579	{ "allocated",			KSTAT_DATA_UINT64 },
580	{ "deleted",			KSTAT_DATA_UINT64 },
581	{ "stolen",			KSTAT_DATA_UINT64 },
582	{ "recycle_miss",		KSTAT_DATA_UINT64 },
583	{ "mutex_miss",			KSTAT_DATA_UINT64 },
584	{ "evict_skip",			KSTAT_DATA_UINT64 },
585	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
586	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
587	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
588	{ "hash_elements",		KSTAT_DATA_UINT64 },
589	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
590	{ "hash_collisions",		KSTAT_DATA_UINT64 },
591	{ "hash_chains",		KSTAT_DATA_UINT64 },
592	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
593	{ "p",				KSTAT_DATA_UINT64 },
594	{ "c",				KSTAT_DATA_UINT64 },
595	{ "c_min",			KSTAT_DATA_UINT64 },
596	{ "c_max",			KSTAT_DATA_UINT64 },
597	{ "size",			KSTAT_DATA_UINT64 },
598	{ "hdr_size",			KSTAT_DATA_UINT64 },
599	{ "data_size",			KSTAT_DATA_UINT64 },
600	{ "metadata_size",		KSTAT_DATA_UINT64 },
601	{ "other_size",			KSTAT_DATA_UINT64 },
602	{ "anon_size",			KSTAT_DATA_UINT64 },
603	{ "anon_evictable_data",	KSTAT_DATA_UINT64 },
604	{ "anon_evictable_metadata",	KSTAT_DATA_UINT64 },
605	{ "mru_size",			KSTAT_DATA_UINT64 },
606	{ "mru_evictable_data",		KSTAT_DATA_UINT64 },
607	{ "mru_evictable_metadata",	KSTAT_DATA_UINT64 },
608	{ "mru_ghost_size",		KSTAT_DATA_UINT64 },
609	{ "mru_ghost_evictable_data",	KSTAT_DATA_UINT64 },
610	{ "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
611	{ "mfu_size",			KSTAT_DATA_UINT64 },
612	{ "mfu_evictable_data",		KSTAT_DATA_UINT64 },
613	{ "mfu_evictable_metadata",	KSTAT_DATA_UINT64 },
614	{ "mfu_ghost_size",		KSTAT_DATA_UINT64 },
615	{ "mfu_ghost_evictable_data",	KSTAT_DATA_UINT64 },
616	{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
617	{ "l2_hits",			KSTAT_DATA_UINT64 },
618	{ "l2_misses",			KSTAT_DATA_UINT64 },
619	{ "l2_feeds",			KSTAT_DATA_UINT64 },
620	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
621	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
622	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
623	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
624	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
625	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
626	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
627	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
628	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
629	{ "l2_evict_l1cached",		KSTAT_DATA_UINT64 },
630	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
631	{ "l2_cdata_free_on_write",	KSTAT_DATA_UINT64 },
632	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
633	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
634	{ "l2_io_error",		KSTAT_DATA_UINT64 },
635	{ "l2_size",			KSTAT_DATA_UINT64 },
636	{ "l2_asize",			KSTAT_DATA_UINT64 },
637	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
638	{ "l2_compress_successes",	KSTAT_DATA_UINT64 },
639	{ "l2_compress_zeros",		KSTAT_DATA_UINT64 },
640	{ "l2_compress_failures",	KSTAT_DATA_UINT64 },
641	{ "l2_write_trylock_fail",	KSTAT_DATA_UINT64 },
642	{ "l2_write_passed_headroom",	KSTAT_DATA_UINT64 },
643	{ "l2_write_spa_mismatch",	KSTAT_DATA_UINT64 },
644	{ "l2_write_in_l2",		KSTAT_DATA_UINT64 },
645	{ "l2_write_io_in_progress",	KSTAT_DATA_UINT64 },
646	{ "l2_write_not_cacheable",	KSTAT_DATA_UINT64 },
647	{ "l2_write_full",		KSTAT_DATA_UINT64 },
648	{ "l2_write_buffer_iter",	KSTAT_DATA_UINT64 },
649	{ "l2_write_pios",		KSTAT_DATA_UINT64 },
650	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
651	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
652	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
653	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
654	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
655	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
656	{ "duplicate_reads",		KSTAT_DATA_UINT64 },
657	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
658	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
659	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
660	{ "arc_meta_min",		KSTAT_DATA_UINT64 }
661};
662
663#define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
664
665#define	ARCSTAT_INCR(stat, val) \
666	atomic_add_64(&arc_stats.stat.value.ui64, (val))
667
668#define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
669#define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
670
671#define	ARCSTAT_MAX(stat, val) {					\
672	uint64_t m;							\
673	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
674	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
675		continue;						\
676}
677
678#define	ARCSTAT_MAXSTAT(stat) \
679	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
680
681/*
682 * We define a macro to allow ARC hits/misses to be easily broken down by
683 * two separate conditions, giving a total of four different subtypes for
684 * each of hits and misses (so eight statistics total).
685 */
686#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
687	if (cond1) {							\
688		if (cond2) {						\
689			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
690		} else {						\
691			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
692		}							\
693	} else {							\
694		if (cond2) {						\
695			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
696		} else {						\
697			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
698		}							\
699	}
700
701kstat_t			*arc_ksp;
702static arc_state_t	*arc_anon;
703static arc_state_t	*arc_mru;
704static arc_state_t	*arc_mru_ghost;
705static arc_state_t	*arc_mfu;
706static arc_state_t	*arc_mfu_ghost;
707static arc_state_t	*arc_l2c_only;
708
709/*
710 * There are several ARC variables that are critical to export as kstats --
711 * but we don't want to have to grovel around in the kstat whenever we wish to
712 * manipulate them.  For these variables, we therefore define them to be in
713 * terms of the statistic variable.  This assures that we are not introducing
714 * the possibility of inconsistency by having shadow copies of the variables,
715 * while still allowing the code to be readable.
716 */
717#define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
718#define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
719#define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
720#define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
721#define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
722#define	arc_meta_limit	ARCSTAT(arcstat_meta_limit) /* max size for metadata */
723#define	arc_meta_min	ARCSTAT(arcstat_meta_min) /* min size for metadata */
724#define	arc_meta_used	ARCSTAT(arcstat_meta_used) /* size of metadata */
725#define	arc_meta_max	ARCSTAT(arcstat_meta_max) /* max size of metadata */
726
727#define	L2ARC_IS_VALID_COMPRESS(_c_) \
728	((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
729
730static int		arc_no_grow;	/* Don't try to grow cache size */
731static uint64_t		arc_tempreserve;
732static uint64_t		arc_loaned_bytes;
733
734typedef struct arc_callback arc_callback_t;
735
736struct arc_callback {
737	void			*acb_private;
738	arc_done_func_t		*acb_done;
739	arc_buf_t		*acb_buf;
740	zio_t			*acb_zio_dummy;
741	arc_callback_t		*acb_next;
742};
743
744typedef struct arc_write_callback arc_write_callback_t;
745
746struct arc_write_callback {
747	void		*awcb_private;
748	arc_done_func_t	*awcb_ready;
749	arc_done_func_t	*awcb_physdone;
750	arc_done_func_t	*awcb_done;
751	arc_buf_t	*awcb_buf;
752};
753
754/*
755 * ARC buffers are separated into multiple structs as a memory saving measure:
756 *   - Common fields struct, always defined, and embedded within it:
757 *       - L2-only fields, always allocated but undefined when not in L2ARC
758 *       - L1-only fields, only allocated when in L1ARC
759 *
760 *           Buffer in L1                     Buffer only in L2
761 *    +------------------------+          +------------------------+
762 *    | arc_buf_hdr_t          |          | arc_buf_hdr_t          |
763 *    |                        |          |                        |
764 *    |                        |          |                        |
765 *    |                        |          |                        |
766 *    +------------------------+          +------------------------+
767 *    | l2arc_buf_hdr_t        |          | l2arc_buf_hdr_t        |
768 *    | (undefined if L1-only) |          |                        |
769 *    +------------------------+          +------------------------+
770 *    | l1arc_buf_hdr_t        |
771 *    |                        |
772 *    |                        |
773 *    |                        |
774 *    |                        |
775 *    +------------------------+
776 *
777 * Because it's possible for the L2ARC to become extremely large, we can wind
778 * up eating a lot of memory in L2ARC buffer headers, so the size of a header
779 * is minimized by only allocating the fields necessary for an L1-cached buffer
780 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
781 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
782 * words in pointers. arc_hdr_realloc() is used to switch a header between
783 * these two allocation states.
784 */
785typedef struct l1arc_buf_hdr {
786	kmutex_t		b_freeze_lock;
787#ifdef ZFS_DEBUG
788	/*
789	 * used for debugging wtih kmem_flags - by allocating and freeing
790	 * b_thawed when the buffer is thawed, we get a record of the stack
791	 * trace that thawed it.
792	 */
793	void			*b_thawed;
794#endif
795
796	arc_buf_t		*b_buf;
797	uint32_t		b_datacnt;
798	/* for waiting on writes to complete */
799	kcondvar_t		b_cv;
800
801	/* protected by arc state mutex */
802	arc_state_t		*b_state;
803	list_node_t		b_arc_node;
804
805	/* updated atomically */
806	clock_t			b_arc_access;
807
808	/* self protecting */
809	refcount_t		b_refcnt;
810
811	arc_callback_t		*b_acb;
812	/* temporary buffer holder for in-flight compressed data */
813	void			*b_tmp_cdata;
814} l1arc_buf_hdr_t;
815
816typedef struct l2arc_dev l2arc_dev_t;
817
818typedef struct l2arc_buf_hdr {
819	/* protected by arc_buf_hdr mutex */
820	l2arc_dev_t		*b_dev;		/* L2ARC device */
821	uint64_t		b_daddr;	/* disk address, offset byte */
822	/* real alloc'd buffer size depending on b_compress applied */
823	int32_t			b_asize;
824
825	list_node_t		b_l2node;
826} l2arc_buf_hdr_t;
827
828struct arc_buf_hdr {
829	/* protected by hash lock */
830	dva_t			b_dva;
831	uint64_t		b_birth;
832	/*
833	 * Even though this checksum is only set/verified when a buffer is in
834	 * the L1 cache, it needs to be in the set of common fields because it
835	 * must be preserved from the time before a buffer is written out to
836	 * L2ARC until after it is read back in.
837	 */
838	zio_cksum_t		*b_freeze_cksum;
839
840	arc_buf_hdr_t		*b_hash_next;
841	arc_flags_t		b_flags;
842
843	/* immutable */
844	int32_t			b_size;
845	uint64_t		b_spa;
846
847	/* L2ARC fields. Undefined when not in L2ARC. */
848	l2arc_buf_hdr_t		b_l2hdr;
849	/* L1ARC fields. Undefined when in l2arc_only state */
850	l1arc_buf_hdr_t		b_l1hdr;
851};
852
853#ifdef _KERNEL
854static int
855sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS)
856{
857	uint64_t val;
858	int err;
859
860	val = arc_meta_limit;
861	err = sysctl_handle_64(oidp, &val, 0, req);
862	if (err != 0 || req->newptr == NULL)
863		return (err);
864
865        if (val <= 0 || val > arc_c_max)
866		return (EINVAL);
867
868	arc_meta_limit = val;
869	return (0);
870}
871#endif
872
873static arc_buf_t *arc_eviction_list;
874static kmutex_t arc_eviction_mtx;
875static arc_buf_hdr_t arc_eviction_hdr;
876
877#define	GHOST_STATE(state)	\
878	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
879	(state) == arc_l2c_only)
880
881#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
882#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
883#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
884#define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
885#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
886#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
887
888#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_FLAG_L2CACHE)
889#define	HDR_L2COMPRESS(hdr)	((hdr)->b_flags & ARC_FLAG_L2COMPRESS)
890#define	HDR_L2_READING(hdr)	\
891	    (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&	\
892	    ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
893#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITING)
894#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
895#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
896
897#define	HDR_ISTYPE_METADATA(hdr)	\
898	    ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
899#define	HDR_ISTYPE_DATA(hdr)	(!HDR_ISTYPE_METADATA(hdr))
900
901#define	HDR_HAS_L1HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
902#define	HDR_HAS_L2HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
903
904/* For storing compression mode in b_flags */
905#define	HDR_COMPRESS_OFFSET	24
906#define	HDR_COMPRESS_NBITS	7
907
908#define	HDR_GET_COMPRESS(hdr)	((enum zio_compress)BF32_GET(hdr->b_flags, \
909	    HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS))
910#define	HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \
911	    HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp))
912
913/*
914 * Other sizes
915 */
916
917#define	HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
918#define	HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
919
920/*
921 * Hash table routines
922 */
923
924#define	HT_LOCK_PAD	CACHE_LINE_SIZE
925
926struct ht_lock {
927	kmutex_t	ht_lock;
928#ifdef _KERNEL
929	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
930#endif
931};
932
933#define	BUF_LOCKS 256
934typedef struct buf_hash_table {
935	uint64_t ht_mask;
936	arc_buf_hdr_t **ht_table;
937	struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
938} buf_hash_table_t;
939
940static buf_hash_table_t buf_hash_table;
941
942#define	BUF_HASH_INDEX(spa, dva, birth) \
943	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
944#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
945#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
946#define	HDR_LOCK(hdr) \
947	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
948
949uint64_t zfs_crc64_table[256];
950
951/*
952 * Level 2 ARC
953 */
954
955#define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
956#define	L2ARC_HEADROOM		2			/* num of writes */
957/*
958 * If we discover during ARC scan any buffers to be compressed, we boost
959 * our headroom for the next scanning cycle by this percentage multiple.
960 */
961#define	L2ARC_HEADROOM_BOOST	200
962#define	L2ARC_FEED_SECS		1		/* caching interval secs */
963#define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
964
965/*
966 * Used to distinguish headers that are being process by
967 * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk
968 * address. This can happen when the header is added to the l2arc's list
969 * of buffers to write in the first stage of l2arc_write_buffers(), but
970 * has not yet been written out which happens in the second stage of
971 * l2arc_write_buffers().
972 */
973#define	L2ARC_ADDR_UNSET	((uint64_t)(-1))
974
975#define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
976#define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
977
978/* L2ARC Performance Tunables */
979uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
980uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
981uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
982uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
983uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
984uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
985boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
986boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
987boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
988
989SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
990    &l2arc_write_max, 0, "max write size");
991SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
992    &l2arc_write_boost, 0, "extra write during warmup");
993SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
994    &l2arc_headroom, 0, "number of dev writes");
995SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
996    &l2arc_feed_secs, 0, "interval seconds");
997SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
998    &l2arc_feed_min_ms, 0, "min interval milliseconds");
999
1000SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
1001    &l2arc_noprefetch, 0, "don't cache prefetch bufs");
1002SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
1003    &l2arc_feed_again, 0, "turbo warmup");
1004SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
1005    &l2arc_norw, 0, "no reads during writes");
1006
1007SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
1008    &ARC_anon.arcs_size, 0, "size of anonymous state");
1009SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
1010    &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
1011SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
1012    &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
1013
1014SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
1015    &ARC_mru.arcs_size, 0, "size of mru state");
1016SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
1017    &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
1018SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
1019    &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
1020
1021SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
1022    &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
1023SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
1024    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
1025    "size of metadata in mru ghost state");
1026SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
1027    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
1028    "size of data in mru ghost state");
1029
1030SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
1031    &ARC_mfu.arcs_size, 0, "size of mfu state");
1032SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
1033    &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
1034SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
1035    &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
1036
1037SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
1038    &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
1039SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
1040    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
1041    "size of metadata in mfu ghost state");
1042SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
1043    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
1044    "size of data in mfu ghost state");
1045
1046SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
1047    &ARC_l2c_only.arcs_size, 0, "size of mru state");
1048
1049/*
1050 * L2ARC Internals
1051 */
1052struct l2arc_dev {
1053	vdev_t			*l2ad_vdev;	/* vdev */
1054	spa_t			*l2ad_spa;	/* spa */
1055	uint64_t		l2ad_hand;	/* next write location */
1056	uint64_t		l2ad_start;	/* first addr on device */
1057	uint64_t		l2ad_end;	/* last addr on device */
1058	boolean_t		l2ad_first;	/* first sweep through */
1059	boolean_t		l2ad_writing;	/* currently writing */
1060	kmutex_t		l2ad_mtx;	/* lock for buffer list */
1061	list_t			l2ad_buflist;	/* buffer list */
1062	list_node_t		l2ad_node;	/* device list node */
1063	refcount_t		l2ad_alloc;	/* allocated bytes */
1064};
1065
1066static list_t L2ARC_dev_list;			/* device list */
1067static list_t *l2arc_dev_list;			/* device list pointer */
1068static kmutex_t l2arc_dev_mtx;			/* device list mutex */
1069static l2arc_dev_t *l2arc_dev_last;		/* last device used */
1070static list_t L2ARC_free_on_write;		/* free after write buf list */
1071static list_t *l2arc_free_on_write;		/* free after write list ptr */
1072static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
1073static uint64_t l2arc_ndev;			/* number of devices */
1074
1075typedef struct l2arc_read_callback {
1076	arc_buf_t		*l2rcb_buf;		/* read buffer */
1077	spa_t			*l2rcb_spa;		/* spa */
1078	blkptr_t		l2rcb_bp;		/* original blkptr */
1079	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
1080	int			l2rcb_flags;		/* original flags */
1081	enum zio_compress	l2rcb_compress;		/* applied compress */
1082} l2arc_read_callback_t;
1083
1084typedef struct l2arc_write_callback {
1085	l2arc_dev_t	*l2wcb_dev;		/* device info */
1086	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
1087} l2arc_write_callback_t;
1088
1089typedef struct l2arc_data_free {
1090	/* protected by l2arc_free_on_write_mtx */
1091	void		*l2df_data;
1092	size_t		l2df_size;
1093	void		(*l2df_func)(void *, size_t);
1094	list_node_t	l2df_list_node;
1095} l2arc_data_free_t;
1096
1097static kmutex_t l2arc_feed_thr_lock;
1098static kcondvar_t l2arc_feed_thr_cv;
1099static uint8_t l2arc_thread_exit;
1100
1101static void arc_get_data_buf(arc_buf_t *);
1102static void arc_access(arc_buf_hdr_t *, kmutex_t *);
1103static int arc_evict_needed(arc_buf_contents_t);
1104static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t);
1105static void arc_buf_watch(arc_buf_t *);
1106
1107static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
1108static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
1109
1110static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
1111static void l2arc_read_done(zio_t *);
1112
1113static boolean_t l2arc_compress_buf(arc_buf_hdr_t *);
1114static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
1115static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
1116
1117static uint64_t
1118buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
1119{
1120	uint8_t *vdva = (uint8_t *)dva;
1121	uint64_t crc = -1ULL;
1122	int i;
1123
1124	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
1125
1126	for (i = 0; i < sizeof (dva_t); i++)
1127		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
1128
1129	crc ^= (spa>>8) ^ birth;
1130
1131	return (crc);
1132}
1133
1134#define	BUF_EMPTY(buf)						\
1135	((buf)->b_dva.dva_word[0] == 0 &&			\
1136	(buf)->b_dva.dva_word[1] == 0)
1137
1138#define	BUF_EQUAL(spa, dva, birth, buf)				\
1139	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
1140	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
1141	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
1142
1143static void
1144buf_discard_identity(arc_buf_hdr_t *hdr)
1145{
1146	hdr->b_dva.dva_word[0] = 0;
1147	hdr->b_dva.dva_word[1] = 0;
1148	hdr->b_birth = 0;
1149}
1150
1151static arc_buf_hdr_t *
1152buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
1153{
1154	const dva_t *dva = BP_IDENTITY(bp);
1155	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
1156	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
1157	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1158	arc_buf_hdr_t *hdr;
1159
1160	mutex_enter(hash_lock);
1161	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
1162	    hdr = hdr->b_hash_next) {
1163		if (BUF_EQUAL(spa, dva, birth, hdr)) {
1164			*lockp = hash_lock;
1165			return (hdr);
1166		}
1167	}
1168	mutex_exit(hash_lock);
1169	*lockp = NULL;
1170	return (NULL);
1171}
1172
1173/*
1174 * Insert an entry into the hash table.  If there is already an element
1175 * equal to elem in the hash table, then the already existing element
1176 * will be returned and the new element will not be inserted.
1177 * Otherwise returns NULL.
1178 * If lockp == NULL, the caller is assumed to already hold the hash lock.
1179 */
1180static arc_buf_hdr_t *
1181buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
1182{
1183	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1184	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1185	arc_buf_hdr_t *fhdr;
1186	uint32_t i;
1187
1188	ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
1189	ASSERT(hdr->b_birth != 0);
1190	ASSERT(!HDR_IN_HASH_TABLE(hdr));
1191
1192	if (lockp != NULL) {
1193		*lockp = hash_lock;
1194		mutex_enter(hash_lock);
1195	} else {
1196		ASSERT(MUTEX_HELD(hash_lock));
1197	}
1198
1199	for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
1200	    fhdr = fhdr->b_hash_next, i++) {
1201		if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
1202			return (fhdr);
1203	}
1204
1205	hdr->b_hash_next = buf_hash_table.ht_table[idx];
1206	buf_hash_table.ht_table[idx] = hdr;
1207	hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
1208
1209	/* collect some hash table performance data */
1210	if (i > 0) {
1211		ARCSTAT_BUMP(arcstat_hash_collisions);
1212		if (i == 1)
1213			ARCSTAT_BUMP(arcstat_hash_chains);
1214
1215		ARCSTAT_MAX(arcstat_hash_chain_max, i);
1216	}
1217
1218	ARCSTAT_BUMP(arcstat_hash_elements);
1219	ARCSTAT_MAXSTAT(arcstat_hash_elements);
1220
1221	return (NULL);
1222}
1223
1224static void
1225buf_hash_remove(arc_buf_hdr_t *hdr)
1226{
1227	arc_buf_hdr_t *fhdr, **hdrp;
1228	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1229
1230	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
1231	ASSERT(HDR_IN_HASH_TABLE(hdr));
1232
1233	hdrp = &buf_hash_table.ht_table[idx];
1234	while ((fhdr = *hdrp) != hdr) {
1235		ASSERT(fhdr != NULL);
1236		hdrp = &fhdr->b_hash_next;
1237	}
1238	*hdrp = hdr->b_hash_next;
1239	hdr->b_hash_next = NULL;
1240	hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE;
1241
1242	/* collect some hash table performance data */
1243	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1244
1245	if (buf_hash_table.ht_table[idx] &&
1246	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1247		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1248}
1249
1250/*
1251 * Global data structures and functions for the buf kmem cache.
1252 */
1253static kmem_cache_t *hdr_full_cache;
1254static kmem_cache_t *hdr_l2only_cache;
1255static kmem_cache_t *buf_cache;
1256
1257static void
1258buf_fini(void)
1259{
1260	int i;
1261
1262	kmem_free(buf_hash_table.ht_table,
1263	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
1264	for (i = 0; i < BUF_LOCKS; i++)
1265		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
1266	kmem_cache_destroy(hdr_full_cache);
1267	kmem_cache_destroy(hdr_l2only_cache);
1268	kmem_cache_destroy(buf_cache);
1269}
1270
1271/*
1272 * Constructor callback - called when the cache is empty
1273 * and a new buf is requested.
1274 */
1275/* ARGSUSED */
1276static int
1277hdr_full_cons(void *vbuf, void *unused, int kmflag)
1278{
1279	arc_buf_hdr_t *hdr = vbuf;
1280
1281	bzero(hdr, HDR_FULL_SIZE);
1282	cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
1283	refcount_create(&hdr->b_l1hdr.b_refcnt);
1284	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1285	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1286
1287	return (0);
1288}
1289
1290/* ARGSUSED */
1291static int
1292hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
1293{
1294	arc_buf_hdr_t *hdr = vbuf;
1295
1296	bzero(hdr, HDR_L2ONLY_SIZE);
1297	arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1298
1299	return (0);
1300}
1301
1302/* ARGSUSED */
1303static int
1304buf_cons(void *vbuf, void *unused, int kmflag)
1305{
1306	arc_buf_t *buf = vbuf;
1307
1308	bzero(buf, sizeof (arc_buf_t));
1309	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1310	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1311
1312	return (0);
1313}
1314
1315/*
1316 * Destructor callback - called when a cached buf is
1317 * no longer required.
1318 */
1319/* ARGSUSED */
1320static void
1321hdr_full_dest(void *vbuf, void *unused)
1322{
1323	arc_buf_hdr_t *hdr = vbuf;
1324
1325	ASSERT(BUF_EMPTY(hdr));
1326	cv_destroy(&hdr->b_l1hdr.b_cv);
1327	refcount_destroy(&hdr->b_l1hdr.b_refcnt);
1328	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
1329	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1330}
1331
1332/* ARGSUSED */
1333static void
1334hdr_l2only_dest(void *vbuf, void *unused)
1335{
1336	arc_buf_hdr_t *hdr = vbuf;
1337
1338	ASSERT(BUF_EMPTY(hdr));
1339	arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1340}
1341
1342/* ARGSUSED */
1343static void
1344buf_dest(void *vbuf, void *unused)
1345{
1346	arc_buf_t *buf = vbuf;
1347
1348	mutex_destroy(&buf->b_evict_lock);
1349	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1350}
1351
1352/*
1353 * Reclaim callback -- invoked when memory is low.
1354 */
1355/* ARGSUSED */
1356static void
1357hdr_recl(void *unused)
1358{
1359	dprintf("hdr_recl called\n");
1360	/*
1361	 * umem calls the reclaim func when we destroy the buf cache,
1362	 * which is after we do arc_fini().
1363	 */
1364	if (!arc_dead)
1365		cv_signal(&arc_reclaim_thr_cv);
1366}
1367
1368static void
1369buf_init(void)
1370{
1371	uint64_t *ct;
1372	uint64_t hsize = 1ULL << 12;
1373	int i, j;
1374
1375	/*
1376	 * The hash table is big enough to fill all of physical memory
1377	 * with an average block size of zfs_arc_average_blocksize (default 8K).
1378	 * By default, the table will take up
1379	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1380	 */
1381	while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
1382		hsize <<= 1;
1383retry:
1384	buf_hash_table.ht_mask = hsize - 1;
1385	buf_hash_table.ht_table =
1386	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1387	if (buf_hash_table.ht_table == NULL) {
1388		ASSERT(hsize > (1ULL << 8));
1389		hsize >>= 1;
1390		goto retry;
1391	}
1392
1393	hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
1394	    0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
1395	hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
1396	    HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
1397	    NULL, NULL, 0);
1398	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1399	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1400
1401	for (i = 0; i < 256; i++)
1402		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1403			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1404
1405	for (i = 0; i < BUF_LOCKS; i++) {
1406		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1407		    NULL, MUTEX_DEFAULT, NULL);
1408	}
1409}
1410
1411/*
1412 * Transition between the two allocation states for the arc_buf_hdr struct.
1413 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
1414 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
1415 * version is used when a cache buffer is only in the L2ARC in order to reduce
1416 * memory usage.
1417 */
1418static arc_buf_hdr_t *
1419arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
1420{
1421	ASSERT(HDR_HAS_L2HDR(hdr));
1422
1423	arc_buf_hdr_t *nhdr;
1424	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
1425
1426	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
1427	    (old == hdr_l2only_cache && new == hdr_full_cache));
1428
1429	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
1430
1431	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
1432	buf_hash_remove(hdr);
1433
1434	bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
1435
1436	if (new == hdr_full_cache) {
1437		nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
1438		/*
1439		 * arc_access and arc_change_state need to be aware that a
1440		 * header has just come out of L2ARC, so we set its state to
1441		 * l2c_only even though it's about to change.
1442		 */
1443		nhdr->b_l1hdr.b_state = arc_l2c_only;
1444	} else {
1445		ASSERT(hdr->b_l1hdr.b_buf == NULL);
1446		ASSERT0(hdr->b_l1hdr.b_datacnt);
1447		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
1448		/*
1449		 * We might be removing the L1hdr of a buffer which was just
1450		 * written out to L2ARC. If such a buffer is compressed then we
1451		 * need to free its b_tmp_cdata before destroying the header.
1452		 */
1453		if (hdr->b_l1hdr.b_tmp_cdata != NULL &&
1454		    HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
1455			l2arc_release_cdata_buf(hdr);
1456		nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
1457	}
1458	/*
1459	 * The header has been reallocated so we need to re-insert it into any
1460	 * lists it was on.
1461	 */
1462	(void) buf_hash_insert(nhdr, NULL);
1463
1464	ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
1465
1466	mutex_enter(&dev->l2ad_mtx);
1467
1468	/*
1469	 * We must place the realloc'ed header back into the list at
1470	 * the same spot. Otherwise, if it's placed earlier in the list,
1471	 * l2arc_write_buffers() could find it during the function's
1472	 * write phase, and try to write it out to the l2arc.
1473	 */
1474	list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
1475	list_remove(&dev->l2ad_buflist, hdr);
1476
1477	mutex_exit(&dev->l2ad_mtx);
1478
1479	/*
1480	 * Since we're using the pointer address as the tag when
1481	 * incrementing and decrementing the l2ad_alloc refcount, we
1482	 * must remove the old pointer (that we're about to destroy) and
1483	 * add the new pointer to the refcount. Otherwise we'd remove
1484	 * the wrong pointer address when calling arc_hdr_destroy() later.
1485	 */
1486
1487	(void) refcount_remove_many(&dev->l2ad_alloc,
1488	    hdr->b_l2hdr.b_asize, hdr);
1489
1490	(void) refcount_add_many(&dev->l2ad_alloc,
1491	    nhdr->b_l2hdr.b_asize, nhdr);
1492
1493	buf_discard_identity(hdr);
1494	hdr->b_freeze_cksum = NULL;
1495	kmem_cache_free(old, hdr);
1496
1497	return (nhdr);
1498}
1499
1500
1501#define	ARC_MINTIME	(hz>>4) /* 62 ms */
1502
1503static void
1504arc_cksum_verify(arc_buf_t *buf)
1505{
1506	zio_cksum_t zc;
1507
1508	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1509		return;
1510
1511	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1512	if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) {
1513		mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1514		return;
1515	}
1516	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1517	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1518		panic("buffer modified while frozen!");
1519	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1520}
1521
1522static int
1523arc_cksum_equal(arc_buf_t *buf)
1524{
1525	zio_cksum_t zc;
1526	int equal;
1527
1528	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1529	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1530	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1531	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1532
1533	return (equal);
1534}
1535
1536static void
1537arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1538{
1539	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1540		return;
1541
1542	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1543	if (buf->b_hdr->b_freeze_cksum != NULL) {
1544		mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1545		return;
1546	}
1547	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1548	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1549	    buf->b_hdr->b_freeze_cksum);
1550	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1551#ifdef illumos
1552	arc_buf_watch(buf);
1553#endif
1554}
1555
1556#ifdef illumos
1557#ifndef _KERNEL
1558typedef struct procctl {
1559	long cmd;
1560	prwatch_t prwatch;
1561} procctl_t;
1562#endif
1563
1564/* ARGSUSED */
1565static void
1566arc_buf_unwatch(arc_buf_t *buf)
1567{
1568#ifndef _KERNEL
1569	if (arc_watch) {
1570		int result;
1571		procctl_t ctl;
1572		ctl.cmd = PCWATCH;
1573		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1574		ctl.prwatch.pr_size = 0;
1575		ctl.prwatch.pr_wflags = 0;
1576		result = write(arc_procfd, &ctl, sizeof (ctl));
1577		ASSERT3U(result, ==, sizeof (ctl));
1578	}
1579#endif
1580}
1581
1582/* ARGSUSED */
1583static void
1584arc_buf_watch(arc_buf_t *buf)
1585{
1586#ifndef _KERNEL
1587	if (arc_watch) {
1588		int result;
1589		procctl_t ctl;
1590		ctl.cmd = PCWATCH;
1591		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1592		ctl.prwatch.pr_size = buf->b_hdr->b_size;
1593		ctl.prwatch.pr_wflags = WA_WRITE;
1594		result = write(arc_procfd, &ctl, sizeof (ctl));
1595		ASSERT3U(result, ==, sizeof (ctl));
1596	}
1597#endif
1598}
1599#endif /* illumos */
1600
1601static arc_buf_contents_t
1602arc_buf_type(arc_buf_hdr_t *hdr)
1603{
1604	if (HDR_ISTYPE_METADATA(hdr)) {
1605		return (ARC_BUFC_METADATA);
1606	} else {
1607		return (ARC_BUFC_DATA);
1608	}
1609}
1610
1611static uint32_t
1612arc_bufc_to_flags(arc_buf_contents_t type)
1613{
1614	switch (type) {
1615	case ARC_BUFC_DATA:
1616		/* metadata field is 0 if buffer contains normal data */
1617		return (0);
1618	case ARC_BUFC_METADATA:
1619		return (ARC_FLAG_BUFC_METADATA);
1620	default:
1621		break;
1622	}
1623	panic("undefined ARC buffer type!");
1624	return ((uint32_t)-1);
1625}
1626
1627void
1628arc_buf_thaw(arc_buf_t *buf)
1629{
1630	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1631		if (buf->b_hdr->b_l1hdr.b_state != arc_anon)
1632			panic("modifying non-anon buffer!");
1633		if (HDR_IO_IN_PROGRESS(buf->b_hdr))
1634			panic("modifying buffer while i/o in progress!");
1635		arc_cksum_verify(buf);
1636	}
1637
1638	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1639	if (buf->b_hdr->b_freeze_cksum != NULL) {
1640		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1641		buf->b_hdr->b_freeze_cksum = NULL;
1642	}
1643
1644#ifdef ZFS_DEBUG
1645	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1646		if (buf->b_hdr->b_l1hdr.b_thawed != NULL)
1647			kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1);
1648		buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
1649	}
1650#endif
1651
1652	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1653
1654#ifdef illumos
1655	arc_buf_unwatch(buf);
1656#endif
1657}
1658
1659void
1660arc_buf_freeze(arc_buf_t *buf)
1661{
1662	kmutex_t *hash_lock;
1663
1664	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1665		return;
1666
1667	hash_lock = HDR_LOCK(buf->b_hdr);
1668	mutex_enter(hash_lock);
1669
1670	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1671	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
1672	arc_cksum_compute(buf, B_FALSE);
1673	mutex_exit(hash_lock);
1674
1675}
1676
1677static void
1678get_buf_info(arc_buf_hdr_t *hdr, arc_state_t *state, list_t **list, kmutex_t **lock)
1679{
1680	uint64_t buf_hashid = buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1681
1682	if (arc_buf_type(hdr) == ARC_BUFC_METADATA)
1683		buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
1684	else {
1685		buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
1686		buf_hashid += ARC_BUFC_NUMMETADATALISTS;
1687	}
1688
1689	*list = &state->arcs_lists[buf_hashid];
1690	*lock = ARCS_LOCK(state, buf_hashid);
1691}
1692
1693
1694static void
1695add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1696{
1697	ASSERT(HDR_HAS_L1HDR(hdr));
1698	ASSERT(MUTEX_HELD(hash_lock));
1699	arc_state_t *state = hdr->b_l1hdr.b_state;
1700
1701	if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
1702	    (state != arc_anon)) {
1703		/* We don't use the L2-only state list. */
1704		if (state != arc_l2c_only) {
1705			uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt;
1706			uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
1707			list_t *list;
1708			kmutex_t *lock;
1709
1710			get_buf_info(hdr, state, &list, &lock);
1711			ASSERT(!MUTEX_HELD(lock));
1712			mutex_enter(lock);
1713			ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
1714			list_remove(list, hdr);
1715			if (GHOST_STATE(state)) {
1716				ASSERT0(hdr->b_l1hdr.b_datacnt);
1717				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
1718				delta = hdr->b_size;
1719			}
1720			ASSERT(delta > 0);
1721			ASSERT3U(*size, >=, delta);
1722			atomic_add_64(size, -delta);
1723			mutex_exit(lock);
1724		}
1725		/* remove the prefetch flag if we get a reference */
1726		hdr->b_flags &= ~ARC_FLAG_PREFETCH;
1727	}
1728}
1729
1730static int
1731remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1732{
1733	int cnt;
1734	arc_state_t *state = hdr->b_l1hdr.b_state;
1735
1736	ASSERT(HDR_HAS_L1HDR(hdr));
1737	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1738	ASSERT(!GHOST_STATE(state));
1739
1740	/*
1741	 * arc_l2c_only counts as a ghost state so we don't need to explicitly
1742	 * check to prevent usage of the arc_l2c_only list.
1743	 */
1744	if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
1745	    (state != arc_anon)) {
1746		uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
1747		list_t *list;
1748		kmutex_t *lock;
1749
1750		get_buf_info(hdr, state, &list, &lock);
1751		ASSERT(!MUTEX_HELD(lock));
1752		mutex_enter(lock);
1753		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
1754		list_insert_head(list, hdr);
1755		ASSERT(hdr->b_l1hdr.b_datacnt > 0);
1756		atomic_add_64(size, hdr->b_size *
1757		    hdr->b_l1hdr.b_datacnt);
1758		mutex_exit(lock);
1759	}
1760	return (cnt);
1761}
1762
1763/*
1764 * Move the supplied buffer to the indicated state.  The mutex
1765 * for the buffer must be held by the caller.
1766 */
1767static void
1768arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
1769    kmutex_t *hash_lock)
1770{
1771	arc_state_t *old_state;
1772	int64_t refcnt;
1773	uint32_t datacnt;
1774	uint64_t from_delta, to_delta;
1775	arc_buf_contents_t buftype = arc_buf_type(hdr);
1776	list_t *list;
1777	kmutex_t *lock;
1778
1779	/*
1780	 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
1781	 * in arc_read() when bringing a buffer out of the L2ARC.  However, the
1782	 * L1 hdr doesn't always exist when we change state to arc_anon before
1783	 * destroying a header, in which case reallocating to add the L1 hdr is
1784	 * pointless.
1785	 */
1786	if (HDR_HAS_L1HDR(hdr)) {
1787		old_state = hdr->b_l1hdr.b_state;
1788		refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
1789		datacnt = hdr->b_l1hdr.b_datacnt;
1790	} else {
1791		old_state = arc_l2c_only;
1792		refcnt = 0;
1793		datacnt = 0;
1794	}
1795
1796	ASSERT(MUTEX_HELD(hash_lock));
1797	ASSERT3P(new_state, !=, old_state);
1798	ASSERT(refcnt == 0 || datacnt > 0);
1799	ASSERT(!GHOST_STATE(new_state) || datacnt == 0);
1800	ASSERT(old_state != arc_anon || datacnt <= 1);
1801
1802	from_delta = to_delta = datacnt * hdr->b_size;
1803
1804	/*
1805	 * If this buffer is evictable, transfer it from the
1806	 * old state list to the new state list.
1807	 */
1808	if (refcnt == 0) {
1809		if (old_state != arc_anon && old_state != arc_l2c_only) {
1810			int use_mutex;
1811			uint64_t *size = &old_state->arcs_lsize[buftype];
1812
1813			get_buf_info(hdr, old_state, &list, &lock);
1814			use_mutex = !MUTEX_HELD(lock);
1815			if (use_mutex)
1816				mutex_enter(lock);
1817
1818			ASSERT(HDR_HAS_L1HDR(hdr));
1819			ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
1820			list_remove(list, hdr);
1821
1822			/*
1823			 * If prefetching out of the ghost cache,
1824			 * we will have a non-zero datacnt.
1825			 */
1826			if (GHOST_STATE(old_state) && datacnt == 0) {
1827				/* ghost elements have a ghost size */
1828				ASSERT(hdr->b_l1hdr.b_buf == NULL);
1829				from_delta = hdr->b_size;
1830			}
1831			ASSERT3U(*size, >=, from_delta);
1832			atomic_add_64(size, -from_delta);
1833
1834			if (use_mutex)
1835				mutex_exit(lock);
1836		}
1837		if (new_state != arc_anon && new_state != arc_l2c_only) {
1838			int use_mutex;
1839			uint64_t *size = &new_state->arcs_lsize[buftype];
1840
1841			/*
1842			 * An L1 header always exists here, since if we're
1843			 * moving to some L1-cached state (i.e. not l2c_only or
1844			 * anonymous), we realloc the header to add an L1hdr
1845			 * beforehand.
1846			 */
1847			ASSERT(HDR_HAS_L1HDR(hdr));
1848			get_buf_info(hdr, new_state, &list, &lock);
1849			use_mutex = !MUTEX_HELD(lock);
1850			if (use_mutex)
1851				mutex_enter(lock);
1852
1853			list_insert_head(list, hdr);
1854
1855			/* ghost elements have a ghost size */
1856			if (GHOST_STATE(new_state)) {
1857				ASSERT(datacnt == 0);
1858				ASSERT(hdr->b_l1hdr.b_buf == NULL);
1859				to_delta = hdr->b_size;
1860			}
1861			atomic_add_64(size, to_delta);
1862
1863			if (use_mutex)
1864				mutex_exit(lock);
1865		}
1866	}
1867
1868	ASSERT(!BUF_EMPTY(hdr));
1869	if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
1870		buf_hash_remove(hdr);
1871
1872	/* adjust state sizes (ignore arc_l2c_only) */
1873	if (to_delta && new_state != arc_l2c_only)
1874		atomic_add_64(&new_state->arcs_size, to_delta);
1875	if (from_delta && old_state != arc_l2c_only) {
1876		ASSERT3U(old_state->arcs_size, >=, from_delta);
1877		atomic_add_64(&old_state->arcs_size, -from_delta);
1878	}
1879	if (HDR_HAS_L1HDR(hdr))
1880		hdr->b_l1hdr.b_state = new_state;
1881
1882	/*
1883	 * L2 headers should never be on the L2 state list since they don't
1884	 * have L1 headers allocated.
1885	 */
1886#ifdef illumos
1887	ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
1888	    list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
1889#endif
1890}
1891
1892void
1893arc_space_consume(uint64_t space, arc_space_type_t type)
1894{
1895	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1896
1897	switch (type) {
1898	case ARC_SPACE_DATA:
1899		ARCSTAT_INCR(arcstat_data_size, space);
1900		break;
1901	case ARC_SPACE_META:
1902		ARCSTAT_INCR(arcstat_metadata_size, space);
1903		break;
1904	case ARC_SPACE_OTHER:
1905		ARCSTAT_INCR(arcstat_other_size, space);
1906		break;
1907	case ARC_SPACE_HDRS:
1908		ARCSTAT_INCR(arcstat_hdr_size, space);
1909		break;
1910	case ARC_SPACE_L2HDRS:
1911		ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1912		break;
1913	}
1914
1915	if (type != ARC_SPACE_DATA)
1916		ARCSTAT_INCR(arcstat_meta_used, space);
1917
1918	atomic_add_64(&arc_size, space);
1919}
1920
1921void
1922arc_space_return(uint64_t space, arc_space_type_t type)
1923{
1924	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1925
1926	switch (type) {
1927	case ARC_SPACE_DATA:
1928		ARCSTAT_INCR(arcstat_data_size, -space);
1929		break;
1930	case ARC_SPACE_META:
1931		ARCSTAT_INCR(arcstat_metadata_size, -space);
1932		break;
1933	case ARC_SPACE_OTHER:
1934		ARCSTAT_INCR(arcstat_other_size, -space);
1935		break;
1936	case ARC_SPACE_HDRS:
1937		ARCSTAT_INCR(arcstat_hdr_size, -space);
1938		break;
1939	case ARC_SPACE_L2HDRS:
1940		ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1941		break;
1942	}
1943
1944	if (type != ARC_SPACE_DATA) {
1945		ASSERT(arc_meta_used >= space);
1946		if (arc_meta_max < arc_meta_used)
1947			arc_meta_max = arc_meta_used;
1948		ARCSTAT_INCR(arcstat_meta_used, -space);
1949	}
1950
1951	ASSERT(arc_size >= space);
1952	atomic_add_64(&arc_size, -space);
1953}
1954
1955arc_buf_t *
1956arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type)
1957{
1958	arc_buf_hdr_t *hdr;
1959	arc_buf_t *buf;
1960
1961	ASSERT3U(size, >, 0);
1962	hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
1963	ASSERT(BUF_EMPTY(hdr));
1964	ASSERT3P(hdr->b_freeze_cksum, ==, NULL);
1965	hdr->b_size = size;
1966	hdr->b_spa = spa_load_guid(spa);
1967
1968	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1969	buf->b_hdr = hdr;
1970	buf->b_data = NULL;
1971	buf->b_efunc = NULL;
1972	buf->b_private = NULL;
1973	buf->b_next = NULL;
1974
1975	hdr->b_flags = arc_bufc_to_flags(type);
1976	hdr->b_flags |= ARC_FLAG_HAS_L1HDR;
1977
1978	hdr->b_l1hdr.b_buf = buf;
1979	hdr->b_l1hdr.b_state = arc_anon;
1980	hdr->b_l1hdr.b_arc_access = 0;
1981	hdr->b_l1hdr.b_datacnt = 1;
1982
1983	arc_get_data_buf(buf);
1984	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
1985	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
1986
1987	return (buf);
1988}
1989
1990static char *arc_onloan_tag = "onloan";
1991
1992/*
1993 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1994 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1995 * buffers must be returned to the arc before they can be used by the DMU or
1996 * freed.
1997 */
1998arc_buf_t *
1999arc_loan_buf(spa_t *spa, int size)
2000{
2001	arc_buf_t *buf;
2002
2003	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
2004
2005	atomic_add_64(&arc_loaned_bytes, size);
2006	return (buf);
2007}
2008
2009/*
2010 * Return a loaned arc buffer to the arc.
2011 */
2012void
2013arc_return_buf(arc_buf_t *buf, void *tag)
2014{
2015	arc_buf_hdr_t *hdr = buf->b_hdr;
2016
2017	ASSERT(buf->b_data != NULL);
2018	ASSERT(HDR_HAS_L1HDR(hdr));
2019	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
2020	(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2021
2022	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
2023}
2024
2025/* Detach an arc_buf from a dbuf (tag) */
2026void
2027arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
2028{
2029	arc_buf_hdr_t *hdr = buf->b_hdr;
2030
2031	ASSERT(buf->b_data != NULL);
2032	ASSERT(HDR_HAS_L1HDR(hdr));
2033	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2034	(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
2035	buf->b_efunc = NULL;
2036	buf->b_private = NULL;
2037
2038	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
2039}
2040
2041static arc_buf_t *
2042arc_buf_clone(arc_buf_t *from)
2043{
2044	arc_buf_t *buf;
2045	arc_buf_hdr_t *hdr = from->b_hdr;
2046	uint64_t size = hdr->b_size;
2047
2048	ASSERT(HDR_HAS_L1HDR(hdr));
2049	ASSERT(hdr->b_l1hdr.b_state != arc_anon);
2050
2051	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2052	buf->b_hdr = hdr;
2053	buf->b_data = NULL;
2054	buf->b_efunc = NULL;
2055	buf->b_private = NULL;
2056	buf->b_next = hdr->b_l1hdr.b_buf;
2057	hdr->b_l1hdr.b_buf = buf;
2058	arc_get_data_buf(buf);
2059	bcopy(from->b_data, buf->b_data, size);
2060
2061	/*
2062	 * This buffer already exists in the arc so create a duplicate
2063	 * copy for the caller.  If the buffer is associated with user data
2064	 * then track the size and number of duplicates.  These stats will be
2065	 * updated as duplicate buffers are created and destroyed.
2066	 */
2067	if (HDR_ISTYPE_DATA(hdr)) {
2068		ARCSTAT_BUMP(arcstat_duplicate_buffers);
2069		ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
2070	}
2071	hdr->b_l1hdr.b_datacnt += 1;
2072	return (buf);
2073}
2074
2075void
2076arc_buf_add_ref(arc_buf_t *buf, void* tag)
2077{
2078	arc_buf_hdr_t *hdr;
2079	kmutex_t *hash_lock;
2080
2081	/*
2082	 * Check to see if this buffer is evicted.  Callers
2083	 * must verify b_data != NULL to know if the add_ref
2084	 * was successful.
2085	 */
2086	mutex_enter(&buf->b_evict_lock);
2087	if (buf->b_data == NULL) {
2088		mutex_exit(&buf->b_evict_lock);
2089		return;
2090	}
2091	hash_lock = HDR_LOCK(buf->b_hdr);
2092	mutex_enter(hash_lock);
2093	hdr = buf->b_hdr;
2094	ASSERT(HDR_HAS_L1HDR(hdr));
2095	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2096	mutex_exit(&buf->b_evict_lock);
2097
2098	ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
2099	    hdr->b_l1hdr.b_state == arc_mfu);
2100
2101	add_reference(hdr, hash_lock, tag);
2102	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2103	arc_access(hdr, hash_lock);
2104	mutex_exit(hash_lock);
2105	ARCSTAT_BUMP(arcstat_hits);
2106	ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
2107	    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
2108	    data, metadata, hits);
2109}
2110
2111static void
2112arc_buf_free_on_write(void *data, size_t size,
2113    void (*free_func)(void *, size_t))
2114{
2115	l2arc_data_free_t *df;
2116
2117	df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
2118	df->l2df_data = data;
2119	df->l2df_size = size;
2120	df->l2df_func = free_func;
2121	mutex_enter(&l2arc_free_on_write_mtx);
2122	list_insert_head(l2arc_free_on_write, df);
2123	mutex_exit(&l2arc_free_on_write_mtx);
2124}
2125
2126/*
2127 * Free the arc data buffer.  If it is an l2arc write in progress,
2128 * the buffer is placed on l2arc_free_on_write to be freed later.
2129 */
2130static void
2131arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
2132{
2133	arc_buf_hdr_t *hdr = buf->b_hdr;
2134
2135	if (HDR_L2_WRITING(hdr)) {
2136		arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func);
2137		ARCSTAT_BUMP(arcstat_l2_free_on_write);
2138	} else {
2139		free_func(buf->b_data, hdr->b_size);
2140	}
2141}
2142
2143/*
2144 * Free up buf->b_data and if 'remove' is set, then pull the
2145 * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
2146 */
2147static void
2148arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
2149{
2150	ASSERT(HDR_HAS_L2HDR(hdr));
2151	ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx));
2152
2153	/*
2154	 * The b_tmp_cdata field is linked off of the b_l1hdr, so if
2155	 * that doesn't exist, the header is in the arc_l2c_only state,
2156	 * and there isn't anything to free (it's already been freed).
2157	 */
2158	if (!HDR_HAS_L1HDR(hdr))
2159		return;
2160
2161	if (hdr->b_l1hdr.b_tmp_cdata == NULL)
2162		return;
2163
2164	ASSERT(HDR_L2_WRITING(hdr));
2165	arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size,
2166	    zio_data_buf_free);
2167
2168	ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
2169	hdr->b_l1hdr.b_tmp_cdata = NULL;
2170}
2171
2172static void
2173arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
2174{
2175	arc_buf_t **bufp;
2176
2177	/* free up data associated with the buf */
2178	if (buf->b_data != NULL) {
2179		arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;
2180		uint64_t size = buf->b_hdr->b_size;
2181		arc_buf_contents_t type = arc_buf_type(buf->b_hdr);
2182
2183		arc_cksum_verify(buf);
2184#ifdef illumos
2185		arc_buf_unwatch(buf);
2186#endif
2187
2188		if (!recycle) {
2189			if (type == ARC_BUFC_METADATA) {
2190				arc_buf_data_free(buf, zio_buf_free);
2191				arc_space_return(size, ARC_SPACE_META);
2192			} else {
2193				ASSERT(type == ARC_BUFC_DATA);
2194				arc_buf_data_free(buf, zio_data_buf_free);
2195				arc_space_return(size, ARC_SPACE_DATA);
2196			}
2197		}
2198		if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
2199			uint64_t *cnt = &state->arcs_lsize[type];
2200
2201			ASSERT(refcount_is_zero(
2202			    &buf->b_hdr->b_l1hdr.b_refcnt));
2203			ASSERT(state != arc_anon && state != arc_l2c_only);
2204
2205			ASSERT3U(*cnt, >=, size);
2206			atomic_add_64(cnt, -size);
2207		}
2208		ASSERT3U(state->arcs_size, >=, size);
2209		atomic_add_64(&state->arcs_size, -size);
2210		buf->b_data = NULL;
2211
2212		/*
2213		 * If we're destroying a duplicate buffer make sure
2214		 * that the appropriate statistics are updated.
2215		 */
2216		if (buf->b_hdr->b_l1hdr.b_datacnt > 1 &&
2217		    HDR_ISTYPE_DATA(buf->b_hdr)) {
2218			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
2219			ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
2220		}
2221		ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0);
2222		buf->b_hdr->b_l1hdr.b_datacnt -= 1;
2223	}
2224
2225	/* only remove the buf if requested */
2226	if (!remove)
2227		return;
2228
2229	/* remove the buf from the hdr list */
2230	for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf;
2231	    bufp = &(*bufp)->b_next)
2232		continue;
2233	*bufp = buf->b_next;
2234	buf->b_next = NULL;
2235
2236	ASSERT(buf->b_efunc == NULL);
2237
2238	/* clean up the buf */
2239	buf->b_hdr = NULL;
2240	kmem_cache_free(buf_cache, buf);
2241}
2242
2243static void
2244arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
2245{
2246	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
2247	l2arc_dev_t *dev = l2hdr->b_dev;
2248
2249	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
2250	ASSERT(HDR_HAS_L2HDR(hdr));
2251
2252	list_remove(&dev->l2ad_buflist, hdr);
2253
2254	/*
2255	 * We don't want to leak the b_tmp_cdata buffer that was
2256	 * allocated in l2arc_write_buffers()
2257	 */
2258	arc_buf_l2_cdata_free(hdr);
2259
2260	/*
2261	 * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then
2262	 * this header is being processed by l2arc_write_buffers() (i.e.
2263	 * it's in the first stage of l2arc_write_buffers()).
2264	 * Re-affirming that truth here, just to serve as a reminder. If
2265	 * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or
2266	 * may not have its HDR_L2_WRITING flag set. (the write may have
2267	 * completed, in which case HDR_L2_WRITING will be false and the
2268	 * b_daddr field will point to the address of the buffer on disk).
2269	 */
2270	IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr));
2271
2272	/*
2273	 * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with
2274	 * l2arc_write_buffers(). Since we've just removed this header
2275	 * from the l2arc buffer list, this header will never reach the
2276	 * second stage of l2arc_write_buffers(), which increments the
2277	 * accounting stats for this header. Thus, we must be careful
2278	 * not to decrement them for this header either.
2279	 */
2280	if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) {
2281		ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
2282		ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
2283
2284		vdev_space_update(dev->l2ad_vdev,
2285		    -l2hdr->b_asize, 0, 0);
2286
2287		(void) refcount_remove_many(&dev->l2ad_alloc,
2288		    l2hdr->b_asize, hdr);
2289	}
2290
2291	hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
2292}
2293
2294static void
2295arc_hdr_destroy(arc_buf_hdr_t *hdr)
2296{
2297	if (HDR_HAS_L1HDR(hdr)) {
2298		ASSERT(hdr->b_l1hdr.b_buf == NULL ||
2299		    hdr->b_l1hdr.b_datacnt > 0);
2300		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2301		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
2302	}
2303	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2304	ASSERT(!HDR_IN_HASH_TABLE(hdr));
2305
2306	if (HDR_HAS_L2HDR(hdr)) {
2307		l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
2308		boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
2309
2310		if (!buflist_held)
2311			mutex_enter(&dev->l2ad_mtx);
2312
2313		/*
2314		 * Even though we checked this conditional above, we
2315		 * need to check this again now that we have the
2316		 * l2ad_mtx. This is because we could be racing with
2317		 * another thread calling l2arc_evict() which might have
2318		 * destroyed this header's L2 portion as we were waiting
2319		 * to acquire the l2ad_mtx. If that happens, we don't
2320		 * want to re-destroy the header's L2 portion.
2321		 */
2322		if (HDR_HAS_L2HDR(hdr)) {
2323			trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr,
2324			    hdr->b_l2hdr.b_asize, 0);
2325			arc_hdr_l2hdr_destroy(hdr);
2326		}
2327
2328		if (!buflist_held)
2329			mutex_exit(&dev->l2ad_mtx);
2330	}
2331
2332	if (!BUF_EMPTY(hdr))
2333		buf_discard_identity(hdr);
2334	if (hdr->b_freeze_cksum != NULL) {
2335		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
2336		hdr->b_freeze_cksum = NULL;
2337	}
2338
2339	if (HDR_HAS_L1HDR(hdr)) {
2340		while (hdr->b_l1hdr.b_buf) {
2341			arc_buf_t *buf = hdr->b_l1hdr.b_buf;
2342
2343			if (buf->b_efunc != NULL) {
2344				mutex_enter(&arc_eviction_mtx);
2345				mutex_enter(&buf->b_evict_lock);
2346				ASSERT(buf->b_hdr != NULL);
2347				arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
2348				    FALSE);
2349				hdr->b_l1hdr.b_buf = buf->b_next;
2350				buf->b_hdr = &arc_eviction_hdr;
2351				buf->b_next = arc_eviction_list;
2352				arc_eviction_list = buf;
2353				mutex_exit(&buf->b_evict_lock);
2354				mutex_exit(&arc_eviction_mtx);
2355			} else {
2356				arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
2357				    TRUE);
2358			}
2359		}
2360#ifdef ZFS_DEBUG
2361		if (hdr->b_l1hdr.b_thawed != NULL) {
2362			kmem_free(hdr->b_l1hdr.b_thawed, 1);
2363			hdr->b_l1hdr.b_thawed = NULL;
2364		}
2365#endif
2366	}
2367
2368	ASSERT3P(hdr->b_hash_next, ==, NULL);
2369	if (HDR_HAS_L1HDR(hdr)) {
2370		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
2371		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
2372		kmem_cache_free(hdr_full_cache, hdr);
2373	} else {
2374		kmem_cache_free(hdr_l2only_cache, hdr);
2375	}
2376}
2377
2378void
2379arc_buf_free(arc_buf_t *buf, void *tag)
2380{
2381	arc_buf_hdr_t *hdr = buf->b_hdr;
2382	int hashed = hdr->b_l1hdr.b_state != arc_anon;
2383
2384	ASSERT(buf->b_efunc == NULL);
2385	ASSERT(buf->b_data != NULL);
2386
2387	if (hashed) {
2388		kmutex_t *hash_lock = HDR_LOCK(hdr);
2389
2390		mutex_enter(hash_lock);
2391		hdr = buf->b_hdr;
2392		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2393
2394		(void) remove_reference(hdr, hash_lock, tag);
2395		if (hdr->b_l1hdr.b_datacnt > 1) {
2396			arc_buf_destroy(buf, FALSE, TRUE);
2397		} else {
2398			ASSERT(buf == hdr->b_l1hdr.b_buf);
2399			ASSERT(buf->b_efunc == NULL);
2400			hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
2401		}
2402		mutex_exit(hash_lock);
2403	} else if (HDR_IO_IN_PROGRESS(hdr)) {
2404		int destroy_hdr;
2405		/*
2406		 * We are in the middle of an async write.  Don't destroy
2407		 * this buffer unless the write completes before we finish
2408		 * decrementing the reference count.
2409		 */
2410		mutex_enter(&arc_eviction_mtx);
2411		(void) remove_reference(hdr, NULL, tag);
2412		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2413		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
2414		mutex_exit(&arc_eviction_mtx);
2415		if (destroy_hdr)
2416			arc_hdr_destroy(hdr);
2417	} else {
2418		if (remove_reference(hdr, NULL, tag) > 0)
2419			arc_buf_destroy(buf, FALSE, TRUE);
2420		else
2421			arc_hdr_destroy(hdr);
2422	}
2423}
2424
2425boolean_t
2426arc_buf_remove_ref(arc_buf_t *buf, void* tag)
2427{
2428	arc_buf_hdr_t *hdr = buf->b_hdr;
2429	kmutex_t *hash_lock = HDR_LOCK(hdr);
2430	boolean_t no_callback = (buf->b_efunc == NULL);
2431
2432	if (hdr->b_l1hdr.b_state == arc_anon) {
2433		ASSERT(hdr->b_l1hdr.b_datacnt == 1);
2434		arc_buf_free(buf, tag);
2435		return (no_callback);
2436	}
2437
2438	mutex_enter(hash_lock);
2439	hdr = buf->b_hdr;
2440	ASSERT(hdr->b_l1hdr.b_datacnt > 0);
2441	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2442	ASSERT(hdr->b_l1hdr.b_state != arc_anon);
2443	ASSERT(buf->b_data != NULL);
2444
2445	(void) remove_reference(hdr, hash_lock, tag);
2446	if (hdr->b_l1hdr.b_datacnt > 1) {
2447		if (no_callback)
2448			arc_buf_destroy(buf, FALSE, TRUE);
2449	} else if (no_callback) {
2450		ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL);
2451		ASSERT(buf->b_efunc == NULL);
2452		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
2453	}
2454	ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 ||
2455	    refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2456	mutex_exit(hash_lock);
2457	return (no_callback);
2458}
2459
2460int32_t
2461arc_buf_size(arc_buf_t *buf)
2462{
2463	return (buf->b_hdr->b_size);
2464}
2465
2466/*
2467 * Called from the DMU to determine if the current buffer should be
2468 * evicted. In order to ensure proper locking, the eviction must be initiated
2469 * from the DMU. Return true if the buffer is associated with user data and
2470 * duplicate buffers still exist.
2471 */
2472boolean_t
2473arc_buf_eviction_needed(arc_buf_t *buf)
2474{
2475	arc_buf_hdr_t *hdr;
2476	boolean_t evict_needed = B_FALSE;
2477
2478	if (zfs_disable_dup_eviction)
2479		return (B_FALSE);
2480
2481	mutex_enter(&buf->b_evict_lock);
2482	hdr = buf->b_hdr;
2483	if (hdr == NULL) {
2484		/*
2485		 * We are in arc_do_user_evicts(); let that function
2486		 * perform the eviction.
2487		 */
2488		ASSERT(buf->b_data == NULL);
2489		mutex_exit(&buf->b_evict_lock);
2490		return (B_FALSE);
2491	} else if (buf->b_data == NULL) {
2492		/*
2493		 * We have already been added to the arc eviction list;
2494		 * recommend eviction.
2495		 */
2496		ASSERT3P(hdr, ==, &arc_eviction_hdr);
2497		mutex_exit(&buf->b_evict_lock);
2498		return (B_TRUE);
2499	}
2500
2501	if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr))
2502		evict_needed = B_TRUE;
2503
2504	mutex_exit(&buf->b_evict_lock);
2505	return (evict_needed);
2506}
2507
2508/*
2509 * Evict buffers from list until we've removed the specified number of
2510 * bytes.  Move the removed buffers to the appropriate evict state.
2511 * If the recycle flag is set, then attempt to "recycle" a buffer:
2512 * - look for a buffer to evict that is `bytes' long.
2513 * - return the data block from this buffer rather than freeing it.
2514 * This flag is used by callers that are trying to make space for a
2515 * new buffer in a full arc cache.
2516 *
2517 * This function makes a "best effort".  It skips over any buffers
2518 * it can't get a hash_lock on, and so may not catch all candidates.
2519 * It may also return without evicting as much space as requested.
2520 */
2521static void *
2522arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
2523    arc_buf_contents_t type)
2524{
2525	arc_state_t *evicted_state;
2526	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
2527	int64_t bytes_remaining;
2528	arc_buf_hdr_t *hdr, *hdr_prev = NULL;
2529	list_t *evicted_list, *list, *evicted_list_start, *list_start;
2530	kmutex_t *lock, *evicted_lock;
2531	kmutex_t *hash_lock;
2532	boolean_t have_lock;
2533	void *stolen = NULL;
2534	arc_buf_hdr_t marker = { 0 };
2535	int count = 0;
2536	static int evict_metadata_offset, evict_data_offset;
2537	int i, idx, offset, list_count, lists;
2538
2539	ASSERT(state == arc_mru || state == arc_mfu);
2540
2541	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2542
2543	/*
2544	 * Decide which "type" (data vs metadata) to recycle from.
2545	 *
2546	 * If we are over the metadata limit, recycle from metadata.
2547	 * If we are under the metadata minimum, recycle from data.
2548	 * Otherwise, recycle from whichever type has the oldest (least
2549	 * recently accessed) header.  This is not yet implemented.
2550	 */
2551	if (recycle) {
2552		arc_buf_contents_t realtype;
2553		if (state->arcs_lsize[ARC_BUFC_DATA] == 0) {
2554			realtype = ARC_BUFC_METADATA;
2555		} else if (state->arcs_lsize[ARC_BUFC_METADATA] == 0) {
2556			realtype = ARC_BUFC_DATA;
2557		} else if (arc_meta_used >= arc_meta_limit) {
2558			realtype = ARC_BUFC_METADATA;
2559		} else if (arc_meta_used <= arc_meta_min) {
2560			realtype = ARC_BUFC_DATA;
2561#ifdef illumos
2562		} else if (HDR_HAS_L1HDR(data_hdr) &&
2563		    HDR_HAS_L1HDR(metadata_hdr) &&
2564		    data_hdr->b_l1hdr.b_arc_access <
2565		    metadata_hdr->b_l1hdr.b_arc_access) {
2566			realtype = ARC_BUFC_DATA;
2567		} else {
2568			realtype = ARC_BUFC_METADATA;
2569#else
2570		} else {
2571			/* TODO */
2572			realtype = type;
2573#endif
2574		}
2575		if (realtype != type) {
2576			/*
2577			 * If we want to evict from a different list,
2578			 * we can not recycle, because DATA vs METADATA
2579			 * buffers are segregated into different kmem
2580			 * caches (and vmem arenas).
2581			 */
2582			type = realtype;
2583			recycle = B_FALSE;
2584		}
2585	}
2586
2587	if (type == ARC_BUFC_METADATA) {
2588		offset = 0;
2589		list_count = ARC_BUFC_NUMMETADATALISTS;
2590		list_start = &state->arcs_lists[0];
2591		evicted_list_start = &evicted_state->arcs_lists[0];
2592		idx = evict_metadata_offset;
2593	} else {
2594		offset = ARC_BUFC_NUMMETADATALISTS;
2595		list_start = &state->arcs_lists[offset];
2596		evicted_list_start = &evicted_state->arcs_lists[offset];
2597		list_count = ARC_BUFC_NUMDATALISTS;
2598		idx = evict_data_offset;
2599	}
2600	bytes_remaining = evicted_state->arcs_lsize[type];
2601	lists = 0;
2602
2603evict_start:
2604	list = &list_start[idx];
2605	evicted_list = &evicted_list_start[idx];
2606	lock = ARCS_LOCK(state, (offset + idx));
2607	evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
2608
2609	/*
2610	 * The ghost list lock must be acquired first in order to prevent
2611	 * a 3 party deadlock:
2612	 *
2613	 *  - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by
2614	 *    l2ad_mtx in arc_hdr_realloc
2615	 *  - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx
2616	 *  - arc_evict acquires arc_*_ghost->arcs_mtx, followed by
2617	 *    arc_*_ghost->arcs_mtx and forms a deadlock cycle.
2618	 *
2619	 * This situation is avoided by acquiring the ghost list lock first.
2620	 */
2621	mutex_enter(evicted_lock);
2622	mutex_enter(lock);
2623
2624	for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
2625		hdr_prev = list_prev(list, hdr);
2626		if (HDR_HAS_L1HDR(hdr)) {
2627			bytes_remaining -=
2628			    (hdr->b_size * hdr->b_l1hdr.b_datacnt);
2629		}
2630		/* prefetch buffers have a minimum lifespan */
2631		if (HDR_IO_IN_PROGRESS(hdr) ||
2632		    (spa && hdr->b_spa != spa) ||
2633		    ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
2634		    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
2635		    arc_min_prefetch_lifespan)) {
2636			skipped++;
2637			continue;
2638		}
2639		/* "lookahead" for better eviction candidate */
2640		if (recycle && hdr->b_size != bytes &&
2641		    hdr_prev && hdr_prev->b_size == bytes)
2642			continue;
2643
2644		/* ignore markers */
2645		if (hdr->b_spa == 0)
2646			continue;
2647
2648		/*
2649		 * It may take a long time to evict all the bufs requested.
2650		 * To avoid blocking all arc activity, periodically drop
2651		 * the arcs_mtx and give other threads a chance to run
2652		 * before reacquiring the lock.
2653		 *
2654		 * If we are looking for a buffer to recycle, we are in
2655		 * the hot code path, so don't sleep.
2656		 */
2657		if (!recycle && count++ > arc_evict_iterations) {
2658			list_insert_after(list, hdr, &marker);
2659			mutex_exit(lock);
2660			mutex_exit(evicted_lock);
2661			kpreempt(KPREEMPT_SYNC);
2662			mutex_enter(evicted_lock);
2663			mutex_enter(lock);
2664			hdr_prev = list_prev(list, &marker);
2665			list_remove(list, &marker);
2666			count = 0;
2667			continue;
2668		}
2669
2670		hash_lock = HDR_LOCK(hdr);
2671		have_lock = MUTEX_HELD(hash_lock);
2672		if (have_lock || mutex_tryenter(hash_lock)) {
2673			ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
2674			ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
2675			while (hdr->b_l1hdr.b_buf) {
2676				arc_buf_t *buf = hdr->b_l1hdr.b_buf;
2677				if (!mutex_tryenter(&buf->b_evict_lock)) {
2678					missed += 1;
2679					break;
2680				}
2681				if (buf->b_data != NULL) {
2682					bytes_evicted += hdr->b_size;
2683					if (recycle &&
2684					    arc_buf_type(hdr) == type &&
2685					    hdr->b_size == bytes &&
2686					    !HDR_L2_WRITING(hdr)) {
2687						stolen = buf->b_data;
2688						recycle = FALSE;
2689					}
2690				}
2691				if (buf->b_efunc != NULL) {
2692					mutex_enter(&arc_eviction_mtx);
2693					arc_buf_destroy(buf,
2694					    buf->b_data == stolen, FALSE);
2695					hdr->b_l1hdr.b_buf = buf->b_next;
2696					buf->b_hdr = &arc_eviction_hdr;
2697					buf->b_next = arc_eviction_list;
2698					arc_eviction_list = buf;
2699					mutex_exit(&arc_eviction_mtx);
2700					mutex_exit(&buf->b_evict_lock);
2701				} else {
2702					mutex_exit(&buf->b_evict_lock);
2703					arc_buf_destroy(buf,
2704					    buf->b_data == stolen, TRUE);
2705				}
2706			}
2707
2708			if (HDR_HAS_L2HDR(hdr)) {
2709				ARCSTAT_INCR(arcstat_evict_l2_cached,
2710				    hdr->b_size);
2711			} else {
2712				if (l2arc_write_eligible(hdr->b_spa, hdr)) {
2713					ARCSTAT_INCR(arcstat_evict_l2_eligible,
2714					    hdr->b_size);
2715				} else {
2716					ARCSTAT_INCR(
2717					    arcstat_evict_l2_ineligible,
2718					    hdr->b_size);
2719				}
2720			}
2721
2722			if (hdr->b_l1hdr.b_datacnt == 0) {
2723				arc_change_state(evicted_state, hdr, hash_lock);
2724				ASSERT(HDR_IN_HASH_TABLE(hdr));
2725				hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
2726				hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
2727				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
2728			}
2729			if (!have_lock)
2730				mutex_exit(hash_lock);
2731			if (bytes >= 0 && bytes_evicted >= bytes)
2732				break;
2733			if (bytes_remaining > 0) {
2734				mutex_exit(evicted_lock);
2735				mutex_exit(lock);
2736				idx  = ((idx + 1) & (list_count - 1));
2737				lists++;
2738				goto evict_start;
2739			}
2740		} else {
2741			missed += 1;
2742		}
2743	}
2744
2745	mutex_exit(lock);
2746	mutex_exit(evicted_lock);
2747
2748	idx  = ((idx + 1) & (list_count - 1));
2749	lists++;
2750
2751	if (bytes_evicted < bytes) {
2752		if (lists < list_count)
2753			goto evict_start;
2754		else
2755			dprintf("only evicted %lld bytes from %x",
2756			    (longlong_t)bytes_evicted, state);
2757	}
2758	if (type == ARC_BUFC_METADATA)
2759		evict_metadata_offset = idx;
2760	else
2761		evict_data_offset = idx;
2762
2763	if (skipped)
2764		ARCSTAT_INCR(arcstat_evict_skip, skipped);
2765
2766	if (missed)
2767		ARCSTAT_INCR(arcstat_mutex_miss, missed);
2768
2769	/*
2770	 * Note: we have just evicted some data into the ghost state,
2771	 * potentially putting the ghost size over the desired size.  Rather
2772	 * that evicting from the ghost list in this hot code path, leave
2773	 * this chore to the arc_reclaim_thread().
2774	 */
2775
2776	if (stolen)
2777		ARCSTAT_BUMP(arcstat_stolen);
2778	return (stolen);
2779}
2780
2781/*
2782 * Remove buffers from list until we've removed the specified number of
2783 * bytes.  Destroy the buffers that are removed.
2784 */
2785static void
2786arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
2787{
2788	arc_buf_hdr_t *hdr, *hdr_prev;
2789	arc_buf_hdr_t marker = { 0 };
2790	list_t *list, *list_start;
2791	kmutex_t *hash_lock, *lock;
2792	uint64_t bytes_deleted = 0;
2793	uint64_t bufs_skipped = 0;
2794	int count = 0;
2795	static int evict_offset;
2796	int list_count, idx = evict_offset;
2797	int offset, lists = 0;
2798
2799	ASSERT(GHOST_STATE(state));
2800
2801	/*
2802	 * data lists come after metadata lists
2803	 */
2804	list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
2805	list_count = ARC_BUFC_NUMDATALISTS;
2806	offset = ARC_BUFC_NUMMETADATALISTS;
2807
2808evict_start:
2809	list = &list_start[idx];
2810	lock = ARCS_LOCK(state, idx + offset);
2811
2812	mutex_enter(lock);
2813	for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
2814		hdr_prev = list_prev(list, hdr);
2815		if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES)
2816			panic("invalid hdr=%p", (void *)hdr);
2817		if (spa && hdr->b_spa != spa)
2818			continue;
2819
2820		/* ignore markers */
2821		if (hdr->b_spa == 0)
2822			continue;
2823
2824		hash_lock = HDR_LOCK(hdr);
2825		/* caller may be trying to modify this buffer, skip it */
2826		if (MUTEX_HELD(hash_lock))
2827			continue;
2828
2829		/*
2830		 * It may take a long time to evict all the bufs requested.
2831		 * To avoid blocking all arc activity, periodically drop
2832		 * the arcs_mtx and give other threads a chance to run
2833		 * before reacquiring the lock.
2834		 */
2835		if (count++ > arc_evict_iterations) {
2836			list_insert_after(list, hdr, &marker);
2837			mutex_exit(lock);
2838			kpreempt(KPREEMPT_SYNC);
2839			mutex_enter(lock);
2840			hdr_prev = list_prev(list, &marker);
2841			list_remove(list, &marker);
2842			count = 0;
2843			continue;
2844		}
2845		if (mutex_tryenter(hash_lock)) {
2846			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2847			ASSERT(!HDR_HAS_L1HDR(hdr) ||
2848			    hdr->b_l1hdr.b_buf == NULL);
2849			ARCSTAT_BUMP(arcstat_deleted);
2850			bytes_deleted += hdr->b_size;
2851
2852			if (HDR_HAS_L2HDR(hdr)) {
2853				/*
2854				 * This buffer is cached on the 2nd Level ARC;
2855				 * don't destroy the header.
2856				 */
2857				arc_change_state(arc_l2c_only, hdr, hash_lock);
2858				/*
2859				 * dropping from L1+L2 cached to L2-only,
2860				 * realloc to remove the L1 header.
2861				 */
2862				hdr = arc_hdr_realloc(hdr, hdr_full_cache,
2863				    hdr_l2only_cache);
2864				mutex_exit(hash_lock);
2865			} else {
2866				arc_change_state(arc_anon, hdr, hash_lock);
2867				mutex_exit(hash_lock);
2868				arc_hdr_destroy(hdr);
2869			}
2870
2871			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
2872			if (bytes >= 0 && bytes_deleted >= bytes)
2873				break;
2874		} else if (bytes < 0) {
2875			/*
2876			 * Insert a list marker and then wait for the
2877			 * hash lock to become available. Once its
2878			 * available, restart from where we left off.
2879			 */
2880			list_insert_after(list, hdr, &marker);
2881			mutex_exit(lock);
2882			mutex_enter(hash_lock);
2883			mutex_exit(hash_lock);
2884			mutex_enter(lock);
2885			hdr_prev = list_prev(list, &marker);
2886			list_remove(list, &marker);
2887		} else {
2888			bufs_skipped += 1;
2889		}
2890
2891	}
2892	mutex_exit(lock);
2893	idx  = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
2894	lists++;
2895
2896	if (lists < list_count)
2897		goto evict_start;
2898
2899	evict_offset = idx;
2900	if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
2901	    (bytes < 0 || bytes_deleted < bytes)) {
2902		list_start = &state->arcs_lists[0];
2903		list_count = ARC_BUFC_NUMMETADATALISTS;
2904		offset = lists = 0;
2905		goto evict_start;
2906	}
2907
2908	if (bufs_skipped) {
2909		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2910		ASSERT(bytes >= 0);
2911	}
2912
2913	if (bytes_deleted < bytes)
2914		dprintf("only deleted %lld bytes from %p",
2915		    (longlong_t)bytes_deleted, state);
2916}
2917
2918static void
2919arc_adjust(void)
2920{
2921	int64_t adjustment, delta;
2922
2923	/*
2924	 * Adjust MRU size
2925	 */
2926
2927	adjustment = MIN((int64_t)(arc_size - arc_c),
2928	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2929	    arc_p));
2930
2931	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2932		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2933		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
2934		adjustment -= delta;
2935	}
2936
2937	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2938		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2939		(void) arc_evict(arc_mru, 0, delta, FALSE,
2940		    ARC_BUFC_METADATA);
2941	}
2942
2943	/*
2944	 * Adjust MFU size
2945	 */
2946
2947	adjustment = arc_size - arc_c;
2948
2949	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2950		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2951		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
2952		adjustment -= delta;
2953	}
2954
2955	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2956		int64_t delta = MIN(adjustment,
2957		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2958		(void) arc_evict(arc_mfu, 0, delta, FALSE,
2959		    ARC_BUFC_METADATA);
2960	}
2961
2962	/*
2963	 * Adjust ghost lists
2964	 */
2965
2966	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2967
2968	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2969		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2970		arc_evict_ghost(arc_mru_ghost, 0, delta);
2971	}
2972
2973	adjustment =
2974	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2975
2976	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2977		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2978		arc_evict_ghost(arc_mfu_ghost, 0, delta);
2979	}
2980}
2981
2982static void
2983arc_do_user_evicts(void)
2984{
2985	static arc_buf_t *tmp_arc_eviction_list;
2986
2987	/*
2988	 * Move list over to avoid LOR
2989	 */
2990restart:
2991	mutex_enter(&arc_eviction_mtx);
2992	tmp_arc_eviction_list = arc_eviction_list;
2993	arc_eviction_list = NULL;
2994	mutex_exit(&arc_eviction_mtx);
2995
2996	while (tmp_arc_eviction_list != NULL) {
2997		arc_buf_t *buf = tmp_arc_eviction_list;
2998		tmp_arc_eviction_list = buf->b_next;
2999		mutex_enter(&buf->b_evict_lock);
3000		buf->b_hdr = NULL;
3001		mutex_exit(&buf->b_evict_lock);
3002
3003		if (buf->b_efunc != NULL)
3004			VERIFY0(buf->b_efunc(buf->b_private));
3005
3006		buf->b_efunc = NULL;
3007		buf->b_private = NULL;
3008		kmem_cache_free(buf_cache, buf);
3009	}
3010
3011	if (arc_eviction_list != NULL)
3012		goto restart;
3013}
3014
3015/*
3016 * Flush all *evictable* data from the cache for the given spa.
3017 * NOTE: this will not touch "active" (i.e. referenced) data.
3018 */
3019void
3020arc_flush(spa_t *spa)
3021{
3022	uint64_t guid = 0;
3023
3024	if (spa != NULL)
3025		guid = spa_load_guid(spa);
3026
3027	while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
3028		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
3029		if (spa != NULL)
3030			break;
3031	}
3032	while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
3033		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
3034		if (spa != NULL)
3035			break;
3036	}
3037	while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
3038		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
3039		if (spa != NULL)
3040			break;
3041	}
3042	while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
3043		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
3044		if (spa != NULL)
3045			break;
3046	}
3047
3048	arc_evict_ghost(arc_mru_ghost, guid, -1);
3049	arc_evict_ghost(arc_mfu_ghost, guid, -1);
3050
3051	mutex_enter(&arc_reclaim_thr_lock);
3052	arc_do_user_evicts();
3053	mutex_exit(&arc_reclaim_thr_lock);
3054	ASSERT(spa || arc_eviction_list == NULL);
3055}
3056
3057void
3058arc_shrink(void)
3059{
3060
3061	if (arc_c > arc_c_min) {
3062		uint64_t to_free;
3063
3064		to_free = arc_c >> arc_shrink_shift;
3065		DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
3066			arc_c_min, uint64_t, arc_p, uint64_t, to_free);
3067		if (arc_c > arc_c_min + to_free)
3068			atomic_add_64(&arc_c, -to_free);
3069		else
3070			arc_c = arc_c_min;
3071
3072		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
3073		if (arc_c > arc_size)
3074			arc_c = MAX(arc_size, arc_c_min);
3075		if (arc_p > arc_c)
3076			arc_p = (arc_c >> 1);
3077
3078		DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
3079			arc_p);
3080
3081		ASSERT(arc_c >= arc_c_min);
3082		ASSERT((int64_t)arc_p >= 0);
3083	}
3084
3085	if (arc_size > arc_c) {
3086		DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,
3087			uint64_t, arc_c);
3088		arc_adjust();
3089	}
3090}
3091
3092static int needfree = 0;
3093
3094static int
3095arc_reclaim_needed(void)
3096{
3097
3098#ifdef _KERNEL
3099
3100	if (needfree) {
3101		DTRACE_PROBE(arc__reclaim_needfree);
3102		return (1);
3103	}
3104
3105	/*
3106	 * Cooperate with pagedaemon when it's time for it to scan
3107	 * and reclaim some pages.
3108	 */
3109	if (freemem < zfs_arc_free_target) {
3110		DTRACE_PROBE2(arc__reclaim_freemem, uint64_t,
3111		    freemem, uint64_t, zfs_arc_free_target);
3112		return (1);
3113	}
3114
3115#ifdef illumos
3116	/*
3117	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
3118	 */
3119	extra = desfree;
3120
3121	/*
3122	 * check that we're out of range of the pageout scanner.  It starts to
3123	 * schedule paging if freemem is less than lotsfree and needfree.
3124	 * lotsfree is the high-water mark for pageout, and needfree is the
3125	 * number of needed free pages.  We add extra pages here to make sure
3126	 * the scanner doesn't start up while we're freeing memory.
3127	 */
3128	if (freemem < lotsfree + needfree + extra)
3129		return (1);
3130
3131	/*
3132	 * check to make sure that swapfs has enough space so that anon
3133	 * reservations can still succeed. anon_resvmem() checks that the
3134	 * availrmem is greater than swapfs_minfree, and the number of reserved
3135	 * swap pages.  We also add a bit of extra here just to prevent
3136	 * circumstances from getting really dire.
3137	 */
3138	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
3139		return (1);
3140
3141	/*
3142	 * Check that we have enough availrmem that memory locking (e.g., via
3143	 * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
3144	 * stores the number of pages that cannot be locked; when availrmem
3145	 * drops below pages_pp_maximum, page locking mechanisms such as
3146	 * page_pp_lock() will fail.)
3147	 */
3148	if (availrmem <= pages_pp_maximum)
3149		return (1);
3150
3151#endif	/* illumos */
3152#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
3153	/*
3154	 * If we're on an i386 platform, it's possible that we'll exhaust the
3155	 * kernel heap space before we ever run out of available physical
3156	 * memory.  Most checks of the size of the heap_area compare against
3157	 * tune.t_minarmem, which is the minimum available real memory that we
3158	 * can have in the system.  However, this is generally fixed at 25 pages
3159	 * which is so low that it's useless.  In this comparison, we seek to
3160	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
3161	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
3162	 * free)
3163	 */
3164	if (vmem_size(heap_arena, VMEM_FREE) <
3165	    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) {
3166		DTRACE_PROBE2(arc__reclaim_used, uint64_t,
3167		    vmem_size(heap_arena, VMEM_FREE), uint64_t,
3168		    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2);
3169		return (1);
3170	}
3171#define	zio_arena	NULL
3172#else
3173#define	zio_arena	heap_arena
3174#endif
3175
3176	/*
3177	 * If zio data pages are being allocated out of a separate heap segment,
3178	 * then enforce that the size of available vmem for this arena remains
3179	 * above about 1/16th free.
3180	 *
3181	 * Note: The 1/16th arena free requirement was put in place
3182	 * to aggressively evict memory from the arc in order to avoid
3183	 * memory fragmentation issues.
3184	 */
3185	if (zio_arena != NULL &&
3186	    vmem_size(zio_arena, VMEM_FREE) <
3187	    (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
3188		return (1);
3189
3190	/*
3191	 * Above limits know nothing about real level of KVA fragmentation.
3192	 * Start aggressive reclamation if too little sequential KVA left.
3193	 */
3194	if (vmem_size(heap_arena, VMEM_MAXFREE) < zfs_max_recordsize) {
3195		DTRACE_PROBE2(arc__reclaim_maxfree, uint64_t,
3196		    vmem_size(heap_arena, VMEM_MAXFREE),
3197		    uint64_t, zfs_max_recordsize);
3198		return (1);
3199	}
3200
3201#else	/* _KERNEL */
3202	if (spa_get_random(100) == 0)
3203		return (1);
3204#endif	/* _KERNEL */
3205	DTRACE_PROBE(arc__reclaim_no);
3206
3207	return (0);
3208}
3209
3210extern kmem_cache_t	*zio_buf_cache[];
3211extern kmem_cache_t	*zio_data_buf_cache[];
3212extern kmem_cache_t	*range_seg_cache;
3213
3214static __noinline void
3215arc_kmem_reap_now(arc_reclaim_strategy_t strat)
3216{
3217	size_t			i;
3218	kmem_cache_t		*prev_cache = NULL;
3219	kmem_cache_t		*prev_data_cache = NULL;
3220
3221	DTRACE_PROBE(arc__kmem_reap_start);
3222#ifdef _KERNEL
3223	if (arc_meta_used >= arc_meta_limit) {
3224		/*
3225		 * We are exceeding our meta-data cache limit.
3226		 * Purge some DNLC entries to release holds on meta-data.
3227		 */
3228		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
3229	}
3230#if defined(__i386)
3231	/*
3232	 * Reclaim unused memory from all kmem caches.
3233	 */
3234	kmem_reap();
3235#endif
3236#endif
3237
3238	/*
3239	 * An aggressive reclamation will shrink the cache size as well as
3240	 * reap free buffers from the arc kmem caches.
3241	 */
3242	if (strat == ARC_RECLAIM_AGGR)
3243		arc_shrink();
3244
3245	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
3246		if (zio_buf_cache[i] != prev_cache) {
3247			prev_cache = zio_buf_cache[i];
3248			kmem_cache_reap_now(zio_buf_cache[i]);
3249		}
3250		if (zio_data_buf_cache[i] != prev_data_cache) {
3251			prev_data_cache = zio_data_buf_cache[i];
3252			kmem_cache_reap_now(zio_data_buf_cache[i]);
3253		}
3254	}
3255	kmem_cache_reap_now(buf_cache);
3256	kmem_cache_reap_now(hdr_full_cache);
3257	kmem_cache_reap_now(hdr_l2only_cache);
3258	kmem_cache_reap_now(range_seg_cache);
3259
3260#ifdef illumos
3261	/*
3262	 * Ask the vmem arena to reclaim unused memory from its
3263	 * quantum caches.
3264	 */
3265	if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
3266		vmem_qcache_reap(zio_arena);
3267#endif
3268	DTRACE_PROBE(arc__kmem_reap_end);
3269}
3270
3271static void
3272arc_reclaim_thread(void *dummy __unused)
3273{
3274	clock_t			growtime = 0;
3275	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
3276	callb_cpr_t		cpr;
3277
3278	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
3279
3280	mutex_enter(&arc_reclaim_thr_lock);
3281	while (arc_thread_exit == 0) {
3282		if (arc_reclaim_needed()) {
3283
3284			if (arc_no_grow) {
3285				if (last_reclaim == ARC_RECLAIM_CONS) {
3286					DTRACE_PROBE(arc__reclaim_aggr_no_grow);
3287					last_reclaim = ARC_RECLAIM_AGGR;
3288				} else {
3289					last_reclaim = ARC_RECLAIM_CONS;
3290				}
3291			} else {
3292				arc_no_grow = TRUE;
3293				last_reclaim = ARC_RECLAIM_AGGR;
3294				DTRACE_PROBE(arc__reclaim_aggr);
3295				membar_producer();
3296			}
3297
3298			/* reset the growth delay for every reclaim */
3299			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
3300
3301			if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
3302				/*
3303				 * If needfree is TRUE our vm_lowmem hook
3304				 * was called and in that case we must free some
3305				 * memory, so switch to aggressive mode.
3306				 */
3307				arc_no_grow = TRUE;
3308				last_reclaim = ARC_RECLAIM_AGGR;
3309			}
3310			arc_kmem_reap_now(last_reclaim);
3311			arc_warm = B_TRUE;
3312
3313		} else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
3314			arc_no_grow = FALSE;
3315		}
3316
3317		arc_adjust();
3318
3319		if (arc_eviction_list != NULL)
3320			arc_do_user_evicts();
3321
3322#ifdef _KERNEL
3323		if (needfree) {
3324			needfree = 0;
3325			wakeup(&needfree);
3326		}
3327#endif
3328
3329		/*
3330		 * This is necessary in order for the mdb ::arc dcmd to
3331		 * show up to date information. Since the ::arc command
3332		 * does not call the kstat's update function, without
3333		 * this call, the command may show stale stats for the
3334		 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
3335		 * with this change, the data might be up to 1 second
3336		 * out of date; but that should suffice. The arc_state_t
3337		 * structures can be queried directly if more accurate
3338		 * information is needed.
3339		 */
3340		if (arc_ksp != NULL)
3341			arc_ksp->ks_update(arc_ksp, KSTAT_READ);
3342
3343		/* block until needed, or one second, whichever is shorter */
3344		CALLB_CPR_SAFE_BEGIN(&cpr);
3345		(void) cv_timedwait(&arc_reclaim_thr_cv,
3346		    &arc_reclaim_thr_lock, hz);
3347		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
3348	}
3349
3350	arc_thread_exit = 0;
3351	cv_broadcast(&arc_reclaim_thr_cv);
3352	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
3353	thread_exit();
3354}
3355
3356/*
3357 * Adapt arc info given the number of bytes we are trying to add and
3358 * the state that we are comming from.  This function is only called
3359 * when we are adding new content to the cache.
3360 */
3361static void
3362arc_adapt(int bytes, arc_state_t *state)
3363{
3364	int mult;
3365	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
3366
3367	if (state == arc_l2c_only)
3368		return;
3369
3370	ASSERT(bytes > 0);
3371	/*
3372	 * Adapt the target size of the MRU list:
3373	 *	- if we just hit in the MRU ghost list, then increase
3374	 *	  the target size of the MRU list.
3375	 *	- if we just hit in the MFU ghost list, then increase
3376	 *	  the target size of the MFU list by decreasing the
3377	 *	  target size of the MRU list.
3378	 */
3379	if (state == arc_mru_ghost) {
3380		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
3381		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
3382		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
3383
3384		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
3385	} else if (state == arc_mfu_ghost) {
3386		uint64_t delta;
3387
3388		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
3389		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
3390		mult = MIN(mult, 10);
3391
3392		delta = MIN(bytes * mult, arc_p);
3393		arc_p = MAX(arc_p_min, arc_p - delta);
3394	}
3395	ASSERT((int64_t)arc_p >= 0);
3396
3397	if (arc_reclaim_needed()) {
3398		cv_signal(&arc_reclaim_thr_cv);
3399		return;
3400	}
3401
3402	if (arc_no_grow)
3403		return;
3404
3405	if (arc_c >= arc_c_max)
3406		return;
3407
3408	/*
3409	 * If we're within (2 * maxblocksize) bytes of the target
3410	 * cache size, increment the target cache size
3411	 */
3412	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
3413		DTRACE_PROBE1(arc__inc_adapt, int, bytes);
3414		atomic_add_64(&arc_c, (int64_t)bytes);
3415		if (arc_c > arc_c_max)
3416			arc_c = arc_c_max;
3417		else if (state == arc_anon)
3418			atomic_add_64(&arc_p, (int64_t)bytes);
3419		if (arc_p > arc_c)
3420			arc_p = arc_c;
3421	}
3422	ASSERT((int64_t)arc_p >= 0);
3423}
3424
3425/*
3426 * Check if the cache has reached its limits and eviction is required
3427 * prior to insert.
3428 */
3429static int
3430arc_evict_needed(arc_buf_contents_t type)
3431{
3432	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
3433		return (1);
3434
3435	if (arc_reclaim_needed())
3436		return (1);
3437
3438	return (arc_size > arc_c);
3439}
3440
3441/*
3442 * The buffer, supplied as the first argument, needs a data block.
3443 * So, if we are at cache max, determine which cache should be victimized.
3444 * We have the following cases:
3445 *
3446 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
3447 * In this situation if we're out of space, but the resident size of the MFU is
3448 * under the limit, victimize the MFU cache to satisfy this insertion request.
3449 *
3450 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
3451 * Here, we've used up all of the available space for the MRU, so we need to
3452 * evict from our own cache instead.  Evict from the set of resident MRU
3453 * entries.
3454 *
3455 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
3456 * c minus p represents the MFU space in the cache, since p is the size of the
3457 * cache that is dedicated to the MRU.  In this situation there's still space on
3458 * the MFU side, so the MRU side needs to be victimized.
3459 *
3460 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
3461 * MFU's resident set is consuming more space than it has been allotted.  In
3462 * this situation, we must victimize our own cache, the MFU, for this insertion.
3463 */
3464static void
3465arc_get_data_buf(arc_buf_t *buf)
3466{
3467	arc_state_t		*state = buf->b_hdr->b_l1hdr.b_state;
3468	uint64_t		size = buf->b_hdr->b_size;
3469	arc_buf_contents_t	type = arc_buf_type(buf->b_hdr);
3470
3471	arc_adapt(size, state);
3472
3473	/*
3474	 * We have not yet reached cache maximum size,
3475	 * just allocate a new buffer.
3476	 */
3477	if (!arc_evict_needed(type)) {
3478		if (type == ARC_BUFC_METADATA) {
3479			buf->b_data = zio_buf_alloc(size);
3480			arc_space_consume(size, ARC_SPACE_META);
3481		} else {
3482			ASSERT(type == ARC_BUFC_DATA);
3483			buf->b_data = zio_data_buf_alloc(size);
3484			arc_space_consume(size, ARC_SPACE_DATA);
3485		}
3486		goto out;
3487	}
3488
3489	/*
3490	 * If we are prefetching from the mfu ghost list, this buffer
3491	 * will end up on the mru list; so steal space from there.
3492	 */
3493	if (state == arc_mfu_ghost)
3494		state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu;
3495	else if (state == arc_mru_ghost)
3496		state = arc_mru;
3497
3498	if (state == arc_mru || state == arc_anon) {
3499		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
3500		state = (arc_mfu->arcs_lsize[type] >= size &&
3501		    arc_p > mru_used) ? arc_mfu : arc_mru;
3502	} else {
3503		/* MFU cases */
3504		uint64_t mfu_space = arc_c - arc_p;
3505		state =  (arc_mru->arcs_lsize[type] >= size &&
3506		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
3507	}
3508	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
3509		if (type == ARC_BUFC_METADATA) {
3510			buf->b_data = zio_buf_alloc(size);
3511			arc_space_consume(size, ARC_SPACE_META);
3512		} else {
3513			ASSERT(type == ARC_BUFC_DATA);
3514			buf->b_data = zio_data_buf_alloc(size);
3515			arc_space_consume(size, ARC_SPACE_DATA);
3516		}
3517		ARCSTAT_BUMP(arcstat_recycle_miss);
3518	}
3519	ASSERT(buf->b_data != NULL);
3520out:
3521	/*
3522	 * Update the state size.  Note that ghost states have a
3523	 * "ghost size" and so don't need to be updated.
3524	 */
3525	if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) {
3526		arc_buf_hdr_t *hdr = buf->b_hdr;
3527
3528		atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size);
3529		if (list_link_active(&hdr->b_l1hdr.b_arc_node)) {
3530			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3531			atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type],
3532			    size);
3533		}
3534		/*
3535		 * If we are growing the cache, and we are adding anonymous
3536		 * data, and we have outgrown arc_p, update arc_p
3537		 */
3538		if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
3539		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
3540			arc_p = MIN(arc_c, arc_p + size);
3541	}
3542	ARCSTAT_BUMP(arcstat_allocated);
3543}
3544
3545/*
3546 * This routine is called whenever a buffer is accessed.
3547 * NOTE: the hash lock is dropped in this function.
3548 */
3549static void
3550arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
3551{
3552	clock_t now;
3553
3554	ASSERT(MUTEX_HELD(hash_lock));
3555	ASSERT(HDR_HAS_L1HDR(hdr));
3556
3557	if (hdr->b_l1hdr.b_state == arc_anon) {
3558		/*
3559		 * This buffer is not in the cache, and does not
3560		 * appear in our "ghost" list.  Add the new buffer
3561		 * to the MRU state.
3562		 */
3563
3564		ASSERT0(hdr->b_l1hdr.b_arc_access);
3565		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3566		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
3567		arc_change_state(arc_mru, hdr, hash_lock);
3568
3569	} else if (hdr->b_l1hdr.b_state == arc_mru) {
3570		now = ddi_get_lbolt();
3571
3572		/*
3573		 * If this buffer is here because of a prefetch, then either:
3574		 * - clear the flag if this is a "referencing" read
3575		 *   (any subsequent access will bump this into the MFU state).
3576		 * or
3577		 * - move the buffer to the head of the list if this is
3578		 *   another prefetch (to make it less likely to be evicted).
3579		 */
3580		if (HDR_PREFETCH(hdr)) {
3581			if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
3582				ASSERT(list_link_active(
3583				    &hdr->b_l1hdr.b_arc_node));
3584			} else {
3585				hdr->b_flags &= ~ARC_FLAG_PREFETCH;
3586				ARCSTAT_BUMP(arcstat_mru_hits);
3587			}
3588			hdr->b_l1hdr.b_arc_access = now;
3589			return;
3590		}
3591
3592		/*
3593		 * This buffer has been "accessed" only once so far,
3594		 * but it is still in the cache. Move it to the MFU
3595		 * state.
3596		 */
3597		if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) {
3598			/*
3599			 * More than 125ms have passed since we
3600			 * instantiated this buffer.  Move it to the
3601			 * most frequently used state.
3602			 */
3603			hdr->b_l1hdr.b_arc_access = now;
3604			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3605			arc_change_state(arc_mfu, hdr, hash_lock);
3606		}
3607		ARCSTAT_BUMP(arcstat_mru_hits);
3608	} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
3609		arc_state_t	*new_state;
3610		/*
3611		 * This buffer has been "accessed" recently, but
3612		 * was evicted from the cache.  Move it to the
3613		 * MFU state.
3614		 */
3615
3616		if (HDR_PREFETCH(hdr)) {
3617			new_state = arc_mru;
3618			if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
3619				hdr->b_flags &= ~ARC_FLAG_PREFETCH;
3620			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
3621		} else {
3622			new_state = arc_mfu;
3623			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3624		}
3625
3626		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3627		arc_change_state(new_state, hdr, hash_lock);
3628
3629		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
3630	} else if (hdr->b_l1hdr.b_state == arc_mfu) {
3631		/*
3632		 * This buffer has been accessed more than once and is
3633		 * still in the cache.  Keep it in the MFU state.
3634		 *
3635		 * NOTE: an add_reference() that occurred when we did
3636		 * the arc_read() will have kicked this off the list.
3637		 * If it was a prefetch, we will explicitly move it to
3638		 * the head of the list now.
3639		 */
3640		if ((HDR_PREFETCH(hdr)) != 0) {
3641			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3642			ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
3643		}
3644		ARCSTAT_BUMP(arcstat_mfu_hits);
3645		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3646	} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
3647		arc_state_t	*new_state = arc_mfu;
3648		/*
3649		 * This buffer has been accessed more than once but has
3650		 * been evicted from the cache.  Move it back to the
3651		 * MFU state.
3652		 */
3653
3654		if (HDR_PREFETCH(hdr)) {
3655			/*
3656			 * This is a prefetch access...
3657			 * move this block back to the MRU state.
3658			 */
3659			ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
3660			new_state = arc_mru;
3661		}
3662
3663		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3664		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3665		arc_change_state(new_state, hdr, hash_lock);
3666
3667		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
3668	} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
3669		/*
3670		 * This buffer is on the 2nd Level ARC.
3671		 */
3672
3673		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3674		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3675		arc_change_state(arc_mfu, hdr, hash_lock);
3676	} else {
3677		ASSERT(!"invalid arc state");
3678	}
3679}
3680
3681/* a generic arc_done_func_t which you can use */
3682/* ARGSUSED */
3683void
3684arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
3685{
3686	if (zio == NULL || zio->io_error == 0)
3687		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
3688	VERIFY(arc_buf_remove_ref(buf, arg));
3689}
3690
3691/* a generic arc_done_func_t */
3692void
3693arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
3694{
3695	arc_buf_t **bufp = arg;
3696	if (zio && zio->io_error) {
3697		VERIFY(arc_buf_remove_ref(buf, arg));
3698		*bufp = NULL;
3699	} else {
3700		*bufp = buf;
3701		ASSERT(buf->b_data);
3702	}
3703}
3704
3705static void
3706arc_read_done(zio_t *zio)
3707{
3708	arc_buf_hdr_t	*hdr;
3709	arc_buf_t	*buf;
3710	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
3711	kmutex_t	*hash_lock = NULL;
3712	arc_callback_t	*callback_list, *acb;
3713	int		freeable = FALSE;
3714
3715	buf = zio->io_private;
3716	hdr = buf->b_hdr;
3717
3718	/*
3719	 * The hdr was inserted into hash-table and removed from lists
3720	 * prior to starting I/O.  We should find this header, since
3721	 * it's in the hash table, and it should be legit since it's
3722	 * not possible to evict it during the I/O.  The only possible
3723	 * reason for it not to be found is if we were freed during the
3724	 * read.
3725	 */
3726	if (HDR_IN_HASH_TABLE(hdr)) {
3727		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
3728		ASSERT3U(hdr->b_dva.dva_word[0], ==,
3729		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
3730		ASSERT3U(hdr->b_dva.dva_word[1], ==,
3731		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
3732
3733		arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
3734		    &hash_lock);
3735
3736		ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
3737		    hash_lock == NULL) ||
3738		    (found == hdr &&
3739		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
3740		    (found == hdr && HDR_L2_READING(hdr)));
3741	}
3742
3743	hdr->b_flags &= ~ARC_FLAG_L2_EVICTED;
3744	if (l2arc_noprefetch && HDR_PREFETCH(hdr))
3745		hdr->b_flags &= ~ARC_FLAG_L2CACHE;
3746
3747	/* byteswap if necessary */
3748	callback_list = hdr->b_l1hdr.b_acb;
3749	ASSERT(callback_list != NULL);
3750	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
3751		dmu_object_byteswap_t bswap =
3752		    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
3753		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
3754		    byteswap_uint64_array :
3755		    dmu_ot_byteswap[bswap].ob_func;
3756		func(buf->b_data, hdr->b_size);
3757	}
3758
3759	arc_cksum_compute(buf, B_FALSE);
3760#ifdef illumos
3761	arc_buf_watch(buf);
3762#endif
3763
3764	if (hash_lock && zio->io_error == 0 &&
3765	    hdr->b_l1hdr.b_state == arc_anon) {
3766		/*
3767		 * Only call arc_access on anonymous buffers.  This is because
3768		 * if we've issued an I/O for an evicted buffer, we've already
3769		 * called arc_access (to prevent any simultaneous readers from
3770		 * getting confused).
3771		 */
3772		arc_access(hdr, hash_lock);
3773	}
3774
3775	/* create copies of the data buffer for the callers */
3776	abuf = buf;
3777	for (acb = callback_list; acb; acb = acb->acb_next) {
3778		if (acb->acb_done) {
3779			if (abuf == NULL) {
3780				ARCSTAT_BUMP(arcstat_duplicate_reads);
3781				abuf = arc_buf_clone(buf);
3782			}
3783			acb->acb_buf = abuf;
3784			abuf = NULL;
3785		}
3786	}
3787	hdr->b_l1hdr.b_acb = NULL;
3788	hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
3789	ASSERT(!HDR_BUF_AVAILABLE(hdr));
3790	if (abuf == buf) {
3791		ASSERT(buf->b_efunc == NULL);
3792		ASSERT(hdr->b_l1hdr.b_datacnt == 1);
3793		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
3794	}
3795
3796	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
3797	    callback_list != NULL);
3798
3799	if (zio->io_error != 0) {
3800		hdr->b_flags |= ARC_FLAG_IO_ERROR;
3801		if (hdr->b_l1hdr.b_state != arc_anon)
3802			arc_change_state(arc_anon, hdr, hash_lock);
3803		if (HDR_IN_HASH_TABLE(hdr))
3804			buf_hash_remove(hdr);
3805		freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
3806	}
3807
3808	/*
3809	 * Broadcast before we drop the hash_lock to avoid the possibility
3810	 * that the hdr (and hence the cv) might be freed before we get to
3811	 * the cv_broadcast().
3812	 */
3813	cv_broadcast(&hdr->b_l1hdr.b_cv);
3814
3815	if (hash_lock != NULL) {
3816		mutex_exit(hash_lock);
3817	} else {
3818		/*
3819		 * This block was freed while we waited for the read to
3820		 * complete.  It has been removed from the hash table and
3821		 * moved to the anonymous state (so that it won't show up
3822		 * in the cache).
3823		 */
3824		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3825		freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
3826	}
3827
3828	/* execute each callback and free its structure */
3829	while ((acb = callback_list) != NULL) {
3830		if (acb->acb_done)
3831			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3832
3833		if (acb->acb_zio_dummy != NULL) {
3834			acb->acb_zio_dummy->io_error = zio->io_error;
3835			zio_nowait(acb->acb_zio_dummy);
3836		}
3837
3838		callback_list = acb->acb_next;
3839		kmem_free(acb, sizeof (arc_callback_t));
3840	}
3841
3842	if (freeable)
3843		arc_hdr_destroy(hdr);
3844}
3845
3846/*
3847 * "Read" the block block at the specified DVA (in bp) via the
3848 * cache.  If the block is found in the cache, invoke the provided
3849 * callback immediately and return.  Note that the `zio' parameter
3850 * in the callback will be NULL in this case, since no IO was
3851 * required.  If the block is not in the cache pass the read request
3852 * on to the spa with a substitute callback function, so that the
3853 * requested block will be added to the cache.
3854 *
3855 * If a read request arrives for a block that has a read in-progress,
3856 * either wait for the in-progress read to complete (and return the
3857 * results); or, if this is a read with a "done" func, add a record
3858 * to the read to invoke the "done" func when the read completes,
3859 * and return; or just return.
3860 *
3861 * arc_read_done() will invoke all the requested "done" functions
3862 * for readers of this block.
3863 */
3864int
3865arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
3866    void *private, zio_priority_t priority, int zio_flags,
3867    arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
3868{
3869	arc_buf_hdr_t *hdr = NULL;
3870	arc_buf_t *buf = NULL;
3871	kmutex_t *hash_lock = NULL;
3872	zio_t *rzio;
3873	uint64_t guid = spa_load_guid(spa);
3874
3875	ASSERT(!BP_IS_EMBEDDED(bp) ||
3876	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
3877
3878top:
3879	if (!BP_IS_EMBEDDED(bp)) {
3880		/*
3881		 * Embedded BP's have no DVA and require no I/O to "read".
3882		 * Create an anonymous arc buf to back it.
3883		 */
3884		hdr = buf_hash_find(guid, bp, &hash_lock);
3885	}
3886
3887	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) {
3888
3889		*arc_flags |= ARC_FLAG_CACHED;
3890
3891		if (HDR_IO_IN_PROGRESS(hdr)) {
3892
3893			if (*arc_flags & ARC_FLAG_WAIT) {
3894				cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
3895				mutex_exit(hash_lock);
3896				goto top;
3897			}
3898			ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
3899
3900			if (done) {
3901				arc_callback_t	*acb = NULL;
3902
3903				acb = kmem_zalloc(sizeof (arc_callback_t),
3904				    KM_SLEEP);
3905				acb->acb_done = done;
3906				acb->acb_private = private;
3907				if (pio != NULL)
3908					acb->acb_zio_dummy = zio_null(pio,
3909					    spa, NULL, NULL, NULL, zio_flags);
3910
3911				ASSERT(acb->acb_done != NULL);
3912				acb->acb_next = hdr->b_l1hdr.b_acb;
3913				hdr->b_l1hdr.b_acb = acb;
3914				add_reference(hdr, hash_lock, private);
3915				mutex_exit(hash_lock);
3916				return (0);
3917			}
3918			mutex_exit(hash_lock);
3919			return (0);
3920		}
3921
3922		ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
3923		    hdr->b_l1hdr.b_state == arc_mfu);
3924
3925		if (done) {
3926			add_reference(hdr, hash_lock, private);
3927			/*
3928			 * If this block is already in use, create a new
3929			 * copy of the data so that we will be guaranteed
3930			 * that arc_release() will always succeed.
3931			 */
3932			buf = hdr->b_l1hdr.b_buf;
3933			ASSERT(buf);
3934			ASSERT(buf->b_data);
3935			if (HDR_BUF_AVAILABLE(hdr)) {
3936				ASSERT(buf->b_efunc == NULL);
3937				hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
3938			} else {
3939				buf = arc_buf_clone(buf);
3940			}
3941
3942		} else if (*arc_flags & ARC_FLAG_PREFETCH &&
3943		    refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
3944			hdr->b_flags |= ARC_FLAG_PREFETCH;
3945		}
3946		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3947		arc_access(hdr, hash_lock);
3948		if (*arc_flags & ARC_FLAG_L2CACHE)
3949			hdr->b_flags |= ARC_FLAG_L2CACHE;
3950		if (*arc_flags & ARC_FLAG_L2COMPRESS)
3951			hdr->b_flags |= ARC_FLAG_L2COMPRESS;
3952		mutex_exit(hash_lock);
3953		ARCSTAT_BUMP(arcstat_hits);
3954		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
3955		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
3956		    data, metadata, hits);
3957
3958		if (done)
3959			done(NULL, buf, private);
3960	} else {
3961		uint64_t size = BP_GET_LSIZE(bp);
3962		arc_callback_t *acb;
3963		vdev_t *vd = NULL;
3964		uint64_t addr = 0;
3965		boolean_t devw = B_FALSE;
3966		enum zio_compress b_compress = ZIO_COMPRESS_OFF;
3967		int32_t b_asize = 0;
3968
3969		if (hdr == NULL) {
3970			/* this block is not in the cache */
3971			arc_buf_hdr_t *exists = NULL;
3972			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3973			buf = arc_buf_alloc(spa, size, private, type);
3974			hdr = buf->b_hdr;
3975			if (!BP_IS_EMBEDDED(bp)) {
3976				hdr->b_dva = *BP_IDENTITY(bp);
3977				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3978				exists = buf_hash_insert(hdr, &hash_lock);
3979			}
3980			if (exists != NULL) {
3981				/* somebody beat us to the hash insert */
3982				mutex_exit(hash_lock);
3983				buf_discard_identity(hdr);
3984				(void) arc_buf_remove_ref(buf, private);
3985				goto top; /* restart the IO request */
3986			}
3987
3988			/* if this is a prefetch, we don't have a reference */
3989			if (*arc_flags & ARC_FLAG_PREFETCH) {
3990				(void) remove_reference(hdr, hash_lock,
3991				    private);
3992				hdr->b_flags |= ARC_FLAG_PREFETCH;
3993			}
3994			if (*arc_flags & ARC_FLAG_L2CACHE)
3995				hdr->b_flags |= ARC_FLAG_L2CACHE;
3996			if (*arc_flags & ARC_FLAG_L2COMPRESS)
3997				hdr->b_flags |= ARC_FLAG_L2COMPRESS;
3998			if (BP_GET_LEVEL(bp) > 0)
3999				hdr->b_flags |= ARC_FLAG_INDIRECT;
4000		} else {
4001			/*
4002			 * This block is in the ghost cache. If it was L2-only
4003			 * (and thus didn't have an L1 hdr), we realloc the
4004			 * header to add an L1 hdr.
4005			 */
4006			if (!HDR_HAS_L1HDR(hdr)) {
4007				hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
4008				    hdr_full_cache);
4009			}
4010
4011			ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
4012			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4013			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4014			ASSERT(hdr->b_l1hdr.b_buf == NULL);
4015
4016			/* if this is a prefetch, we don't have a reference */
4017			if (*arc_flags & ARC_FLAG_PREFETCH)
4018				hdr->b_flags |= ARC_FLAG_PREFETCH;
4019			else
4020				add_reference(hdr, hash_lock, private);
4021			if (*arc_flags & ARC_FLAG_L2CACHE)
4022				hdr->b_flags |= ARC_FLAG_L2CACHE;
4023			if (*arc_flags & ARC_FLAG_L2COMPRESS)
4024				hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4025			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
4026			buf->b_hdr = hdr;
4027			buf->b_data = NULL;
4028			buf->b_efunc = NULL;
4029			buf->b_private = NULL;
4030			buf->b_next = NULL;
4031			hdr->b_l1hdr.b_buf = buf;
4032			ASSERT0(hdr->b_l1hdr.b_datacnt);
4033			hdr->b_l1hdr.b_datacnt = 1;
4034			arc_get_data_buf(buf);
4035			arc_access(hdr, hash_lock);
4036		}
4037
4038		ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
4039
4040		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
4041		acb->acb_done = done;
4042		acb->acb_private = private;
4043
4044		ASSERT(hdr->b_l1hdr.b_acb == NULL);
4045		hdr->b_l1hdr.b_acb = acb;
4046		hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
4047
4048		if (HDR_HAS_L2HDR(hdr) &&
4049		    (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
4050			devw = hdr->b_l2hdr.b_dev->l2ad_writing;
4051			addr = hdr->b_l2hdr.b_daddr;
4052			b_compress = HDR_GET_COMPRESS(hdr);
4053			b_asize = hdr->b_l2hdr.b_asize;
4054			/*
4055			 * Lock out device removal.
4056			 */
4057			if (vdev_is_dead(vd) ||
4058			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
4059				vd = NULL;
4060		}
4061
4062		if (hash_lock != NULL)
4063			mutex_exit(hash_lock);
4064
4065		/*
4066		 * At this point, we have a level 1 cache miss.  Try again in
4067		 * L2ARC if possible.
4068		 */
4069		ASSERT3U(hdr->b_size, ==, size);
4070		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
4071		    uint64_t, size, zbookmark_phys_t *, zb);
4072		ARCSTAT_BUMP(arcstat_misses);
4073		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
4074		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
4075		    data, metadata, misses);
4076#ifdef _KERNEL
4077		curthread->td_ru.ru_inblock++;
4078#endif
4079
4080		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
4081			/*
4082			 * Read from the L2ARC if the following are true:
4083			 * 1. The L2ARC vdev was previously cached.
4084			 * 2. This buffer still has L2ARC metadata.
4085			 * 3. This buffer isn't currently writing to the L2ARC.
4086			 * 4. The L2ARC entry wasn't evicted, which may
4087			 *    also have invalidated the vdev.
4088			 * 5. This isn't prefetch and l2arc_noprefetch is set.
4089			 */
4090			if (HDR_HAS_L2HDR(hdr) &&
4091			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
4092			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
4093				l2arc_read_callback_t *cb;
4094
4095				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
4096				ARCSTAT_BUMP(arcstat_l2_hits);
4097
4098				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
4099				    KM_SLEEP);
4100				cb->l2rcb_buf = buf;
4101				cb->l2rcb_spa = spa;
4102				cb->l2rcb_bp = *bp;
4103				cb->l2rcb_zb = *zb;
4104				cb->l2rcb_flags = zio_flags;
4105				cb->l2rcb_compress = b_compress;
4106
4107				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
4108				    addr + size < vd->vdev_psize -
4109				    VDEV_LABEL_END_SIZE);
4110
4111				/*
4112				 * l2arc read.  The SCL_L2ARC lock will be
4113				 * released by l2arc_read_done().
4114				 * Issue a null zio if the underlying buffer
4115				 * was squashed to zero size by compression.
4116				 */
4117				if (b_compress == ZIO_COMPRESS_EMPTY) {
4118					rzio = zio_null(pio, spa, vd,
4119					    l2arc_read_done, cb,
4120					    zio_flags | ZIO_FLAG_DONT_CACHE |
4121					    ZIO_FLAG_CANFAIL |
4122					    ZIO_FLAG_DONT_PROPAGATE |
4123					    ZIO_FLAG_DONT_RETRY);
4124				} else {
4125					rzio = zio_read_phys(pio, vd, addr,
4126					    b_asize, buf->b_data,
4127					    ZIO_CHECKSUM_OFF,
4128					    l2arc_read_done, cb, priority,
4129					    zio_flags | ZIO_FLAG_DONT_CACHE |
4130					    ZIO_FLAG_CANFAIL |
4131					    ZIO_FLAG_DONT_PROPAGATE |
4132					    ZIO_FLAG_DONT_RETRY, B_FALSE);
4133				}
4134				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
4135				    zio_t *, rzio);
4136				ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
4137
4138				if (*arc_flags & ARC_FLAG_NOWAIT) {
4139					zio_nowait(rzio);
4140					return (0);
4141				}
4142
4143				ASSERT(*arc_flags & ARC_FLAG_WAIT);
4144				if (zio_wait(rzio) == 0)
4145					return (0);
4146
4147				/* l2arc read error; goto zio_read() */
4148			} else {
4149				DTRACE_PROBE1(l2arc__miss,
4150				    arc_buf_hdr_t *, hdr);
4151				ARCSTAT_BUMP(arcstat_l2_misses);
4152				if (HDR_L2_WRITING(hdr))
4153					ARCSTAT_BUMP(arcstat_l2_rw_clash);
4154				spa_config_exit(spa, SCL_L2ARC, vd);
4155			}
4156		} else {
4157			if (vd != NULL)
4158				spa_config_exit(spa, SCL_L2ARC, vd);
4159			if (l2arc_ndev != 0) {
4160				DTRACE_PROBE1(l2arc__miss,
4161				    arc_buf_hdr_t *, hdr);
4162				ARCSTAT_BUMP(arcstat_l2_misses);
4163			}
4164		}
4165
4166		rzio = zio_read(pio, spa, bp, buf->b_data, size,
4167		    arc_read_done, buf, priority, zio_flags, zb);
4168
4169		if (*arc_flags & ARC_FLAG_WAIT)
4170			return (zio_wait(rzio));
4171
4172		ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
4173		zio_nowait(rzio);
4174	}
4175	return (0);
4176}
4177
4178void
4179arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
4180{
4181	ASSERT(buf->b_hdr != NULL);
4182	ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon);
4183	ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) ||
4184	    func == NULL);
4185	ASSERT(buf->b_efunc == NULL);
4186	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
4187
4188	buf->b_efunc = func;
4189	buf->b_private = private;
4190}
4191
4192/*
4193 * Notify the arc that a block was freed, and thus will never be used again.
4194 */
4195void
4196arc_freed(spa_t *spa, const blkptr_t *bp)
4197{
4198	arc_buf_hdr_t *hdr;
4199	kmutex_t *hash_lock;
4200	uint64_t guid = spa_load_guid(spa);
4201
4202	ASSERT(!BP_IS_EMBEDDED(bp));
4203
4204	hdr = buf_hash_find(guid, bp, &hash_lock);
4205	if (hdr == NULL)
4206		return;
4207	if (HDR_BUF_AVAILABLE(hdr)) {
4208		arc_buf_t *buf = hdr->b_l1hdr.b_buf;
4209		add_reference(hdr, hash_lock, FTAG);
4210		hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
4211		mutex_exit(hash_lock);
4212
4213		arc_release(buf, FTAG);
4214		(void) arc_buf_remove_ref(buf, FTAG);
4215	} else {
4216		mutex_exit(hash_lock);
4217	}
4218
4219}
4220
4221/*
4222 * Clear the user eviction callback set by arc_set_callback(), first calling
4223 * it if it exists.  Because the presence of a callback keeps an arc_buf cached
4224 * clearing the callback may result in the arc_buf being destroyed.  However,
4225 * it will not result in the *last* arc_buf being destroyed, hence the data
4226 * will remain cached in the ARC. We make a copy of the arc buffer here so
4227 * that we can process the callback without holding any locks.
4228 *
4229 * It's possible that the callback is already in the process of being cleared
4230 * by another thread.  In this case we can not clear the callback.
4231 *
4232 * Returns B_TRUE if the callback was successfully called and cleared.
4233 */
4234boolean_t
4235arc_clear_callback(arc_buf_t *buf)
4236{
4237	arc_buf_hdr_t *hdr;
4238	kmutex_t *hash_lock;
4239	arc_evict_func_t *efunc = buf->b_efunc;
4240	void *private = buf->b_private;
4241	list_t *list, *evicted_list;
4242	kmutex_t *lock, *evicted_lock;
4243
4244	mutex_enter(&buf->b_evict_lock);
4245	hdr = buf->b_hdr;
4246	if (hdr == NULL) {
4247		/*
4248		 * We are in arc_do_user_evicts().
4249		 */
4250		ASSERT(buf->b_data == NULL);
4251		mutex_exit(&buf->b_evict_lock);
4252		return (B_FALSE);
4253	} else if (buf->b_data == NULL) {
4254		/*
4255		 * We are on the eviction list; process this buffer now
4256		 * but let arc_do_user_evicts() do the reaping.
4257		 */
4258		buf->b_efunc = NULL;
4259		mutex_exit(&buf->b_evict_lock);
4260		VERIFY0(efunc(private));
4261		return (B_TRUE);
4262	}
4263	hash_lock = HDR_LOCK(hdr);
4264	mutex_enter(hash_lock);
4265	hdr = buf->b_hdr;
4266	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4267
4268	ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <,
4269	    hdr->b_l1hdr.b_datacnt);
4270	ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
4271	    hdr->b_l1hdr.b_state == arc_mfu);
4272
4273	buf->b_efunc = NULL;
4274	buf->b_private = NULL;
4275
4276	if (hdr->b_l1hdr.b_datacnt > 1) {
4277		mutex_exit(&buf->b_evict_lock);
4278		arc_buf_destroy(buf, FALSE, TRUE);
4279	} else {
4280		ASSERT(buf == hdr->b_l1hdr.b_buf);
4281		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
4282		mutex_exit(&buf->b_evict_lock);
4283	}
4284
4285	mutex_exit(hash_lock);
4286	VERIFY0(efunc(private));
4287	return (B_TRUE);
4288}
4289
4290/*
4291 * Release this buffer from the cache, making it an anonymous buffer.  This
4292 * must be done after a read and prior to modifying the buffer contents.
4293 * If the buffer has more than one reference, we must make
4294 * a new hdr for the buffer.
4295 */
4296void
4297arc_release(arc_buf_t *buf, void *tag)
4298{
4299	arc_buf_hdr_t *hdr = buf->b_hdr;
4300
4301	/*
4302	 * It would be nice to assert that if it's DMU metadata (level >
4303	 * 0 || it's the dnode file), then it must be syncing context.
4304	 * But we don't know that information at this level.
4305	 */
4306
4307	mutex_enter(&buf->b_evict_lock);
4308	/*
4309	 * We don't grab the hash lock prior to this check, because if
4310	 * the buffer's header is in the arc_anon state, it won't be
4311	 * linked into the hash table.
4312	 */
4313	if (hdr->b_l1hdr.b_state == arc_anon) {
4314		mutex_exit(&buf->b_evict_lock);
4315		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4316		ASSERT(!HDR_IN_HASH_TABLE(hdr));
4317		ASSERT(!HDR_HAS_L2HDR(hdr));
4318		ASSERT(BUF_EMPTY(hdr));
4319		ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1);
4320		ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
4321		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
4322
4323		ASSERT3P(buf->b_efunc, ==, NULL);
4324		ASSERT3P(buf->b_private, ==, NULL);
4325
4326		hdr->b_l1hdr.b_arc_access = 0;
4327		arc_buf_thaw(buf);
4328
4329		return;
4330	}
4331
4332	kmutex_t *hash_lock = HDR_LOCK(hdr);
4333	mutex_enter(hash_lock);
4334
4335	/*
4336	 * This assignment is only valid as long as the hash_lock is
4337	 * held, we must be careful not to reference state or the
4338	 * b_state field after dropping the lock.
4339	 */
4340	arc_state_t *state = hdr->b_l1hdr.b_state;
4341	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4342	ASSERT3P(state, !=, arc_anon);
4343
4344	/* this buffer is not on any list */
4345	ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0);
4346
4347	if (HDR_HAS_L2HDR(hdr)) {
4348		mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
4349
4350		/*
4351		 * We have to recheck this conditional again now that
4352		 * we're holding the l2ad_mtx to prevent a race with
4353		 * another thread which might be concurrently calling
4354		 * l2arc_evict(). In that case, l2arc_evict() might have
4355		 * destroyed the header's L2 portion as we were waiting
4356		 * to acquire the l2ad_mtx.
4357		 */
4358		if (HDR_HAS_L2HDR(hdr)) {
4359			trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev,
4360			    hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0);
4361			arc_hdr_l2hdr_destroy(hdr);
4362		}
4363
4364		mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
4365	}
4366
4367	/*
4368	 * Do we have more than one buf?
4369	 */
4370	if (hdr->b_l1hdr.b_datacnt > 1) {
4371		arc_buf_hdr_t *nhdr;
4372		arc_buf_t **bufp;
4373		uint64_t blksz = hdr->b_size;
4374		uint64_t spa = hdr->b_spa;
4375		arc_buf_contents_t type = arc_buf_type(hdr);
4376		uint32_t flags = hdr->b_flags;
4377
4378		ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
4379		/*
4380		 * Pull the data off of this hdr and attach it to
4381		 * a new anonymous hdr.
4382		 */
4383		(void) remove_reference(hdr, hash_lock, tag);
4384		bufp = &hdr->b_l1hdr.b_buf;
4385		while (*bufp != buf)
4386			bufp = &(*bufp)->b_next;
4387		*bufp = buf->b_next;
4388		buf->b_next = NULL;
4389
4390		ASSERT3P(state, !=, arc_l2c_only);
4391		ASSERT3U(state->arcs_size, >=, hdr->b_size);
4392		atomic_add_64(&state->arcs_size, -hdr->b_size);
4393		if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
4394			ASSERT3P(state, !=, arc_l2c_only);
4395			uint64_t *size = &state->arcs_lsize[type];
4396			ASSERT3U(*size, >=, hdr->b_size);
4397			atomic_add_64(size, -hdr->b_size);
4398		}
4399
4400		/*
4401		 * We're releasing a duplicate user data buffer, update
4402		 * our statistics accordingly.
4403		 */
4404		if (HDR_ISTYPE_DATA(hdr)) {
4405			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
4406			ARCSTAT_INCR(arcstat_duplicate_buffers_size,
4407			    -hdr->b_size);
4408		}
4409		hdr->b_l1hdr.b_datacnt -= 1;
4410		arc_cksum_verify(buf);
4411#ifdef illumos
4412		arc_buf_unwatch(buf);
4413#endif
4414
4415		mutex_exit(hash_lock);
4416
4417		nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
4418		nhdr->b_size = blksz;
4419		nhdr->b_spa = spa;
4420
4421		nhdr->b_flags = flags & ARC_FLAG_L2_WRITING;
4422		nhdr->b_flags |= arc_bufc_to_flags(type);
4423		nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
4424
4425		nhdr->b_l1hdr.b_buf = buf;
4426		nhdr->b_l1hdr.b_datacnt = 1;
4427		nhdr->b_l1hdr.b_state = arc_anon;
4428		nhdr->b_l1hdr.b_arc_access = 0;
4429		nhdr->b_freeze_cksum = NULL;
4430
4431		(void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
4432		buf->b_hdr = nhdr;
4433		mutex_exit(&buf->b_evict_lock);
4434		atomic_add_64(&arc_anon->arcs_size, blksz);
4435	} else {
4436		mutex_exit(&buf->b_evict_lock);
4437		ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
4438		/* protected by hash lock */
4439		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
4440		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4441		arc_change_state(arc_anon, hdr, hash_lock);
4442		hdr->b_l1hdr.b_arc_access = 0;
4443		mutex_exit(hash_lock);
4444
4445		buf_discard_identity(hdr);
4446		arc_buf_thaw(buf);
4447	}
4448	buf->b_efunc = NULL;
4449	buf->b_private = NULL;
4450}
4451
4452int
4453arc_released(arc_buf_t *buf)
4454{
4455	int released;
4456
4457	mutex_enter(&buf->b_evict_lock);
4458	released = (buf->b_data != NULL &&
4459	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
4460	mutex_exit(&buf->b_evict_lock);
4461	return (released);
4462}
4463
4464#ifdef ZFS_DEBUG
4465int
4466arc_referenced(arc_buf_t *buf)
4467{
4468	int referenced;
4469
4470	mutex_enter(&buf->b_evict_lock);
4471	referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
4472	mutex_exit(&buf->b_evict_lock);
4473	return (referenced);
4474}
4475#endif
4476
4477static void
4478arc_write_ready(zio_t *zio)
4479{
4480	arc_write_callback_t *callback = zio->io_private;
4481	arc_buf_t *buf = callback->awcb_buf;
4482	arc_buf_hdr_t *hdr = buf->b_hdr;
4483
4484	ASSERT(HDR_HAS_L1HDR(hdr));
4485	ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
4486	ASSERT(hdr->b_l1hdr.b_datacnt > 0);
4487	callback->awcb_ready(zio, buf, callback->awcb_private);
4488
4489	/*
4490	 * If the IO is already in progress, then this is a re-write
4491	 * attempt, so we need to thaw and re-compute the cksum.
4492	 * It is the responsibility of the callback to handle the
4493	 * accounting for any re-write attempt.
4494	 */
4495	if (HDR_IO_IN_PROGRESS(hdr)) {
4496		mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
4497		if (hdr->b_freeze_cksum != NULL) {
4498			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
4499			hdr->b_freeze_cksum = NULL;
4500		}
4501		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
4502	}
4503	arc_cksum_compute(buf, B_FALSE);
4504	hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
4505}
4506
4507/*
4508 * The SPA calls this callback for each physical write that happens on behalf
4509 * of a logical write.  See the comment in dbuf_write_physdone() for details.
4510 */
4511static void
4512arc_write_physdone(zio_t *zio)
4513{
4514	arc_write_callback_t *cb = zio->io_private;
4515	if (cb->awcb_physdone != NULL)
4516		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
4517}
4518
4519static void
4520arc_write_done(zio_t *zio)
4521{
4522	arc_write_callback_t *callback = zio->io_private;
4523	arc_buf_t *buf = callback->awcb_buf;
4524	arc_buf_hdr_t *hdr = buf->b_hdr;
4525
4526	ASSERT(hdr->b_l1hdr.b_acb == NULL);
4527
4528	if (zio->io_error == 0) {
4529		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
4530			buf_discard_identity(hdr);
4531		} else {
4532			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
4533			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
4534		}
4535	} else {
4536		ASSERT(BUF_EMPTY(hdr));
4537	}
4538
4539	/*
4540	 * If the block to be written was all-zero or compressed enough to be
4541	 * embedded in the BP, no write was performed so there will be no
4542	 * dva/birth/checksum.  The buffer must therefore remain anonymous
4543	 * (and uncached).
4544	 */
4545	if (!BUF_EMPTY(hdr)) {
4546		arc_buf_hdr_t *exists;
4547		kmutex_t *hash_lock;
4548
4549		ASSERT(zio->io_error == 0);
4550
4551		arc_cksum_verify(buf);
4552
4553		exists = buf_hash_insert(hdr, &hash_lock);
4554		if (exists != NULL) {
4555			/*
4556			 * This can only happen if we overwrite for
4557			 * sync-to-convergence, because we remove
4558			 * buffers from the hash table when we arc_free().
4559			 */
4560			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
4561				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
4562					panic("bad overwrite, hdr=%p exists=%p",
4563					    (void *)hdr, (void *)exists);
4564				ASSERT(refcount_is_zero(
4565				    &exists->b_l1hdr.b_refcnt));
4566				arc_change_state(arc_anon, exists, hash_lock);
4567				mutex_exit(hash_lock);
4568				arc_hdr_destroy(exists);
4569				exists = buf_hash_insert(hdr, &hash_lock);
4570				ASSERT3P(exists, ==, NULL);
4571			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
4572				/* nopwrite */
4573				ASSERT(zio->io_prop.zp_nopwrite);
4574				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
4575					panic("bad nopwrite, hdr=%p exists=%p",
4576					    (void *)hdr, (void *)exists);
4577			} else {
4578				/* Dedup */
4579				ASSERT(hdr->b_l1hdr.b_datacnt == 1);
4580				ASSERT(hdr->b_l1hdr.b_state == arc_anon);
4581				ASSERT(BP_GET_DEDUP(zio->io_bp));
4582				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
4583			}
4584		}
4585		hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
4586		/* if it's not anon, we are doing a scrub */
4587		if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
4588			arc_access(hdr, hash_lock);
4589		mutex_exit(hash_lock);
4590	} else {
4591		hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
4592	}
4593
4594	ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4595	callback->awcb_done(zio, buf, callback->awcb_private);
4596
4597	kmem_free(callback, sizeof (arc_write_callback_t));
4598}
4599
4600zio_t *
4601arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
4602    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
4603    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
4604    arc_done_func_t *done, void *private, zio_priority_t priority,
4605    int zio_flags, const zbookmark_phys_t *zb)
4606{
4607	arc_buf_hdr_t *hdr = buf->b_hdr;
4608	arc_write_callback_t *callback;
4609	zio_t *zio;
4610
4611	ASSERT(ready != NULL);
4612	ASSERT(done != NULL);
4613	ASSERT(!HDR_IO_ERROR(hdr));
4614	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4615	ASSERT(hdr->b_l1hdr.b_acb == NULL);
4616	ASSERT(hdr->b_l1hdr.b_datacnt > 0);
4617	if (l2arc)
4618		hdr->b_flags |= ARC_FLAG_L2CACHE;
4619	if (l2arc_compress)
4620		hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4621	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
4622	callback->awcb_ready = ready;
4623	callback->awcb_physdone = physdone;
4624	callback->awcb_done = done;
4625	callback->awcb_private = private;
4626	callback->awcb_buf = buf;
4627
4628	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
4629	    arc_write_ready, arc_write_physdone, arc_write_done, callback,
4630	    priority, zio_flags, zb);
4631
4632	return (zio);
4633}
4634
4635static int
4636arc_memory_throttle(uint64_t reserve, uint64_t txg)
4637{
4638#ifdef _KERNEL
4639	uint64_t available_memory = ptob(freemem);
4640	static uint64_t page_load = 0;
4641	static uint64_t last_txg = 0;
4642
4643#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
4644	available_memory =
4645	    MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE)));
4646#endif
4647
4648	if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
4649		return (0);
4650
4651	if (txg > last_txg) {
4652		last_txg = txg;
4653		page_load = 0;
4654	}
4655	/*
4656	 * If we are in pageout, we know that memory is already tight,
4657	 * the arc is already going to be evicting, so we just want to
4658	 * continue to let page writes occur as quickly as possible.
4659	 */
4660	if (curproc == pageproc) {
4661		if (page_load > MAX(ptob(minfree), available_memory) / 4)
4662			return (SET_ERROR(ERESTART));
4663		/* Note: reserve is inflated, so we deflate */
4664		page_load += reserve / 8;
4665		return (0);
4666	} else if (page_load > 0 && arc_reclaim_needed()) {
4667		/* memory is low, delay before restarting */
4668		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
4669		return (SET_ERROR(EAGAIN));
4670	}
4671	page_load = 0;
4672#endif
4673	return (0);
4674}
4675
4676static void
4677arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
4678    kstat_named_t *evict_data, kstat_named_t *evict_metadata)
4679{
4680	size->value.ui64 = state->arcs_size;
4681	evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
4682	evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
4683}
4684
4685static int
4686arc_kstat_update(kstat_t *ksp, int rw)
4687{
4688	arc_stats_t *as = ksp->ks_data;
4689
4690	if (rw == KSTAT_WRITE) {
4691		return (EACCES);
4692	} else {
4693		arc_kstat_update_state(arc_anon,
4694		    &as->arcstat_anon_size,
4695		    &as->arcstat_anon_evictable_data,
4696		    &as->arcstat_anon_evictable_metadata);
4697		arc_kstat_update_state(arc_mru,
4698		    &as->arcstat_mru_size,
4699		    &as->arcstat_mru_evictable_data,
4700		    &as->arcstat_mru_evictable_metadata);
4701		arc_kstat_update_state(arc_mru_ghost,
4702		    &as->arcstat_mru_ghost_size,
4703		    &as->arcstat_mru_ghost_evictable_data,
4704		    &as->arcstat_mru_ghost_evictable_metadata);
4705		arc_kstat_update_state(arc_mfu,
4706		    &as->arcstat_mfu_size,
4707		    &as->arcstat_mfu_evictable_data,
4708		    &as->arcstat_mfu_evictable_metadata);
4709		arc_kstat_update_state(arc_mfu_ghost,
4710		    &as->arcstat_mfu_ghost_size,
4711		    &as->arcstat_mfu_ghost_evictable_data,
4712		    &as->arcstat_mfu_ghost_evictable_metadata);
4713	}
4714
4715	return (0);
4716}
4717
4718void
4719arc_tempreserve_clear(uint64_t reserve)
4720{
4721	atomic_add_64(&arc_tempreserve, -reserve);
4722	ASSERT((int64_t)arc_tempreserve >= 0);
4723}
4724
4725int
4726arc_tempreserve_space(uint64_t reserve, uint64_t txg)
4727{
4728	int error;
4729	uint64_t anon_size;
4730
4731	if (reserve > arc_c/4 && !arc_no_grow) {
4732		arc_c = MIN(arc_c_max, reserve * 4);
4733		DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
4734	}
4735	if (reserve > arc_c)
4736		return (SET_ERROR(ENOMEM));
4737
4738	/*
4739	 * Don't count loaned bufs as in flight dirty data to prevent long
4740	 * network delays from blocking transactions that are ready to be
4741	 * assigned to a txg.
4742	 */
4743	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
4744
4745	/*
4746	 * Writes will, almost always, require additional memory allocations
4747	 * in order to compress/encrypt/etc the data.  We therefore need to
4748	 * make sure that there is sufficient available memory for this.
4749	 */
4750	error = arc_memory_throttle(reserve, txg);
4751	if (error != 0)
4752		return (error);
4753
4754	/*
4755	 * Throttle writes when the amount of dirty data in the cache
4756	 * gets too large.  We try to keep the cache less than half full
4757	 * of dirty blocks so that our sync times don't grow too large.
4758	 * Note: if two requests come in concurrently, we might let them
4759	 * both succeed, when one of them should fail.  Not a huge deal.
4760	 */
4761
4762	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
4763	    anon_size > arc_c / 4) {
4764		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
4765		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
4766		    arc_tempreserve>>10,
4767		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
4768		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
4769		    reserve>>10, arc_c>>10);
4770		return (SET_ERROR(ERESTART));
4771	}
4772	atomic_add_64(&arc_tempreserve, reserve);
4773	return (0);
4774}
4775
4776static kmutex_t arc_lowmem_lock;
4777#ifdef _KERNEL
4778static eventhandler_tag arc_event_lowmem = NULL;
4779
4780static void
4781arc_lowmem(void *arg __unused, int howto __unused)
4782{
4783
4784	/* Serialize access via arc_lowmem_lock. */
4785	mutex_enter(&arc_lowmem_lock);
4786	mutex_enter(&arc_reclaim_thr_lock);
4787	needfree = 1;
4788	DTRACE_PROBE(arc__needfree);
4789	cv_signal(&arc_reclaim_thr_cv);
4790
4791	/*
4792	 * It is unsafe to block here in arbitrary threads, because we can come
4793	 * here from ARC itself and may hold ARC locks and thus risk a deadlock
4794	 * with ARC reclaim thread.
4795	 */
4796	if (curproc == pageproc) {
4797		while (needfree)
4798			msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
4799	}
4800	mutex_exit(&arc_reclaim_thr_lock);
4801	mutex_exit(&arc_lowmem_lock);
4802}
4803#endif
4804
4805void
4806arc_init(void)
4807{
4808	int i, prefetch_tunable_set = 0;
4809
4810	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4811	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
4812	mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
4813
4814	/* Convert seconds to clock ticks */
4815	arc_min_prefetch_lifespan = 1 * hz;
4816
4817	/* Start out with 1/8 of all memory */
4818	arc_c = kmem_size() / 8;
4819
4820#ifdef illumos
4821#ifdef _KERNEL
4822	/*
4823	 * On architectures where the physical memory can be larger
4824	 * than the addressable space (intel in 32-bit mode), we may
4825	 * need to limit the cache to 1/8 of VM size.
4826	 */
4827	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
4828#endif
4829#endif	/* illumos */
4830	/* set min cache to 1/32 of all memory, or 16MB, whichever is more */
4831	arc_c_min = MAX(arc_c / 4, 16 << 20);
4832	/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
4833	if (arc_c * 8 >= 1 << 30)
4834		arc_c_max = (arc_c * 8) - (1 << 30);
4835	else
4836		arc_c_max = arc_c_min;
4837	arc_c_max = MAX(arc_c * 5, arc_c_max);
4838
4839#ifdef _KERNEL
4840	/*
4841	 * Allow the tunables to override our calculations if they are
4842	 * reasonable (ie. over 16MB)
4843	 */
4844	if (zfs_arc_max > 16 << 20 && zfs_arc_max < kmem_size())
4845		arc_c_max = zfs_arc_max;
4846	if (zfs_arc_min > 16 << 20 && zfs_arc_min <= arc_c_max)
4847		arc_c_min = zfs_arc_min;
4848#endif
4849
4850	arc_c = arc_c_max;
4851	arc_p = (arc_c >> 1);
4852
4853	/* limit meta-data to 1/4 of the arc capacity */
4854	arc_meta_limit = arc_c_max / 4;
4855
4856	/* Allow the tunable to override if it is reasonable */
4857	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
4858		arc_meta_limit = zfs_arc_meta_limit;
4859
4860	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
4861		arc_c_min = arc_meta_limit / 2;
4862
4863	if (zfs_arc_meta_min > 0) {
4864		arc_meta_min = zfs_arc_meta_min;
4865	} else {
4866		arc_meta_min = arc_c_min / 2;
4867	}
4868
4869	if (zfs_arc_grow_retry > 0)
4870		arc_grow_retry = zfs_arc_grow_retry;
4871
4872	if (zfs_arc_shrink_shift > 0)
4873		arc_shrink_shift = zfs_arc_shrink_shift;
4874
4875	if (zfs_arc_p_min_shift > 0)
4876		arc_p_min_shift = zfs_arc_p_min_shift;
4877
4878	/* if kmem_flags are set, lets try to use less memory */
4879	if (kmem_debugging())
4880		arc_c = arc_c / 2;
4881	if (arc_c < arc_c_min)
4882		arc_c = arc_c_min;
4883
4884	zfs_arc_min = arc_c_min;
4885	zfs_arc_max = arc_c_max;
4886
4887	arc_anon = &ARC_anon;
4888	arc_mru = &ARC_mru;
4889	arc_mru_ghost = &ARC_mru_ghost;
4890	arc_mfu = &ARC_mfu;
4891	arc_mfu_ghost = &ARC_mfu_ghost;
4892	arc_l2c_only = &ARC_l2c_only;
4893	arc_size = 0;
4894
4895	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4896		mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
4897		    NULL, MUTEX_DEFAULT, NULL);
4898		mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
4899		    NULL, MUTEX_DEFAULT, NULL);
4900		mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
4901		    NULL, MUTEX_DEFAULT, NULL);
4902		mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
4903		    NULL, MUTEX_DEFAULT, NULL);
4904		mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
4905		    NULL, MUTEX_DEFAULT, NULL);
4906		mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
4907		    NULL, MUTEX_DEFAULT, NULL);
4908
4909		list_create(&arc_mru->arcs_lists[i],
4910		    sizeof (arc_buf_hdr_t),
4911		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4912		list_create(&arc_mru_ghost->arcs_lists[i],
4913		    sizeof (arc_buf_hdr_t),
4914		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4915		list_create(&arc_mfu->arcs_lists[i],
4916		    sizeof (arc_buf_hdr_t),
4917		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4918		list_create(&arc_mfu_ghost->arcs_lists[i],
4919		    sizeof (arc_buf_hdr_t),
4920		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4921		list_create(&arc_mfu_ghost->arcs_lists[i],
4922		    sizeof (arc_buf_hdr_t),
4923		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4924		list_create(&arc_l2c_only->arcs_lists[i],
4925		    sizeof (arc_buf_hdr_t),
4926		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4927	}
4928
4929	buf_init();
4930
4931	arc_thread_exit = 0;
4932	arc_eviction_list = NULL;
4933	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
4934	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4935
4936	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4937	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4938
4939	if (arc_ksp != NULL) {
4940		arc_ksp->ks_data = &arc_stats;
4941		arc_ksp->ks_update = arc_kstat_update;
4942		kstat_install(arc_ksp);
4943	}
4944
4945	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
4946	    TS_RUN, minclsyspri);
4947
4948#ifdef _KERNEL
4949	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
4950	    EVENTHANDLER_PRI_FIRST);
4951#endif
4952
4953	arc_dead = FALSE;
4954	arc_warm = B_FALSE;
4955
4956	/*
4957	 * Calculate maximum amount of dirty data per pool.
4958	 *
4959	 * If it has been set by /etc/system, take that.
4960	 * Otherwise, use a percentage of physical memory defined by
4961	 * zfs_dirty_data_max_percent (default 10%) with a cap at
4962	 * zfs_dirty_data_max_max (default 4GB).
4963	 */
4964	if (zfs_dirty_data_max == 0) {
4965		zfs_dirty_data_max = ptob(physmem) *
4966		    zfs_dirty_data_max_percent / 100;
4967		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
4968		    zfs_dirty_data_max_max);
4969	}
4970
4971#ifdef _KERNEL
4972	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
4973		prefetch_tunable_set = 1;
4974
4975#ifdef __i386__
4976	if (prefetch_tunable_set == 0) {
4977		printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
4978		    "-- to enable,\n");
4979		printf("            add \"vfs.zfs.prefetch_disable=0\" "
4980		    "to /boot/loader.conf.\n");
4981		zfs_prefetch_disable = 1;
4982	}
4983#else
4984	if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
4985	    prefetch_tunable_set == 0) {
4986		printf("ZFS NOTICE: Prefetch is disabled by default if less "
4987		    "than 4GB of RAM is present;\n"
4988		    "            to enable, add \"vfs.zfs.prefetch_disable=0\" "
4989		    "to /boot/loader.conf.\n");
4990		zfs_prefetch_disable = 1;
4991	}
4992#endif
4993	/* Warn about ZFS memory and address space requirements. */
4994	if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
4995		printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
4996		    "expect unstable behavior.\n");
4997	}
4998	if (kmem_size() < 512 * (1 << 20)) {
4999		printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
5000		    "expect unstable behavior.\n");
5001		printf("             Consider tuning vm.kmem_size and "
5002		    "vm.kmem_size_max\n");
5003		printf("             in /boot/loader.conf.\n");
5004	}
5005#endif
5006}
5007
5008void
5009arc_fini(void)
5010{
5011	int i;
5012
5013	mutex_enter(&arc_reclaim_thr_lock);
5014	arc_thread_exit = 1;
5015	cv_signal(&arc_reclaim_thr_cv);
5016	while (arc_thread_exit != 0)
5017		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
5018	mutex_exit(&arc_reclaim_thr_lock);
5019
5020	arc_flush(NULL);
5021
5022	arc_dead = TRUE;
5023
5024	if (arc_ksp != NULL) {
5025		kstat_delete(arc_ksp);
5026		arc_ksp = NULL;
5027	}
5028
5029	mutex_destroy(&arc_eviction_mtx);
5030	mutex_destroy(&arc_reclaim_thr_lock);
5031	cv_destroy(&arc_reclaim_thr_cv);
5032
5033	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
5034		list_destroy(&arc_mru->arcs_lists[i]);
5035		list_destroy(&arc_mru_ghost->arcs_lists[i]);
5036		list_destroy(&arc_mfu->arcs_lists[i]);
5037		list_destroy(&arc_mfu_ghost->arcs_lists[i]);
5038		list_destroy(&arc_l2c_only->arcs_lists[i]);
5039
5040		mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
5041		mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
5042		mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
5043		mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
5044		mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
5045		mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
5046	}
5047
5048	buf_fini();
5049
5050	ASSERT0(arc_loaned_bytes);
5051
5052	mutex_destroy(&arc_lowmem_lock);
5053#ifdef _KERNEL
5054	if (arc_event_lowmem != NULL)
5055		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
5056#endif
5057}
5058
5059/*
5060 * Level 2 ARC
5061 *
5062 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
5063 * It uses dedicated storage devices to hold cached data, which are populated
5064 * using large infrequent writes.  The main role of this cache is to boost
5065 * the performance of random read workloads.  The intended L2ARC devices
5066 * include short-stroked disks, solid state disks, and other media with
5067 * substantially faster read latency than disk.
5068 *
5069 *                 +-----------------------+
5070 *                 |         ARC           |
5071 *                 +-----------------------+
5072 *                    |         ^     ^
5073 *                    |         |     |
5074 *      l2arc_feed_thread()    arc_read()
5075 *                    |         |     |
5076 *                    |  l2arc read   |
5077 *                    V         |     |
5078 *               +---------------+    |
5079 *               |     L2ARC     |    |
5080 *               +---------------+    |
5081 *                   |    ^           |
5082 *          l2arc_write() |           |
5083 *                   |    |           |
5084 *                   V    |           |
5085 *                 +-------+      +-------+
5086 *                 | vdev  |      | vdev  |
5087 *                 | cache |      | cache |
5088 *                 +-------+      +-------+
5089 *                 +=========+     .-----.
5090 *                 :  L2ARC  :    |-_____-|
5091 *                 : devices :    | Disks |
5092 *                 +=========+    `-_____-'
5093 *
5094 * Read requests are satisfied from the following sources, in order:
5095 *
5096 *	1) ARC
5097 *	2) vdev cache of L2ARC devices
5098 *	3) L2ARC devices
5099 *	4) vdev cache of disks
5100 *	5) disks
5101 *
5102 * Some L2ARC device types exhibit extremely slow write performance.
5103 * To accommodate for this there are some significant differences between
5104 * the L2ARC and traditional cache design:
5105 *
5106 * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
5107 * the ARC behave as usual, freeing buffers and placing headers on ghost
5108 * lists.  The ARC does not send buffers to the L2ARC during eviction as
5109 * this would add inflated write latencies for all ARC memory pressure.
5110 *
5111 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
5112 * It does this by periodically scanning buffers from the eviction-end of
5113 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
5114 * not already there. It scans until a headroom of buffers is satisfied,
5115 * which itself is a buffer for ARC eviction. If a compressible buffer is
5116 * found during scanning and selected for writing to an L2ARC device, we
5117 * temporarily boost scanning headroom during the next scan cycle to make
5118 * sure we adapt to compression effects (which might significantly reduce
5119 * the data volume we write to L2ARC). The thread that does this is
5120 * l2arc_feed_thread(), illustrated below; example sizes are included to
5121 * provide a better sense of ratio than this diagram:
5122 *
5123 *	       head -->                        tail
5124 *	        +---------------------+----------+
5125 *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
5126 *	        +---------------------+----------+   |   o L2ARC eligible
5127 *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
5128 *	        +---------------------+----------+   |
5129 *	             15.9 Gbytes      ^ 32 Mbytes    |
5130 *	                           headroom          |
5131 *	                                      l2arc_feed_thread()
5132 *	                                             |
5133 *	                 l2arc write hand <--[oooo]--'
5134 *	                         |           8 Mbyte
5135 *	                         |          write max
5136 *	                         V
5137 *		  +==============================+
5138 *	L2ARC dev |####|#|###|###|    |####| ... |
5139 *	          +==============================+
5140 *	                     32 Gbytes
5141 *
5142 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
5143 * evicted, then the L2ARC has cached a buffer much sooner than it probably
5144 * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
5145 * safe to say that this is an uncommon case, since buffers at the end of
5146 * the ARC lists have moved there due to inactivity.
5147 *
5148 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
5149 * then the L2ARC simply misses copying some buffers.  This serves as a
5150 * pressure valve to prevent heavy read workloads from both stalling the ARC
5151 * with waits and clogging the L2ARC with writes.  This also helps prevent
5152 * the potential for the L2ARC to churn if it attempts to cache content too
5153 * quickly, such as during backups of the entire pool.
5154 *
5155 * 5. After system boot and before the ARC has filled main memory, there are
5156 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
5157 * lists can remain mostly static.  Instead of searching from tail of these
5158 * lists as pictured, the l2arc_feed_thread() will search from the list heads
5159 * for eligible buffers, greatly increasing its chance of finding them.
5160 *
5161 * The L2ARC device write speed is also boosted during this time so that
5162 * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
5163 * there are no L2ARC reads, and no fear of degrading read performance
5164 * through increased writes.
5165 *
5166 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
5167 * the vdev queue can aggregate them into larger and fewer writes.  Each
5168 * device is written to in a rotor fashion, sweeping writes through
5169 * available space then repeating.
5170 *
5171 * 7. The L2ARC does not store dirty content.  It never needs to flush
5172 * write buffers back to disk based storage.
5173 *
5174 * 8. If an ARC buffer is written (and dirtied) which also exists in the
5175 * L2ARC, the now stale L2ARC buffer is immediately dropped.
5176 *
5177 * The performance of the L2ARC can be tweaked by a number of tunables, which
5178 * may be necessary for different workloads:
5179 *
5180 *	l2arc_write_max		max write bytes per interval
5181 *	l2arc_write_boost	extra write bytes during device warmup
5182 *	l2arc_noprefetch	skip caching prefetched buffers
5183 *	l2arc_headroom		number of max device writes to precache
5184 *	l2arc_headroom_boost	when we find compressed buffers during ARC
5185 *				scanning, we multiply headroom by this
5186 *				percentage factor for the next scan cycle,
5187 *				since more compressed buffers are likely to
5188 *				be present
5189 *	l2arc_feed_secs		seconds between L2ARC writing
5190 *
5191 * Tunables may be removed or added as future performance improvements are
5192 * integrated, and also may become zpool properties.
5193 *
5194 * There are three key functions that control how the L2ARC warms up:
5195 *
5196 *	l2arc_write_eligible()	check if a buffer is eligible to cache
5197 *	l2arc_write_size()	calculate how much to write
5198 *	l2arc_write_interval()	calculate sleep delay between writes
5199 *
5200 * These three functions determine what to write, how much, and how quickly
5201 * to send writes.
5202 */
5203
5204static boolean_t
5205l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
5206{
5207	/*
5208	 * A buffer is *not* eligible for the L2ARC if it:
5209	 * 1. belongs to a different spa.
5210	 * 2. is already cached on the L2ARC.
5211	 * 3. has an I/O in progress (it may be an incomplete read).
5212	 * 4. is flagged not eligible (zfs property).
5213	 */
5214	if (hdr->b_spa != spa_guid) {
5215		ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
5216		return (B_FALSE);
5217	}
5218	if (HDR_HAS_L2HDR(hdr)) {
5219		ARCSTAT_BUMP(arcstat_l2_write_in_l2);
5220		return (B_FALSE);
5221	}
5222	if (HDR_IO_IN_PROGRESS(hdr)) {
5223		ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
5224		return (B_FALSE);
5225	}
5226	if (!HDR_L2CACHE(hdr)) {
5227		ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
5228		return (B_FALSE);
5229	}
5230
5231	return (B_TRUE);
5232}
5233
5234static uint64_t
5235l2arc_write_size(void)
5236{
5237	uint64_t size;
5238
5239	/*
5240	 * Make sure our globals have meaningful values in case the user
5241	 * altered them.
5242	 */
5243	size = l2arc_write_max;
5244	if (size == 0) {
5245		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
5246		    "be greater than zero, resetting it to the default (%d)",
5247		    L2ARC_WRITE_SIZE);
5248		size = l2arc_write_max = L2ARC_WRITE_SIZE;
5249	}
5250
5251	if (arc_warm == B_FALSE)
5252		size += l2arc_write_boost;
5253
5254	return (size);
5255
5256}
5257
5258static clock_t
5259l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
5260{
5261	clock_t interval, next, now;
5262
5263	/*
5264	 * If the ARC lists are busy, increase our write rate; if the
5265	 * lists are stale, idle back.  This is achieved by checking
5266	 * how much we previously wrote - if it was more than half of
5267	 * what we wanted, schedule the next write much sooner.
5268	 */
5269	if (l2arc_feed_again && wrote > (wanted / 2))
5270		interval = (hz * l2arc_feed_min_ms) / 1000;
5271	else
5272		interval = hz * l2arc_feed_secs;
5273
5274	now = ddi_get_lbolt();
5275	next = MAX(now, MIN(now + interval, began + interval));
5276
5277	return (next);
5278}
5279
5280/*
5281 * Cycle through L2ARC devices.  This is how L2ARC load balances.
5282 * If a device is returned, this also returns holding the spa config lock.
5283 */
5284static l2arc_dev_t *
5285l2arc_dev_get_next(void)
5286{
5287	l2arc_dev_t *first, *next = NULL;
5288
5289	/*
5290	 * Lock out the removal of spas (spa_namespace_lock), then removal
5291	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
5292	 * both locks will be dropped and a spa config lock held instead.
5293	 */
5294	mutex_enter(&spa_namespace_lock);
5295	mutex_enter(&l2arc_dev_mtx);
5296
5297	/* if there are no vdevs, there is nothing to do */
5298	if (l2arc_ndev == 0)
5299		goto out;
5300
5301	first = NULL;
5302	next = l2arc_dev_last;
5303	do {
5304		/* loop around the list looking for a non-faulted vdev */
5305		if (next == NULL) {
5306			next = list_head(l2arc_dev_list);
5307		} else {
5308			next = list_next(l2arc_dev_list, next);
5309			if (next == NULL)
5310				next = list_head(l2arc_dev_list);
5311		}
5312
5313		/* if we have come back to the start, bail out */
5314		if (first == NULL)
5315			first = next;
5316		else if (next == first)
5317			break;
5318
5319	} while (vdev_is_dead(next->l2ad_vdev));
5320
5321	/* if we were unable to find any usable vdevs, return NULL */
5322	if (vdev_is_dead(next->l2ad_vdev))
5323		next = NULL;
5324
5325	l2arc_dev_last = next;
5326
5327out:
5328	mutex_exit(&l2arc_dev_mtx);
5329
5330	/*
5331	 * Grab the config lock to prevent the 'next' device from being
5332	 * removed while we are writing to it.
5333	 */
5334	if (next != NULL)
5335		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
5336	mutex_exit(&spa_namespace_lock);
5337
5338	return (next);
5339}
5340
5341/*
5342 * Free buffers that were tagged for destruction.
5343 */
5344static void
5345l2arc_do_free_on_write()
5346{
5347	list_t *buflist;
5348	l2arc_data_free_t *df, *df_prev;
5349
5350	mutex_enter(&l2arc_free_on_write_mtx);
5351	buflist = l2arc_free_on_write;
5352
5353	for (df = list_tail(buflist); df; df = df_prev) {
5354		df_prev = list_prev(buflist, df);
5355		ASSERT(df->l2df_data != NULL);
5356		ASSERT(df->l2df_func != NULL);
5357		df->l2df_func(df->l2df_data, df->l2df_size);
5358		list_remove(buflist, df);
5359		kmem_free(df, sizeof (l2arc_data_free_t));
5360	}
5361
5362	mutex_exit(&l2arc_free_on_write_mtx);
5363}
5364
5365/*
5366 * A write to a cache device has completed.  Update all headers to allow
5367 * reads from these buffers to begin.
5368 */
5369static void
5370l2arc_write_done(zio_t *zio)
5371{
5372	l2arc_write_callback_t *cb;
5373	l2arc_dev_t *dev;
5374	list_t *buflist;
5375	arc_buf_hdr_t *head, *hdr, *hdr_prev;
5376	kmutex_t *hash_lock;
5377	int64_t bytes_dropped = 0;
5378
5379	cb = zio->io_private;
5380	ASSERT(cb != NULL);
5381	dev = cb->l2wcb_dev;
5382	ASSERT(dev != NULL);
5383	head = cb->l2wcb_head;
5384	ASSERT(head != NULL);
5385	buflist = &dev->l2ad_buflist;
5386	ASSERT(buflist != NULL);
5387	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
5388	    l2arc_write_callback_t *, cb);
5389
5390	if (zio->io_error != 0)
5391		ARCSTAT_BUMP(arcstat_l2_writes_error);
5392
5393	mutex_enter(&dev->l2ad_mtx);
5394
5395	/*
5396	 * All writes completed, or an error was hit.
5397	 */
5398	for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
5399		hdr_prev = list_prev(buflist, hdr);
5400
5401		hash_lock = HDR_LOCK(hdr);
5402		if (!mutex_tryenter(hash_lock)) {
5403			/*
5404			 * This buffer misses out.  It may be in a stage
5405			 * of eviction.  Its ARC_FLAG_L2_WRITING flag will be
5406			 * left set, denying reads to this buffer.
5407			 */
5408			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
5409			continue;
5410		}
5411
5412		/*
5413		 * It's possible that this buffer got evicted from the L1 cache
5414		 * before we grabbed the vdev + hash locks, in which case
5415		 * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated.
5416		 * Only free the buffer if we still have an L1 hdr.
5417		 */
5418		if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL &&
5419		    HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
5420			l2arc_release_cdata_buf(hdr);
5421
5422		if (zio->io_error != 0) {
5423			/*
5424			 * Error - drop L2ARC entry.
5425			 */
5426			trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev,
5427			    hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0);
5428			hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
5429
5430			ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
5431			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
5432
5433			bytes_dropped += hdr->b_l2hdr.b_asize;
5434			(void) refcount_remove_many(&dev->l2ad_alloc,
5435			    hdr->b_l2hdr.b_asize, hdr);
5436		}
5437
5438		/*
5439		 * Allow ARC to begin reads to this L2ARC entry.
5440		 */
5441		hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
5442
5443		mutex_exit(hash_lock);
5444	}
5445
5446	atomic_inc_64(&l2arc_writes_done);
5447	list_remove(buflist, head);
5448	ASSERT(!HDR_HAS_L1HDR(head));
5449	kmem_cache_free(hdr_l2only_cache, head);
5450	mutex_exit(&dev->l2ad_mtx);
5451
5452	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
5453
5454	l2arc_do_free_on_write();
5455
5456	kmem_free(cb, sizeof (l2arc_write_callback_t));
5457}
5458
5459/*
5460 * A read to a cache device completed.  Validate buffer contents before
5461 * handing over to the regular ARC routines.
5462 */
5463static void
5464l2arc_read_done(zio_t *zio)
5465{
5466	l2arc_read_callback_t *cb;
5467	arc_buf_hdr_t *hdr;
5468	arc_buf_t *buf;
5469	kmutex_t *hash_lock;
5470	int equal;
5471
5472	ASSERT(zio->io_vd != NULL);
5473	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
5474
5475	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
5476
5477	cb = zio->io_private;
5478	ASSERT(cb != NULL);
5479	buf = cb->l2rcb_buf;
5480	ASSERT(buf != NULL);
5481
5482	hash_lock = HDR_LOCK(buf->b_hdr);
5483	mutex_enter(hash_lock);
5484	hdr = buf->b_hdr;
5485	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
5486
5487	/*
5488	 * If the buffer was compressed, decompress it first.
5489	 */
5490	if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
5491		l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
5492	ASSERT(zio->io_data != NULL);
5493
5494	/*
5495	 * Check this survived the L2ARC journey.
5496	 */
5497	equal = arc_cksum_equal(buf);
5498	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
5499		mutex_exit(hash_lock);
5500		zio->io_private = buf;
5501		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
5502		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
5503		arc_read_done(zio);
5504	} else {
5505		mutex_exit(hash_lock);
5506		/*
5507		 * Buffer didn't survive caching.  Increment stats and
5508		 * reissue to the original storage device.
5509		 */
5510		if (zio->io_error != 0) {
5511			ARCSTAT_BUMP(arcstat_l2_io_error);
5512		} else {
5513			zio->io_error = SET_ERROR(EIO);
5514		}
5515		if (!equal)
5516			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
5517
5518		/*
5519		 * If there's no waiter, issue an async i/o to the primary
5520		 * storage now.  If there *is* a waiter, the caller must
5521		 * issue the i/o in a context where it's OK to block.
5522		 */
5523		if (zio->io_waiter == NULL) {
5524			zio_t *pio = zio_unique_parent(zio);
5525
5526			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
5527
5528			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
5529			    buf->b_data, zio->io_size, arc_read_done, buf,
5530			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
5531		}
5532	}
5533
5534	kmem_free(cb, sizeof (l2arc_read_callback_t));
5535}
5536
5537/*
5538 * This is the list priority from which the L2ARC will search for pages to
5539 * cache.  This is used within loops (0..3) to cycle through lists in the
5540 * desired order.  This order can have a significant effect on cache
5541 * performance.
5542 *
5543 * Currently the metadata lists are hit first, MFU then MRU, followed by
5544 * the data lists.  This function returns a locked list, and also returns
5545 * the lock pointer.
5546 */
5547static list_t *
5548l2arc_list_locked(int list_num, kmutex_t **lock)
5549{
5550	list_t *list = NULL;
5551	int idx;
5552
5553	ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);
5554
5555	if (list_num < ARC_BUFC_NUMMETADATALISTS) {
5556		idx = list_num;
5557		list = &arc_mfu->arcs_lists[idx];
5558		*lock = ARCS_LOCK(arc_mfu, idx);
5559	} else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) {
5560		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
5561		list = &arc_mru->arcs_lists[idx];
5562		*lock = ARCS_LOCK(arc_mru, idx);
5563	} else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 +
5564		ARC_BUFC_NUMDATALISTS)) {
5565		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
5566		list = &arc_mfu->arcs_lists[idx];
5567		*lock = ARCS_LOCK(arc_mfu, idx);
5568	} else {
5569		idx = list_num - ARC_BUFC_NUMLISTS;
5570		list = &arc_mru->arcs_lists[idx];
5571		*lock = ARCS_LOCK(arc_mru, idx);
5572	}
5573
5574	ASSERT(!(MUTEX_HELD(*lock)));
5575	mutex_enter(*lock);
5576	return (list);
5577}
5578
5579/*
5580 * Evict buffers from the device write hand to the distance specified in
5581 * bytes.  This distance may span populated buffers, it may span nothing.
5582 * This is clearing a region on the L2ARC device ready for writing.
5583 * If the 'all' boolean is set, every buffer is evicted.
5584 */
5585static void
5586l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
5587{
5588	list_t *buflist;
5589	arc_buf_hdr_t *hdr, *hdr_prev;
5590	kmutex_t *hash_lock;
5591	uint64_t taddr;
5592
5593	buflist = &dev->l2ad_buflist;
5594
5595	if (!all && dev->l2ad_first) {
5596		/*
5597		 * This is the first sweep through the device.  There is
5598		 * nothing to evict.
5599		 */
5600		return;
5601	}
5602
5603	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
5604		/*
5605		 * When nearing the end of the device, evict to the end
5606		 * before the device write hand jumps to the start.
5607		 */
5608		taddr = dev->l2ad_end;
5609	} else {
5610		taddr = dev->l2ad_hand + distance;
5611	}
5612	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
5613	    uint64_t, taddr, boolean_t, all);
5614
5615top:
5616	mutex_enter(&dev->l2ad_mtx);
5617	for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
5618		hdr_prev = list_prev(buflist, hdr);
5619
5620		hash_lock = HDR_LOCK(hdr);
5621		if (!mutex_tryenter(hash_lock)) {
5622			/*
5623			 * Missed the hash lock.  Retry.
5624			 */
5625			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
5626			mutex_exit(&dev->l2ad_mtx);
5627			mutex_enter(hash_lock);
5628			mutex_exit(hash_lock);
5629			goto top;
5630		}
5631
5632		if (HDR_L2_WRITE_HEAD(hdr)) {
5633			/*
5634			 * We hit a write head node.  Leave it for
5635			 * l2arc_write_done().
5636			 */
5637			list_remove(buflist, hdr);
5638			mutex_exit(hash_lock);
5639			continue;
5640		}
5641
5642		if (!all && HDR_HAS_L2HDR(hdr) &&
5643		    (hdr->b_l2hdr.b_daddr > taddr ||
5644		    hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
5645			/*
5646			 * We've evicted to the target address,
5647			 * or the end of the device.
5648			 */
5649			mutex_exit(hash_lock);
5650			break;
5651		}
5652
5653		ASSERT(HDR_HAS_L2HDR(hdr));
5654		if (!HDR_HAS_L1HDR(hdr)) {
5655			ASSERT(!HDR_L2_READING(hdr));
5656			/*
5657			 * This doesn't exist in the ARC.  Destroy.
5658			 * arc_hdr_destroy() will call list_remove()
5659			 * and decrement arcstat_l2_size.
5660			 */
5661			arc_change_state(arc_anon, hdr, hash_lock);
5662			arc_hdr_destroy(hdr);
5663		} else {
5664			ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
5665			ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
5666			/*
5667			 * Invalidate issued or about to be issued
5668			 * reads, since we may be about to write
5669			 * over this location.
5670			 */
5671			if (HDR_L2_READING(hdr)) {
5672				ARCSTAT_BUMP(arcstat_l2_evict_reading);
5673				hdr->b_flags |= ARC_FLAG_L2_EVICTED;
5674			}
5675
5676			arc_hdr_l2hdr_destroy(hdr);
5677		}
5678		mutex_exit(hash_lock);
5679	}
5680	mutex_exit(&dev->l2ad_mtx);
5681}
5682
5683/*
5684 * Find and write ARC buffers to the L2ARC device.
5685 *
5686 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
5687 * for reading until they have completed writing.
5688 * The headroom_boost is an in-out parameter used to maintain headroom boost
5689 * state between calls to this function.
5690 *
5691 * Returns the number of bytes actually written (which may be smaller than
5692 * the delta by which the device hand has changed due to alignment).
5693 */
5694static uint64_t
5695l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
5696    boolean_t *headroom_boost)
5697{
5698	arc_buf_hdr_t *hdr, *hdr_prev, *head;
5699	list_t *list;
5700	uint64_t write_asize, write_psize, write_sz, headroom,
5701	    buf_compress_minsz;
5702	void *buf_data;
5703	kmutex_t *list_lock;
5704	boolean_t full;
5705	l2arc_write_callback_t *cb;
5706	zio_t *pio, *wzio;
5707	uint64_t guid = spa_load_guid(spa);
5708	const boolean_t do_headroom_boost = *headroom_boost;
5709	int try;
5710
5711	ASSERT(dev->l2ad_vdev != NULL);
5712
5713	/* Lower the flag now, we might want to raise it again later. */
5714	*headroom_boost = B_FALSE;
5715
5716	pio = NULL;
5717	write_sz = write_asize = write_psize = 0;
5718	full = B_FALSE;
5719	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
5720	head->b_flags |= ARC_FLAG_L2_WRITE_HEAD;
5721	head->b_flags |= ARC_FLAG_HAS_L2HDR;
5722
5723	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
5724	/*
5725	 * We will want to try to compress buffers that are at least 2x the
5726	 * device sector size.
5727	 */
5728	buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
5729
5730	/*
5731	 * Copy buffers for L2ARC writing.
5732	 */
5733	mutex_enter(&dev->l2ad_mtx);
5734	for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
5735		uint64_t passed_sz = 0;
5736
5737		list = l2arc_list_locked(try, &list_lock);
5738		ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
5739
5740		/*
5741		 * L2ARC fast warmup.
5742		 *
5743		 * Until the ARC is warm and starts to evict, read from the
5744		 * head of the ARC lists rather than the tail.
5745		 */
5746		if (arc_warm == B_FALSE)
5747			hdr = list_head(list);
5748		else
5749			hdr = list_tail(list);
5750		if (hdr == NULL)
5751			ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
5752
5753		headroom = target_sz * l2arc_headroom * 2 / ARC_BUFC_NUMLISTS;
5754		if (do_headroom_boost)
5755			headroom = (headroom * l2arc_headroom_boost) / 100;
5756
5757		for (; hdr; hdr = hdr_prev) {
5758			kmutex_t *hash_lock;
5759			uint64_t buf_sz;
5760
5761			if (arc_warm == B_FALSE)
5762				hdr_prev = list_next(list, hdr);
5763			else
5764				hdr_prev = list_prev(list, hdr);
5765			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size);
5766
5767			hash_lock = HDR_LOCK(hdr);
5768			if (!mutex_tryenter(hash_lock)) {
5769				ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
5770				/*
5771				 * Skip this buffer rather than waiting.
5772				 */
5773				continue;
5774			}
5775
5776			passed_sz += hdr->b_size;
5777			if (passed_sz > headroom) {
5778				/*
5779				 * Searched too far.
5780				 */
5781				mutex_exit(hash_lock);
5782				ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
5783				break;
5784			}
5785
5786			if (!l2arc_write_eligible(guid, hdr)) {
5787				mutex_exit(hash_lock);
5788				continue;
5789			}
5790
5791			if ((write_sz + hdr->b_size) > target_sz) {
5792				full = B_TRUE;
5793				mutex_exit(hash_lock);
5794				ARCSTAT_BUMP(arcstat_l2_write_full);
5795				break;
5796			}
5797
5798			if (pio == NULL) {
5799				/*
5800				 * Insert a dummy header on the buflist so
5801				 * l2arc_write_done() can find where the
5802				 * write buffers begin without searching.
5803				 */
5804				list_insert_head(&dev->l2ad_buflist, head);
5805
5806				cb = kmem_alloc(
5807				    sizeof (l2arc_write_callback_t), KM_SLEEP);
5808				cb->l2wcb_dev = dev;
5809				cb->l2wcb_head = head;
5810				pio = zio_root(spa, l2arc_write_done, cb,
5811				    ZIO_FLAG_CANFAIL);
5812				ARCSTAT_BUMP(arcstat_l2_write_pios);
5813			}
5814
5815			/*
5816			 * Create and add a new L2ARC header.
5817			 */
5818			hdr->b_l2hdr.b_dev = dev;
5819			hdr->b_flags |= ARC_FLAG_L2_WRITING;
5820			/*
5821			 * Temporarily stash the data buffer in b_tmp_cdata.
5822			 * The subsequent write step will pick it up from
5823			 * there. This is because can't access b_l1hdr.b_buf
5824			 * without holding the hash_lock, which we in turn
5825			 * can't access without holding the ARC list locks
5826			 * (which we want to avoid during compression/writing).
5827			 */
5828			HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
5829			hdr->b_l2hdr.b_asize = hdr->b_size;
5830			hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data;
5831
5832			/*
5833			 * Explicitly set the b_daddr field to a known
5834			 * value which means "invalid address". This
5835			 * enables us to differentiate which stage of
5836			 * l2arc_write_buffers() the particular header
5837			 * is in (e.g. this loop, or the one below).
5838			 * ARC_FLAG_L2_WRITING is not enough to make
5839			 * this distinction, and we need to know in
5840			 * order to do proper l2arc vdev accounting in
5841			 * arc_release() and arc_hdr_destroy().
5842			 *
5843			 * Note, we can't use a new flag to distinguish
5844			 * the two stages because we don't hold the
5845			 * header's hash_lock below, in the second stage
5846			 * of this function. Thus, we can't simply
5847			 * change the b_flags field to denote that the
5848			 * IO has been sent. We can change the b_daddr
5849			 * field of the L2 portion, though, since we'll
5850			 * be holding the l2ad_mtx; which is why we're
5851			 * using it to denote the header's state change.
5852			 */
5853			hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET;
5854
5855			buf_sz = hdr->b_size;
5856			hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
5857
5858			list_insert_head(&dev->l2ad_buflist, hdr);
5859
5860			/*
5861			 * Compute and store the buffer cksum before
5862			 * writing.  On debug the cksum is verified first.
5863			 */
5864			arc_cksum_verify(hdr->b_l1hdr.b_buf);
5865			arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE);
5866
5867			mutex_exit(hash_lock);
5868
5869			write_sz += buf_sz;
5870		}
5871
5872		mutex_exit(list_lock);
5873
5874		if (full == B_TRUE)
5875			break;
5876	}
5877
5878	/* No buffers selected for writing? */
5879	if (pio == NULL) {
5880		ASSERT0(write_sz);
5881		mutex_exit(&dev->l2ad_mtx);
5882		ASSERT(!HDR_HAS_L1HDR(head));
5883		kmem_cache_free(hdr_l2only_cache, head);
5884		return (0);
5885	}
5886
5887	/*
5888	 * Now start writing the buffers. We're starting at the write head
5889	 * and work backwards, retracing the course of the buffer selector
5890	 * loop above.
5891	 */
5892	for (hdr = list_prev(&dev->l2ad_buflist, head); hdr;
5893	    hdr = list_prev(&dev->l2ad_buflist, hdr)) {
5894		uint64_t buf_sz;
5895
5896		/*
5897		 * We shouldn't need to lock the buffer here, since we flagged
5898		 * it as ARC_FLAG_L2_WRITING in the previous step, but we must
5899		 * take care to only access its L2 cache parameters. In
5900		 * particular, hdr->l1hdr.b_buf may be invalid by now due to
5901		 * ARC eviction.
5902		 */
5903		hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
5904
5905		if ((HDR_L2COMPRESS(hdr)) &&
5906		    hdr->b_l2hdr.b_asize >= buf_compress_minsz) {
5907			if (l2arc_compress_buf(hdr)) {
5908				/*
5909				 * If compression succeeded, enable headroom
5910				 * boost on the next scan cycle.
5911				 */
5912				*headroom_boost = B_TRUE;
5913			}
5914		}
5915
5916		/*
5917		 * Pick up the buffer data we had previously stashed away
5918		 * (and now potentially also compressed).
5919		 */
5920		buf_data = hdr->b_l1hdr.b_tmp_cdata;
5921		buf_sz = hdr->b_l2hdr.b_asize;
5922
5923		/*
5924		 * If the data has not been compressed, then clear b_tmp_cdata
5925		 * to make sure that it points only to a temporary compression
5926		 * buffer.
5927		 */
5928		if (!L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr)))
5929			hdr->b_l1hdr.b_tmp_cdata = NULL;
5930
5931		/*
5932		 * We need to do this regardless if buf_sz is zero or
5933		 * not, otherwise, when this l2hdr is evicted we'll
5934		 * remove a reference that was never added.
5935		 */
5936		(void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr);
5937
5938		/* Compression may have squashed the buffer to zero length. */
5939		if (buf_sz != 0) {
5940			uint64_t buf_p_sz;
5941
5942			wzio = zio_write_phys(pio, dev->l2ad_vdev,
5943			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5944			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5945			    ZIO_FLAG_CANFAIL, B_FALSE);
5946
5947			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5948			    zio_t *, wzio);
5949			(void) zio_nowait(wzio);
5950
5951			write_asize += buf_sz;
5952
5953			/*
5954			 * Keep the clock hand suitably device-aligned.
5955			 */
5956			buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
5957			write_psize += buf_p_sz;
5958			dev->l2ad_hand += buf_p_sz;
5959		}
5960	}
5961
5962	mutex_exit(&dev->l2ad_mtx);
5963
5964	ASSERT3U(write_asize, <=, target_sz);
5965	ARCSTAT_BUMP(arcstat_l2_writes_sent);
5966	ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5967	ARCSTAT_INCR(arcstat_l2_size, write_sz);
5968	ARCSTAT_INCR(arcstat_l2_asize, write_asize);
5969	vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
5970
5971	/*
5972	 * Bump device hand to the device start if it is approaching the end.
5973	 * l2arc_evict() will already have evicted ahead for this case.
5974	 */
5975	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
5976		dev->l2ad_hand = dev->l2ad_start;
5977		dev->l2ad_first = B_FALSE;
5978	}
5979
5980	dev->l2ad_writing = B_TRUE;
5981	(void) zio_wait(pio);
5982	dev->l2ad_writing = B_FALSE;
5983
5984	return (write_asize);
5985}
5986
5987/*
5988 * Compresses an L2ARC buffer.
5989 * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its
5990 * size in l2hdr->b_asize. This routine tries to compress the data and
5991 * depending on the compression result there are three possible outcomes:
5992 * *) The buffer was incompressible. The original l2hdr contents were left
5993 *    untouched and are ready for writing to an L2 device.
5994 * *) The buffer was all-zeros, so there is no need to write it to an L2
5995 *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
5996 *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
5997 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
5998 *    data buffer which holds the compressed data to be written, and b_asize
5999 *    tells us how much data there is. b_compress is set to the appropriate
6000 *    compression algorithm. Once writing is done, invoke
6001 *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
6002 *
6003 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
6004 * buffer was incompressible).
6005 */
6006static boolean_t
6007l2arc_compress_buf(arc_buf_hdr_t *hdr)
6008{
6009	void *cdata;
6010	size_t csize, len, rounded;
6011	ASSERT(HDR_HAS_L2HDR(hdr));
6012	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
6013
6014	ASSERT(HDR_HAS_L1HDR(hdr));
6015	ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF);
6016	ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
6017
6018	len = l2hdr->b_asize;
6019	cdata = zio_data_buf_alloc(len);
6020	ASSERT3P(cdata, !=, NULL);
6021	csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata,
6022	    cdata, l2hdr->b_asize);
6023
6024	if (csize == 0) {
6025		/* zero block, indicate that there's nothing to write */
6026		zio_data_buf_free(cdata, len);
6027		HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY);
6028		l2hdr->b_asize = 0;
6029		hdr->b_l1hdr.b_tmp_cdata = NULL;
6030		ARCSTAT_BUMP(arcstat_l2_compress_zeros);
6031		return (B_TRUE);
6032	}
6033
6034	rounded = P2ROUNDUP(csize,
6035	    (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift);
6036	if (rounded < len) {
6037		/*
6038		 * Compression succeeded, we'll keep the cdata around for
6039		 * writing and release it afterwards.
6040		 */
6041		if (rounded > csize) {
6042			bzero((char *)cdata + csize, rounded - csize);
6043			csize = rounded;
6044		}
6045		HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4);
6046		l2hdr->b_asize = csize;
6047		hdr->b_l1hdr.b_tmp_cdata = cdata;
6048		ARCSTAT_BUMP(arcstat_l2_compress_successes);
6049		return (B_TRUE);
6050	} else {
6051		/*
6052		 * Compression failed, release the compressed buffer.
6053		 * l2hdr will be left unmodified.
6054		 */
6055		zio_data_buf_free(cdata, len);
6056		ARCSTAT_BUMP(arcstat_l2_compress_failures);
6057		return (B_FALSE);
6058	}
6059}
6060
6061/*
6062 * Decompresses a zio read back from an l2arc device. On success, the
6063 * underlying zio's io_data buffer is overwritten by the uncompressed
6064 * version. On decompression error (corrupt compressed stream), the
6065 * zio->io_error value is set to signal an I/O error.
6066 *
6067 * Please note that the compressed data stream is not checksummed, so
6068 * if the underlying device is experiencing data corruption, we may feed
6069 * corrupt data to the decompressor, so the decompressor needs to be
6070 * able to handle this situation (LZ4 does).
6071 */
6072static void
6073l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
6074{
6075	ASSERT(L2ARC_IS_VALID_COMPRESS(c));
6076
6077	if (zio->io_error != 0) {
6078		/*
6079		 * An io error has occured, just restore the original io
6080		 * size in preparation for a main pool read.
6081		 */
6082		zio->io_orig_size = zio->io_size = hdr->b_size;
6083		return;
6084	}
6085
6086	if (c == ZIO_COMPRESS_EMPTY) {
6087		/*
6088		 * An empty buffer results in a null zio, which means we
6089		 * need to fill its io_data after we're done restoring the
6090		 * buffer's contents.
6091		 */
6092		ASSERT(hdr->b_l1hdr.b_buf != NULL);
6093		bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size);
6094		zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data;
6095	} else {
6096		ASSERT(zio->io_data != NULL);
6097		/*
6098		 * We copy the compressed data from the start of the arc buffer
6099		 * (the zio_read will have pulled in only what we need, the
6100		 * rest is garbage which we will overwrite at decompression)
6101		 * and then decompress back to the ARC data buffer. This way we
6102		 * can minimize copying by simply decompressing back over the
6103		 * original compressed data (rather than decompressing to an
6104		 * aux buffer and then copying back the uncompressed buffer,
6105		 * which is likely to be much larger).
6106		 */
6107		uint64_t csize;
6108		void *cdata;
6109
6110		csize = zio->io_size;
6111		cdata = zio_data_buf_alloc(csize);
6112		bcopy(zio->io_data, cdata, csize);
6113		if (zio_decompress_data(c, cdata, zio->io_data, csize,
6114		    hdr->b_size) != 0)
6115			zio->io_error = EIO;
6116		zio_data_buf_free(cdata, csize);
6117	}
6118
6119	/* Restore the expected uncompressed IO size. */
6120	zio->io_orig_size = zio->io_size = hdr->b_size;
6121}
6122
6123/*
6124 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
6125 * This buffer serves as a temporary holder of compressed data while
6126 * the buffer entry is being written to an l2arc device. Once that is
6127 * done, we can dispose of it.
6128 */
6129static void
6130l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
6131{
6132	ASSERT(HDR_HAS_L1HDR(hdr));
6133	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) {
6134		/*
6135		 * If the data was compressed, then we've allocated a
6136		 * temporary buffer for it, so now we need to release it.
6137		 */
6138		ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
6139		zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata,
6140		    hdr->b_size);
6141		hdr->b_l1hdr.b_tmp_cdata = NULL;
6142	} else {
6143		ASSERT(hdr->b_l1hdr.b_tmp_cdata == NULL);
6144	}
6145}
6146
6147/*
6148 * This thread feeds the L2ARC at regular intervals.  This is the beating
6149 * heart of the L2ARC.
6150 */
6151static void
6152l2arc_feed_thread(void *dummy __unused)
6153{
6154	callb_cpr_t cpr;
6155	l2arc_dev_t *dev;
6156	spa_t *spa;
6157	uint64_t size, wrote;
6158	clock_t begin, next = ddi_get_lbolt();
6159	boolean_t headroom_boost = B_FALSE;
6160
6161	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
6162
6163	mutex_enter(&l2arc_feed_thr_lock);
6164
6165	while (l2arc_thread_exit == 0) {
6166		CALLB_CPR_SAFE_BEGIN(&cpr);
6167		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
6168		    next - ddi_get_lbolt());
6169		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
6170		next = ddi_get_lbolt() + hz;
6171
6172		/*
6173		 * Quick check for L2ARC devices.
6174		 */
6175		mutex_enter(&l2arc_dev_mtx);
6176		if (l2arc_ndev == 0) {
6177			mutex_exit(&l2arc_dev_mtx);
6178			continue;
6179		}
6180		mutex_exit(&l2arc_dev_mtx);
6181		begin = ddi_get_lbolt();
6182
6183		/*
6184		 * This selects the next l2arc device to write to, and in
6185		 * doing so the next spa to feed from: dev->l2ad_spa.   This
6186		 * will return NULL if there are now no l2arc devices or if
6187		 * they are all faulted.
6188		 *
6189		 * If a device is returned, its spa's config lock is also
6190		 * held to prevent device removal.  l2arc_dev_get_next()
6191		 * will grab and release l2arc_dev_mtx.
6192		 */
6193		if ((dev = l2arc_dev_get_next()) == NULL)
6194			continue;
6195
6196		spa = dev->l2ad_spa;
6197		ASSERT(spa != NULL);
6198
6199		/*
6200		 * If the pool is read-only then force the feed thread to
6201		 * sleep a little longer.
6202		 */
6203		if (!spa_writeable(spa)) {
6204			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
6205			spa_config_exit(spa, SCL_L2ARC, dev);
6206			continue;
6207		}
6208
6209		/*
6210		 * Avoid contributing to memory pressure.
6211		 */
6212		if (arc_reclaim_needed()) {
6213			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
6214			spa_config_exit(spa, SCL_L2ARC, dev);
6215			continue;
6216		}
6217
6218		ARCSTAT_BUMP(arcstat_l2_feeds);
6219
6220		size = l2arc_write_size();
6221
6222		/*
6223		 * Evict L2ARC buffers that will be overwritten.
6224		 */
6225		l2arc_evict(dev, size, B_FALSE);
6226
6227		/*
6228		 * Write ARC buffers.
6229		 */
6230		wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
6231
6232		/*
6233		 * Calculate interval between writes.
6234		 */
6235		next = l2arc_write_interval(begin, size, wrote);
6236		spa_config_exit(spa, SCL_L2ARC, dev);
6237	}
6238
6239	l2arc_thread_exit = 0;
6240	cv_broadcast(&l2arc_feed_thr_cv);
6241	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
6242	thread_exit();
6243}
6244
6245boolean_t
6246l2arc_vdev_present(vdev_t *vd)
6247{
6248	l2arc_dev_t *dev;
6249
6250	mutex_enter(&l2arc_dev_mtx);
6251	for (dev = list_head(l2arc_dev_list); dev != NULL;
6252	    dev = list_next(l2arc_dev_list, dev)) {
6253		if (dev->l2ad_vdev == vd)
6254			break;
6255	}
6256	mutex_exit(&l2arc_dev_mtx);
6257
6258	return (dev != NULL);
6259}
6260
6261/*
6262 * Add a vdev for use by the L2ARC.  By this point the spa has already
6263 * validated the vdev and opened it.
6264 */
6265void
6266l2arc_add_vdev(spa_t *spa, vdev_t *vd)
6267{
6268	l2arc_dev_t *adddev;
6269
6270	ASSERT(!l2arc_vdev_present(vd));
6271
6272	vdev_ashift_optimize(vd);
6273
6274	/*
6275	 * Create a new l2arc device entry.
6276	 */
6277	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
6278	adddev->l2ad_spa = spa;
6279	adddev->l2ad_vdev = vd;
6280	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
6281	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
6282	adddev->l2ad_hand = adddev->l2ad_start;
6283	adddev->l2ad_first = B_TRUE;
6284	adddev->l2ad_writing = B_FALSE;
6285
6286	mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
6287	/*
6288	 * This is a list of all ARC buffers that are still valid on the
6289	 * device.
6290	 */
6291	list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
6292	    offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
6293
6294	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
6295	refcount_create(&adddev->l2ad_alloc);
6296
6297	/*
6298	 * Add device to global list
6299	 */
6300	mutex_enter(&l2arc_dev_mtx);
6301	list_insert_head(l2arc_dev_list, adddev);
6302	atomic_inc_64(&l2arc_ndev);
6303	mutex_exit(&l2arc_dev_mtx);
6304}
6305
6306/*
6307 * Remove a vdev from the L2ARC.
6308 */
6309void
6310l2arc_remove_vdev(vdev_t *vd)
6311{
6312	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
6313
6314	/*
6315	 * Find the device by vdev
6316	 */
6317	mutex_enter(&l2arc_dev_mtx);
6318	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
6319		nextdev = list_next(l2arc_dev_list, dev);
6320		if (vd == dev->l2ad_vdev) {
6321			remdev = dev;
6322			break;
6323		}
6324	}
6325	ASSERT(remdev != NULL);
6326
6327	/*
6328	 * Remove device from global list
6329	 */
6330	list_remove(l2arc_dev_list, remdev);
6331	l2arc_dev_last = NULL;		/* may have been invalidated */
6332	atomic_dec_64(&l2arc_ndev);
6333	mutex_exit(&l2arc_dev_mtx);
6334
6335	/*
6336	 * Clear all buflists and ARC references.  L2ARC device flush.
6337	 */
6338	l2arc_evict(remdev, 0, B_TRUE);
6339	list_destroy(&remdev->l2ad_buflist);
6340	mutex_destroy(&remdev->l2ad_mtx);
6341	refcount_destroy(&remdev->l2ad_alloc);
6342	kmem_free(remdev, sizeof (l2arc_dev_t));
6343}
6344
6345void
6346l2arc_init(void)
6347{
6348	l2arc_thread_exit = 0;
6349	l2arc_ndev = 0;
6350	l2arc_writes_sent = 0;
6351	l2arc_writes_done = 0;
6352
6353	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
6354	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
6355	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
6356	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
6357
6358	l2arc_dev_list = &L2ARC_dev_list;
6359	l2arc_free_on_write = &L2ARC_free_on_write;
6360	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
6361	    offsetof(l2arc_dev_t, l2ad_node));
6362	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
6363	    offsetof(l2arc_data_free_t, l2df_list_node));
6364}
6365
6366void
6367l2arc_fini(void)
6368{
6369	/*
6370	 * This is called from dmu_fini(), which is called from spa_fini();
6371	 * Because of this, we can assume that all l2arc devices have
6372	 * already been removed when the pools themselves were removed.
6373	 */
6374
6375	l2arc_do_free_on_write();
6376
6377	mutex_destroy(&l2arc_feed_thr_lock);
6378	cv_destroy(&l2arc_feed_thr_cv);
6379	mutex_destroy(&l2arc_dev_mtx);
6380	mutex_destroy(&l2arc_free_on_write_mtx);
6381
6382	list_destroy(l2arc_dev_list);
6383	list_destroy(l2arc_free_on_write);
6384}
6385
6386void
6387l2arc_start(void)
6388{
6389	if (!(spa_mode_global & FWRITE))
6390		return;
6391
6392	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
6393	    TS_RUN, minclsyspri);
6394}
6395
6396void
6397l2arc_stop(void)
6398{
6399	if (!(spa_mode_global & FWRITE))
6400		return;
6401
6402	mutex_enter(&l2arc_feed_thr_lock);
6403	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
6404	l2arc_thread_exit = 1;
6405	while (l2arc_thread_exit != 0)
6406		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
6407	mutex_exit(&l2arc_feed_thr_lock);
6408}
6409