arc.c revision 286626
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
26 * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
27 */
28
29/*
30 * DVA-based Adjustable Replacement Cache
31 *
32 * While much of the theory of operation used here is
33 * based on the self-tuning, low overhead replacement cache
34 * presented by Megiddo and Modha at FAST 2003, there are some
35 * significant differences:
36 *
37 * 1. The Megiddo and Modha model assumes any page is evictable.
38 * Pages in its cache cannot be "locked" into memory.  This makes
39 * the eviction algorithm simple: evict the last page in the list.
40 * This also make the performance characteristics easy to reason
41 * about.  Our cache is not so simple.  At any given moment, some
42 * subset of the blocks in the cache are un-evictable because we
43 * have handed out a reference to them.  Blocks are only evictable
44 * when there are no external references active.  This makes
45 * eviction far more problematic:  we choose to evict the evictable
46 * blocks that are the "lowest" in the list.
47 *
48 * There are times when it is not possible to evict the requested
49 * space.  In these circumstances we are unable to adjust the cache
50 * size.  To prevent the cache growing unbounded at these times we
51 * implement a "cache throttle" that slows the flow of new data
52 * into the cache until we can make space available.
53 *
54 * 2. The Megiddo and Modha model assumes a fixed cache size.
55 * Pages are evicted when the cache is full and there is a cache
56 * miss.  Our model has a variable sized cache.  It grows with
57 * high use, but also tries to react to memory pressure from the
58 * operating system: decreasing its size when system memory is
59 * tight.
60 *
61 * 3. The Megiddo and Modha model assumes a fixed page size. All
62 * elements of the cache are therefore exactly the same size.  So
63 * when adjusting the cache size following a cache miss, its simply
64 * a matter of choosing a single page to evict.  In our model, we
65 * have variable sized cache blocks (rangeing from 512 bytes to
66 * 128K bytes).  We therefore choose a set of blocks to evict to make
67 * space for a cache miss that approximates as closely as possible
68 * the space used by the new block.
69 *
70 * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
71 * by N. Megiddo & D. Modha, FAST 2003
72 */
73
74/*
75 * The locking model:
76 *
77 * A new reference to a cache buffer can be obtained in two
78 * ways: 1) via a hash table lookup using the DVA as a key,
79 * or 2) via one of the ARC lists.  The arc_read() interface
80 * uses method 1, while the internal arc algorithms for
81 * adjusting the cache use method 2.  We therefore provide two
82 * types of locks: 1) the hash table lock array, and 2) the
83 * arc list locks.
84 *
85 * Buffers do not have their own mutexs, rather they rely on the
86 * hash table mutexs for the bulk of their protection (i.e. most
87 * fields in the arc_buf_hdr_t are protected by these mutexs).
88 *
89 * buf_hash_find() returns the appropriate mutex (held) when it
90 * locates the requested buffer in the hash table.  It returns
91 * NULL for the mutex if the buffer was not in the table.
92 *
93 * buf_hash_remove() expects the appropriate hash mutex to be
94 * already held before it is invoked.
95 *
96 * Each arc state also has a mutex which is used to protect the
97 * buffer list associated with the state.  When attempting to
98 * obtain a hash table lock while holding an arc list lock you
99 * must use: mutex_tryenter() to avoid deadlock.  Also note that
100 * the active state mutex must be held before the ghost state mutex.
101 *
102 * Arc buffers may have an associated eviction callback function.
103 * This function will be invoked prior to removing the buffer (e.g.
104 * in arc_do_user_evicts()).  Note however that the data associated
105 * with the buffer may be evicted prior to the callback.  The callback
106 * must be made with *no locks held* (to prevent deadlock).  Additionally,
107 * the users of callbacks must ensure that their private data is
108 * protected from simultaneous callbacks from arc_clear_callback()
109 * and arc_do_user_evicts().
110 *
111 * Note that the majority of the performance stats are manipulated
112 * with atomic operations.
113 *
114 * The L2ARC uses the l2ad_mtx on each vdev for the following:
115 *
116 *	- L2ARC buflist creation
117 *	- L2ARC buflist eviction
118 *	- L2ARC write completion, which walks L2ARC buflists
119 *	- ARC header destruction, as it removes from L2ARC buflists
120 *	- ARC header release, as it removes from L2ARC buflists
121 */
122
123#include <sys/spa.h>
124#include <sys/zio.h>
125#include <sys/zio_compress.h>
126#include <sys/zfs_context.h>
127#include <sys/arc.h>
128#include <sys/refcount.h>
129#include <sys/vdev.h>
130#include <sys/vdev_impl.h>
131#include <sys/dsl_pool.h>
132#ifdef _KERNEL
133#include <sys/dnlc.h>
134#endif
135#include <sys/callb.h>
136#include <sys/kstat.h>
137#include <sys/trim_map.h>
138#include <zfs_fletcher.h>
139#include <sys/sdt.h>
140
141#include <vm/vm_pageout.h>
142#include <machine/vmparam.h>
143
144#ifdef illumos
145#ifndef _KERNEL
146/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
147boolean_t arc_watch = B_FALSE;
148int arc_procfd;
149#endif
150#endif /* illumos */
151
152static kmutex_t		arc_reclaim_thr_lock;
153static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
154static uint8_t		arc_thread_exit;
155
156uint_t arc_reduce_dnlc_percent = 3;
157
158/*
159 * The number of iterations through arc_evict_*() before we
160 * drop & reacquire the lock.
161 */
162int arc_evict_iterations = 100;
163
164/* number of seconds before growing cache again */
165static int		arc_grow_retry = 60;
166
167/* shift of arc_c for calculating both min and max arc_p */
168static int		arc_p_min_shift = 4;
169
170/* log2(fraction of arc to reclaim) */
171static int		arc_shrink_shift = 7;
172
173/*
174 * log2(fraction of ARC which must be free to allow growing).
175 * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
176 * when reading a new block into the ARC, we will evict an equal-sized block
177 * from the ARC.
178 *
179 * This must be less than arc_shrink_shift, so that when we shrink the ARC,
180 * we will still not allow it to grow.
181 */
182int			arc_no_grow_shift = 5;
183
184
185/*
186 * minimum lifespan of a prefetch block in clock ticks
187 * (initialized in arc_init())
188 */
189static int		arc_min_prefetch_lifespan;
190
191/*
192 * If this percent of memory is free, don't throttle.
193 */
194int arc_lotsfree_percent = 10;
195
196static int arc_dead;
197extern int zfs_prefetch_disable;
198
199/*
200 * The arc has filled available memory and has now warmed up.
201 */
202static boolean_t arc_warm;
203
204uint64_t zfs_arc_max;
205uint64_t zfs_arc_min;
206uint64_t zfs_arc_meta_limit = 0;
207uint64_t zfs_arc_meta_min = 0;
208int zfs_arc_grow_retry = 0;
209int zfs_arc_shrink_shift = 0;
210int zfs_arc_p_min_shift = 0;
211int zfs_disable_dup_eviction = 0;
212uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
213u_int zfs_arc_free_target = 0;
214
215static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
216static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS);
217
218#ifdef _KERNEL
219static void
220arc_free_target_init(void *unused __unused)
221{
222
223	zfs_arc_free_target = vm_pageout_wakeup_thresh;
224}
225SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
226    arc_free_target_init, NULL);
227
228TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
229TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min);
230TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift);
231SYSCTL_DECL(_vfs_zfs);
232SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
233    "Maximum ARC size");
234SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
235    "Minimum ARC size");
236SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
237    &zfs_arc_average_blocksize, 0,
238    "ARC average blocksize");
239SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
240    &arc_shrink_shift, 0,
241    "log2(fraction of arc to reclaim)");
242
243/*
244 * We don't have a tunable for arc_free_target due to the dependency on
245 * pagedaemon initialisation.
246 */
247SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
248    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
249    sysctl_vfs_zfs_arc_free_target, "IU",
250    "Desired number of free pages below which ARC triggers reclaim");
251
252static int
253sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
254{
255	u_int val;
256	int err;
257
258	val = zfs_arc_free_target;
259	err = sysctl_handle_int(oidp, &val, 0, req);
260	if (err != 0 || req->newptr == NULL)
261		return (err);
262
263	if (val < minfree)
264		return (EINVAL);
265	if (val > vm_cnt.v_page_count)
266		return (EINVAL);
267
268	zfs_arc_free_target = val;
269
270	return (0);
271}
272
273/*
274 * Must be declared here, before the definition of corresponding kstat
275 * macro which uses the same names will confuse the compiler.
276 */
277SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit,
278    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
279    sysctl_vfs_zfs_arc_meta_limit, "QU",
280    "ARC metadata limit");
281#endif
282
283/*
284 * Note that buffers can be in one of 6 states:
285 *	ARC_anon	- anonymous (discussed below)
286 *	ARC_mru		- recently used, currently cached
287 *	ARC_mru_ghost	- recentely used, no longer in cache
288 *	ARC_mfu		- frequently used, currently cached
289 *	ARC_mfu_ghost	- frequently used, no longer in cache
290 *	ARC_l2c_only	- exists in L2ARC but not other states
291 * When there are no active references to the buffer, they are
292 * are linked onto a list in one of these arc states.  These are
293 * the only buffers that can be evicted or deleted.  Within each
294 * state there are multiple lists, one for meta-data and one for
295 * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
296 * etc.) is tracked separately so that it can be managed more
297 * explicitly: favored over data, limited explicitly.
298 *
299 * Anonymous buffers are buffers that are not associated with
300 * a DVA.  These are buffers that hold dirty block copies
301 * before they are written to stable storage.  By definition,
302 * they are "ref'd" and are considered part of arc_mru
303 * that cannot be freed.  Generally, they will aquire a DVA
304 * as they are written and migrate onto the arc_mru list.
305 *
306 * The ARC_l2c_only state is for buffers that are in the second
307 * level ARC but no longer in any of the ARC_m* lists.  The second
308 * level ARC itself may also contain buffers that are in any of
309 * the ARC_m* states - meaning that a buffer can exist in two
310 * places.  The reason for the ARC_l2c_only state is to keep the
311 * buffer header in the hash table, so that reads that hit the
312 * second level ARC benefit from these fast lookups.
313 */
314
315#define	ARCS_LOCK_PAD		CACHE_LINE_SIZE
316struct arcs_lock {
317	kmutex_t	arcs_lock;
318#ifdef _KERNEL
319	unsigned char	pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
320#endif
321};
322
323/*
324 * must be power of two for mask use to work
325 *
326 */
327#define ARC_BUFC_NUMDATALISTS		16
328#define ARC_BUFC_NUMMETADATALISTS	16
329#define ARC_BUFC_NUMLISTS	(ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
330
331typedef struct arc_state {
332	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
333	uint64_t arcs_size;	/* total amount of data in this state */
334	list_t	arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
335	struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
336} arc_state_t;
337
338#define ARCS_LOCK(s, i)	(&((s)->arcs_locks[(i)].arcs_lock))
339
340/* The 6 states: */
341static arc_state_t ARC_anon;
342static arc_state_t ARC_mru;
343static arc_state_t ARC_mru_ghost;
344static arc_state_t ARC_mfu;
345static arc_state_t ARC_mfu_ghost;
346static arc_state_t ARC_l2c_only;
347
348typedef struct arc_stats {
349	kstat_named_t arcstat_hits;
350	kstat_named_t arcstat_misses;
351	kstat_named_t arcstat_demand_data_hits;
352	kstat_named_t arcstat_demand_data_misses;
353	kstat_named_t arcstat_demand_metadata_hits;
354	kstat_named_t arcstat_demand_metadata_misses;
355	kstat_named_t arcstat_prefetch_data_hits;
356	kstat_named_t arcstat_prefetch_data_misses;
357	kstat_named_t arcstat_prefetch_metadata_hits;
358	kstat_named_t arcstat_prefetch_metadata_misses;
359	kstat_named_t arcstat_mru_hits;
360	kstat_named_t arcstat_mru_ghost_hits;
361	kstat_named_t arcstat_mfu_hits;
362	kstat_named_t arcstat_mfu_ghost_hits;
363	kstat_named_t arcstat_allocated;
364	kstat_named_t arcstat_deleted;
365	kstat_named_t arcstat_stolen;
366	kstat_named_t arcstat_recycle_miss;
367	/*
368	 * Number of buffers that could not be evicted because the hash lock
369	 * was held by another thread.  The lock may not necessarily be held
370	 * by something using the same buffer, since hash locks are shared
371	 * by multiple buffers.
372	 */
373	kstat_named_t arcstat_mutex_miss;
374	/*
375	 * Number of buffers skipped because they have I/O in progress, are
376	 * indrect prefetch buffers that have not lived long enough, or are
377	 * not from the spa we're trying to evict from.
378	 */
379	kstat_named_t arcstat_evict_skip;
380	kstat_named_t arcstat_evict_l2_cached;
381	kstat_named_t arcstat_evict_l2_eligible;
382	kstat_named_t arcstat_evict_l2_ineligible;
383	kstat_named_t arcstat_hash_elements;
384	kstat_named_t arcstat_hash_elements_max;
385	kstat_named_t arcstat_hash_collisions;
386	kstat_named_t arcstat_hash_chains;
387	kstat_named_t arcstat_hash_chain_max;
388	kstat_named_t arcstat_p;
389	kstat_named_t arcstat_c;
390	kstat_named_t arcstat_c_min;
391	kstat_named_t arcstat_c_max;
392	kstat_named_t arcstat_size;
393	/*
394	 * Number of bytes consumed by internal ARC structures necessary
395	 * for tracking purposes; these structures are not actually
396	 * backed by ARC buffers. This includes arc_buf_hdr_t structures
397	 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
398	 * caches), and arc_buf_t structures (allocated via arc_buf_t
399	 * cache).
400	 */
401	kstat_named_t arcstat_hdr_size;
402	/*
403	 * Number of bytes consumed by ARC buffers of type equal to
404	 * ARC_BUFC_DATA. This is generally consumed by buffers backing
405	 * on disk user data (e.g. plain file contents).
406	 */
407	kstat_named_t arcstat_data_size;
408	/*
409	 * Number of bytes consumed by ARC buffers of type equal to
410	 * ARC_BUFC_METADATA. This is generally consumed by buffers
411	 * backing on disk data that is used for internal ZFS
412	 * structures (e.g. ZAP, dnode, indirect blocks, etc).
413	 */
414	kstat_named_t arcstat_metadata_size;
415	/*
416	 * Number of bytes consumed by various buffers and structures
417	 * not actually backed with ARC buffers. This includes bonus
418	 * buffers (allocated directly via zio_buf_* functions),
419	 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
420	 * cache), and dnode_t structures (allocated via dnode_t cache).
421	 */
422	kstat_named_t arcstat_other_size;
423	/*
424	 * Total number of bytes consumed by ARC buffers residing in the
425	 * arc_anon state. This includes *all* buffers in the arc_anon
426	 * state; e.g. data, metadata, evictable, and unevictable buffers
427	 * are all included in this value.
428	 */
429	kstat_named_t arcstat_anon_size;
430	/*
431	 * Number of bytes consumed by ARC buffers that meet the
432	 * following criteria: backing buffers of type ARC_BUFC_DATA,
433	 * residing in the arc_anon state, and are eligible for eviction
434	 * (e.g. have no outstanding holds on the buffer).
435	 */
436	kstat_named_t arcstat_anon_evictable_data;
437	/*
438	 * Number of bytes consumed by ARC buffers that meet the
439	 * following criteria: backing buffers of type ARC_BUFC_METADATA,
440	 * residing in the arc_anon state, and are eligible for eviction
441	 * (e.g. have no outstanding holds on the buffer).
442	 */
443	kstat_named_t arcstat_anon_evictable_metadata;
444	/*
445	 * Total number of bytes consumed by ARC buffers residing in the
446	 * arc_mru state. This includes *all* buffers in the arc_mru
447	 * state; e.g. data, metadata, evictable, and unevictable buffers
448	 * are all included in this value.
449	 */
450	kstat_named_t arcstat_mru_size;
451	/*
452	 * Number of bytes consumed by ARC buffers that meet the
453	 * following criteria: backing buffers of type ARC_BUFC_DATA,
454	 * residing in the arc_mru state, and are eligible for eviction
455	 * (e.g. have no outstanding holds on the buffer).
456	 */
457	kstat_named_t arcstat_mru_evictable_data;
458	/*
459	 * Number of bytes consumed by ARC buffers that meet the
460	 * following criteria: backing buffers of type ARC_BUFC_METADATA,
461	 * residing in the arc_mru state, and are eligible for eviction
462	 * (e.g. have no outstanding holds on the buffer).
463	 */
464	kstat_named_t arcstat_mru_evictable_metadata;
465	/*
466	 * Total number of bytes that *would have been* consumed by ARC
467	 * buffers in the arc_mru_ghost state. The key thing to note
468	 * here, is the fact that this size doesn't actually indicate
469	 * RAM consumption. The ghost lists only consist of headers and
470	 * don't actually have ARC buffers linked off of these headers.
471	 * Thus, *if* the headers had associated ARC buffers, these
472	 * buffers *would have* consumed this number of bytes.
473	 */
474	kstat_named_t arcstat_mru_ghost_size;
475	/*
476	 * Number of bytes that *would have been* consumed by ARC
477	 * buffers that are eligible for eviction, of type
478	 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
479	 */
480	kstat_named_t arcstat_mru_ghost_evictable_data;
481	/*
482	 * Number of bytes that *would have been* consumed by ARC
483	 * buffers that are eligible for eviction, of type
484	 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
485	 */
486	kstat_named_t arcstat_mru_ghost_evictable_metadata;
487	/*
488	 * Total number of bytes consumed by ARC buffers residing in the
489	 * arc_mfu state. This includes *all* buffers in the arc_mfu
490	 * state; e.g. data, metadata, evictable, and unevictable buffers
491	 * are all included in this value.
492	 */
493	kstat_named_t arcstat_mfu_size;
494	/*
495	 * Number of bytes consumed by ARC buffers that are eligible for
496	 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
497	 * state.
498	 */
499	kstat_named_t arcstat_mfu_evictable_data;
500	/*
501	 * Number of bytes consumed by ARC buffers that are eligible for
502	 * eviction, of type ARC_BUFC_METADATA, and reside in the
503	 * arc_mfu state.
504	 */
505	kstat_named_t arcstat_mfu_evictable_metadata;
506	/*
507	 * Total number of bytes that *would have been* consumed by ARC
508	 * buffers in the arc_mfu_ghost state. See the comment above
509	 * arcstat_mru_ghost_size for more details.
510	 */
511	kstat_named_t arcstat_mfu_ghost_size;
512	/*
513	 * Number of bytes that *would have been* consumed by ARC
514	 * buffers that are eligible for eviction, of type
515	 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
516	 */
517	kstat_named_t arcstat_mfu_ghost_evictable_data;
518	/*
519	 * Number of bytes that *would have been* consumed by ARC
520	 * buffers that are eligible for eviction, of type
521	 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
522	 */
523	kstat_named_t arcstat_mfu_ghost_evictable_metadata;
524	kstat_named_t arcstat_l2_hits;
525	kstat_named_t arcstat_l2_misses;
526	kstat_named_t arcstat_l2_feeds;
527	kstat_named_t arcstat_l2_rw_clash;
528	kstat_named_t arcstat_l2_read_bytes;
529	kstat_named_t arcstat_l2_write_bytes;
530	kstat_named_t arcstat_l2_writes_sent;
531	kstat_named_t arcstat_l2_writes_done;
532	kstat_named_t arcstat_l2_writes_error;
533	kstat_named_t arcstat_l2_writes_hdr_miss;
534	kstat_named_t arcstat_l2_evict_lock_retry;
535	kstat_named_t arcstat_l2_evict_reading;
536	kstat_named_t arcstat_l2_evict_l1cached;
537	kstat_named_t arcstat_l2_free_on_write;
538	kstat_named_t arcstat_l2_cdata_free_on_write;
539	kstat_named_t arcstat_l2_abort_lowmem;
540	kstat_named_t arcstat_l2_cksum_bad;
541	kstat_named_t arcstat_l2_io_error;
542	kstat_named_t arcstat_l2_size;
543	kstat_named_t arcstat_l2_asize;
544	kstat_named_t arcstat_l2_hdr_size;
545	kstat_named_t arcstat_l2_compress_successes;
546	kstat_named_t arcstat_l2_compress_zeros;
547	kstat_named_t arcstat_l2_compress_failures;
548	kstat_named_t arcstat_l2_write_trylock_fail;
549	kstat_named_t arcstat_l2_write_passed_headroom;
550	kstat_named_t arcstat_l2_write_spa_mismatch;
551	kstat_named_t arcstat_l2_write_in_l2;
552	kstat_named_t arcstat_l2_write_hdr_io_in_progress;
553	kstat_named_t arcstat_l2_write_not_cacheable;
554	kstat_named_t arcstat_l2_write_full;
555	kstat_named_t arcstat_l2_write_buffer_iter;
556	kstat_named_t arcstat_l2_write_pios;
557	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
558	kstat_named_t arcstat_l2_write_buffer_list_iter;
559	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
560	kstat_named_t arcstat_memory_throttle_count;
561	kstat_named_t arcstat_duplicate_buffers;
562	kstat_named_t arcstat_duplicate_buffers_size;
563	kstat_named_t arcstat_duplicate_reads;
564	kstat_named_t arcstat_meta_used;
565	kstat_named_t arcstat_meta_limit;
566	kstat_named_t arcstat_meta_max;
567	kstat_named_t arcstat_meta_min;
568} arc_stats_t;
569
570static arc_stats_t arc_stats = {
571	{ "hits",			KSTAT_DATA_UINT64 },
572	{ "misses",			KSTAT_DATA_UINT64 },
573	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
574	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
575	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
576	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
577	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
578	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
579	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
580	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
581	{ "mru_hits",			KSTAT_DATA_UINT64 },
582	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
583	{ "mfu_hits",			KSTAT_DATA_UINT64 },
584	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
585	{ "allocated",			KSTAT_DATA_UINT64 },
586	{ "deleted",			KSTAT_DATA_UINT64 },
587	{ "stolen",			KSTAT_DATA_UINT64 },
588	{ "recycle_miss",		KSTAT_DATA_UINT64 },
589	{ "mutex_miss",			KSTAT_DATA_UINT64 },
590	{ "evict_skip",			KSTAT_DATA_UINT64 },
591	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
592	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
593	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
594	{ "hash_elements",		KSTAT_DATA_UINT64 },
595	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
596	{ "hash_collisions",		KSTAT_DATA_UINT64 },
597	{ "hash_chains",		KSTAT_DATA_UINT64 },
598	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
599	{ "p",				KSTAT_DATA_UINT64 },
600	{ "c",				KSTAT_DATA_UINT64 },
601	{ "c_min",			KSTAT_DATA_UINT64 },
602	{ "c_max",			KSTAT_DATA_UINT64 },
603	{ "size",			KSTAT_DATA_UINT64 },
604	{ "hdr_size",			KSTAT_DATA_UINT64 },
605	{ "data_size",			KSTAT_DATA_UINT64 },
606	{ "metadata_size",		KSTAT_DATA_UINT64 },
607	{ "other_size",			KSTAT_DATA_UINT64 },
608	{ "anon_size",			KSTAT_DATA_UINT64 },
609	{ "anon_evictable_data",	KSTAT_DATA_UINT64 },
610	{ "anon_evictable_metadata",	KSTAT_DATA_UINT64 },
611	{ "mru_size",			KSTAT_DATA_UINT64 },
612	{ "mru_evictable_data",		KSTAT_DATA_UINT64 },
613	{ "mru_evictable_metadata",	KSTAT_DATA_UINT64 },
614	{ "mru_ghost_size",		KSTAT_DATA_UINT64 },
615	{ "mru_ghost_evictable_data",	KSTAT_DATA_UINT64 },
616	{ "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
617	{ "mfu_size",			KSTAT_DATA_UINT64 },
618	{ "mfu_evictable_data",		KSTAT_DATA_UINT64 },
619	{ "mfu_evictable_metadata",	KSTAT_DATA_UINT64 },
620	{ "mfu_ghost_size",		KSTAT_DATA_UINT64 },
621	{ "mfu_ghost_evictable_data",	KSTAT_DATA_UINT64 },
622	{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
623	{ "l2_hits",			KSTAT_DATA_UINT64 },
624	{ "l2_misses",			KSTAT_DATA_UINT64 },
625	{ "l2_feeds",			KSTAT_DATA_UINT64 },
626	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
627	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
628	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
629	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
630	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
631	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
632	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
633	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
634	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
635	{ "l2_evict_l1cached",		KSTAT_DATA_UINT64 },
636	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
637	{ "l2_cdata_free_on_write",	KSTAT_DATA_UINT64 },
638	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
639	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
640	{ "l2_io_error",		KSTAT_DATA_UINT64 },
641	{ "l2_size",			KSTAT_DATA_UINT64 },
642	{ "l2_asize",			KSTAT_DATA_UINT64 },
643	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
644	{ "l2_compress_successes",	KSTAT_DATA_UINT64 },
645	{ "l2_compress_zeros",		KSTAT_DATA_UINT64 },
646	{ "l2_compress_failures",	KSTAT_DATA_UINT64 },
647	{ "l2_write_trylock_fail",	KSTAT_DATA_UINT64 },
648	{ "l2_write_passed_headroom",	KSTAT_DATA_UINT64 },
649	{ "l2_write_spa_mismatch",	KSTAT_DATA_UINT64 },
650	{ "l2_write_in_l2",		KSTAT_DATA_UINT64 },
651	{ "l2_write_io_in_progress",	KSTAT_DATA_UINT64 },
652	{ "l2_write_not_cacheable",	KSTAT_DATA_UINT64 },
653	{ "l2_write_full",		KSTAT_DATA_UINT64 },
654	{ "l2_write_buffer_iter",	KSTAT_DATA_UINT64 },
655	{ "l2_write_pios",		KSTAT_DATA_UINT64 },
656	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
657	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
658	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
659	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
660	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
661	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
662	{ "duplicate_reads",		KSTAT_DATA_UINT64 },
663	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
664	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
665	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
666	{ "arc_meta_min",		KSTAT_DATA_UINT64 }
667};
668
669#define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
670
671#define	ARCSTAT_INCR(stat, val) \
672	atomic_add_64(&arc_stats.stat.value.ui64, (val))
673
674#define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
675#define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
676
677#define	ARCSTAT_MAX(stat, val) {					\
678	uint64_t m;							\
679	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
680	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
681		continue;						\
682}
683
684#define	ARCSTAT_MAXSTAT(stat) \
685	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
686
687/*
688 * We define a macro to allow ARC hits/misses to be easily broken down by
689 * two separate conditions, giving a total of four different subtypes for
690 * each of hits and misses (so eight statistics total).
691 */
692#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
693	if (cond1) {							\
694		if (cond2) {						\
695			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
696		} else {						\
697			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
698		}							\
699	} else {							\
700		if (cond2) {						\
701			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
702		} else {						\
703			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
704		}							\
705	}
706
707kstat_t			*arc_ksp;
708static arc_state_t	*arc_anon;
709static arc_state_t	*arc_mru;
710static arc_state_t	*arc_mru_ghost;
711static arc_state_t	*arc_mfu;
712static arc_state_t	*arc_mfu_ghost;
713static arc_state_t	*arc_l2c_only;
714
715/*
716 * There are several ARC variables that are critical to export as kstats --
717 * but we don't want to have to grovel around in the kstat whenever we wish to
718 * manipulate them.  For these variables, we therefore define them to be in
719 * terms of the statistic variable.  This assures that we are not introducing
720 * the possibility of inconsistency by having shadow copies of the variables,
721 * while still allowing the code to be readable.
722 */
723#define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
724#define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
725#define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
726#define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
727#define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
728#define	arc_meta_limit	ARCSTAT(arcstat_meta_limit) /* max size for metadata */
729#define	arc_meta_min	ARCSTAT(arcstat_meta_min) /* min size for metadata */
730#define	arc_meta_used	ARCSTAT(arcstat_meta_used) /* size of metadata */
731#define	arc_meta_max	ARCSTAT(arcstat_meta_max) /* max size of metadata */
732
733#define	L2ARC_IS_VALID_COMPRESS(_c_) \
734	((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
735
736static int		arc_no_grow;	/* Don't try to grow cache size */
737static uint64_t		arc_tempreserve;
738static uint64_t		arc_loaned_bytes;
739
740typedef struct arc_callback arc_callback_t;
741
742struct arc_callback {
743	void			*acb_private;
744	arc_done_func_t		*acb_done;
745	arc_buf_t		*acb_buf;
746	zio_t			*acb_zio_dummy;
747	arc_callback_t		*acb_next;
748};
749
750typedef struct arc_write_callback arc_write_callback_t;
751
752struct arc_write_callback {
753	void		*awcb_private;
754	arc_done_func_t	*awcb_ready;
755	arc_done_func_t	*awcb_physdone;
756	arc_done_func_t	*awcb_done;
757	arc_buf_t	*awcb_buf;
758};
759
760/*
761 * ARC buffers are separated into multiple structs as a memory saving measure:
762 *   - Common fields struct, always defined, and embedded within it:
763 *       - L2-only fields, always allocated but undefined when not in L2ARC
764 *       - L1-only fields, only allocated when in L1ARC
765 *
766 *           Buffer in L1                     Buffer only in L2
767 *    +------------------------+          +------------------------+
768 *    | arc_buf_hdr_t          |          | arc_buf_hdr_t          |
769 *    |                        |          |                        |
770 *    |                        |          |                        |
771 *    |                        |          |                        |
772 *    +------------------------+          +------------------------+
773 *    | l2arc_buf_hdr_t        |          | l2arc_buf_hdr_t        |
774 *    | (undefined if L1-only) |          |                        |
775 *    +------------------------+          +------------------------+
776 *    | l1arc_buf_hdr_t        |
777 *    |                        |
778 *    |                        |
779 *    |                        |
780 *    |                        |
781 *    +------------------------+
782 *
783 * Because it's possible for the L2ARC to become extremely large, we can wind
784 * up eating a lot of memory in L2ARC buffer headers, so the size of a header
785 * is minimized by only allocating the fields necessary for an L1-cached buffer
786 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
787 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
788 * words in pointers. arc_hdr_realloc() is used to switch a header between
789 * these two allocation states.
790 */
791typedef struct l1arc_buf_hdr {
792	kmutex_t		b_freeze_lock;
793#ifdef ZFS_DEBUG
794	/*
795	 * used for debugging wtih kmem_flags - by allocating and freeing
796	 * b_thawed when the buffer is thawed, we get a record of the stack
797	 * trace that thawed it.
798	 */
799	void			*b_thawed;
800#endif
801
802	arc_buf_t		*b_buf;
803	uint32_t		b_datacnt;
804	/* for waiting on writes to complete */
805	kcondvar_t		b_cv;
806
807	/* protected by arc state mutex */
808	arc_state_t		*b_state;
809	list_node_t		b_arc_node;
810
811	/* updated atomically */
812	clock_t			b_arc_access;
813
814	/* self protecting */
815	refcount_t		b_refcnt;
816
817	arc_callback_t		*b_acb;
818	/* temporary buffer holder for in-flight compressed data */
819	void			*b_tmp_cdata;
820} l1arc_buf_hdr_t;
821
822typedef struct l2arc_dev l2arc_dev_t;
823
824typedef struct l2arc_buf_hdr {
825	/* protected by arc_buf_hdr mutex */
826	l2arc_dev_t		*b_dev;		/* L2ARC device */
827	uint64_t		b_daddr;	/* disk address, offset byte */
828	/* real alloc'd buffer size depending on b_compress applied */
829	int32_t			b_asize;
830
831	list_node_t		b_l2node;
832} l2arc_buf_hdr_t;
833
834struct arc_buf_hdr {
835	/* protected by hash lock */
836	dva_t			b_dva;
837	uint64_t		b_birth;
838	/*
839	 * Even though this checksum is only set/verified when a buffer is in
840	 * the L1 cache, it needs to be in the set of common fields because it
841	 * must be preserved from the time before a buffer is written out to
842	 * L2ARC until after it is read back in.
843	 */
844	zio_cksum_t		*b_freeze_cksum;
845
846	arc_buf_hdr_t		*b_hash_next;
847	arc_flags_t		b_flags;
848
849	/* immutable */
850	int32_t			b_size;
851	uint64_t		b_spa;
852
853	/* L2ARC fields. Undefined when not in L2ARC. */
854	l2arc_buf_hdr_t		b_l2hdr;
855	/* L1ARC fields. Undefined when in l2arc_only state */
856	l1arc_buf_hdr_t		b_l1hdr;
857};
858
859#ifdef _KERNEL
860static int
861sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS)
862{
863	uint64_t val;
864	int err;
865
866	val = arc_meta_limit;
867	err = sysctl_handle_64(oidp, &val, 0, req);
868	if (err != 0 || req->newptr == NULL)
869		return (err);
870
871        if (val <= 0 || val > arc_c_max)
872		return (EINVAL);
873
874	arc_meta_limit = val;
875	return (0);
876}
877#endif
878
879static arc_buf_t *arc_eviction_list;
880static kmutex_t arc_eviction_mtx;
881static arc_buf_hdr_t arc_eviction_hdr;
882
883#define	GHOST_STATE(state)	\
884	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
885	(state) == arc_l2c_only)
886
887#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
888#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
889#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
890#define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
891#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
892#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
893
894#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_FLAG_L2CACHE)
895#define	HDR_L2COMPRESS(hdr)	((hdr)->b_flags & ARC_FLAG_L2COMPRESS)
896#define	HDR_L2_READING(hdr)	\
897	    (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&	\
898	    ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
899#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITING)
900#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
901#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
902
903#define	HDR_ISTYPE_METADATA(hdr)	\
904	    ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
905#define	HDR_ISTYPE_DATA(hdr)	(!HDR_ISTYPE_METADATA(hdr))
906
907#define	HDR_HAS_L1HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
908#define	HDR_HAS_L2HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
909
910/* For storing compression mode in b_flags */
911#define	HDR_COMPRESS_OFFSET	24
912#define	HDR_COMPRESS_NBITS	7
913
914#define	HDR_GET_COMPRESS(hdr)	((enum zio_compress)BF32_GET(hdr->b_flags, \
915	    HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS))
916#define	HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \
917	    HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp))
918
919/*
920 * Other sizes
921 */
922
923#define	HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
924#define	HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
925
926/*
927 * Hash table routines
928 */
929
930#define	HT_LOCK_PAD	CACHE_LINE_SIZE
931
932struct ht_lock {
933	kmutex_t	ht_lock;
934#ifdef _KERNEL
935	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
936#endif
937};
938
939#define	BUF_LOCKS 256
940typedef struct buf_hash_table {
941	uint64_t ht_mask;
942	arc_buf_hdr_t **ht_table;
943	struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
944} buf_hash_table_t;
945
946static buf_hash_table_t buf_hash_table;
947
948#define	BUF_HASH_INDEX(spa, dva, birth) \
949	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
950#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
951#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
952#define	HDR_LOCK(hdr) \
953	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
954
955uint64_t zfs_crc64_table[256];
956
957/*
958 * Level 2 ARC
959 */
960
961#define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
962#define	L2ARC_HEADROOM		2			/* num of writes */
963/*
964 * If we discover during ARC scan any buffers to be compressed, we boost
965 * our headroom for the next scanning cycle by this percentage multiple.
966 */
967#define	L2ARC_HEADROOM_BOOST	200
968#define	L2ARC_FEED_SECS		1		/* caching interval secs */
969#define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
970
971/*
972 * Used to distinguish headers that are being process by
973 * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk
974 * address. This can happen when the header is added to the l2arc's list
975 * of buffers to write in the first stage of l2arc_write_buffers(), but
976 * has not yet been written out which happens in the second stage of
977 * l2arc_write_buffers().
978 */
979#define	L2ARC_ADDR_UNSET	((uint64_t)(-1))
980
981#define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
982#define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
983
984/* L2ARC Performance Tunables */
985uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
986uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
987uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
988uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
989uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
990uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
991boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
992boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
993boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
994
995SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
996    &l2arc_write_max, 0, "max write size");
997SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
998    &l2arc_write_boost, 0, "extra write during warmup");
999SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
1000    &l2arc_headroom, 0, "number of dev writes");
1001SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
1002    &l2arc_feed_secs, 0, "interval seconds");
1003SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
1004    &l2arc_feed_min_ms, 0, "min interval milliseconds");
1005
1006SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
1007    &l2arc_noprefetch, 0, "don't cache prefetch bufs");
1008SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
1009    &l2arc_feed_again, 0, "turbo warmup");
1010SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
1011    &l2arc_norw, 0, "no reads during writes");
1012
1013SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
1014    &ARC_anon.arcs_size, 0, "size of anonymous state");
1015SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
1016    &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
1017SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
1018    &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
1019
1020SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
1021    &ARC_mru.arcs_size, 0, "size of mru state");
1022SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
1023    &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
1024SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
1025    &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
1026
1027SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
1028    &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
1029SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
1030    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
1031    "size of metadata in mru ghost state");
1032SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
1033    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
1034    "size of data in mru ghost state");
1035
1036SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
1037    &ARC_mfu.arcs_size, 0, "size of mfu state");
1038SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
1039    &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
1040SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
1041    &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
1042
1043SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
1044    &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
1045SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
1046    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
1047    "size of metadata in mfu ghost state");
1048SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
1049    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
1050    "size of data in mfu ghost state");
1051
1052SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
1053    &ARC_l2c_only.arcs_size, 0, "size of mru state");
1054
1055/*
1056 * L2ARC Internals
1057 */
1058struct l2arc_dev {
1059	vdev_t			*l2ad_vdev;	/* vdev */
1060	spa_t			*l2ad_spa;	/* spa */
1061	uint64_t		l2ad_hand;	/* next write location */
1062	uint64_t		l2ad_start;	/* first addr on device */
1063	uint64_t		l2ad_end;	/* last addr on device */
1064	boolean_t		l2ad_first;	/* first sweep through */
1065	boolean_t		l2ad_writing;	/* currently writing */
1066	kmutex_t		l2ad_mtx;	/* lock for buffer list */
1067	list_t			l2ad_buflist;	/* buffer list */
1068	list_node_t		l2ad_node;	/* device list node */
1069	refcount_t		l2ad_alloc;	/* allocated bytes */
1070};
1071
1072static list_t L2ARC_dev_list;			/* device list */
1073static list_t *l2arc_dev_list;			/* device list pointer */
1074static kmutex_t l2arc_dev_mtx;			/* device list mutex */
1075static l2arc_dev_t *l2arc_dev_last;		/* last device used */
1076static list_t L2ARC_free_on_write;		/* free after write buf list */
1077static list_t *l2arc_free_on_write;		/* free after write list ptr */
1078static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
1079static uint64_t l2arc_ndev;			/* number of devices */
1080
1081typedef struct l2arc_read_callback {
1082	arc_buf_t		*l2rcb_buf;		/* read buffer */
1083	spa_t			*l2rcb_spa;		/* spa */
1084	blkptr_t		l2rcb_bp;		/* original blkptr */
1085	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
1086	int			l2rcb_flags;		/* original flags */
1087	enum zio_compress	l2rcb_compress;		/* applied compress */
1088} l2arc_read_callback_t;
1089
1090typedef struct l2arc_write_callback {
1091	l2arc_dev_t	*l2wcb_dev;		/* device info */
1092	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
1093} l2arc_write_callback_t;
1094
1095typedef struct l2arc_data_free {
1096	/* protected by l2arc_free_on_write_mtx */
1097	void		*l2df_data;
1098	size_t		l2df_size;
1099	void		(*l2df_func)(void *, size_t);
1100	list_node_t	l2df_list_node;
1101} l2arc_data_free_t;
1102
1103static kmutex_t l2arc_feed_thr_lock;
1104static kcondvar_t l2arc_feed_thr_cv;
1105static uint8_t l2arc_thread_exit;
1106
1107static void arc_get_data_buf(arc_buf_t *);
1108static void arc_access(arc_buf_hdr_t *, kmutex_t *);
1109static int arc_evict_needed(arc_buf_contents_t);
1110static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t);
1111static void arc_buf_watch(arc_buf_t *);
1112
1113static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
1114static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
1115
1116static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
1117static void l2arc_read_done(zio_t *);
1118
1119static boolean_t l2arc_compress_buf(arc_buf_hdr_t *);
1120static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
1121static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
1122
1123static uint64_t
1124buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
1125{
1126	uint8_t *vdva = (uint8_t *)dva;
1127	uint64_t crc = -1ULL;
1128	int i;
1129
1130	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
1131
1132	for (i = 0; i < sizeof (dva_t); i++)
1133		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
1134
1135	crc ^= (spa>>8) ^ birth;
1136
1137	return (crc);
1138}
1139
1140#define	BUF_EMPTY(buf)						\
1141	((buf)->b_dva.dva_word[0] == 0 &&			\
1142	(buf)->b_dva.dva_word[1] == 0)
1143
1144#define	BUF_EQUAL(spa, dva, birth, buf)				\
1145	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
1146	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
1147	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
1148
1149static void
1150buf_discard_identity(arc_buf_hdr_t *hdr)
1151{
1152	hdr->b_dva.dva_word[0] = 0;
1153	hdr->b_dva.dva_word[1] = 0;
1154	hdr->b_birth = 0;
1155}
1156
1157static arc_buf_hdr_t *
1158buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
1159{
1160	const dva_t *dva = BP_IDENTITY(bp);
1161	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
1162	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
1163	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1164	arc_buf_hdr_t *hdr;
1165
1166	mutex_enter(hash_lock);
1167	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
1168	    hdr = hdr->b_hash_next) {
1169		if (BUF_EQUAL(spa, dva, birth, hdr)) {
1170			*lockp = hash_lock;
1171			return (hdr);
1172		}
1173	}
1174	mutex_exit(hash_lock);
1175	*lockp = NULL;
1176	return (NULL);
1177}
1178
1179/*
1180 * Insert an entry into the hash table.  If there is already an element
1181 * equal to elem in the hash table, then the already existing element
1182 * will be returned and the new element will not be inserted.
1183 * Otherwise returns NULL.
1184 * If lockp == NULL, the caller is assumed to already hold the hash lock.
1185 */
1186static arc_buf_hdr_t *
1187buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
1188{
1189	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1190	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1191	arc_buf_hdr_t *fhdr;
1192	uint32_t i;
1193
1194	ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
1195	ASSERT(hdr->b_birth != 0);
1196	ASSERT(!HDR_IN_HASH_TABLE(hdr));
1197
1198	if (lockp != NULL) {
1199		*lockp = hash_lock;
1200		mutex_enter(hash_lock);
1201	} else {
1202		ASSERT(MUTEX_HELD(hash_lock));
1203	}
1204
1205	for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
1206	    fhdr = fhdr->b_hash_next, i++) {
1207		if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
1208			return (fhdr);
1209	}
1210
1211	hdr->b_hash_next = buf_hash_table.ht_table[idx];
1212	buf_hash_table.ht_table[idx] = hdr;
1213	hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
1214
1215	/* collect some hash table performance data */
1216	if (i > 0) {
1217		ARCSTAT_BUMP(arcstat_hash_collisions);
1218		if (i == 1)
1219			ARCSTAT_BUMP(arcstat_hash_chains);
1220
1221		ARCSTAT_MAX(arcstat_hash_chain_max, i);
1222	}
1223
1224	ARCSTAT_BUMP(arcstat_hash_elements);
1225	ARCSTAT_MAXSTAT(arcstat_hash_elements);
1226
1227	return (NULL);
1228}
1229
1230static void
1231buf_hash_remove(arc_buf_hdr_t *hdr)
1232{
1233	arc_buf_hdr_t *fhdr, **hdrp;
1234	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1235
1236	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
1237	ASSERT(HDR_IN_HASH_TABLE(hdr));
1238
1239	hdrp = &buf_hash_table.ht_table[idx];
1240	while ((fhdr = *hdrp) != hdr) {
1241		ASSERT(fhdr != NULL);
1242		hdrp = &fhdr->b_hash_next;
1243	}
1244	*hdrp = hdr->b_hash_next;
1245	hdr->b_hash_next = NULL;
1246	hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE;
1247
1248	/* collect some hash table performance data */
1249	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1250
1251	if (buf_hash_table.ht_table[idx] &&
1252	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1253		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1254}
1255
1256/*
1257 * Global data structures and functions for the buf kmem cache.
1258 */
1259static kmem_cache_t *hdr_full_cache;
1260static kmem_cache_t *hdr_l2only_cache;
1261static kmem_cache_t *buf_cache;
1262
1263static void
1264buf_fini(void)
1265{
1266	int i;
1267
1268	kmem_free(buf_hash_table.ht_table,
1269	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
1270	for (i = 0; i < BUF_LOCKS; i++)
1271		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
1272	kmem_cache_destroy(hdr_full_cache);
1273	kmem_cache_destroy(hdr_l2only_cache);
1274	kmem_cache_destroy(buf_cache);
1275}
1276
1277/*
1278 * Constructor callback - called when the cache is empty
1279 * and a new buf is requested.
1280 */
1281/* ARGSUSED */
1282static int
1283hdr_full_cons(void *vbuf, void *unused, int kmflag)
1284{
1285	arc_buf_hdr_t *hdr = vbuf;
1286
1287	bzero(hdr, HDR_FULL_SIZE);
1288	cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
1289	refcount_create(&hdr->b_l1hdr.b_refcnt);
1290	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1291	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1292
1293	return (0);
1294}
1295
1296/* ARGSUSED */
1297static int
1298hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
1299{
1300	arc_buf_hdr_t *hdr = vbuf;
1301
1302	bzero(hdr, HDR_L2ONLY_SIZE);
1303	arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1304
1305	return (0);
1306}
1307
1308/* ARGSUSED */
1309static int
1310buf_cons(void *vbuf, void *unused, int kmflag)
1311{
1312	arc_buf_t *buf = vbuf;
1313
1314	bzero(buf, sizeof (arc_buf_t));
1315	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1316	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1317
1318	return (0);
1319}
1320
1321/*
1322 * Destructor callback - called when a cached buf is
1323 * no longer required.
1324 */
1325/* ARGSUSED */
1326static void
1327hdr_full_dest(void *vbuf, void *unused)
1328{
1329	arc_buf_hdr_t *hdr = vbuf;
1330
1331	ASSERT(BUF_EMPTY(hdr));
1332	cv_destroy(&hdr->b_l1hdr.b_cv);
1333	refcount_destroy(&hdr->b_l1hdr.b_refcnt);
1334	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
1335	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1336}
1337
1338/* ARGSUSED */
1339static void
1340hdr_l2only_dest(void *vbuf, void *unused)
1341{
1342	arc_buf_hdr_t *hdr = vbuf;
1343
1344	ASSERT(BUF_EMPTY(hdr));
1345	arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1346}
1347
1348/* ARGSUSED */
1349static void
1350buf_dest(void *vbuf, void *unused)
1351{
1352	arc_buf_t *buf = vbuf;
1353
1354	mutex_destroy(&buf->b_evict_lock);
1355	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1356}
1357
1358/*
1359 * Reclaim callback -- invoked when memory is low.
1360 */
1361/* ARGSUSED */
1362static void
1363hdr_recl(void *unused)
1364{
1365	dprintf("hdr_recl called\n");
1366	/*
1367	 * umem calls the reclaim func when we destroy the buf cache,
1368	 * which is after we do arc_fini().
1369	 */
1370	if (!arc_dead)
1371		cv_signal(&arc_reclaim_thr_cv);
1372}
1373
1374static void
1375buf_init(void)
1376{
1377	uint64_t *ct;
1378	uint64_t hsize = 1ULL << 12;
1379	int i, j;
1380
1381	/*
1382	 * The hash table is big enough to fill all of physical memory
1383	 * with an average block size of zfs_arc_average_blocksize (default 8K).
1384	 * By default, the table will take up
1385	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1386	 */
1387	while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
1388		hsize <<= 1;
1389retry:
1390	buf_hash_table.ht_mask = hsize - 1;
1391	buf_hash_table.ht_table =
1392	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1393	if (buf_hash_table.ht_table == NULL) {
1394		ASSERT(hsize > (1ULL << 8));
1395		hsize >>= 1;
1396		goto retry;
1397	}
1398
1399	hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
1400	    0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
1401	hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
1402	    HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
1403	    NULL, NULL, 0);
1404	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1405	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1406
1407	for (i = 0; i < 256; i++)
1408		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1409			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1410
1411	for (i = 0; i < BUF_LOCKS; i++) {
1412		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1413		    NULL, MUTEX_DEFAULT, NULL);
1414	}
1415}
1416
1417/*
1418 * Transition between the two allocation states for the arc_buf_hdr struct.
1419 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
1420 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
1421 * version is used when a cache buffer is only in the L2ARC in order to reduce
1422 * memory usage.
1423 */
1424static arc_buf_hdr_t *
1425arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
1426{
1427	ASSERT(HDR_HAS_L2HDR(hdr));
1428
1429	arc_buf_hdr_t *nhdr;
1430	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
1431
1432	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
1433	    (old == hdr_l2only_cache && new == hdr_full_cache));
1434
1435	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
1436
1437	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
1438	buf_hash_remove(hdr);
1439
1440	bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
1441
1442	if (new == hdr_full_cache) {
1443		nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
1444		/*
1445		 * arc_access and arc_change_state need to be aware that a
1446		 * header has just come out of L2ARC, so we set its state to
1447		 * l2c_only even though it's about to change.
1448		 */
1449		nhdr->b_l1hdr.b_state = arc_l2c_only;
1450	} else {
1451		ASSERT(hdr->b_l1hdr.b_buf == NULL);
1452		ASSERT0(hdr->b_l1hdr.b_datacnt);
1453		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
1454		/*
1455		 * We might be removing the L1hdr of a buffer which was just
1456		 * written out to L2ARC. If such a buffer is compressed then we
1457		 * need to free its b_tmp_cdata before destroying the header.
1458		 */
1459		if (hdr->b_l1hdr.b_tmp_cdata != NULL &&
1460		    HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
1461			l2arc_release_cdata_buf(hdr);
1462		nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
1463	}
1464	/*
1465	 * The header has been reallocated so we need to re-insert it into any
1466	 * lists it was on.
1467	 */
1468	(void) buf_hash_insert(nhdr, NULL);
1469
1470	ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
1471
1472	mutex_enter(&dev->l2ad_mtx);
1473
1474	/*
1475	 * We must place the realloc'ed header back into the list at
1476	 * the same spot. Otherwise, if it's placed earlier in the list,
1477	 * l2arc_write_buffers() could find it during the function's
1478	 * write phase, and try to write it out to the l2arc.
1479	 */
1480	list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
1481	list_remove(&dev->l2ad_buflist, hdr);
1482
1483	mutex_exit(&dev->l2ad_mtx);
1484
1485	/*
1486	 * Since we're using the pointer address as the tag when
1487	 * incrementing and decrementing the l2ad_alloc refcount, we
1488	 * must remove the old pointer (that we're about to destroy) and
1489	 * add the new pointer to the refcount. Otherwise we'd remove
1490	 * the wrong pointer address when calling arc_hdr_destroy() later.
1491	 */
1492
1493	(void) refcount_remove_many(&dev->l2ad_alloc,
1494	    hdr->b_l2hdr.b_asize, hdr);
1495
1496	(void) refcount_add_many(&dev->l2ad_alloc,
1497	    nhdr->b_l2hdr.b_asize, nhdr);
1498
1499	buf_discard_identity(hdr);
1500	hdr->b_freeze_cksum = NULL;
1501	kmem_cache_free(old, hdr);
1502
1503	return (nhdr);
1504}
1505
1506
1507#define	ARC_MINTIME	(hz>>4) /* 62 ms */
1508
1509static void
1510arc_cksum_verify(arc_buf_t *buf)
1511{
1512	zio_cksum_t zc;
1513
1514	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1515		return;
1516
1517	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1518	if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) {
1519		mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1520		return;
1521	}
1522	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1523	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1524		panic("buffer modified while frozen!");
1525	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1526}
1527
1528static int
1529arc_cksum_equal(arc_buf_t *buf)
1530{
1531	zio_cksum_t zc;
1532	int equal;
1533
1534	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1535	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1536	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1537	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1538
1539	return (equal);
1540}
1541
1542static void
1543arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1544{
1545	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1546		return;
1547
1548	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1549	if (buf->b_hdr->b_freeze_cksum != NULL) {
1550		mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1551		return;
1552	}
1553	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1554	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1555	    buf->b_hdr->b_freeze_cksum);
1556	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1557#ifdef illumos
1558	arc_buf_watch(buf);
1559#endif
1560}
1561
1562#ifdef illumos
1563#ifndef _KERNEL
1564typedef struct procctl {
1565	long cmd;
1566	prwatch_t prwatch;
1567} procctl_t;
1568#endif
1569
1570/* ARGSUSED */
1571static void
1572arc_buf_unwatch(arc_buf_t *buf)
1573{
1574#ifndef _KERNEL
1575	if (arc_watch) {
1576		int result;
1577		procctl_t ctl;
1578		ctl.cmd = PCWATCH;
1579		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1580		ctl.prwatch.pr_size = 0;
1581		ctl.prwatch.pr_wflags = 0;
1582		result = write(arc_procfd, &ctl, sizeof (ctl));
1583		ASSERT3U(result, ==, sizeof (ctl));
1584	}
1585#endif
1586}
1587
1588/* ARGSUSED */
1589static void
1590arc_buf_watch(arc_buf_t *buf)
1591{
1592#ifndef _KERNEL
1593	if (arc_watch) {
1594		int result;
1595		procctl_t ctl;
1596		ctl.cmd = PCWATCH;
1597		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1598		ctl.prwatch.pr_size = buf->b_hdr->b_size;
1599		ctl.prwatch.pr_wflags = WA_WRITE;
1600		result = write(arc_procfd, &ctl, sizeof (ctl));
1601		ASSERT3U(result, ==, sizeof (ctl));
1602	}
1603#endif
1604}
1605#endif /* illumos */
1606
1607static arc_buf_contents_t
1608arc_buf_type(arc_buf_hdr_t *hdr)
1609{
1610	if (HDR_ISTYPE_METADATA(hdr)) {
1611		return (ARC_BUFC_METADATA);
1612	} else {
1613		return (ARC_BUFC_DATA);
1614	}
1615}
1616
1617static uint32_t
1618arc_bufc_to_flags(arc_buf_contents_t type)
1619{
1620	switch (type) {
1621	case ARC_BUFC_DATA:
1622		/* metadata field is 0 if buffer contains normal data */
1623		return (0);
1624	case ARC_BUFC_METADATA:
1625		return (ARC_FLAG_BUFC_METADATA);
1626	default:
1627		break;
1628	}
1629	panic("undefined ARC buffer type!");
1630	return ((uint32_t)-1);
1631}
1632
1633void
1634arc_buf_thaw(arc_buf_t *buf)
1635{
1636	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1637		if (buf->b_hdr->b_l1hdr.b_state != arc_anon)
1638			panic("modifying non-anon buffer!");
1639		if (HDR_IO_IN_PROGRESS(buf->b_hdr))
1640			panic("modifying buffer while i/o in progress!");
1641		arc_cksum_verify(buf);
1642	}
1643
1644	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1645	if (buf->b_hdr->b_freeze_cksum != NULL) {
1646		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1647		buf->b_hdr->b_freeze_cksum = NULL;
1648	}
1649
1650#ifdef ZFS_DEBUG
1651	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1652		if (buf->b_hdr->b_l1hdr.b_thawed != NULL)
1653			kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1);
1654		buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
1655	}
1656#endif
1657
1658	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1659
1660#ifdef illumos
1661	arc_buf_unwatch(buf);
1662#endif
1663}
1664
1665void
1666arc_buf_freeze(arc_buf_t *buf)
1667{
1668	kmutex_t *hash_lock;
1669
1670	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1671		return;
1672
1673	hash_lock = HDR_LOCK(buf->b_hdr);
1674	mutex_enter(hash_lock);
1675
1676	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1677	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
1678	arc_cksum_compute(buf, B_FALSE);
1679	mutex_exit(hash_lock);
1680
1681}
1682
1683static void
1684get_buf_info(arc_buf_hdr_t *hdr, arc_state_t *state, list_t **list, kmutex_t **lock)
1685{
1686	uint64_t buf_hashid = buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1687
1688	if (arc_buf_type(hdr) == ARC_BUFC_METADATA)
1689		buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
1690	else {
1691		buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
1692		buf_hashid += ARC_BUFC_NUMMETADATALISTS;
1693	}
1694
1695	*list = &state->arcs_lists[buf_hashid];
1696	*lock = ARCS_LOCK(state, buf_hashid);
1697}
1698
1699
1700static void
1701add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1702{
1703	ASSERT(HDR_HAS_L1HDR(hdr));
1704	ASSERT(MUTEX_HELD(hash_lock));
1705	arc_state_t *state = hdr->b_l1hdr.b_state;
1706
1707	if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
1708	    (state != arc_anon)) {
1709		/* We don't use the L2-only state list. */
1710		if (state != arc_l2c_only) {
1711			uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt;
1712			uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
1713			list_t *list;
1714			kmutex_t *lock;
1715
1716			get_buf_info(hdr, state, &list, &lock);
1717			ASSERT(!MUTEX_HELD(lock));
1718			mutex_enter(lock);
1719			ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
1720			list_remove(list, hdr);
1721			if (GHOST_STATE(state)) {
1722				ASSERT0(hdr->b_l1hdr.b_datacnt);
1723				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
1724				delta = hdr->b_size;
1725			}
1726			ASSERT(delta > 0);
1727			ASSERT3U(*size, >=, delta);
1728			atomic_add_64(size, -delta);
1729			mutex_exit(lock);
1730		}
1731		/* remove the prefetch flag if we get a reference */
1732		hdr->b_flags &= ~ARC_FLAG_PREFETCH;
1733	}
1734}
1735
1736static int
1737remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1738{
1739	int cnt;
1740	arc_state_t *state = hdr->b_l1hdr.b_state;
1741
1742	ASSERT(HDR_HAS_L1HDR(hdr));
1743	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1744	ASSERT(!GHOST_STATE(state));
1745
1746	/*
1747	 * arc_l2c_only counts as a ghost state so we don't need to explicitly
1748	 * check to prevent usage of the arc_l2c_only list.
1749	 */
1750	if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
1751	    (state != arc_anon)) {
1752		uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
1753		list_t *list;
1754		kmutex_t *lock;
1755
1756		get_buf_info(hdr, state, &list, &lock);
1757		ASSERT(!MUTEX_HELD(lock));
1758		mutex_enter(lock);
1759		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
1760		list_insert_head(list, hdr);
1761		ASSERT(hdr->b_l1hdr.b_datacnt > 0);
1762		atomic_add_64(size, hdr->b_size *
1763		    hdr->b_l1hdr.b_datacnt);
1764		mutex_exit(lock);
1765	}
1766	return (cnt);
1767}
1768
1769/*
1770 * Move the supplied buffer to the indicated state.  The mutex
1771 * for the buffer must be held by the caller.
1772 */
1773static void
1774arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
1775    kmutex_t *hash_lock)
1776{
1777	arc_state_t *old_state;
1778	int64_t refcnt;
1779	uint32_t datacnt;
1780	uint64_t from_delta, to_delta;
1781	arc_buf_contents_t buftype = arc_buf_type(hdr);
1782	list_t *list;
1783	kmutex_t *lock;
1784
1785	/*
1786	 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
1787	 * in arc_read() when bringing a buffer out of the L2ARC.  However, the
1788	 * L1 hdr doesn't always exist when we change state to arc_anon before
1789	 * destroying a header, in which case reallocating to add the L1 hdr is
1790	 * pointless.
1791	 */
1792	if (HDR_HAS_L1HDR(hdr)) {
1793		old_state = hdr->b_l1hdr.b_state;
1794		refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
1795		datacnt = hdr->b_l1hdr.b_datacnt;
1796	} else {
1797		old_state = arc_l2c_only;
1798		refcnt = 0;
1799		datacnt = 0;
1800	}
1801
1802	ASSERT(MUTEX_HELD(hash_lock));
1803	ASSERT3P(new_state, !=, old_state);
1804	ASSERT(refcnt == 0 || datacnt > 0);
1805	ASSERT(!GHOST_STATE(new_state) || datacnt == 0);
1806	ASSERT(old_state != arc_anon || datacnt <= 1);
1807
1808	from_delta = to_delta = datacnt * hdr->b_size;
1809
1810	/*
1811	 * If this buffer is evictable, transfer it from the
1812	 * old state list to the new state list.
1813	 */
1814	if (refcnt == 0) {
1815		if (old_state != arc_anon && old_state != arc_l2c_only) {
1816			int use_mutex;
1817			uint64_t *size = &old_state->arcs_lsize[buftype];
1818
1819			get_buf_info(hdr, old_state, &list, &lock);
1820			use_mutex = !MUTEX_HELD(lock);
1821			if (use_mutex)
1822				mutex_enter(lock);
1823
1824			ASSERT(HDR_HAS_L1HDR(hdr));
1825			ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
1826			list_remove(list, hdr);
1827
1828			/*
1829			 * If prefetching out of the ghost cache,
1830			 * we will have a non-zero datacnt.
1831			 */
1832			if (GHOST_STATE(old_state) && datacnt == 0) {
1833				/* ghost elements have a ghost size */
1834				ASSERT(hdr->b_l1hdr.b_buf == NULL);
1835				from_delta = hdr->b_size;
1836			}
1837			ASSERT3U(*size, >=, from_delta);
1838			atomic_add_64(size, -from_delta);
1839
1840			if (use_mutex)
1841				mutex_exit(lock);
1842		}
1843		if (new_state != arc_anon && new_state != arc_l2c_only) {
1844			int use_mutex;
1845			uint64_t *size = &new_state->arcs_lsize[buftype];
1846
1847			/*
1848			 * An L1 header always exists here, since if we're
1849			 * moving to some L1-cached state (i.e. not l2c_only or
1850			 * anonymous), we realloc the header to add an L1hdr
1851			 * beforehand.
1852			 */
1853			ASSERT(HDR_HAS_L1HDR(hdr));
1854			get_buf_info(hdr, new_state, &list, &lock);
1855			use_mutex = !MUTEX_HELD(lock);
1856			if (use_mutex)
1857				mutex_enter(lock);
1858
1859			list_insert_head(list, hdr);
1860
1861			/* ghost elements have a ghost size */
1862			if (GHOST_STATE(new_state)) {
1863				ASSERT(datacnt == 0);
1864				ASSERT(hdr->b_l1hdr.b_buf == NULL);
1865				to_delta = hdr->b_size;
1866			}
1867			atomic_add_64(size, to_delta);
1868
1869			if (use_mutex)
1870				mutex_exit(lock);
1871		}
1872	}
1873
1874	ASSERT(!BUF_EMPTY(hdr));
1875	if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
1876		buf_hash_remove(hdr);
1877
1878	/* adjust state sizes (ignore arc_l2c_only) */
1879	if (to_delta && new_state != arc_l2c_only)
1880		atomic_add_64(&new_state->arcs_size, to_delta);
1881	if (from_delta && old_state != arc_l2c_only) {
1882		ASSERT3U(old_state->arcs_size, >=, from_delta);
1883		atomic_add_64(&old_state->arcs_size, -from_delta);
1884	}
1885	if (HDR_HAS_L1HDR(hdr))
1886		hdr->b_l1hdr.b_state = new_state;
1887
1888	/*
1889	 * L2 headers should never be on the L2 state list since they don't
1890	 * have L1 headers allocated.
1891	 */
1892#ifdef illumos
1893	ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
1894	    list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
1895#endif
1896}
1897
1898void
1899arc_space_consume(uint64_t space, arc_space_type_t type)
1900{
1901	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1902
1903	switch (type) {
1904	case ARC_SPACE_DATA:
1905		ARCSTAT_INCR(arcstat_data_size, space);
1906		break;
1907	case ARC_SPACE_META:
1908		ARCSTAT_INCR(arcstat_metadata_size, space);
1909		break;
1910	case ARC_SPACE_OTHER:
1911		ARCSTAT_INCR(arcstat_other_size, space);
1912		break;
1913	case ARC_SPACE_HDRS:
1914		ARCSTAT_INCR(arcstat_hdr_size, space);
1915		break;
1916	case ARC_SPACE_L2HDRS:
1917		ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1918		break;
1919	}
1920
1921	if (type != ARC_SPACE_DATA)
1922		ARCSTAT_INCR(arcstat_meta_used, space);
1923
1924	atomic_add_64(&arc_size, space);
1925}
1926
1927void
1928arc_space_return(uint64_t space, arc_space_type_t type)
1929{
1930	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1931
1932	switch (type) {
1933	case ARC_SPACE_DATA:
1934		ARCSTAT_INCR(arcstat_data_size, -space);
1935		break;
1936	case ARC_SPACE_META:
1937		ARCSTAT_INCR(arcstat_metadata_size, -space);
1938		break;
1939	case ARC_SPACE_OTHER:
1940		ARCSTAT_INCR(arcstat_other_size, -space);
1941		break;
1942	case ARC_SPACE_HDRS:
1943		ARCSTAT_INCR(arcstat_hdr_size, -space);
1944		break;
1945	case ARC_SPACE_L2HDRS:
1946		ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1947		break;
1948	}
1949
1950	if (type != ARC_SPACE_DATA) {
1951		ASSERT(arc_meta_used >= space);
1952		if (arc_meta_max < arc_meta_used)
1953			arc_meta_max = arc_meta_used;
1954		ARCSTAT_INCR(arcstat_meta_used, -space);
1955	}
1956
1957	ASSERT(arc_size >= space);
1958	atomic_add_64(&arc_size, -space);
1959}
1960
1961arc_buf_t *
1962arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type)
1963{
1964	arc_buf_hdr_t *hdr;
1965	arc_buf_t *buf;
1966
1967	ASSERT3U(size, >, 0);
1968	hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
1969	ASSERT(BUF_EMPTY(hdr));
1970	ASSERT3P(hdr->b_freeze_cksum, ==, NULL);
1971	hdr->b_size = size;
1972	hdr->b_spa = spa_load_guid(spa);
1973
1974	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1975	buf->b_hdr = hdr;
1976	buf->b_data = NULL;
1977	buf->b_efunc = NULL;
1978	buf->b_private = NULL;
1979	buf->b_next = NULL;
1980
1981	hdr->b_flags = arc_bufc_to_flags(type);
1982	hdr->b_flags |= ARC_FLAG_HAS_L1HDR;
1983
1984	hdr->b_l1hdr.b_buf = buf;
1985	hdr->b_l1hdr.b_state = arc_anon;
1986	hdr->b_l1hdr.b_arc_access = 0;
1987	hdr->b_l1hdr.b_datacnt = 1;
1988
1989	arc_get_data_buf(buf);
1990	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
1991	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
1992
1993	return (buf);
1994}
1995
1996static char *arc_onloan_tag = "onloan";
1997
1998/*
1999 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
2000 * flight data by arc_tempreserve_space() until they are "returned". Loaned
2001 * buffers must be returned to the arc before they can be used by the DMU or
2002 * freed.
2003 */
2004arc_buf_t *
2005arc_loan_buf(spa_t *spa, int size)
2006{
2007	arc_buf_t *buf;
2008
2009	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
2010
2011	atomic_add_64(&arc_loaned_bytes, size);
2012	return (buf);
2013}
2014
2015/*
2016 * Return a loaned arc buffer to the arc.
2017 */
2018void
2019arc_return_buf(arc_buf_t *buf, void *tag)
2020{
2021	arc_buf_hdr_t *hdr = buf->b_hdr;
2022
2023	ASSERT(buf->b_data != NULL);
2024	ASSERT(HDR_HAS_L1HDR(hdr));
2025	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
2026	(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2027
2028	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
2029}
2030
2031/* Detach an arc_buf from a dbuf (tag) */
2032void
2033arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
2034{
2035	arc_buf_hdr_t *hdr = buf->b_hdr;
2036
2037	ASSERT(buf->b_data != NULL);
2038	ASSERT(HDR_HAS_L1HDR(hdr));
2039	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2040	(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
2041	buf->b_efunc = NULL;
2042	buf->b_private = NULL;
2043
2044	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
2045}
2046
2047static arc_buf_t *
2048arc_buf_clone(arc_buf_t *from)
2049{
2050	arc_buf_t *buf;
2051	arc_buf_hdr_t *hdr = from->b_hdr;
2052	uint64_t size = hdr->b_size;
2053
2054	ASSERT(HDR_HAS_L1HDR(hdr));
2055	ASSERT(hdr->b_l1hdr.b_state != arc_anon);
2056
2057	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2058	buf->b_hdr = hdr;
2059	buf->b_data = NULL;
2060	buf->b_efunc = NULL;
2061	buf->b_private = NULL;
2062	buf->b_next = hdr->b_l1hdr.b_buf;
2063	hdr->b_l1hdr.b_buf = buf;
2064	arc_get_data_buf(buf);
2065	bcopy(from->b_data, buf->b_data, size);
2066
2067	/*
2068	 * This buffer already exists in the arc so create a duplicate
2069	 * copy for the caller.  If the buffer is associated with user data
2070	 * then track the size and number of duplicates.  These stats will be
2071	 * updated as duplicate buffers are created and destroyed.
2072	 */
2073	if (HDR_ISTYPE_DATA(hdr)) {
2074		ARCSTAT_BUMP(arcstat_duplicate_buffers);
2075		ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
2076	}
2077	hdr->b_l1hdr.b_datacnt += 1;
2078	return (buf);
2079}
2080
2081void
2082arc_buf_add_ref(arc_buf_t *buf, void* tag)
2083{
2084	arc_buf_hdr_t *hdr;
2085	kmutex_t *hash_lock;
2086
2087	/*
2088	 * Check to see if this buffer is evicted.  Callers
2089	 * must verify b_data != NULL to know if the add_ref
2090	 * was successful.
2091	 */
2092	mutex_enter(&buf->b_evict_lock);
2093	if (buf->b_data == NULL) {
2094		mutex_exit(&buf->b_evict_lock);
2095		return;
2096	}
2097	hash_lock = HDR_LOCK(buf->b_hdr);
2098	mutex_enter(hash_lock);
2099	hdr = buf->b_hdr;
2100	ASSERT(HDR_HAS_L1HDR(hdr));
2101	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2102	mutex_exit(&buf->b_evict_lock);
2103
2104	ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
2105	    hdr->b_l1hdr.b_state == arc_mfu);
2106
2107	add_reference(hdr, hash_lock, tag);
2108	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2109	arc_access(hdr, hash_lock);
2110	mutex_exit(hash_lock);
2111	ARCSTAT_BUMP(arcstat_hits);
2112	ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
2113	    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
2114	    data, metadata, hits);
2115}
2116
2117static void
2118arc_buf_free_on_write(void *data, size_t size,
2119    void (*free_func)(void *, size_t))
2120{
2121	l2arc_data_free_t *df;
2122
2123	df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
2124	df->l2df_data = data;
2125	df->l2df_size = size;
2126	df->l2df_func = free_func;
2127	mutex_enter(&l2arc_free_on_write_mtx);
2128	list_insert_head(l2arc_free_on_write, df);
2129	mutex_exit(&l2arc_free_on_write_mtx);
2130}
2131
2132/*
2133 * Free the arc data buffer.  If it is an l2arc write in progress,
2134 * the buffer is placed on l2arc_free_on_write to be freed later.
2135 */
2136static void
2137arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
2138{
2139	arc_buf_hdr_t *hdr = buf->b_hdr;
2140
2141	if (HDR_L2_WRITING(hdr)) {
2142		arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func);
2143		ARCSTAT_BUMP(arcstat_l2_free_on_write);
2144	} else {
2145		free_func(buf->b_data, hdr->b_size);
2146	}
2147}
2148
2149/*
2150 * Free up buf->b_data and if 'remove' is set, then pull the
2151 * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
2152 */
2153static void
2154arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
2155{
2156	ASSERT(HDR_HAS_L2HDR(hdr));
2157	ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx));
2158
2159	/*
2160	 * The b_tmp_cdata field is linked off of the b_l1hdr, so if
2161	 * that doesn't exist, the header is in the arc_l2c_only state,
2162	 * and there isn't anything to free (it's already been freed).
2163	 */
2164	if (!HDR_HAS_L1HDR(hdr))
2165		return;
2166
2167	if (hdr->b_l1hdr.b_tmp_cdata == NULL)
2168		return;
2169
2170	ASSERT(HDR_L2_WRITING(hdr));
2171	arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size,
2172	    zio_data_buf_free);
2173
2174	ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
2175	hdr->b_l1hdr.b_tmp_cdata = NULL;
2176}
2177
2178static void
2179arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
2180{
2181	arc_buf_t **bufp;
2182
2183	/* free up data associated with the buf */
2184	if (buf->b_data != NULL) {
2185		arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;
2186		uint64_t size = buf->b_hdr->b_size;
2187		arc_buf_contents_t type = arc_buf_type(buf->b_hdr);
2188
2189		arc_cksum_verify(buf);
2190#ifdef illumos
2191		arc_buf_unwatch(buf);
2192#endif
2193
2194		if (!recycle) {
2195			if (type == ARC_BUFC_METADATA) {
2196				arc_buf_data_free(buf, zio_buf_free);
2197				arc_space_return(size, ARC_SPACE_META);
2198			} else {
2199				ASSERT(type == ARC_BUFC_DATA);
2200				arc_buf_data_free(buf, zio_data_buf_free);
2201				arc_space_return(size, ARC_SPACE_DATA);
2202			}
2203		}
2204		if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
2205			uint64_t *cnt = &state->arcs_lsize[type];
2206
2207			ASSERT(refcount_is_zero(
2208			    &buf->b_hdr->b_l1hdr.b_refcnt));
2209			ASSERT(state != arc_anon && state != arc_l2c_only);
2210
2211			ASSERT3U(*cnt, >=, size);
2212			atomic_add_64(cnt, -size);
2213		}
2214		ASSERT3U(state->arcs_size, >=, size);
2215		atomic_add_64(&state->arcs_size, -size);
2216		buf->b_data = NULL;
2217
2218		/*
2219		 * If we're destroying a duplicate buffer make sure
2220		 * that the appropriate statistics are updated.
2221		 */
2222		if (buf->b_hdr->b_l1hdr.b_datacnt > 1 &&
2223		    HDR_ISTYPE_DATA(buf->b_hdr)) {
2224			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
2225			ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
2226		}
2227		ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0);
2228		buf->b_hdr->b_l1hdr.b_datacnt -= 1;
2229	}
2230
2231	/* only remove the buf if requested */
2232	if (!remove)
2233		return;
2234
2235	/* remove the buf from the hdr list */
2236	for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf;
2237	    bufp = &(*bufp)->b_next)
2238		continue;
2239	*bufp = buf->b_next;
2240	buf->b_next = NULL;
2241
2242	ASSERT(buf->b_efunc == NULL);
2243
2244	/* clean up the buf */
2245	buf->b_hdr = NULL;
2246	kmem_cache_free(buf_cache, buf);
2247}
2248
2249static void
2250arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
2251{
2252	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
2253	l2arc_dev_t *dev = l2hdr->b_dev;
2254
2255	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
2256	ASSERT(HDR_HAS_L2HDR(hdr));
2257
2258	list_remove(&dev->l2ad_buflist, hdr);
2259
2260	/*
2261	 * We don't want to leak the b_tmp_cdata buffer that was
2262	 * allocated in l2arc_write_buffers()
2263	 */
2264	arc_buf_l2_cdata_free(hdr);
2265
2266	/*
2267	 * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then
2268	 * this header is being processed by l2arc_write_buffers() (i.e.
2269	 * it's in the first stage of l2arc_write_buffers()).
2270	 * Re-affirming that truth here, just to serve as a reminder. If
2271	 * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or
2272	 * may not have its HDR_L2_WRITING flag set. (the write may have
2273	 * completed, in which case HDR_L2_WRITING will be false and the
2274	 * b_daddr field will point to the address of the buffer on disk).
2275	 */
2276	IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr));
2277
2278	/*
2279	 * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with
2280	 * l2arc_write_buffers(). Since we've just removed this header
2281	 * from the l2arc buffer list, this header will never reach the
2282	 * second stage of l2arc_write_buffers(), which increments the
2283	 * accounting stats for this header. Thus, we must be careful
2284	 * not to decrement them for this header either.
2285	 */
2286	if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) {
2287		ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
2288		ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
2289
2290		vdev_space_update(dev->l2ad_vdev,
2291		    -l2hdr->b_asize, 0, 0);
2292
2293		(void) refcount_remove_many(&dev->l2ad_alloc,
2294		    l2hdr->b_asize, hdr);
2295	}
2296
2297	hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
2298}
2299
2300static void
2301arc_hdr_destroy(arc_buf_hdr_t *hdr)
2302{
2303	if (HDR_HAS_L1HDR(hdr)) {
2304		ASSERT(hdr->b_l1hdr.b_buf == NULL ||
2305		    hdr->b_l1hdr.b_datacnt > 0);
2306		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2307		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
2308	}
2309	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2310	ASSERT(!HDR_IN_HASH_TABLE(hdr));
2311
2312	if (HDR_HAS_L2HDR(hdr)) {
2313		l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
2314		boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
2315
2316		if (!buflist_held)
2317			mutex_enter(&dev->l2ad_mtx);
2318
2319		/*
2320		 * Even though we checked this conditional above, we
2321		 * need to check this again now that we have the
2322		 * l2ad_mtx. This is because we could be racing with
2323		 * another thread calling l2arc_evict() which might have
2324		 * destroyed this header's L2 portion as we were waiting
2325		 * to acquire the l2ad_mtx. If that happens, we don't
2326		 * want to re-destroy the header's L2 portion.
2327		 */
2328		if (HDR_HAS_L2HDR(hdr)) {
2329			trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr,
2330			    hdr->b_l2hdr.b_asize, 0);
2331			arc_hdr_l2hdr_destroy(hdr);
2332		}
2333
2334		if (!buflist_held)
2335			mutex_exit(&dev->l2ad_mtx);
2336	}
2337
2338	if (!BUF_EMPTY(hdr))
2339		buf_discard_identity(hdr);
2340	if (hdr->b_freeze_cksum != NULL) {
2341		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
2342		hdr->b_freeze_cksum = NULL;
2343	}
2344
2345	if (HDR_HAS_L1HDR(hdr)) {
2346		while (hdr->b_l1hdr.b_buf) {
2347			arc_buf_t *buf = hdr->b_l1hdr.b_buf;
2348
2349			if (buf->b_efunc != NULL) {
2350				mutex_enter(&arc_eviction_mtx);
2351				mutex_enter(&buf->b_evict_lock);
2352				ASSERT(buf->b_hdr != NULL);
2353				arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
2354				    FALSE);
2355				hdr->b_l1hdr.b_buf = buf->b_next;
2356				buf->b_hdr = &arc_eviction_hdr;
2357				buf->b_next = arc_eviction_list;
2358				arc_eviction_list = buf;
2359				mutex_exit(&buf->b_evict_lock);
2360				mutex_exit(&arc_eviction_mtx);
2361			} else {
2362				arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
2363				    TRUE);
2364			}
2365		}
2366#ifdef ZFS_DEBUG
2367		if (hdr->b_l1hdr.b_thawed != NULL) {
2368			kmem_free(hdr->b_l1hdr.b_thawed, 1);
2369			hdr->b_l1hdr.b_thawed = NULL;
2370		}
2371#endif
2372	}
2373
2374	ASSERT3P(hdr->b_hash_next, ==, NULL);
2375	if (HDR_HAS_L1HDR(hdr)) {
2376		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
2377		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
2378		kmem_cache_free(hdr_full_cache, hdr);
2379	} else {
2380		kmem_cache_free(hdr_l2only_cache, hdr);
2381	}
2382}
2383
2384void
2385arc_buf_free(arc_buf_t *buf, void *tag)
2386{
2387	arc_buf_hdr_t *hdr = buf->b_hdr;
2388	int hashed = hdr->b_l1hdr.b_state != arc_anon;
2389
2390	ASSERT(buf->b_efunc == NULL);
2391	ASSERT(buf->b_data != NULL);
2392
2393	if (hashed) {
2394		kmutex_t *hash_lock = HDR_LOCK(hdr);
2395
2396		mutex_enter(hash_lock);
2397		hdr = buf->b_hdr;
2398		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2399
2400		(void) remove_reference(hdr, hash_lock, tag);
2401		if (hdr->b_l1hdr.b_datacnt > 1) {
2402			arc_buf_destroy(buf, FALSE, TRUE);
2403		} else {
2404			ASSERT(buf == hdr->b_l1hdr.b_buf);
2405			ASSERT(buf->b_efunc == NULL);
2406			hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
2407		}
2408		mutex_exit(hash_lock);
2409	} else if (HDR_IO_IN_PROGRESS(hdr)) {
2410		int destroy_hdr;
2411		/*
2412		 * We are in the middle of an async write.  Don't destroy
2413		 * this buffer unless the write completes before we finish
2414		 * decrementing the reference count.
2415		 */
2416		mutex_enter(&arc_eviction_mtx);
2417		(void) remove_reference(hdr, NULL, tag);
2418		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2419		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
2420		mutex_exit(&arc_eviction_mtx);
2421		if (destroy_hdr)
2422			arc_hdr_destroy(hdr);
2423	} else {
2424		if (remove_reference(hdr, NULL, tag) > 0)
2425			arc_buf_destroy(buf, FALSE, TRUE);
2426		else
2427			arc_hdr_destroy(hdr);
2428	}
2429}
2430
2431boolean_t
2432arc_buf_remove_ref(arc_buf_t *buf, void* tag)
2433{
2434	arc_buf_hdr_t *hdr = buf->b_hdr;
2435	kmutex_t *hash_lock = HDR_LOCK(hdr);
2436	boolean_t no_callback = (buf->b_efunc == NULL);
2437
2438	if (hdr->b_l1hdr.b_state == arc_anon) {
2439		ASSERT(hdr->b_l1hdr.b_datacnt == 1);
2440		arc_buf_free(buf, tag);
2441		return (no_callback);
2442	}
2443
2444	mutex_enter(hash_lock);
2445	hdr = buf->b_hdr;
2446	ASSERT(hdr->b_l1hdr.b_datacnt > 0);
2447	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2448	ASSERT(hdr->b_l1hdr.b_state != arc_anon);
2449	ASSERT(buf->b_data != NULL);
2450
2451	(void) remove_reference(hdr, hash_lock, tag);
2452	if (hdr->b_l1hdr.b_datacnt > 1) {
2453		if (no_callback)
2454			arc_buf_destroy(buf, FALSE, TRUE);
2455	} else if (no_callback) {
2456		ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL);
2457		ASSERT(buf->b_efunc == NULL);
2458		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
2459	}
2460	ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 ||
2461	    refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2462	mutex_exit(hash_lock);
2463	return (no_callback);
2464}
2465
2466int32_t
2467arc_buf_size(arc_buf_t *buf)
2468{
2469	return (buf->b_hdr->b_size);
2470}
2471
2472/*
2473 * Called from the DMU to determine if the current buffer should be
2474 * evicted. In order to ensure proper locking, the eviction must be initiated
2475 * from the DMU. Return true if the buffer is associated with user data and
2476 * duplicate buffers still exist.
2477 */
2478boolean_t
2479arc_buf_eviction_needed(arc_buf_t *buf)
2480{
2481	arc_buf_hdr_t *hdr;
2482	boolean_t evict_needed = B_FALSE;
2483
2484	if (zfs_disable_dup_eviction)
2485		return (B_FALSE);
2486
2487	mutex_enter(&buf->b_evict_lock);
2488	hdr = buf->b_hdr;
2489	if (hdr == NULL) {
2490		/*
2491		 * We are in arc_do_user_evicts(); let that function
2492		 * perform the eviction.
2493		 */
2494		ASSERT(buf->b_data == NULL);
2495		mutex_exit(&buf->b_evict_lock);
2496		return (B_FALSE);
2497	} else if (buf->b_data == NULL) {
2498		/*
2499		 * We have already been added to the arc eviction list;
2500		 * recommend eviction.
2501		 */
2502		ASSERT3P(hdr, ==, &arc_eviction_hdr);
2503		mutex_exit(&buf->b_evict_lock);
2504		return (B_TRUE);
2505	}
2506
2507	if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr))
2508		evict_needed = B_TRUE;
2509
2510	mutex_exit(&buf->b_evict_lock);
2511	return (evict_needed);
2512}
2513
2514/*
2515 * Evict buffers from list until we've removed the specified number of
2516 * bytes.  Move the removed buffers to the appropriate evict state.
2517 * If the recycle flag is set, then attempt to "recycle" a buffer:
2518 * - look for a buffer to evict that is `bytes' long.
2519 * - return the data block from this buffer rather than freeing it.
2520 * This flag is used by callers that are trying to make space for a
2521 * new buffer in a full arc cache.
2522 *
2523 * This function makes a "best effort".  It skips over any buffers
2524 * it can't get a hash_lock on, and so may not catch all candidates.
2525 * It may also return without evicting as much space as requested.
2526 */
2527static void *
2528arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
2529    arc_buf_contents_t type)
2530{
2531	arc_state_t *evicted_state;
2532	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
2533	int64_t bytes_remaining;
2534	arc_buf_hdr_t *hdr, *hdr_prev = NULL;
2535	list_t *evicted_list, *list, *evicted_list_start, *list_start;
2536	kmutex_t *lock, *evicted_lock;
2537	kmutex_t *hash_lock;
2538	boolean_t have_lock;
2539	void *stolen = NULL;
2540	arc_buf_hdr_t marker = { 0 };
2541	int count = 0;
2542	static int evict_metadata_offset, evict_data_offset;
2543	int i, idx, offset, list_count, lists;
2544
2545	ASSERT(state == arc_mru || state == arc_mfu);
2546
2547	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2548
2549	/*
2550	 * Decide which "type" (data vs metadata) to recycle from.
2551	 *
2552	 * If we are over the metadata limit, recycle from metadata.
2553	 * If we are under the metadata minimum, recycle from data.
2554	 * Otherwise, recycle from whichever type has the oldest (least
2555	 * recently accessed) header.  This is not yet implemented.
2556	 */
2557	if (recycle) {
2558		arc_buf_contents_t realtype;
2559		if (state->arcs_lsize[ARC_BUFC_DATA] == 0) {
2560			realtype = ARC_BUFC_METADATA;
2561		} else if (state->arcs_lsize[ARC_BUFC_METADATA] == 0) {
2562			realtype = ARC_BUFC_DATA;
2563		} else if (arc_meta_used >= arc_meta_limit) {
2564			realtype = ARC_BUFC_METADATA;
2565		} else if (arc_meta_used <= arc_meta_min) {
2566			realtype = ARC_BUFC_DATA;
2567#ifdef illumos
2568		} else if (HDR_HAS_L1HDR(data_hdr) &&
2569		    HDR_HAS_L1HDR(metadata_hdr) &&
2570		    data_hdr->b_l1hdr.b_arc_access <
2571		    metadata_hdr->b_l1hdr.b_arc_access) {
2572			realtype = ARC_BUFC_DATA;
2573		} else {
2574			realtype = ARC_BUFC_METADATA;
2575#else
2576		} else {
2577			/* TODO */
2578			realtype = type;
2579#endif
2580		}
2581		if (realtype != type) {
2582			/*
2583			 * If we want to evict from a different list,
2584			 * we can not recycle, because DATA vs METADATA
2585			 * buffers are segregated into different kmem
2586			 * caches (and vmem arenas).
2587			 */
2588			type = realtype;
2589			recycle = B_FALSE;
2590		}
2591	}
2592
2593	if (type == ARC_BUFC_METADATA) {
2594		offset = 0;
2595		list_count = ARC_BUFC_NUMMETADATALISTS;
2596		list_start = &state->arcs_lists[0];
2597		evicted_list_start = &evicted_state->arcs_lists[0];
2598		idx = evict_metadata_offset;
2599	} else {
2600		offset = ARC_BUFC_NUMMETADATALISTS;
2601		list_start = &state->arcs_lists[offset];
2602		evicted_list_start = &evicted_state->arcs_lists[offset];
2603		list_count = ARC_BUFC_NUMDATALISTS;
2604		idx = evict_data_offset;
2605	}
2606	bytes_remaining = evicted_state->arcs_lsize[type];
2607	lists = 0;
2608
2609evict_start:
2610	list = &list_start[idx];
2611	evicted_list = &evicted_list_start[idx];
2612	lock = ARCS_LOCK(state, (offset + idx));
2613	evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
2614
2615	/*
2616	 * The ghost list lock must be acquired first in order to prevent
2617	 * a 3 party deadlock:
2618	 *
2619	 *  - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by
2620	 *    l2ad_mtx in arc_hdr_realloc
2621	 *  - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx
2622	 *  - arc_evict acquires arc_*_ghost->arcs_mtx, followed by
2623	 *    arc_*_ghost->arcs_mtx and forms a deadlock cycle.
2624	 *
2625	 * This situation is avoided by acquiring the ghost list lock first.
2626	 */
2627	mutex_enter(evicted_lock);
2628	mutex_enter(lock);
2629
2630	for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
2631		hdr_prev = list_prev(list, hdr);
2632		if (HDR_HAS_L1HDR(hdr)) {
2633			bytes_remaining -=
2634			    (hdr->b_size * hdr->b_l1hdr.b_datacnt);
2635		}
2636		/* prefetch buffers have a minimum lifespan */
2637		if (HDR_IO_IN_PROGRESS(hdr) ||
2638		    (spa && hdr->b_spa != spa) ||
2639		    ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
2640		    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
2641		    arc_min_prefetch_lifespan)) {
2642			skipped++;
2643			continue;
2644		}
2645		/* "lookahead" for better eviction candidate */
2646		if (recycle && hdr->b_size != bytes &&
2647		    hdr_prev && hdr_prev->b_size == bytes)
2648			continue;
2649
2650		/* ignore markers */
2651		if (hdr->b_spa == 0)
2652			continue;
2653
2654		/*
2655		 * It may take a long time to evict all the bufs requested.
2656		 * To avoid blocking all arc activity, periodically drop
2657		 * the arcs_mtx and give other threads a chance to run
2658		 * before reacquiring the lock.
2659		 *
2660		 * If we are looking for a buffer to recycle, we are in
2661		 * the hot code path, so don't sleep.
2662		 */
2663		if (!recycle && count++ > arc_evict_iterations) {
2664			list_insert_after(list, hdr, &marker);
2665			mutex_exit(lock);
2666			mutex_exit(evicted_lock);
2667			kpreempt(KPREEMPT_SYNC);
2668			mutex_enter(evicted_lock);
2669			mutex_enter(lock);
2670			hdr_prev = list_prev(list, &marker);
2671			list_remove(list, &marker);
2672			count = 0;
2673			continue;
2674		}
2675
2676		hash_lock = HDR_LOCK(hdr);
2677		have_lock = MUTEX_HELD(hash_lock);
2678		if (have_lock || mutex_tryenter(hash_lock)) {
2679			ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
2680			ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
2681			while (hdr->b_l1hdr.b_buf) {
2682				arc_buf_t *buf = hdr->b_l1hdr.b_buf;
2683				if (!mutex_tryenter(&buf->b_evict_lock)) {
2684					missed += 1;
2685					break;
2686				}
2687				if (buf->b_data != NULL) {
2688					bytes_evicted += hdr->b_size;
2689					if (recycle &&
2690					    arc_buf_type(hdr) == type &&
2691					    hdr->b_size == bytes &&
2692					    !HDR_L2_WRITING(hdr)) {
2693						stolen = buf->b_data;
2694						recycle = FALSE;
2695					}
2696				}
2697				if (buf->b_efunc != NULL) {
2698					mutex_enter(&arc_eviction_mtx);
2699					arc_buf_destroy(buf,
2700					    buf->b_data == stolen, FALSE);
2701					hdr->b_l1hdr.b_buf = buf->b_next;
2702					buf->b_hdr = &arc_eviction_hdr;
2703					buf->b_next = arc_eviction_list;
2704					arc_eviction_list = buf;
2705					mutex_exit(&arc_eviction_mtx);
2706					mutex_exit(&buf->b_evict_lock);
2707				} else {
2708					mutex_exit(&buf->b_evict_lock);
2709					arc_buf_destroy(buf,
2710					    buf->b_data == stolen, TRUE);
2711				}
2712			}
2713
2714			if (HDR_HAS_L2HDR(hdr)) {
2715				ARCSTAT_INCR(arcstat_evict_l2_cached,
2716				    hdr->b_size);
2717			} else {
2718				if (l2arc_write_eligible(hdr->b_spa, hdr)) {
2719					ARCSTAT_INCR(arcstat_evict_l2_eligible,
2720					    hdr->b_size);
2721				} else {
2722					ARCSTAT_INCR(
2723					    arcstat_evict_l2_ineligible,
2724					    hdr->b_size);
2725				}
2726			}
2727
2728			if (hdr->b_l1hdr.b_datacnt == 0) {
2729				arc_change_state(evicted_state, hdr, hash_lock);
2730				ASSERT(HDR_IN_HASH_TABLE(hdr));
2731				hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
2732				hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
2733				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
2734			}
2735			if (!have_lock)
2736				mutex_exit(hash_lock);
2737			if (bytes >= 0 && bytes_evicted >= bytes)
2738				break;
2739			if (bytes_remaining > 0) {
2740				mutex_exit(evicted_lock);
2741				mutex_exit(lock);
2742				idx  = ((idx + 1) & (list_count - 1));
2743				lists++;
2744				goto evict_start;
2745			}
2746		} else {
2747			missed += 1;
2748		}
2749	}
2750
2751	mutex_exit(lock);
2752	mutex_exit(evicted_lock);
2753
2754	idx  = ((idx + 1) & (list_count - 1));
2755	lists++;
2756
2757	if (bytes_evicted < bytes) {
2758		if (lists < list_count)
2759			goto evict_start;
2760		else
2761			dprintf("only evicted %lld bytes from %x",
2762			    (longlong_t)bytes_evicted, state);
2763	}
2764	if (type == ARC_BUFC_METADATA)
2765		evict_metadata_offset = idx;
2766	else
2767		evict_data_offset = idx;
2768
2769	if (skipped)
2770		ARCSTAT_INCR(arcstat_evict_skip, skipped);
2771
2772	if (missed)
2773		ARCSTAT_INCR(arcstat_mutex_miss, missed);
2774
2775	/*
2776	 * Note: we have just evicted some data into the ghost state,
2777	 * potentially putting the ghost size over the desired size.  Rather
2778	 * that evicting from the ghost list in this hot code path, leave
2779	 * this chore to the arc_reclaim_thread().
2780	 */
2781
2782	if (stolen)
2783		ARCSTAT_BUMP(arcstat_stolen);
2784	return (stolen);
2785}
2786
2787/*
2788 * Remove buffers from list until we've removed the specified number of
2789 * bytes.  Destroy the buffers that are removed.
2790 */
2791static void
2792arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
2793{
2794	arc_buf_hdr_t *hdr, *hdr_prev;
2795	arc_buf_hdr_t marker = { 0 };
2796	list_t *list, *list_start;
2797	kmutex_t *hash_lock, *lock;
2798	uint64_t bytes_deleted = 0;
2799	uint64_t bufs_skipped = 0;
2800	int count = 0;
2801	static int evict_offset;
2802	int list_count, idx = evict_offset;
2803	int offset, lists = 0;
2804
2805	ASSERT(GHOST_STATE(state));
2806
2807	/*
2808	 * data lists come after metadata lists
2809	 */
2810	list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
2811	list_count = ARC_BUFC_NUMDATALISTS;
2812	offset = ARC_BUFC_NUMMETADATALISTS;
2813
2814evict_start:
2815	list = &list_start[idx];
2816	lock = ARCS_LOCK(state, idx + offset);
2817
2818	mutex_enter(lock);
2819	for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
2820		hdr_prev = list_prev(list, hdr);
2821		if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES)
2822			panic("invalid hdr=%p", (void *)hdr);
2823		if (spa && hdr->b_spa != spa)
2824			continue;
2825
2826		/* ignore markers */
2827		if (hdr->b_spa == 0)
2828			continue;
2829
2830		hash_lock = HDR_LOCK(hdr);
2831		/* caller may be trying to modify this buffer, skip it */
2832		if (MUTEX_HELD(hash_lock))
2833			continue;
2834
2835		/*
2836		 * It may take a long time to evict all the bufs requested.
2837		 * To avoid blocking all arc activity, periodically drop
2838		 * the arcs_mtx and give other threads a chance to run
2839		 * before reacquiring the lock.
2840		 */
2841		if (count++ > arc_evict_iterations) {
2842			list_insert_after(list, hdr, &marker);
2843			mutex_exit(lock);
2844			kpreempt(KPREEMPT_SYNC);
2845			mutex_enter(lock);
2846			hdr_prev = list_prev(list, &marker);
2847			list_remove(list, &marker);
2848			count = 0;
2849			continue;
2850		}
2851		if (mutex_tryenter(hash_lock)) {
2852			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2853			ASSERT(!HDR_HAS_L1HDR(hdr) ||
2854			    hdr->b_l1hdr.b_buf == NULL);
2855			ARCSTAT_BUMP(arcstat_deleted);
2856			bytes_deleted += hdr->b_size;
2857
2858			if (HDR_HAS_L2HDR(hdr)) {
2859				/*
2860				 * This buffer is cached on the 2nd Level ARC;
2861				 * don't destroy the header.
2862				 */
2863				arc_change_state(arc_l2c_only, hdr, hash_lock);
2864				/*
2865				 * dropping from L1+L2 cached to L2-only,
2866				 * realloc to remove the L1 header.
2867				 */
2868				hdr = arc_hdr_realloc(hdr, hdr_full_cache,
2869				    hdr_l2only_cache);
2870				mutex_exit(hash_lock);
2871			} else {
2872				arc_change_state(arc_anon, hdr, hash_lock);
2873				mutex_exit(hash_lock);
2874				arc_hdr_destroy(hdr);
2875			}
2876
2877			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
2878			if (bytes >= 0 && bytes_deleted >= bytes)
2879				break;
2880		} else if (bytes < 0) {
2881			/*
2882			 * Insert a list marker and then wait for the
2883			 * hash lock to become available. Once its
2884			 * available, restart from where we left off.
2885			 */
2886			list_insert_after(list, hdr, &marker);
2887			mutex_exit(lock);
2888			mutex_enter(hash_lock);
2889			mutex_exit(hash_lock);
2890			mutex_enter(lock);
2891			hdr_prev = list_prev(list, &marker);
2892			list_remove(list, &marker);
2893		} else {
2894			bufs_skipped += 1;
2895		}
2896
2897	}
2898	mutex_exit(lock);
2899	idx  = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
2900	lists++;
2901
2902	if (lists < list_count)
2903		goto evict_start;
2904
2905	evict_offset = idx;
2906	if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
2907	    (bytes < 0 || bytes_deleted < bytes)) {
2908		list_start = &state->arcs_lists[0];
2909		list_count = ARC_BUFC_NUMMETADATALISTS;
2910		offset = lists = 0;
2911		goto evict_start;
2912	}
2913
2914	if (bufs_skipped) {
2915		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2916		ASSERT(bytes >= 0);
2917	}
2918
2919	if (bytes_deleted < bytes)
2920		dprintf("only deleted %lld bytes from %p",
2921		    (longlong_t)bytes_deleted, state);
2922}
2923
2924static void
2925arc_adjust(void)
2926{
2927	int64_t adjustment, delta;
2928
2929	/*
2930	 * Adjust MRU size
2931	 */
2932
2933	adjustment = MIN((int64_t)(arc_size - arc_c),
2934	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2935	    arc_p));
2936
2937	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2938		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2939		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
2940		adjustment -= delta;
2941	}
2942
2943	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2944		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2945		(void) arc_evict(arc_mru, 0, delta, FALSE,
2946		    ARC_BUFC_METADATA);
2947	}
2948
2949	/*
2950	 * Adjust MFU size
2951	 */
2952
2953	adjustment = arc_size - arc_c;
2954
2955	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2956		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2957		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
2958		adjustment -= delta;
2959	}
2960
2961	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2962		int64_t delta = MIN(adjustment,
2963		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2964		(void) arc_evict(arc_mfu, 0, delta, FALSE,
2965		    ARC_BUFC_METADATA);
2966	}
2967
2968	/*
2969	 * Adjust ghost lists
2970	 */
2971
2972	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2973
2974	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2975		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2976		arc_evict_ghost(arc_mru_ghost, 0, delta);
2977	}
2978
2979	adjustment =
2980	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2981
2982	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2983		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2984		arc_evict_ghost(arc_mfu_ghost, 0, delta);
2985	}
2986}
2987
2988static void
2989arc_do_user_evicts(void)
2990{
2991	static arc_buf_t *tmp_arc_eviction_list;
2992
2993	/*
2994	 * Move list over to avoid LOR
2995	 */
2996restart:
2997	mutex_enter(&arc_eviction_mtx);
2998	tmp_arc_eviction_list = arc_eviction_list;
2999	arc_eviction_list = NULL;
3000	mutex_exit(&arc_eviction_mtx);
3001
3002	while (tmp_arc_eviction_list != NULL) {
3003		arc_buf_t *buf = tmp_arc_eviction_list;
3004		tmp_arc_eviction_list = buf->b_next;
3005		mutex_enter(&buf->b_evict_lock);
3006		buf->b_hdr = NULL;
3007		mutex_exit(&buf->b_evict_lock);
3008
3009		if (buf->b_efunc != NULL)
3010			VERIFY0(buf->b_efunc(buf->b_private));
3011
3012		buf->b_efunc = NULL;
3013		buf->b_private = NULL;
3014		kmem_cache_free(buf_cache, buf);
3015	}
3016
3017	if (arc_eviction_list != NULL)
3018		goto restart;
3019}
3020
3021/*
3022 * Flush all *evictable* data from the cache for the given spa.
3023 * NOTE: this will not touch "active" (i.e. referenced) data.
3024 */
3025void
3026arc_flush(spa_t *spa)
3027{
3028	uint64_t guid = 0;
3029
3030	if (spa != NULL)
3031		guid = spa_load_guid(spa);
3032
3033	while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
3034		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
3035		if (spa != NULL)
3036			break;
3037	}
3038	while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
3039		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
3040		if (spa != NULL)
3041			break;
3042	}
3043	while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
3044		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
3045		if (spa != NULL)
3046			break;
3047	}
3048	while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
3049		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
3050		if (spa != NULL)
3051			break;
3052	}
3053
3054	arc_evict_ghost(arc_mru_ghost, guid, -1);
3055	arc_evict_ghost(arc_mfu_ghost, guid, -1);
3056
3057	mutex_enter(&arc_reclaim_thr_lock);
3058	arc_do_user_evicts();
3059	mutex_exit(&arc_reclaim_thr_lock);
3060	ASSERT(spa || arc_eviction_list == NULL);
3061}
3062
3063void
3064arc_shrink(int64_t to_free)
3065{
3066
3067	if (arc_c > arc_c_min) {
3068		DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
3069			arc_c_min, uint64_t, arc_p, uint64_t, to_free);
3070		if (arc_c > arc_c_min + to_free)
3071			atomic_add_64(&arc_c, -to_free);
3072		else
3073			arc_c = arc_c_min;
3074
3075		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
3076		if (arc_c > arc_size)
3077			arc_c = MAX(arc_size, arc_c_min);
3078		if (arc_p > arc_c)
3079			arc_p = (arc_c >> 1);
3080
3081		DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
3082			arc_p);
3083
3084		ASSERT(arc_c >= arc_c_min);
3085		ASSERT((int64_t)arc_p >= 0);
3086	}
3087
3088	if (arc_size > arc_c) {
3089		DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,
3090			uint64_t, arc_c);
3091		arc_adjust();
3092	}
3093}
3094
3095static long needfree = 0;
3096
3097typedef enum free_memory_reason_t {
3098	FMR_UNKNOWN,
3099	FMR_NEEDFREE,
3100	FMR_LOTSFREE,
3101	FMR_SWAPFS_MINFREE,
3102	FMR_PAGES_PP_MAXIMUM,
3103	FMR_HEAP_ARENA,
3104	FMR_ZIO_ARENA,
3105	FMR_ZIO_FRAG,
3106} free_memory_reason_t;
3107
3108int64_t last_free_memory;
3109free_memory_reason_t last_free_reason;
3110
3111/*
3112 * Additional reserve of pages for pp_reserve.
3113 */
3114int64_t arc_pages_pp_reserve = 64;
3115
3116/*
3117 * Additional reserve of pages for swapfs.
3118 */
3119int64_t arc_swapfs_reserve = 64;
3120
3121/*
3122 * Return the amount of memory that can be consumed before reclaim will be
3123 * needed.  Positive if there is sufficient free memory, negative indicates
3124 * the amount of memory that needs to be freed up.
3125 */
3126static int64_t
3127arc_available_memory(void)
3128{
3129	int64_t lowest = INT64_MAX;
3130	int64_t n;
3131	free_memory_reason_t r = FMR_UNKNOWN;
3132
3133#ifdef _KERNEL
3134	if (needfree > 0) {
3135		n = PAGESIZE * (-needfree);
3136		if (n < lowest) {
3137			lowest = n;
3138			r = FMR_NEEDFREE;
3139		}
3140	}
3141
3142	/*
3143	 * Cooperate with pagedaemon when it's time for it to scan
3144	 * and reclaim some pages.
3145	 */
3146	n = PAGESIZE * (int64_t)(freemem - zfs_arc_free_target);
3147	if (n < lowest) {
3148		lowest = n;
3149		r = FMR_LOTSFREE;
3150	}
3151
3152#ifdef illumos
3153	/*
3154	 * check that we're out of range of the pageout scanner.  It starts to
3155	 * schedule paging if freemem is less than lotsfree and needfree.
3156	 * lotsfree is the high-water mark for pageout, and needfree is the
3157	 * number of needed free pages.  We add extra pages here to make sure
3158	 * the scanner doesn't start up while we're freeing memory.
3159	 */
3160	n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
3161	if (n < lowest) {
3162		lowest = n;
3163		r = FMR_LOTSFREE;
3164	}
3165
3166	/*
3167	 * check to make sure that swapfs has enough space so that anon
3168	 * reservations can still succeed. anon_resvmem() checks that the
3169	 * availrmem is greater than swapfs_minfree, and the number of reserved
3170	 * swap pages.  We also add a bit of extra here just to prevent
3171	 * circumstances from getting really dire.
3172	 */
3173	n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
3174	    desfree - arc_swapfs_reserve);
3175	if (n < lowest) {
3176		lowest = n;
3177		r = FMR_SWAPFS_MINFREE;
3178	}
3179
3180
3181	/*
3182	 * Check that we have enough availrmem that memory locking (e.g., via
3183	 * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
3184	 * stores the number of pages that cannot be locked; when availrmem
3185	 * drops below pages_pp_maximum, page locking mechanisms such as
3186	 * page_pp_lock() will fail.)
3187	 */
3188	n = PAGESIZE * (availrmem - pages_pp_maximum -
3189	    arc_pages_pp_reserve);
3190	if (n < lowest) {
3191		lowest = n;
3192		r = FMR_PAGES_PP_MAXIMUM;
3193	}
3194
3195#endif	/* illumos */
3196#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
3197	/*
3198	 * If we're on an i386 platform, it's possible that we'll exhaust the
3199	 * kernel heap space before we ever run out of available physical
3200	 * memory.  Most checks of the size of the heap_area compare against
3201	 * tune.t_minarmem, which is the minimum available real memory that we
3202	 * can have in the system.  However, this is generally fixed at 25 pages
3203	 * which is so low that it's useless.  In this comparison, we seek to
3204	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
3205	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
3206	 * free)
3207	 */
3208	n = vmem_size(heap_arena, VMEM_FREE) -
3209	    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)
3210	if (n < lowest) {
3211		lowest = n;
3212		r = FMR_HEAP_ARENA;
3213	}
3214#define	zio_arena	NULL
3215#else
3216#define	zio_arena	heap_arena
3217#endif
3218
3219	/*
3220	 * If zio data pages are being allocated out of a separate heap segment,
3221	 * then enforce that the size of available vmem for this arena remains
3222	 * above about 1/16th free.
3223	 *
3224	 * Note: The 1/16th arena free requirement was put in place
3225	 * to aggressively evict memory from the arc in order to avoid
3226	 * memory fragmentation issues.
3227	 */
3228	if (zio_arena != NULL) {
3229		n = vmem_size(zio_arena, VMEM_FREE) -
3230		    (vmem_size(zio_arena, VMEM_ALLOC) >> 4);
3231		if (n < lowest) {
3232			lowest = n;
3233			r = FMR_ZIO_ARENA;
3234		}
3235	}
3236
3237	/*
3238	 * Above limits know nothing about real level of KVA fragmentation.
3239	 * Start aggressive reclamation if too little sequential KVA left.
3240	 */
3241	if (lowest > 0) {
3242		n = (vmem_size(heap_arena, VMEM_MAXFREE) < zfs_max_recordsize) ?
3243		    -(vmem_size(heap_arena, VMEM_ALLOC) >> 4) : INT64_MAX;
3244		if (n < lowest) {
3245			lowest = n;
3246			r = FMR_ZIO_FRAG;
3247		}
3248	}
3249
3250#else	/* _KERNEL */
3251	/* Every 100 calls, free a small amount */
3252	if (spa_get_random(100) == 0)
3253		lowest = -1024;
3254#endif	/* _KERNEL */
3255
3256	last_free_memory = lowest;
3257	last_free_reason = r;
3258	DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r);
3259	return (lowest);
3260}
3261
3262
3263/*
3264 * Determine if the system is under memory pressure and is asking
3265 * to reclaim memory. A return value of TRUE indicates that the system
3266 * is under memory pressure and that the arc should adjust accordingly.
3267 */
3268static boolean_t
3269arc_reclaim_needed(void)
3270{
3271	return (arc_available_memory() < 0);
3272}
3273
3274extern kmem_cache_t	*zio_buf_cache[];
3275extern kmem_cache_t	*zio_data_buf_cache[];
3276extern kmem_cache_t	*range_seg_cache;
3277
3278static __noinline void
3279arc_kmem_reap_now(void)
3280{
3281	size_t			i;
3282	kmem_cache_t		*prev_cache = NULL;
3283	kmem_cache_t		*prev_data_cache = NULL;
3284
3285	DTRACE_PROBE(arc__kmem_reap_start);
3286#ifdef _KERNEL
3287	if (arc_meta_used >= arc_meta_limit) {
3288		/*
3289		 * We are exceeding our meta-data cache limit.
3290		 * Purge some DNLC entries to release holds on meta-data.
3291		 */
3292		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
3293	}
3294#if defined(__i386)
3295	/*
3296	 * Reclaim unused memory from all kmem caches.
3297	 */
3298	kmem_reap();
3299#endif
3300#endif
3301
3302	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
3303		if (zio_buf_cache[i] != prev_cache) {
3304			prev_cache = zio_buf_cache[i];
3305			kmem_cache_reap_now(zio_buf_cache[i]);
3306		}
3307		if (zio_data_buf_cache[i] != prev_data_cache) {
3308			prev_data_cache = zio_data_buf_cache[i];
3309			kmem_cache_reap_now(zio_data_buf_cache[i]);
3310		}
3311	}
3312	kmem_cache_reap_now(buf_cache);
3313	kmem_cache_reap_now(hdr_full_cache);
3314	kmem_cache_reap_now(hdr_l2only_cache);
3315	kmem_cache_reap_now(range_seg_cache);
3316
3317#ifdef illumos
3318	if (zio_arena != NULL) {
3319		/*
3320		 * Ask the vmem arena to reclaim unused memory from its
3321		 * quantum caches.
3322		 */
3323		vmem_qcache_reap(zio_arena);
3324	}
3325#endif
3326	DTRACE_PROBE(arc__kmem_reap_end);
3327}
3328
3329static void
3330arc_reclaim_thread(void *dummy __unused)
3331{
3332	clock_t			growtime = 0;
3333	callb_cpr_t		cpr;
3334
3335	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
3336
3337	mutex_enter(&arc_reclaim_thr_lock);
3338	while (arc_thread_exit == 0) {
3339		int64_t free_memory = arc_available_memory();
3340		if (free_memory < 0) {
3341
3342			arc_no_grow = B_TRUE;
3343			arc_warm = B_TRUE;
3344
3345			/*
3346			 * Wait at least zfs_grow_retry (default 60) seconds
3347			 * before considering growing.
3348			 */
3349			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
3350
3351			arc_kmem_reap_now();
3352
3353			/*
3354			 * If we are still low on memory, shrink the ARC
3355			 * so that we have arc_shrink_min free space.
3356			 */
3357			free_memory = arc_available_memory();
3358
3359			int64_t to_free =
3360			    (arc_c >> arc_shrink_shift) - free_memory;
3361			if (to_free > 0) {
3362#ifdef _KERNEL
3363				to_free = MAX(to_free, ptob(needfree));
3364#endif
3365				arc_shrink(to_free);
3366			}
3367		} else if (free_memory < arc_c >> arc_no_grow_shift) {
3368			arc_no_grow = B_TRUE;
3369		} else if (ddi_get_lbolt() >= growtime) {
3370			arc_no_grow = B_FALSE;
3371		}
3372
3373		arc_adjust();
3374
3375		if (arc_eviction_list != NULL)
3376			arc_do_user_evicts();
3377
3378#ifdef _KERNEL
3379		if (needfree) {
3380			needfree = 0;
3381			wakeup(&needfree);
3382		}
3383#endif
3384
3385		/*
3386		 * This is necessary in order for the mdb ::arc dcmd to
3387		 * show up to date information. Since the ::arc command
3388		 * does not call the kstat's update function, without
3389		 * this call, the command may show stale stats for the
3390		 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
3391		 * with this change, the data might be up to 1 second
3392		 * out of date; but that should suffice. The arc_state_t
3393		 * structures can be queried directly if more accurate
3394		 * information is needed.
3395		 */
3396		if (arc_ksp != NULL)
3397			arc_ksp->ks_update(arc_ksp, KSTAT_READ);
3398
3399		/* block until needed, or one second, whichever is shorter */
3400		CALLB_CPR_SAFE_BEGIN(&cpr);
3401		(void) cv_timedwait(&arc_reclaim_thr_cv,
3402		    &arc_reclaim_thr_lock, hz);
3403		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
3404	}
3405
3406	arc_thread_exit = 0;
3407	cv_broadcast(&arc_reclaim_thr_cv);
3408	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
3409	thread_exit();
3410}
3411
3412/*
3413 * Adapt arc info given the number of bytes we are trying to add and
3414 * the state that we are comming from.  This function is only called
3415 * when we are adding new content to the cache.
3416 */
3417static void
3418arc_adapt(int bytes, arc_state_t *state)
3419{
3420	int mult;
3421	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
3422
3423	if (state == arc_l2c_only)
3424		return;
3425
3426	ASSERT(bytes > 0);
3427	/*
3428	 * Adapt the target size of the MRU list:
3429	 *	- if we just hit in the MRU ghost list, then increase
3430	 *	  the target size of the MRU list.
3431	 *	- if we just hit in the MFU ghost list, then increase
3432	 *	  the target size of the MFU list by decreasing the
3433	 *	  target size of the MRU list.
3434	 */
3435	if (state == arc_mru_ghost) {
3436		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
3437		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
3438		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
3439
3440		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
3441	} else if (state == arc_mfu_ghost) {
3442		uint64_t delta;
3443
3444		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
3445		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
3446		mult = MIN(mult, 10);
3447
3448		delta = MIN(bytes * mult, arc_p);
3449		arc_p = MAX(arc_p_min, arc_p - delta);
3450	}
3451	ASSERT((int64_t)arc_p >= 0);
3452
3453	if (arc_reclaim_needed()) {
3454		cv_signal(&arc_reclaim_thr_cv);
3455		return;
3456	}
3457
3458	if (arc_no_grow)
3459		return;
3460
3461	if (arc_c >= arc_c_max)
3462		return;
3463
3464	/*
3465	 * If we're within (2 * maxblocksize) bytes of the target
3466	 * cache size, increment the target cache size
3467	 */
3468	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
3469		DTRACE_PROBE1(arc__inc_adapt, int, bytes);
3470		atomic_add_64(&arc_c, (int64_t)bytes);
3471		if (arc_c > arc_c_max)
3472			arc_c = arc_c_max;
3473		else if (state == arc_anon)
3474			atomic_add_64(&arc_p, (int64_t)bytes);
3475		if (arc_p > arc_c)
3476			arc_p = arc_c;
3477	}
3478	ASSERT((int64_t)arc_p >= 0);
3479}
3480
3481/*
3482 * Check if the cache has reached its limits and eviction is required
3483 * prior to insert.
3484 */
3485static int
3486arc_evict_needed(arc_buf_contents_t type)
3487{
3488	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
3489		return (1);
3490
3491	if (arc_reclaim_needed())
3492		return (1);
3493
3494	return (arc_size > arc_c);
3495}
3496
3497/*
3498 * The buffer, supplied as the first argument, needs a data block.
3499 * So, if we are at cache max, determine which cache should be victimized.
3500 * We have the following cases:
3501 *
3502 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
3503 * In this situation if we're out of space, but the resident size of the MFU is
3504 * under the limit, victimize the MFU cache to satisfy this insertion request.
3505 *
3506 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
3507 * Here, we've used up all of the available space for the MRU, so we need to
3508 * evict from our own cache instead.  Evict from the set of resident MRU
3509 * entries.
3510 *
3511 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
3512 * c minus p represents the MFU space in the cache, since p is the size of the
3513 * cache that is dedicated to the MRU.  In this situation there's still space on
3514 * the MFU side, so the MRU side needs to be victimized.
3515 *
3516 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
3517 * MFU's resident set is consuming more space than it has been allotted.  In
3518 * this situation, we must victimize our own cache, the MFU, for this insertion.
3519 */
3520static void
3521arc_get_data_buf(arc_buf_t *buf)
3522{
3523	arc_state_t		*state = buf->b_hdr->b_l1hdr.b_state;
3524	uint64_t		size = buf->b_hdr->b_size;
3525	arc_buf_contents_t	type = arc_buf_type(buf->b_hdr);
3526
3527	arc_adapt(size, state);
3528
3529	/*
3530	 * We have not yet reached cache maximum size,
3531	 * just allocate a new buffer.
3532	 */
3533	if (!arc_evict_needed(type)) {
3534		if (type == ARC_BUFC_METADATA) {
3535			buf->b_data = zio_buf_alloc(size);
3536			arc_space_consume(size, ARC_SPACE_META);
3537		} else {
3538			ASSERT(type == ARC_BUFC_DATA);
3539			buf->b_data = zio_data_buf_alloc(size);
3540			arc_space_consume(size, ARC_SPACE_DATA);
3541		}
3542		goto out;
3543	}
3544
3545	/*
3546	 * If we are prefetching from the mfu ghost list, this buffer
3547	 * will end up on the mru list; so steal space from there.
3548	 */
3549	if (state == arc_mfu_ghost)
3550		state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu;
3551	else if (state == arc_mru_ghost)
3552		state = arc_mru;
3553
3554	if (state == arc_mru || state == arc_anon) {
3555		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
3556		state = (arc_mfu->arcs_lsize[type] >= size &&
3557		    arc_p > mru_used) ? arc_mfu : arc_mru;
3558	} else {
3559		/* MFU cases */
3560		uint64_t mfu_space = arc_c - arc_p;
3561		state =  (arc_mru->arcs_lsize[type] >= size &&
3562		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
3563	}
3564	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
3565		if (type == ARC_BUFC_METADATA) {
3566			buf->b_data = zio_buf_alloc(size);
3567			arc_space_consume(size, ARC_SPACE_META);
3568		} else {
3569			ASSERT(type == ARC_BUFC_DATA);
3570			buf->b_data = zio_data_buf_alloc(size);
3571			arc_space_consume(size, ARC_SPACE_DATA);
3572		}
3573		ARCSTAT_BUMP(arcstat_recycle_miss);
3574	}
3575	ASSERT(buf->b_data != NULL);
3576out:
3577	/*
3578	 * Update the state size.  Note that ghost states have a
3579	 * "ghost size" and so don't need to be updated.
3580	 */
3581	if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) {
3582		arc_buf_hdr_t *hdr = buf->b_hdr;
3583
3584		atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size);
3585		if (list_link_active(&hdr->b_l1hdr.b_arc_node)) {
3586			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3587			atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type],
3588			    size);
3589		}
3590		/*
3591		 * If we are growing the cache, and we are adding anonymous
3592		 * data, and we have outgrown arc_p, update arc_p
3593		 */
3594		if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
3595		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
3596			arc_p = MIN(arc_c, arc_p + size);
3597	}
3598	ARCSTAT_BUMP(arcstat_allocated);
3599}
3600
3601/*
3602 * This routine is called whenever a buffer is accessed.
3603 * NOTE: the hash lock is dropped in this function.
3604 */
3605static void
3606arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
3607{
3608	clock_t now;
3609
3610	ASSERT(MUTEX_HELD(hash_lock));
3611	ASSERT(HDR_HAS_L1HDR(hdr));
3612
3613	if (hdr->b_l1hdr.b_state == arc_anon) {
3614		/*
3615		 * This buffer is not in the cache, and does not
3616		 * appear in our "ghost" list.  Add the new buffer
3617		 * to the MRU state.
3618		 */
3619
3620		ASSERT0(hdr->b_l1hdr.b_arc_access);
3621		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3622		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
3623		arc_change_state(arc_mru, hdr, hash_lock);
3624
3625	} else if (hdr->b_l1hdr.b_state == arc_mru) {
3626		now = ddi_get_lbolt();
3627
3628		/*
3629		 * If this buffer is here because of a prefetch, then either:
3630		 * - clear the flag if this is a "referencing" read
3631		 *   (any subsequent access will bump this into the MFU state).
3632		 * or
3633		 * - move the buffer to the head of the list if this is
3634		 *   another prefetch (to make it less likely to be evicted).
3635		 */
3636		if (HDR_PREFETCH(hdr)) {
3637			if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
3638				ASSERT(list_link_active(
3639				    &hdr->b_l1hdr.b_arc_node));
3640			} else {
3641				hdr->b_flags &= ~ARC_FLAG_PREFETCH;
3642				ARCSTAT_BUMP(arcstat_mru_hits);
3643			}
3644			hdr->b_l1hdr.b_arc_access = now;
3645			return;
3646		}
3647
3648		/*
3649		 * This buffer has been "accessed" only once so far,
3650		 * but it is still in the cache. Move it to the MFU
3651		 * state.
3652		 */
3653		if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) {
3654			/*
3655			 * More than 125ms have passed since we
3656			 * instantiated this buffer.  Move it to the
3657			 * most frequently used state.
3658			 */
3659			hdr->b_l1hdr.b_arc_access = now;
3660			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3661			arc_change_state(arc_mfu, hdr, hash_lock);
3662		}
3663		ARCSTAT_BUMP(arcstat_mru_hits);
3664	} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
3665		arc_state_t	*new_state;
3666		/*
3667		 * This buffer has been "accessed" recently, but
3668		 * was evicted from the cache.  Move it to the
3669		 * MFU state.
3670		 */
3671
3672		if (HDR_PREFETCH(hdr)) {
3673			new_state = arc_mru;
3674			if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
3675				hdr->b_flags &= ~ARC_FLAG_PREFETCH;
3676			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
3677		} else {
3678			new_state = arc_mfu;
3679			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3680		}
3681
3682		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3683		arc_change_state(new_state, hdr, hash_lock);
3684
3685		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
3686	} else if (hdr->b_l1hdr.b_state == arc_mfu) {
3687		/*
3688		 * This buffer has been accessed more than once and is
3689		 * still in the cache.  Keep it in the MFU state.
3690		 *
3691		 * NOTE: an add_reference() that occurred when we did
3692		 * the arc_read() will have kicked this off the list.
3693		 * If it was a prefetch, we will explicitly move it to
3694		 * the head of the list now.
3695		 */
3696		if ((HDR_PREFETCH(hdr)) != 0) {
3697			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3698			ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
3699		}
3700		ARCSTAT_BUMP(arcstat_mfu_hits);
3701		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3702	} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
3703		arc_state_t	*new_state = arc_mfu;
3704		/*
3705		 * This buffer has been accessed more than once but has
3706		 * been evicted from the cache.  Move it back to the
3707		 * MFU state.
3708		 */
3709
3710		if (HDR_PREFETCH(hdr)) {
3711			/*
3712			 * This is a prefetch access...
3713			 * move this block back to the MRU state.
3714			 */
3715			ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
3716			new_state = arc_mru;
3717		}
3718
3719		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3720		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3721		arc_change_state(new_state, hdr, hash_lock);
3722
3723		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
3724	} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
3725		/*
3726		 * This buffer is on the 2nd Level ARC.
3727		 */
3728
3729		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3730		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3731		arc_change_state(arc_mfu, hdr, hash_lock);
3732	} else {
3733		ASSERT(!"invalid arc state");
3734	}
3735}
3736
3737/* a generic arc_done_func_t which you can use */
3738/* ARGSUSED */
3739void
3740arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
3741{
3742	if (zio == NULL || zio->io_error == 0)
3743		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
3744	VERIFY(arc_buf_remove_ref(buf, arg));
3745}
3746
3747/* a generic arc_done_func_t */
3748void
3749arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
3750{
3751	arc_buf_t **bufp = arg;
3752	if (zio && zio->io_error) {
3753		VERIFY(arc_buf_remove_ref(buf, arg));
3754		*bufp = NULL;
3755	} else {
3756		*bufp = buf;
3757		ASSERT(buf->b_data);
3758	}
3759}
3760
3761static void
3762arc_read_done(zio_t *zio)
3763{
3764	arc_buf_hdr_t	*hdr;
3765	arc_buf_t	*buf;
3766	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
3767	kmutex_t	*hash_lock = NULL;
3768	arc_callback_t	*callback_list, *acb;
3769	int		freeable = FALSE;
3770
3771	buf = zio->io_private;
3772	hdr = buf->b_hdr;
3773
3774	/*
3775	 * The hdr was inserted into hash-table and removed from lists
3776	 * prior to starting I/O.  We should find this header, since
3777	 * it's in the hash table, and it should be legit since it's
3778	 * not possible to evict it during the I/O.  The only possible
3779	 * reason for it not to be found is if we were freed during the
3780	 * read.
3781	 */
3782	if (HDR_IN_HASH_TABLE(hdr)) {
3783		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
3784		ASSERT3U(hdr->b_dva.dva_word[0], ==,
3785		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
3786		ASSERT3U(hdr->b_dva.dva_word[1], ==,
3787		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
3788
3789		arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
3790		    &hash_lock);
3791
3792		ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
3793		    hash_lock == NULL) ||
3794		    (found == hdr &&
3795		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
3796		    (found == hdr && HDR_L2_READING(hdr)));
3797	}
3798
3799	hdr->b_flags &= ~ARC_FLAG_L2_EVICTED;
3800	if (l2arc_noprefetch && HDR_PREFETCH(hdr))
3801		hdr->b_flags &= ~ARC_FLAG_L2CACHE;
3802
3803	/* byteswap if necessary */
3804	callback_list = hdr->b_l1hdr.b_acb;
3805	ASSERT(callback_list != NULL);
3806	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
3807		dmu_object_byteswap_t bswap =
3808		    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
3809		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
3810		    byteswap_uint64_array :
3811		    dmu_ot_byteswap[bswap].ob_func;
3812		func(buf->b_data, hdr->b_size);
3813	}
3814
3815	arc_cksum_compute(buf, B_FALSE);
3816#ifdef illumos
3817	arc_buf_watch(buf);
3818#endif
3819
3820	if (hash_lock && zio->io_error == 0 &&
3821	    hdr->b_l1hdr.b_state == arc_anon) {
3822		/*
3823		 * Only call arc_access on anonymous buffers.  This is because
3824		 * if we've issued an I/O for an evicted buffer, we've already
3825		 * called arc_access (to prevent any simultaneous readers from
3826		 * getting confused).
3827		 */
3828		arc_access(hdr, hash_lock);
3829	}
3830
3831	/* create copies of the data buffer for the callers */
3832	abuf = buf;
3833	for (acb = callback_list; acb; acb = acb->acb_next) {
3834		if (acb->acb_done) {
3835			if (abuf == NULL) {
3836				ARCSTAT_BUMP(arcstat_duplicate_reads);
3837				abuf = arc_buf_clone(buf);
3838			}
3839			acb->acb_buf = abuf;
3840			abuf = NULL;
3841		}
3842	}
3843	hdr->b_l1hdr.b_acb = NULL;
3844	hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
3845	ASSERT(!HDR_BUF_AVAILABLE(hdr));
3846	if (abuf == buf) {
3847		ASSERT(buf->b_efunc == NULL);
3848		ASSERT(hdr->b_l1hdr.b_datacnt == 1);
3849		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
3850	}
3851
3852	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
3853	    callback_list != NULL);
3854
3855	if (zio->io_error != 0) {
3856		hdr->b_flags |= ARC_FLAG_IO_ERROR;
3857		if (hdr->b_l1hdr.b_state != arc_anon)
3858			arc_change_state(arc_anon, hdr, hash_lock);
3859		if (HDR_IN_HASH_TABLE(hdr))
3860			buf_hash_remove(hdr);
3861		freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
3862	}
3863
3864	/*
3865	 * Broadcast before we drop the hash_lock to avoid the possibility
3866	 * that the hdr (and hence the cv) might be freed before we get to
3867	 * the cv_broadcast().
3868	 */
3869	cv_broadcast(&hdr->b_l1hdr.b_cv);
3870
3871	if (hash_lock != NULL) {
3872		mutex_exit(hash_lock);
3873	} else {
3874		/*
3875		 * This block was freed while we waited for the read to
3876		 * complete.  It has been removed from the hash table and
3877		 * moved to the anonymous state (so that it won't show up
3878		 * in the cache).
3879		 */
3880		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3881		freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
3882	}
3883
3884	/* execute each callback and free its structure */
3885	while ((acb = callback_list) != NULL) {
3886		if (acb->acb_done)
3887			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3888
3889		if (acb->acb_zio_dummy != NULL) {
3890			acb->acb_zio_dummy->io_error = zio->io_error;
3891			zio_nowait(acb->acb_zio_dummy);
3892		}
3893
3894		callback_list = acb->acb_next;
3895		kmem_free(acb, sizeof (arc_callback_t));
3896	}
3897
3898	if (freeable)
3899		arc_hdr_destroy(hdr);
3900}
3901
3902/*
3903 * "Read" the block block at the specified DVA (in bp) via the
3904 * cache.  If the block is found in the cache, invoke the provided
3905 * callback immediately and return.  Note that the `zio' parameter
3906 * in the callback will be NULL in this case, since no IO was
3907 * required.  If the block is not in the cache pass the read request
3908 * on to the spa with a substitute callback function, so that the
3909 * requested block will be added to the cache.
3910 *
3911 * If a read request arrives for a block that has a read in-progress,
3912 * either wait for the in-progress read to complete (and return the
3913 * results); or, if this is a read with a "done" func, add a record
3914 * to the read to invoke the "done" func when the read completes,
3915 * and return; or just return.
3916 *
3917 * arc_read_done() will invoke all the requested "done" functions
3918 * for readers of this block.
3919 */
3920int
3921arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
3922    void *private, zio_priority_t priority, int zio_flags,
3923    arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
3924{
3925	arc_buf_hdr_t *hdr = NULL;
3926	arc_buf_t *buf = NULL;
3927	kmutex_t *hash_lock = NULL;
3928	zio_t *rzio;
3929	uint64_t guid = spa_load_guid(spa);
3930
3931	ASSERT(!BP_IS_EMBEDDED(bp) ||
3932	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
3933
3934top:
3935	if (!BP_IS_EMBEDDED(bp)) {
3936		/*
3937		 * Embedded BP's have no DVA and require no I/O to "read".
3938		 * Create an anonymous arc buf to back it.
3939		 */
3940		hdr = buf_hash_find(guid, bp, &hash_lock);
3941	}
3942
3943	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) {
3944
3945		*arc_flags |= ARC_FLAG_CACHED;
3946
3947		if (HDR_IO_IN_PROGRESS(hdr)) {
3948
3949			if (*arc_flags & ARC_FLAG_WAIT) {
3950				cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
3951				mutex_exit(hash_lock);
3952				goto top;
3953			}
3954			ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
3955
3956			if (done) {
3957				arc_callback_t	*acb = NULL;
3958
3959				acb = kmem_zalloc(sizeof (arc_callback_t),
3960				    KM_SLEEP);
3961				acb->acb_done = done;
3962				acb->acb_private = private;
3963				if (pio != NULL)
3964					acb->acb_zio_dummy = zio_null(pio,
3965					    spa, NULL, NULL, NULL, zio_flags);
3966
3967				ASSERT(acb->acb_done != NULL);
3968				acb->acb_next = hdr->b_l1hdr.b_acb;
3969				hdr->b_l1hdr.b_acb = acb;
3970				add_reference(hdr, hash_lock, private);
3971				mutex_exit(hash_lock);
3972				return (0);
3973			}
3974			mutex_exit(hash_lock);
3975			return (0);
3976		}
3977
3978		ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
3979		    hdr->b_l1hdr.b_state == arc_mfu);
3980
3981		if (done) {
3982			add_reference(hdr, hash_lock, private);
3983			/*
3984			 * If this block is already in use, create a new
3985			 * copy of the data so that we will be guaranteed
3986			 * that arc_release() will always succeed.
3987			 */
3988			buf = hdr->b_l1hdr.b_buf;
3989			ASSERT(buf);
3990			ASSERT(buf->b_data);
3991			if (HDR_BUF_AVAILABLE(hdr)) {
3992				ASSERT(buf->b_efunc == NULL);
3993				hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
3994			} else {
3995				buf = arc_buf_clone(buf);
3996			}
3997
3998		} else if (*arc_flags & ARC_FLAG_PREFETCH &&
3999		    refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
4000			hdr->b_flags |= ARC_FLAG_PREFETCH;
4001		}
4002		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
4003		arc_access(hdr, hash_lock);
4004		if (*arc_flags & ARC_FLAG_L2CACHE)
4005			hdr->b_flags |= ARC_FLAG_L2CACHE;
4006		if (*arc_flags & ARC_FLAG_L2COMPRESS)
4007			hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4008		mutex_exit(hash_lock);
4009		ARCSTAT_BUMP(arcstat_hits);
4010		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
4011		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
4012		    data, metadata, hits);
4013
4014		if (done)
4015			done(NULL, buf, private);
4016	} else {
4017		uint64_t size = BP_GET_LSIZE(bp);
4018		arc_callback_t *acb;
4019		vdev_t *vd = NULL;
4020		uint64_t addr = 0;
4021		boolean_t devw = B_FALSE;
4022		enum zio_compress b_compress = ZIO_COMPRESS_OFF;
4023		int32_t b_asize = 0;
4024
4025		if (hdr == NULL) {
4026			/* this block is not in the cache */
4027			arc_buf_hdr_t *exists = NULL;
4028			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
4029			buf = arc_buf_alloc(spa, size, private, type);
4030			hdr = buf->b_hdr;
4031			if (!BP_IS_EMBEDDED(bp)) {
4032				hdr->b_dva = *BP_IDENTITY(bp);
4033				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
4034				exists = buf_hash_insert(hdr, &hash_lock);
4035			}
4036			if (exists != NULL) {
4037				/* somebody beat us to the hash insert */
4038				mutex_exit(hash_lock);
4039				buf_discard_identity(hdr);
4040				(void) arc_buf_remove_ref(buf, private);
4041				goto top; /* restart the IO request */
4042			}
4043
4044			/* if this is a prefetch, we don't have a reference */
4045			if (*arc_flags & ARC_FLAG_PREFETCH) {
4046				(void) remove_reference(hdr, hash_lock,
4047				    private);
4048				hdr->b_flags |= ARC_FLAG_PREFETCH;
4049			}
4050			if (*arc_flags & ARC_FLAG_L2CACHE)
4051				hdr->b_flags |= ARC_FLAG_L2CACHE;
4052			if (*arc_flags & ARC_FLAG_L2COMPRESS)
4053				hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4054			if (BP_GET_LEVEL(bp) > 0)
4055				hdr->b_flags |= ARC_FLAG_INDIRECT;
4056		} else {
4057			/*
4058			 * This block is in the ghost cache. If it was L2-only
4059			 * (and thus didn't have an L1 hdr), we realloc the
4060			 * header to add an L1 hdr.
4061			 */
4062			if (!HDR_HAS_L1HDR(hdr)) {
4063				hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
4064				    hdr_full_cache);
4065			}
4066
4067			ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
4068			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4069			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4070			ASSERT(hdr->b_l1hdr.b_buf == NULL);
4071
4072			/* if this is a prefetch, we don't have a reference */
4073			if (*arc_flags & ARC_FLAG_PREFETCH)
4074				hdr->b_flags |= ARC_FLAG_PREFETCH;
4075			else
4076				add_reference(hdr, hash_lock, private);
4077			if (*arc_flags & ARC_FLAG_L2CACHE)
4078				hdr->b_flags |= ARC_FLAG_L2CACHE;
4079			if (*arc_flags & ARC_FLAG_L2COMPRESS)
4080				hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4081			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
4082			buf->b_hdr = hdr;
4083			buf->b_data = NULL;
4084			buf->b_efunc = NULL;
4085			buf->b_private = NULL;
4086			buf->b_next = NULL;
4087			hdr->b_l1hdr.b_buf = buf;
4088			ASSERT0(hdr->b_l1hdr.b_datacnt);
4089			hdr->b_l1hdr.b_datacnt = 1;
4090			arc_get_data_buf(buf);
4091			arc_access(hdr, hash_lock);
4092		}
4093
4094		ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
4095
4096		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
4097		acb->acb_done = done;
4098		acb->acb_private = private;
4099
4100		ASSERT(hdr->b_l1hdr.b_acb == NULL);
4101		hdr->b_l1hdr.b_acb = acb;
4102		hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
4103
4104		if (HDR_HAS_L2HDR(hdr) &&
4105		    (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
4106			devw = hdr->b_l2hdr.b_dev->l2ad_writing;
4107			addr = hdr->b_l2hdr.b_daddr;
4108			b_compress = HDR_GET_COMPRESS(hdr);
4109			b_asize = hdr->b_l2hdr.b_asize;
4110			/*
4111			 * Lock out device removal.
4112			 */
4113			if (vdev_is_dead(vd) ||
4114			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
4115				vd = NULL;
4116		}
4117
4118		if (hash_lock != NULL)
4119			mutex_exit(hash_lock);
4120
4121		/*
4122		 * At this point, we have a level 1 cache miss.  Try again in
4123		 * L2ARC if possible.
4124		 */
4125		ASSERT3U(hdr->b_size, ==, size);
4126		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
4127		    uint64_t, size, zbookmark_phys_t *, zb);
4128		ARCSTAT_BUMP(arcstat_misses);
4129		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
4130		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
4131		    data, metadata, misses);
4132#ifdef _KERNEL
4133		curthread->td_ru.ru_inblock++;
4134#endif
4135
4136		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
4137			/*
4138			 * Read from the L2ARC if the following are true:
4139			 * 1. The L2ARC vdev was previously cached.
4140			 * 2. This buffer still has L2ARC metadata.
4141			 * 3. This buffer isn't currently writing to the L2ARC.
4142			 * 4. The L2ARC entry wasn't evicted, which may
4143			 *    also have invalidated the vdev.
4144			 * 5. This isn't prefetch and l2arc_noprefetch is set.
4145			 */
4146			if (HDR_HAS_L2HDR(hdr) &&
4147			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
4148			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
4149				l2arc_read_callback_t *cb;
4150
4151				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
4152				ARCSTAT_BUMP(arcstat_l2_hits);
4153
4154				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
4155				    KM_SLEEP);
4156				cb->l2rcb_buf = buf;
4157				cb->l2rcb_spa = spa;
4158				cb->l2rcb_bp = *bp;
4159				cb->l2rcb_zb = *zb;
4160				cb->l2rcb_flags = zio_flags;
4161				cb->l2rcb_compress = b_compress;
4162
4163				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
4164				    addr + size < vd->vdev_psize -
4165				    VDEV_LABEL_END_SIZE);
4166
4167				/*
4168				 * l2arc read.  The SCL_L2ARC lock will be
4169				 * released by l2arc_read_done().
4170				 * Issue a null zio if the underlying buffer
4171				 * was squashed to zero size by compression.
4172				 */
4173				if (b_compress == ZIO_COMPRESS_EMPTY) {
4174					rzio = zio_null(pio, spa, vd,
4175					    l2arc_read_done, cb,
4176					    zio_flags | ZIO_FLAG_DONT_CACHE |
4177					    ZIO_FLAG_CANFAIL |
4178					    ZIO_FLAG_DONT_PROPAGATE |
4179					    ZIO_FLAG_DONT_RETRY);
4180				} else {
4181					rzio = zio_read_phys(pio, vd, addr,
4182					    b_asize, buf->b_data,
4183					    ZIO_CHECKSUM_OFF,
4184					    l2arc_read_done, cb, priority,
4185					    zio_flags | ZIO_FLAG_DONT_CACHE |
4186					    ZIO_FLAG_CANFAIL |
4187					    ZIO_FLAG_DONT_PROPAGATE |
4188					    ZIO_FLAG_DONT_RETRY, B_FALSE);
4189				}
4190				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
4191				    zio_t *, rzio);
4192				ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
4193
4194				if (*arc_flags & ARC_FLAG_NOWAIT) {
4195					zio_nowait(rzio);
4196					return (0);
4197				}
4198
4199				ASSERT(*arc_flags & ARC_FLAG_WAIT);
4200				if (zio_wait(rzio) == 0)
4201					return (0);
4202
4203				/* l2arc read error; goto zio_read() */
4204			} else {
4205				DTRACE_PROBE1(l2arc__miss,
4206				    arc_buf_hdr_t *, hdr);
4207				ARCSTAT_BUMP(arcstat_l2_misses);
4208				if (HDR_L2_WRITING(hdr))
4209					ARCSTAT_BUMP(arcstat_l2_rw_clash);
4210				spa_config_exit(spa, SCL_L2ARC, vd);
4211			}
4212		} else {
4213			if (vd != NULL)
4214				spa_config_exit(spa, SCL_L2ARC, vd);
4215			if (l2arc_ndev != 0) {
4216				DTRACE_PROBE1(l2arc__miss,
4217				    arc_buf_hdr_t *, hdr);
4218				ARCSTAT_BUMP(arcstat_l2_misses);
4219			}
4220		}
4221
4222		rzio = zio_read(pio, spa, bp, buf->b_data, size,
4223		    arc_read_done, buf, priority, zio_flags, zb);
4224
4225		if (*arc_flags & ARC_FLAG_WAIT)
4226			return (zio_wait(rzio));
4227
4228		ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
4229		zio_nowait(rzio);
4230	}
4231	return (0);
4232}
4233
4234void
4235arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
4236{
4237	ASSERT(buf->b_hdr != NULL);
4238	ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon);
4239	ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) ||
4240	    func == NULL);
4241	ASSERT(buf->b_efunc == NULL);
4242	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
4243
4244	buf->b_efunc = func;
4245	buf->b_private = private;
4246}
4247
4248/*
4249 * Notify the arc that a block was freed, and thus will never be used again.
4250 */
4251void
4252arc_freed(spa_t *spa, const blkptr_t *bp)
4253{
4254	arc_buf_hdr_t *hdr;
4255	kmutex_t *hash_lock;
4256	uint64_t guid = spa_load_guid(spa);
4257
4258	ASSERT(!BP_IS_EMBEDDED(bp));
4259
4260	hdr = buf_hash_find(guid, bp, &hash_lock);
4261	if (hdr == NULL)
4262		return;
4263	if (HDR_BUF_AVAILABLE(hdr)) {
4264		arc_buf_t *buf = hdr->b_l1hdr.b_buf;
4265		add_reference(hdr, hash_lock, FTAG);
4266		hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
4267		mutex_exit(hash_lock);
4268
4269		arc_release(buf, FTAG);
4270		(void) arc_buf_remove_ref(buf, FTAG);
4271	} else {
4272		mutex_exit(hash_lock);
4273	}
4274
4275}
4276
4277/*
4278 * Clear the user eviction callback set by arc_set_callback(), first calling
4279 * it if it exists.  Because the presence of a callback keeps an arc_buf cached
4280 * clearing the callback may result in the arc_buf being destroyed.  However,
4281 * it will not result in the *last* arc_buf being destroyed, hence the data
4282 * will remain cached in the ARC. We make a copy of the arc buffer here so
4283 * that we can process the callback without holding any locks.
4284 *
4285 * It's possible that the callback is already in the process of being cleared
4286 * by another thread.  In this case we can not clear the callback.
4287 *
4288 * Returns B_TRUE if the callback was successfully called and cleared.
4289 */
4290boolean_t
4291arc_clear_callback(arc_buf_t *buf)
4292{
4293	arc_buf_hdr_t *hdr;
4294	kmutex_t *hash_lock;
4295	arc_evict_func_t *efunc = buf->b_efunc;
4296	void *private = buf->b_private;
4297	list_t *list, *evicted_list;
4298	kmutex_t *lock, *evicted_lock;
4299
4300	mutex_enter(&buf->b_evict_lock);
4301	hdr = buf->b_hdr;
4302	if (hdr == NULL) {
4303		/*
4304		 * We are in arc_do_user_evicts().
4305		 */
4306		ASSERT(buf->b_data == NULL);
4307		mutex_exit(&buf->b_evict_lock);
4308		return (B_FALSE);
4309	} else if (buf->b_data == NULL) {
4310		/*
4311		 * We are on the eviction list; process this buffer now
4312		 * but let arc_do_user_evicts() do the reaping.
4313		 */
4314		buf->b_efunc = NULL;
4315		mutex_exit(&buf->b_evict_lock);
4316		VERIFY0(efunc(private));
4317		return (B_TRUE);
4318	}
4319	hash_lock = HDR_LOCK(hdr);
4320	mutex_enter(hash_lock);
4321	hdr = buf->b_hdr;
4322	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4323
4324	ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <,
4325	    hdr->b_l1hdr.b_datacnt);
4326	ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
4327	    hdr->b_l1hdr.b_state == arc_mfu);
4328
4329	buf->b_efunc = NULL;
4330	buf->b_private = NULL;
4331
4332	if (hdr->b_l1hdr.b_datacnt > 1) {
4333		mutex_exit(&buf->b_evict_lock);
4334		arc_buf_destroy(buf, FALSE, TRUE);
4335	} else {
4336		ASSERT(buf == hdr->b_l1hdr.b_buf);
4337		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
4338		mutex_exit(&buf->b_evict_lock);
4339	}
4340
4341	mutex_exit(hash_lock);
4342	VERIFY0(efunc(private));
4343	return (B_TRUE);
4344}
4345
4346/*
4347 * Release this buffer from the cache, making it an anonymous buffer.  This
4348 * must be done after a read and prior to modifying the buffer contents.
4349 * If the buffer has more than one reference, we must make
4350 * a new hdr for the buffer.
4351 */
4352void
4353arc_release(arc_buf_t *buf, void *tag)
4354{
4355	arc_buf_hdr_t *hdr = buf->b_hdr;
4356
4357	/*
4358	 * It would be nice to assert that if it's DMU metadata (level >
4359	 * 0 || it's the dnode file), then it must be syncing context.
4360	 * But we don't know that information at this level.
4361	 */
4362
4363	mutex_enter(&buf->b_evict_lock);
4364	/*
4365	 * We don't grab the hash lock prior to this check, because if
4366	 * the buffer's header is in the arc_anon state, it won't be
4367	 * linked into the hash table.
4368	 */
4369	if (hdr->b_l1hdr.b_state == arc_anon) {
4370		mutex_exit(&buf->b_evict_lock);
4371		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4372		ASSERT(!HDR_IN_HASH_TABLE(hdr));
4373		ASSERT(!HDR_HAS_L2HDR(hdr));
4374		ASSERT(BUF_EMPTY(hdr));
4375		ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1);
4376		ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
4377		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
4378
4379		ASSERT3P(buf->b_efunc, ==, NULL);
4380		ASSERT3P(buf->b_private, ==, NULL);
4381
4382		hdr->b_l1hdr.b_arc_access = 0;
4383		arc_buf_thaw(buf);
4384
4385		return;
4386	}
4387
4388	kmutex_t *hash_lock = HDR_LOCK(hdr);
4389	mutex_enter(hash_lock);
4390
4391	/*
4392	 * This assignment is only valid as long as the hash_lock is
4393	 * held, we must be careful not to reference state or the
4394	 * b_state field after dropping the lock.
4395	 */
4396	arc_state_t *state = hdr->b_l1hdr.b_state;
4397	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4398	ASSERT3P(state, !=, arc_anon);
4399
4400	/* this buffer is not on any list */
4401	ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0);
4402
4403	if (HDR_HAS_L2HDR(hdr)) {
4404		mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
4405
4406		/*
4407		 * We have to recheck this conditional again now that
4408		 * we're holding the l2ad_mtx to prevent a race with
4409		 * another thread which might be concurrently calling
4410		 * l2arc_evict(). In that case, l2arc_evict() might have
4411		 * destroyed the header's L2 portion as we were waiting
4412		 * to acquire the l2ad_mtx.
4413		 */
4414		if (HDR_HAS_L2HDR(hdr)) {
4415			trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev,
4416			    hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0);
4417			arc_hdr_l2hdr_destroy(hdr);
4418		}
4419
4420		mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
4421	}
4422
4423	/*
4424	 * Do we have more than one buf?
4425	 */
4426	if (hdr->b_l1hdr.b_datacnt > 1) {
4427		arc_buf_hdr_t *nhdr;
4428		arc_buf_t **bufp;
4429		uint64_t blksz = hdr->b_size;
4430		uint64_t spa = hdr->b_spa;
4431		arc_buf_contents_t type = arc_buf_type(hdr);
4432		uint32_t flags = hdr->b_flags;
4433
4434		ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
4435		/*
4436		 * Pull the data off of this hdr and attach it to
4437		 * a new anonymous hdr.
4438		 */
4439		(void) remove_reference(hdr, hash_lock, tag);
4440		bufp = &hdr->b_l1hdr.b_buf;
4441		while (*bufp != buf)
4442			bufp = &(*bufp)->b_next;
4443		*bufp = buf->b_next;
4444		buf->b_next = NULL;
4445
4446		ASSERT3P(state, !=, arc_l2c_only);
4447		ASSERT3U(state->arcs_size, >=, hdr->b_size);
4448		atomic_add_64(&state->arcs_size, -hdr->b_size);
4449		if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
4450			ASSERT3P(state, !=, arc_l2c_only);
4451			uint64_t *size = &state->arcs_lsize[type];
4452			ASSERT3U(*size, >=, hdr->b_size);
4453			atomic_add_64(size, -hdr->b_size);
4454		}
4455
4456		/*
4457		 * We're releasing a duplicate user data buffer, update
4458		 * our statistics accordingly.
4459		 */
4460		if (HDR_ISTYPE_DATA(hdr)) {
4461			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
4462			ARCSTAT_INCR(arcstat_duplicate_buffers_size,
4463			    -hdr->b_size);
4464		}
4465		hdr->b_l1hdr.b_datacnt -= 1;
4466		arc_cksum_verify(buf);
4467#ifdef illumos
4468		arc_buf_unwatch(buf);
4469#endif
4470
4471		mutex_exit(hash_lock);
4472
4473		nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
4474		nhdr->b_size = blksz;
4475		nhdr->b_spa = spa;
4476
4477		nhdr->b_flags = flags & ARC_FLAG_L2_WRITING;
4478		nhdr->b_flags |= arc_bufc_to_flags(type);
4479		nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
4480
4481		nhdr->b_l1hdr.b_buf = buf;
4482		nhdr->b_l1hdr.b_datacnt = 1;
4483		nhdr->b_l1hdr.b_state = arc_anon;
4484		nhdr->b_l1hdr.b_arc_access = 0;
4485		nhdr->b_freeze_cksum = NULL;
4486
4487		(void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
4488		buf->b_hdr = nhdr;
4489		mutex_exit(&buf->b_evict_lock);
4490		atomic_add_64(&arc_anon->arcs_size, blksz);
4491	} else {
4492		mutex_exit(&buf->b_evict_lock);
4493		ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
4494		/* protected by hash lock */
4495		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
4496		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4497		arc_change_state(arc_anon, hdr, hash_lock);
4498		hdr->b_l1hdr.b_arc_access = 0;
4499		mutex_exit(hash_lock);
4500
4501		buf_discard_identity(hdr);
4502		arc_buf_thaw(buf);
4503	}
4504	buf->b_efunc = NULL;
4505	buf->b_private = NULL;
4506}
4507
4508int
4509arc_released(arc_buf_t *buf)
4510{
4511	int released;
4512
4513	mutex_enter(&buf->b_evict_lock);
4514	released = (buf->b_data != NULL &&
4515	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
4516	mutex_exit(&buf->b_evict_lock);
4517	return (released);
4518}
4519
4520#ifdef ZFS_DEBUG
4521int
4522arc_referenced(arc_buf_t *buf)
4523{
4524	int referenced;
4525
4526	mutex_enter(&buf->b_evict_lock);
4527	referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
4528	mutex_exit(&buf->b_evict_lock);
4529	return (referenced);
4530}
4531#endif
4532
4533static void
4534arc_write_ready(zio_t *zio)
4535{
4536	arc_write_callback_t *callback = zio->io_private;
4537	arc_buf_t *buf = callback->awcb_buf;
4538	arc_buf_hdr_t *hdr = buf->b_hdr;
4539
4540	ASSERT(HDR_HAS_L1HDR(hdr));
4541	ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
4542	ASSERT(hdr->b_l1hdr.b_datacnt > 0);
4543	callback->awcb_ready(zio, buf, callback->awcb_private);
4544
4545	/*
4546	 * If the IO is already in progress, then this is a re-write
4547	 * attempt, so we need to thaw and re-compute the cksum.
4548	 * It is the responsibility of the callback to handle the
4549	 * accounting for any re-write attempt.
4550	 */
4551	if (HDR_IO_IN_PROGRESS(hdr)) {
4552		mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
4553		if (hdr->b_freeze_cksum != NULL) {
4554			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
4555			hdr->b_freeze_cksum = NULL;
4556		}
4557		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
4558	}
4559	arc_cksum_compute(buf, B_FALSE);
4560	hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
4561}
4562
4563/*
4564 * The SPA calls this callback for each physical write that happens on behalf
4565 * of a logical write.  See the comment in dbuf_write_physdone() for details.
4566 */
4567static void
4568arc_write_physdone(zio_t *zio)
4569{
4570	arc_write_callback_t *cb = zio->io_private;
4571	if (cb->awcb_physdone != NULL)
4572		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
4573}
4574
4575static void
4576arc_write_done(zio_t *zio)
4577{
4578	arc_write_callback_t *callback = zio->io_private;
4579	arc_buf_t *buf = callback->awcb_buf;
4580	arc_buf_hdr_t *hdr = buf->b_hdr;
4581
4582	ASSERT(hdr->b_l1hdr.b_acb == NULL);
4583
4584	if (zio->io_error == 0) {
4585		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
4586			buf_discard_identity(hdr);
4587		} else {
4588			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
4589			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
4590		}
4591	} else {
4592		ASSERT(BUF_EMPTY(hdr));
4593	}
4594
4595	/*
4596	 * If the block to be written was all-zero or compressed enough to be
4597	 * embedded in the BP, no write was performed so there will be no
4598	 * dva/birth/checksum.  The buffer must therefore remain anonymous
4599	 * (and uncached).
4600	 */
4601	if (!BUF_EMPTY(hdr)) {
4602		arc_buf_hdr_t *exists;
4603		kmutex_t *hash_lock;
4604
4605		ASSERT(zio->io_error == 0);
4606
4607		arc_cksum_verify(buf);
4608
4609		exists = buf_hash_insert(hdr, &hash_lock);
4610		if (exists != NULL) {
4611			/*
4612			 * This can only happen if we overwrite for
4613			 * sync-to-convergence, because we remove
4614			 * buffers from the hash table when we arc_free().
4615			 */
4616			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
4617				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
4618					panic("bad overwrite, hdr=%p exists=%p",
4619					    (void *)hdr, (void *)exists);
4620				ASSERT(refcount_is_zero(
4621				    &exists->b_l1hdr.b_refcnt));
4622				arc_change_state(arc_anon, exists, hash_lock);
4623				mutex_exit(hash_lock);
4624				arc_hdr_destroy(exists);
4625				exists = buf_hash_insert(hdr, &hash_lock);
4626				ASSERT3P(exists, ==, NULL);
4627			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
4628				/* nopwrite */
4629				ASSERT(zio->io_prop.zp_nopwrite);
4630				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
4631					panic("bad nopwrite, hdr=%p exists=%p",
4632					    (void *)hdr, (void *)exists);
4633			} else {
4634				/* Dedup */
4635				ASSERT(hdr->b_l1hdr.b_datacnt == 1);
4636				ASSERT(hdr->b_l1hdr.b_state == arc_anon);
4637				ASSERT(BP_GET_DEDUP(zio->io_bp));
4638				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
4639			}
4640		}
4641		hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
4642		/* if it's not anon, we are doing a scrub */
4643		if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
4644			arc_access(hdr, hash_lock);
4645		mutex_exit(hash_lock);
4646	} else {
4647		hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
4648	}
4649
4650	ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4651	callback->awcb_done(zio, buf, callback->awcb_private);
4652
4653	kmem_free(callback, sizeof (arc_write_callback_t));
4654}
4655
4656zio_t *
4657arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
4658    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
4659    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
4660    arc_done_func_t *done, void *private, zio_priority_t priority,
4661    int zio_flags, const zbookmark_phys_t *zb)
4662{
4663	arc_buf_hdr_t *hdr = buf->b_hdr;
4664	arc_write_callback_t *callback;
4665	zio_t *zio;
4666
4667	ASSERT(ready != NULL);
4668	ASSERT(done != NULL);
4669	ASSERT(!HDR_IO_ERROR(hdr));
4670	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4671	ASSERT(hdr->b_l1hdr.b_acb == NULL);
4672	ASSERT(hdr->b_l1hdr.b_datacnt > 0);
4673	if (l2arc)
4674		hdr->b_flags |= ARC_FLAG_L2CACHE;
4675	if (l2arc_compress)
4676		hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4677	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
4678	callback->awcb_ready = ready;
4679	callback->awcb_physdone = physdone;
4680	callback->awcb_done = done;
4681	callback->awcb_private = private;
4682	callback->awcb_buf = buf;
4683
4684	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
4685	    arc_write_ready, arc_write_physdone, arc_write_done, callback,
4686	    priority, zio_flags, zb);
4687
4688	return (zio);
4689}
4690
4691static int
4692arc_memory_throttle(uint64_t reserve, uint64_t txg)
4693{
4694#ifdef _KERNEL
4695	uint64_t available_memory = ptob(freemem);
4696	static uint64_t page_load = 0;
4697	static uint64_t last_txg = 0;
4698
4699#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
4700	available_memory =
4701	    MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE)));
4702#endif
4703
4704	if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
4705		return (0);
4706
4707	if (txg > last_txg) {
4708		last_txg = txg;
4709		page_load = 0;
4710	}
4711	/*
4712	 * If we are in pageout, we know that memory is already tight,
4713	 * the arc is already going to be evicting, so we just want to
4714	 * continue to let page writes occur as quickly as possible.
4715	 */
4716	if (curproc == pageproc) {
4717		if (page_load > MAX(ptob(minfree), available_memory) / 4)
4718			return (SET_ERROR(ERESTART));
4719		/* Note: reserve is inflated, so we deflate */
4720		page_load += reserve / 8;
4721		return (0);
4722	} else if (page_load > 0 && arc_reclaim_needed()) {
4723		/* memory is low, delay before restarting */
4724		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
4725		return (SET_ERROR(EAGAIN));
4726	}
4727	page_load = 0;
4728#endif
4729	return (0);
4730}
4731
4732void
4733arc_tempreserve_clear(uint64_t reserve)
4734{
4735	atomic_add_64(&arc_tempreserve, -reserve);
4736	ASSERT((int64_t)arc_tempreserve >= 0);
4737}
4738
4739int
4740arc_tempreserve_space(uint64_t reserve, uint64_t txg)
4741{
4742	int error;
4743	uint64_t anon_size;
4744
4745	if (reserve > arc_c/4 && !arc_no_grow) {
4746		arc_c = MIN(arc_c_max, reserve * 4);
4747		DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
4748	}
4749	if (reserve > arc_c)
4750		return (SET_ERROR(ENOMEM));
4751
4752	/*
4753	 * Don't count loaned bufs as in flight dirty data to prevent long
4754	 * network delays from blocking transactions that are ready to be
4755	 * assigned to a txg.
4756	 */
4757	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
4758
4759	/*
4760	 * Writes will, almost always, require additional memory allocations
4761	 * in order to compress/encrypt/etc the data.  We therefore need to
4762	 * make sure that there is sufficient available memory for this.
4763	 */
4764	error = arc_memory_throttle(reserve, txg);
4765	if (error != 0)
4766		return (error);
4767
4768	/*
4769	 * Throttle writes when the amount of dirty data in the cache
4770	 * gets too large.  We try to keep the cache less than half full
4771	 * of dirty blocks so that our sync times don't grow too large.
4772	 * Note: if two requests come in concurrently, we might let them
4773	 * both succeed, when one of them should fail.  Not a huge deal.
4774	 */
4775
4776	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
4777	    anon_size > arc_c / 4) {
4778		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
4779		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
4780		    arc_tempreserve>>10,
4781		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
4782		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
4783		    reserve>>10, arc_c>>10);
4784		return (SET_ERROR(ERESTART));
4785	}
4786	atomic_add_64(&arc_tempreserve, reserve);
4787	return (0);
4788}
4789
4790static void
4791arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
4792    kstat_named_t *evict_data, kstat_named_t *evict_metadata)
4793{
4794	size->value.ui64 = state->arcs_size;
4795	evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
4796	evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
4797}
4798
4799static int
4800arc_kstat_update(kstat_t *ksp, int rw)
4801{
4802	arc_stats_t *as = ksp->ks_data;
4803
4804	if (rw == KSTAT_WRITE) {
4805		return (EACCES);
4806	} else {
4807		arc_kstat_update_state(arc_anon,
4808		    &as->arcstat_anon_size,
4809		    &as->arcstat_anon_evictable_data,
4810		    &as->arcstat_anon_evictable_metadata);
4811		arc_kstat_update_state(arc_mru,
4812		    &as->arcstat_mru_size,
4813		    &as->arcstat_mru_evictable_data,
4814		    &as->arcstat_mru_evictable_metadata);
4815		arc_kstat_update_state(arc_mru_ghost,
4816		    &as->arcstat_mru_ghost_size,
4817		    &as->arcstat_mru_ghost_evictable_data,
4818		    &as->arcstat_mru_ghost_evictable_metadata);
4819		arc_kstat_update_state(arc_mfu,
4820		    &as->arcstat_mfu_size,
4821		    &as->arcstat_mfu_evictable_data,
4822		    &as->arcstat_mfu_evictable_metadata);
4823		arc_kstat_update_state(arc_mfu_ghost,
4824		    &as->arcstat_mfu_ghost_size,
4825		    &as->arcstat_mfu_ghost_evictable_data,
4826		    &as->arcstat_mfu_ghost_evictable_metadata);
4827	}
4828
4829	return (0);
4830}
4831
4832#ifdef _KERNEL
4833static eventhandler_tag arc_event_lowmem = NULL;
4834
4835static void
4836arc_lowmem(void *arg __unused, int howto __unused)
4837{
4838
4839	mutex_enter(&arc_reclaim_thr_lock);
4840	/* XXX: Memory deficit should be passed as argument. */
4841	needfree = btoc(arc_c >> arc_shrink_shift);
4842	DTRACE_PROBE(arc__needfree);
4843	cv_signal(&arc_reclaim_thr_cv);
4844
4845	/*
4846	 * It is unsafe to block here in arbitrary threads, because we can come
4847	 * here from ARC itself and may hold ARC locks and thus risk a deadlock
4848	 * with ARC reclaim thread.
4849	 */
4850	if (curproc == pageproc)
4851		msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
4852	mutex_exit(&arc_reclaim_thr_lock);
4853}
4854#endif
4855
4856void
4857arc_init(void)
4858{
4859	int i, prefetch_tunable_set = 0;
4860
4861	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4862	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
4863
4864	/* Convert seconds to clock ticks */
4865	arc_min_prefetch_lifespan = 1 * hz;
4866
4867	/* Start out with 1/8 of all memory */
4868	arc_c = kmem_size() / 8;
4869
4870#ifdef illumos
4871#ifdef _KERNEL
4872	/*
4873	 * On architectures where the physical memory can be larger
4874	 * than the addressable space (intel in 32-bit mode), we may
4875	 * need to limit the cache to 1/8 of VM size.
4876	 */
4877	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
4878#endif
4879#endif	/* illumos */
4880	/* set min cache to 1/32 of all memory, or 16MB, whichever is more */
4881	arc_c_min = MAX(arc_c / 4, 16 << 20);
4882	/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
4883	if (arc_c * 8 >= 1 << 30)
4884		arc_c_max = (arc_c * 8) - (1 << 30);
4885	else
4886		arc_c_max = arc_c_min;
4887	arc_c_max = MAX(arc_c * 5, arc_c_max);
4888
4889#ifdef _KERNEL
4890	/*
4891	 * Allow the tunables to override our calculations if they are
4892	 * reasonable (ie. over 16MB)
4893	 */
4894	if (zfs_arc_max > 16 << 20 && zfs_arc_max < kmem_size())
4895		arc_c_max = zfs_arc_max;
4896	if (zfs_arc_min > 16 << 20 && zfs_arc_min <= arc_c_max)
4897		arc_c_min = zfs_arc_min;
4898#endif
4899
4900	arc_c = arc_c_max;
4901	arc_p = (arc_c >> 1);
4902
4903	/* limit meta-data to 1/4 of the arc capacity */
4904	arc_meta_limit = arc_c_max / 4;
4905
4906	/* Allow the tunable to override if it is reasonable */
4907	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
4908		arc_meta_limit = zfs_arc_meta_limit;
4909
4910	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
4911		arc_c_min = arc_meta_limit / 2;
4912
4913	if (zfs_arc_meta_min > 0) {
4914		arc_meta_min = zfs_arc_meta_min;
4915	} else {
4916		arc_meta_min = arc_c_min / 2;
4917	}
4918
4919	if (zfs_arc_grow_retry > 0)
4920		arc_grow_retry = zfs_arc_grow_retry;
4921
4922	if (zfs_arc_shrink_shift > 0)
4923		arc_shrink_shift = zfs_arc_shrink_shift;
4924
4925	/*
4926	 * Ensure that arc_no_grow_shift is less than arc_shrink_shift.
4927	 */
4928	if (arc_no_grow_shift >= arc_shrink_shift)
4929		arc_no_grow_shift = arc_shrink_shift - 1;
4930
4931	if (zfs_arc_p_min_shift > 0)
4932		arc_p_min_shift = zfs_arc_p_min_shift;
4933
4934	/* if kmem_flags are set, lets try to use less memory */
4935	if (kmem_debugging())
4936		arc_c = arc_c / 2;
4937	if (arc_c < arc_c_min)
4938		arc_c = arc_c_min;
4939
4940	zfs_arc_min = arc_c_min;
4941	zfs_arc_max = arc_c_max;
4942
4943	arc_anon = &ARC_anon;
4944	arc_mru = &ARC_mru;
4945	arc_mru_ghost = &ARC_mru_ghost;
4946	arc_mfu = &ARC_mfu;
4947	arc_mfu_ghost = &ARC_mfu_ghost;
4948	arc_l2c_only = &ARC_l2c_only;
4949	arc_size = 0;
4950
4951	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4952		mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
4953		    NULL, MUTEX_DEFAULT, NULL);
4954		mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
4955		    NULL, MUTEX_DEFAULT, NULL);
4956		mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
4957		    NULL, MUTEX_DEFAULT, NULL);
4958		mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
4959		    NULL, MUTEX_DEFAULT, NULL);
4960		mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
4961		    NULL, MUTEX_DEFAULT, NULL);
4962		mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
4963		    NULL, MUTEX_DEFAULT, NULL);
4964
4965		list_create(&arc_mru->arcs_lists[i],
4966		    sizeof (arc_buf_hdr_t),
4967		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4968		list_create(&arc_mru_ghost->arcs_lists[i],
4969		    sizeof (arc_buf_hdr_t),
4970		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4971		list_create(&arc_mfu->arcs_lists[i],
4972		    sizeof (arc_buf_hdr_t),
4973		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4974		list_create(&arc_mfu_ghost->arcs_lists[i],
4975		    sizeof (arc_buf_hdr_t),
4976		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4977		list_create(&arc_mfu_ghost->arcs_lists[i],
4978		    sizeof (arc_buf_hdr_t),
4979		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4980		list_create(&arc_l2c_only->arcs_lists[i],
4981		    sizeof (arc_buf_hdr_t),
4982		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4983	}
4984
4985	buf_init();
4986
4987	arc_thread_exit = 0;
4988	arc_eviction_list = NULL;
4989	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
4990	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4991
4992	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4993	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4994
4995	if (arc_ksp != NULL) {
4996		arc_ksp->ks_data = &arc_stats;
4997		arc_ksp->ks_update = arc_kstat_update;
4998		kstat_install(arc_ksp);
4999	}
5000
5001	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
5002	    TS_RUN, minclsyspri);
5003
5004#ifdef _KERNEL
5005	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
5006	    EVENTHANDLER_PRI_FIRST);
5007#endif
5008
5009	arc_dead = FALSE;
5010	arc_warm = B_FALSE;
5011
5012	/*
5013	 * Calculate maximum amount of dirty data per pool.
5014	 *
5015	 * If it has been set by /etc/system, take that.
5016	 * Otherwise, use a percentage of physical memory defined by
5017	 * zfs_dirty_data_max_percent (default 10%) with a cap at
5018	 * zfs_dirty_data_max_max (default 4GB).
5019	 */
5020	if (zfs_dirty_data_max == 0) {
5021		zfs_dirty_data_max = ptob(physmem) *
5022		    zfs_dirty_data_max_percent / 100;
5023		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
5024		    zfs_dirty_data_max_max);
5025	}
5026
5027#ifdef _KERNEL
5028	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
5029		prefetch_tunable_set = 1;
5030
5031#ifdef __i386__
5032	if (prefetch_tunable_set == 0) {
5033		printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
5034		    "-- to enable,\n");
5035		printf("            add \"vfs.zfs.prefetch_disable=0\" "
5036		    "to /boot/loader.conf.\n");
5037		zfs_prefetch_disable = 1;
5038	}
5039#else
5040	if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
5041	    prefetch_tunable_set == 0) {
5042		printf("ZFS NOTICE: Prefetch is disabled by default if less "
5043		    "than 4GB of RAM is present;\n"
5044		    "            to enable, add \"vfs.zfs.prefetch_disable=0\" "
5045		    "to /boot/loader.conf.\n");
5046		zfs_prefetch_disable = 1;
5047	}
5048#endif
5049	/* Warn about ZFS memory and address space requirements. */
5050	if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
5051		printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
5052		    "expect unstable behavior.\n");
5053	}
5054	if (kmem_size() < 512 * (1 << 20)) {
5055		printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
5056		    "expect unstable behavior.\n");
5057		printf("             Consider tuning vm.kmem_size and "
5058		    "vm.kmem_size_max\n");
5059		printf("             in /boot/loader.conf.\n");
5060	}
5061#endif
5062}
5063
5064void
5065arc_fini(void)
5066{
5067	int i;
5068
5069	mutex_enter(&arc_reclaim_thr_lock);
5070	arc_thread_exit = 1;
5071	cv_signal(&arc_reclaim_thr_cv);
5072	while (arc_thread_exit != 0)
5073		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
5074	mutex_exit(&arc_reclaim_thr_lock);
5075
5076	arc_flush(NULL);
5077
5078	arc_dead = TRUE;
5079
5080	if (arc_ksp != NULL) {
5081		kstat_delete(arc_ksp);
5082		arc_ksp = NULL;
5083	}
5084
5085	mutex_destroy(&arc_eviction_mtx);
5086	mutex_destroy(&arc_reclaim_thr_lock);
5087	cv_destroy(&arc_reclaim_thr_cv);
5088
5089	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
5090		list_destroy(&arc_mru->arcs_lists[i]);
5091		list_destroy(&arc_mru_ghost->arcs_lists[i]);
5092		list_destroy(&arc_mfu->arcs_lists[i]);
5093		list_destroy(&arc_mfu_ghost->arcs_lists[i]);
5094		list_destroy(&arc_l2c_only->arcs_lists[i]);
5095
5096		mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
5097		mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
5098		mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
5099		mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
5100		mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
5101		mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
5102	}
5103
5104	buf_fini();
5105
5106	ASSERT0(arc_loaned_bytes);
5107
5108#ifdef _KERNEL
5109	if (arc_event_lowmem != NULL)
5110		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
5111#endif
5112}
5113
5114/*
5115 * Level 2 ARC
5116 *
5117 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
5118 * It uses dedicated storage devices to hold cached data, which are populated
5119 * using large infrequent writes.  The main role of this cache is to boost
5120 * the performance of random read workloads.  The intended L2ARC devices
5121 * include short-stroked disks, solid state disks, and other media with
5122 * substantially faster read latency than disk.
5123 *
5124 *                 +-----------------------+
5125 *                 |         ARC           |
5126 *                 +-----------------------+
5127 *                    |         ^     ^
5128 *                    |         |     |
5129 *      l2arc_feed_thread()    arc_read()
5130 *                    |         |     |
5131 *                    |  l2arc read   |
5132 *                    V         |     |
5133 *               +---------------+    |
5134 *               |     L2ARC     |    |
5135 *               +---------------+    |
5136 *                   |    ^           |
5137 *          l2arc_write() |           |
5138 *                   |    |           |
5139 *                   V    |           |
5140 *                 +-------+      +-------+
5141 *                 | vdev  |      | vdev  |
5142 *                 | cache |      | cache |
5143 *                 +-------+      +-------+
5144 *                 +=========+     .-----.
5145 *                 :  L2ARC  :    |-_____-|
5146 *                 : devices :    | Disks |
5147 *                 +=========+    `-_____-'
5148 *
5149 * Read requests are satisfied from the following sources, in order:
5150 *
5151 *	1) ARC
5152 *	2) vdev cache of L2ARC devices
5153 *	3) L2ARC devices
5154 *	4) vdev cache of disks
5155 *	5) disks
5156 *
5157 * Some L2ARC device types exhibit extremely slow write performance.
5158 * To accommodate for this there are some significant differences between
5159 * the L2ARC and traditional cache design:
5160 *
5161 * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
5162 * the ARC behave as usual, freeing buffers and placing headers on ghost
5163 * lists.  The ARC does not send buffers to the L2ARC during eviction as
5164 * this would add inflated write latencies for all ARC memory pressure.
5165 *
5166 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
5167 * It does this by periodically scanning buffers from the eviction-end of
5168 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
5169 * not already there. It scans until a headroom of buffers is satisfied,
5170 * which itself is a buffer for ARC eviction. If a compressible buffer is
5171 * found during scanning and selected for writing to an L2ARC device, we
5172 * temporarily boost scanning headroom during the next scan cycle to make
5173 * sure we adapt to compression effects (which might significantly reduce
5174 * the data volume we write to L2ARC). The thread that does this is
5175 * l2arc_feed_thread(), illustrated below; example sizes are included to
5176 * provide a better sense of ratio than this diagram:
5177 *
5178 *	       head -->                        tail
5179 *	        +---------------------+----------+
5180 *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
5181 *	        +---------------------+----------+   |   o L2ARC eligible
5182 *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
5183 *	        +---------------------+----------+   |
5184 *	             15.9 Gbytes      ^ 32 Mbytes    |
5185 *	                           headroom          |
5186 *	                                      l2arc_feed_thread()
5187 *	                                             |
5188 *	                 l2arc write hand <--[oooo]--'
5189 *	                         |           8 Mbyte
5190 *	                         |          write max
5191 *	                         V
5192 *		  +==============================+
5193 *	L2ARC dev |####|#|###|###|    |####| ... |
5194 *	          +==============================+
5195 *	                     32 Gbytes
5196 *
5197 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
5198 * evicted, then the L2ARC has cached a buffer much sooner than it probably
5199 * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
5200 * safe to say that this is an uncommon case, since buffers at the end of
5201 * the ARC lists have moved there due to inactivity.
5202 *
5203 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
5204 * then the L2ARC simply misses copying some buffers.  This serves as a
5205 * pressure valve to prevent heavy read workloads from both stalling the ARC
5206 * with waits and clogging the L2ARC with writes.  This also helps prevent
5207 * the potential for the L2ARC to churn if it attempts to cache content too
5208 * quickly, such as during backups of the entire pool.
5209 *
5210 * 5. After system boot and before the ARC has filled main memory, there are
5211 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
5212 * lists can remain mostly static.  Instead of searching from tail of these
5213 * lists as pictured, the l2arc_feed_thread() will search from the list heads
5214 * for eligible buffers, greatly increasing its chance of finding them.
5215 *
5216 * The L2ARC device write speed is also boosted during this time so that
5217 * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
5218 * there are no L2ARC reads, and no fear of degrading read performance
5219 * through increased writes.
5220 *
5221 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
5222 * the vdev queue can aggregate them into larger and fewer writes.  Each
5223 * device is written to in a rotor fashion, sweeping writes through
5224 * available space then repeating.
5225 *
5226 * 7. The L2ARC does not store dirty content.  It never needs to flush
5227 * write buffers back to disk based storage.
5228 *
5229 * 8. If an ARC buffer is written (and dirtied) which also exists in the
5230 * L2ARC, the now stale L2ARC buffer is immediately dropped.
5231 *
5232 * The performance of the L2ARC can be tweaked by a number of tunables, which
5233 * may be necessary for different workloads:
5234 *
5235 *	l2arc_write_max		max write bytes per interval
5236 *	l2arc_write_boost	extra write bytes during device warmup
5237 *	l2arc_noprefetch	skip caching prefetched buffers
5238 *	l2arc_headroom		number of max device writes to precache
5239 *	l2arc_headroom_boost	when we find compressed buffers during ARC
5240 *				scanning, we multiply headroom by this
5241 *				percentage factor for the next scan cycle,
5242 *				since more compressed buffers are likely to
5243 *				be present
5244 *	l2arc_feed_secs		seconds between L2ARC writing
5245 *
5246 * Tunables may be removed or added as future performance improvements are
5247 * integrated, and also may become zpool properties.
5248 *
5249 * There are three key functions that control how the L2ARC warms up:
5250 *
5251 *	l2arc_write_eligible()	check if a buffer is eligible to cache
5252 *	l2arc_write_size()	calculate how much to write
5253 *	l2arc_write_interval()	calculate sleep delay between writes
5254 *
5255 * These three functions determine what to write, how much, and how quickly
5256 * to send writes.
5257 */
5258
5259static boolean_t
5260l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
5261{
5262	/*
5263	 * A buffer is *not* eligible for the L2ARC if it:
5264	 * 1. belongs to a different spa.
5265	 * 2. is already cached on the L2ARC.
5266	 * 3. has an I/O in progress (it may be an incomplete read).
5267	 * 4. is flagged not eligible (zfs property).
5268	 */
5269	if (hdr->b_spa != spa_guid) {
5270		ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
5271		return (B_FALSE);
5272	}
5273	if (HDR_HAS_L2HDR(hdr)) {
5274		ARCSTAT_BUMP(arcstat_l2_write_in_l2);
5275		return (B_FALSE);
5276	}
5277	if (HDR_IO_IN_PROGRESS(hdr)) {
5278		ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
5279		return (B_FALSE);
5280	}
5281	if (!HDR_L2CACHE(hdr)) {
5282		ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
5283		return (B_FALSE);
5284	}
5285
5286	return (B_TRUE);
5287}
5288
5289static uint64_t
5290l2arc_write_size(void)
5291{
5292	uint64_t size;
5293
5294	/*
5295	 * Make sure our globals have meaningful values in case the user
5296	 * altered them.
5297	 */
5298	size = l2arc_write_max;
5299	if (size == 0) {
5300		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
5301		    "be greater than zero, resetting it to the default (%d)",
5302		    L2ARC_WRITE_SIZE);
5303		size = l2arc_write_max = L2ARC_WRITE_SIZE;
5304	}
5305
5306	if (arc_warm == B_FALSE)
5307		size += l2arc_write_boost;
5308
5309	return (size);
5310
5311}
5312
5313static clock_t
5314l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
5315{
5316	clock_t interval, next, now;
5317
5318	/*
5319	 * If the ARC lists are busy, increase our write rate; if the
5320	 * lists are stale, idle back.  This is achieved by checking
5321	 * how much we previously wrote - if it was more than half of
5322	 * what we wanted, schedule the next write much sooner.
5323	 */
5324	if (l2arc_feed_again && wrote > (wanted / 2))
5325		interval = (hz * l2arc_feed_min_ms) / 1000;
5326	else
5327		interval = hz * l2arc_feed_secs;
5328
5329	now = ddi_get_lbolt();
5330	next = MAX(now, MIN(now + interval, began + interval));
5331
5332	return (next);
5333}
5334
5335/*
5336 * Cycle through L2ARC devices.  This is how L2ARC load balances.
5337 * If a device is returned, this also returns holding the spa config lock.
5338 */
5339static l2arc_dev_t *
5340l2arc_dev_get_next(void)
5341{
5342	l2arc_dev_t *first, *next = NULL;
5343
5344	/*
5345	 * Lock out the removal of spas (spa_namespace_lock), then removal
5346	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
5347	 * both locks will be dropped and a spa config lock held instead.
5348	 */
5349	mutex_enter(&spa_namespace_lock);
5350	mutex_enter(&l2arc_dev_mtx);
5351
5352	/* if there are no vdevs, there is nothing to do */
5353	if (l2arc_ndev == 0)
5354		goto out;
5355
5356	first = NULL;
5357	next = l2arc_dev_last;
5358	do {
5359		/* loop around the list looking for a non-faulted vdev */
5360		if (next == NULL) {
5361			next = list_head(l2arc_dev_list);
5362		} else {
5363			next = list_next(l2arc_dev_list, next);
5364			if (next == NULL)
5365				next = list_head(l2arc_dev_list);
5366		}
5367
5368		/* if we have come back to the start, bail out */
5369		if (first == NULL)
5370			first = next;
5371		else if (next == first)
5372			break;
5373
5374	} while (vdev_is_dead(next->l2ad_vdev));
5375
5376	/* if we were unable to find any usable vdevs, return NULL */
5377	if (vdev_is_dead(next->l2ad_vdev))
5378		next = NULL;
5379
5380	l2arc_dev_last = next;
5381
5382out:
5383	mutex_exit(&l2arc_dev_mtx);
5384
5385	/*
5386	 * Grab the config lock to prevent the 'next' device from being
5387	 * removed while we are writing to it.
5388	 */
5389	if (next != NULL)
5390		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
5391	mutex_exit(&spa_namespace_lock);
5392
5393	return (next);
5394}
5395
5396/*
5397 * Free buffers that were tagged for destruction.
5398 */
5399static void
5400l2arc_do_free_on_write()
5401{
5402	list_t *buflist;
5403	l2arc_data_free_t *df, *df_prev;
5404
5405	mutex_enter(&l2arc_free_on_write_mtx);
5406	buflist = l2arc_free_on_write;
5407
5408	for (df = list_tail(buflist); df; df = df_prev) {
5409		df_prev = list_prev(buflist, df);
5410		ASSERT(df->l2df_data != NULL);
5411		ASSERT(df->l2df_func != NULL);
5412		df->l2df_func(df->l2df_data, df->l2df_size);
5413		list_remove(buflist, df);
5414		kmem_free(df, sizeof (l2arc_data_free_t));
5415	}
5416
5417	mutex_exit(&l2arc_free_on_write_mtx);
5418}
5419
5420/*
5421 * A write to a cache device has completed.  Update all headers to allow
5422 * reads from these buffers to begin.
5423 */
5424static void
5425l2arc_write_done(zio_t *zio)
5426{
5427	l2arc_write_callback_t *cb;
5428	l2arc_dev_t *dev;
5429	list_t *buflist;
5430	arc_buf_hdr_t *head, *hdr, *hdr_prev;
5431	kmutex_t *hash_lock;
5432	int64_t bytes_dropped = 0;
5433
5434	cb = zio->io_private;
5435	ASSERT(cb != NULL);
5436	dev = cb->l2wcb_dev;
5437	ASSERT(dev != NULL);
5438	head = cb->l2wcb_head;
5439	ASSERT(head != NULL);
5440	buflist = &dev->l2ad_buflist;
5441	ASSERT(buflist != NULL);
5442	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
5443	    l2arc_write_callback_t *, cb);
5444
5445	if (zio->io_error != 0)
5446		ARCSTAT_BUMP(arcstat_l2_writes_error);
5447
5448	mutex_enter(&dev->l2ad_mtx);
5449
5450	/*
5451	 * All writes completed, or an error was hit.
5452	 */
5453	for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
5454		hdr_prev = list_prev(buflist, hdr);
5455
5456		hash_lock = HDR_LOCK(hdr);
5457		if (!mutex_tryenter(hash_lock)) {
5458			/*
5459			 * This buffer misses out.  It may be in a stage
5460			 * of eviction.  Its ARC_FLAG_L2_WRITING flag will be
5461			 * left set, denying reads to this buffer.
5462			 */
5463			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
5464			continue;
5465		}
5466
5467		/*
5468		 * It's possible that this buffer got evicted from the L1 cache
5469		 * before we grabbed the vdev + hash locks, in which case
5470		 * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated.
5471		 * Only free the buffer if we still have an L1 hdr.
5472		 */
5473		if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL &&
5474		    HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
5475			l2arc_release_cdata_buf(hdr);
5476
5477		if (zio->io_error != 0) {
5478			/*
5479			 * Error - drop L2ARC entry.
5480			 */
5481			trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev,
5482			    hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0);
5483			hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
5484
5485			ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
5486			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
5487
5488			bytes_dropped += hdr->b_l2hdr.b_asize;
5489			(void) refcount_remove_many(&dev->l2ad_alloc,
5490			    hdr->b_l2hdr.b_asize, hdr);
5491		}
5492
5493		/*
5494		 * Allow ARC to begin reads to this L2ARC entry.
5495		 */
5496		hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
5497
5498		mutex_exit(hash_lock);
5499	}
5500
5501	atomic_inc_64(&l2arc_writes_done);
5502	list_remove(buflist, head);
5503	ASSERT(!HDR_HAS_L1HDR(head));
5504	kmem_cache_free(hdr_l2only_cache, head);
5505	mutex_exit(&dev->l2ad_mtx);
5506
5507	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
5508
5509	l2arc_do_free_on_write();
5510
5511	kmem_free(cb, sizeof (l2arc_write_callback_t));
5512}
5513
5514/*
5515 * A read to a cache device completed.  Validate buffer contents before
5516 * handing over to the regular ARC routines.
5517 */
5518static void
5519l2arc_read_done(zio_t *zio)
5520{
5521	l2arc_read_callback_t *cb;
5522	arc_buf_hdr_t *hdr;
5523	arc_buf_t *buf;
5524	kmutex_t *hash_lock;
5525	int equal;
5526
5527	ASSERT(zio->io_vd != NULL);
5528	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
5529
5530	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
5531
5532	cb = zio->io_private;
5533	ASSERT(cb != NULL);
5534	buf = cb->l2rcb_buf;
5535	ASSERT(buf != NULL);
5536
5537	hash_lock = HDR_LOCK(buf->b_hdr);
5538	mutex_enter(hash_lock);
5539	hdr = buf->b_hdr;
5540	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
5541
5542	/*
5543	 * If the buffer was compressed, decompress it first.
5544	 */
5545	if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
5546		l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
5547	ASSERT(zio->io_data != NULL);
5548
5549	/*
5550	 * Check this survived the L2ARC journey.
5551	 */
5552	equal = arc_cksum_equal(buf);
5553	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
5554		mutex_exit(hash_lock);
5555		zio->io_private = buf;
5556		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
5557		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
5558		arc_read_done(zio);
5559	} else {
5560		mutex_exit(hash_lock);
5561		/*
5562		 * Buffer didn't survive caching.  Increment stats and
5563		 * reissue to the original storage device.
5564		 */
5565		if (zio->io_error != 0) {
5566			ARCSTAT_BUMP(arcstat_l2_io_error);
5567		} else {
5568			zio->io_error = SET_ERROR(EIO);
5569		}
5570		if (!equal)
5571			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
5572
5573		/*
5574		 * If there's no waiter, issue an async i/o to the primary
5575		 * storage now.  If there *is* a waiter, the caller must
5576		 * issue the i/o in a context where it's OK to block.
5577		 */
5578		if (zio->io_waiter == NULL) {
5579			zio_t *pio = zio_unique_parent(zio);
5580
5581			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
5582
5583			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
5584			    buf->b_data, zio->io_size, arc_read_done, buf,
5585			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
5586		}
5587	}
5588
5589	kmem_free(cb, sizeof (l2arc_read_callback_t));
5590}
5591
5592/*
5593 * This is the list priority from which the L2ARC will search for pages to
5594 * cache.  This is used within loops (0..3) to cycle through lists in the
5595 * desired order.  This order can have a significant effect on cache
5596 * performance.
5597 *
5598 * Currently the metadata lists are hit first, MFU then MRU, followed by
5599 * the data lists.  This function returns a locked list, and also returns
5600 * the lock pointer.
5601 */
5602static list_t *
5603l2arc_list_locked(int list_num, kmutex_t **lock)
5604{
5605	list_t *list = NULL;
5606	int idx;
5607
5608	ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);
5609
5610	if (list_num < ARC_BUFC_NUMMETADATALISTS) {
5611		idx = list_num;
5612		list = &arc_mfu->arcs_lists[idx];
5613		*lock = ARCS_LOCK(arc_mfu, idx);
5614	} else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) {
5615		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
5616		list = &arc_mru->arcs_lists[idx];
5617		*lock = ARCS_LOCK(arc_mru, idx);
5618	} else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 +
5619		ARC_BUFC_NUMDATALISTS)) {
5620		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
5621		list = &arc_mfu->arcs_lists[idx];
5622		*lock = ARCS_LOCK(arc_mfu, idx);
5623	} else {
5624		idx = list_num - ARC_BUFC_NUMLISTS;
5625		list = &arc_mru->arcs_lists[idx];
5626		*lock = ARCS_LOCK(arc_mru, idx);
5627	}
5628
5629	ASSERT(!(MUTEX_HELD(*lock)));
5630	mutex_enter(*lock);
5631	return (list);
5632}
5633
5634/*
5635 * Evict buffers from the device write hand to the distance specified in
5636 * bytes.  This distance may span populated buffers, it may span nothing.
5637 * This is clearing a region on the L2ARC device ready for writing.
5638 * If the 'all' boolean is set, every buffer is evicted.
5639 */
5640static void
5641l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
5642{
5643	list_t *buflist;
5644	arc_buf_hdr_t *hdr, *hdr_prev;
5645	kmutex_t *hash_lock;
5646	uint64_t taddr;
5647
5648	buflist = &dev->l2ad_buflist;
5649
5650	if (!all && dev->l2ad_first) {
5651		/*
5652		 * This is the first sweep through the device.  There is
5653		 * nothing to evict.
5654		 */
5655		return;
5656	}
5657
5658	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
5659		/*
5660		 * When nearing the end of the device, evict to the end
5661		 * before the device write hand jumps to the start.
5662		 */
5663		taddr = dev->l2ad_end;
5664	} else {
5665		taddr = dev->l2ad_hand + distance;
5666	}
5667	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
5668	    uint64_t, taddr, boolean_t, all);
5669
5670top:
5671	mutex_enter(&dev->l2ad_mtx);
5672	for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
5673		hdr_prev = list_prev(buflist, hdr);
5674
5675		hash_lock = HDR_LOCK(hdr);
5676		if (!mutex_tryenter(hash_lock)) {
5677			/*
5678			 * Missed the hash lock.  Retry.
5679			 */
5680			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
5681			mutex_exit(&dev->l2ad_mtx);
5682			mutex_enter(hash_lock);
5683			mutex_exit(hash_lock);
5684			goto top;
5685		}
5686
5687		if (HDR_L2_WRITE_HEAD(hdr)) {
5688			/*
5689			 * We hit a write head node.  Leave it for
5690			 * l2arc_write_done().
5691			 */
5692			list_remove(buflist, hdr);
5693			mutex_exit(hash_lock);
5694			continue;
5695		}
5696
5697		if (!all && HDR_HAS_L2HDR(hdr) &&
5698		    (hdr->b_l2hdr.b_daddr > taddr ||
5699		    hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
5700			/*
5701			 * We've evicted to the target address,
5702			 * or the end of the device.
5703			 */
5704			mutex_exit(hash_lock);
5705			break;
5706		}
5707
5708		ASSERT(HDR_HAS_L2HDR(hdr));
5709		if (!HDR_HAS_L1HDR(hdr)) {
5710			ASSERT(!HDR_L2_READING(hdr));
5711			/*
5712			 * This doesn't exist in the ARC.  Destroy.
5713			 * arc_hdr_destroy() will call list_remove()
5714			 * and decrement arcstat_l2_size.
5715			 */
5716			arc_change_state(arc_anon, hdr, hash_lock);
5717			arc_hdr_destroy(hdr);
5718		} else {
5719			ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
5720			ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
5721			/*
5722			 * Invalidate issued or about to be issued
5723			 * reads, since we may be about to write
5724			 * over this location.
5725			 */
5726			if (HDR_L2_READING(hdr)) {
5727				ARCSTAT_BUMP(arcstat_l2_evict_reading);
5728				hdr->b_flags |= ARC_FLAG_L2_EVICTED;
5729			}
5730
5731			arc_hdr_l2hdr_destroy(hdr);
5732		}
5733		mutex_exit(hash_lock);
5734	}
5735	mutex_exit(&dev->l2ad_mtx);
5736}
5737
5738/*
5739 * Find and write ARC buffers to the L2ARC device.
5740 *
5741 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
5742 * for reading until they have completed writing.
5743 * The headroom_boost is an in-out parameter used to maintain headroom boost
5744 * state between calls to this function.
5745 *
5746 * Returns the number of bytes actually written (which may be smaller than
5747 * the delta by which the device hand has changed due to alignment).
5748 */
5749static uint64_t
5750l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
5751    boolean_t *headroom_boost)
5752{
5753	arc_buf_hdr_t *hdr, *hdr_prev, *head;
5754	list_t *list;
5755	uint64_t write_asize, write_psize, write_sz, headroom,
5756	    buf_compress_minsz;
5757	void *buf_data;
5758	kmutex_t *list_lock;
5759	boolean_t full;
5760	l2arc_write_callback_t *cb;
5761	zio_t *pio, *wzio;
5762	uint64_t guid = spa_load_guid(spa);
5763	const boolean_t do_headroom_boost = *headroom_boost;
5764	int try;
5765
5766	ASSERT(dev->l2ad_vdev != NULL);
5767
5768	/* Lower the flag now, we might want to raise it again later. */
5769	*headroom_boost = B_FALSE;
5770
5771	pio = NULL;
5772	write_sz = write_asize = write_psize = 0;
5773	full = B_FALSE;
5774	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
5775	head->b_flags |= ARC_FLAG_L2_WRITE_HEAD;
5776	head->b_flags |= ARC_FLAG_HAS_L2HDR;
5777
5778	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
5779	/*
5780	 * We will want to try to compress buffers that are at least 2x the
5781	 * device sector size.
5782	 */
5783	buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
5784
5785	/*
5786	 * Copy buffers for L2ARC writing.
5787	 */
5788	mutex_enter(&dev->l2ad_mtx);
5789	for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
5790		uint64_t passed_sz = 0;
5791
5792		list = l2arc_list_locked(try, &list_lock);
5793		ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
5794
5795		/*
5796		 * L2ARC fast warmup.
5797		 *
5798		 * Until the ARC is warm and starts to evict, read from the
5799		 * head of the ARC lists rather than the tail.
5800		 */
5801		if (arc_warm == B_FALSE)
5802			hdr = list_head(list);
5803		else
5804			hdr = list_tail(list);
5805		if (hdr == NULL)
5806			ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
5807
5808		headroom = target_sz * l2arc_headroom * 2 / ARC_BUFC_NUMLISTS;
5809		if (do_headroom_boost)
5810			headroom = (headroom * l2arc_headroom_boost) / 100;
5811
5812		for (; hdr; hdr = hdr_prev) {
5813			kmutex_t *hash_lock;
5814			uint64_t buf_sz;
5815
5816			if (arc_warm == B_FALSE)
5817				hdr_prev = list_next(list, hdr);
5818			else
5819				hdr_prev = list_prev(list, hdr);
5820			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size);
5821
5822			hash_lock = HDR_LOCK(hdr);
5823			if (!mutex_tryenter(hash_lock)) {
5824				ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
5825				/*
5826				 * Skip this buffer rather than waiting.
5827				 */
5828				continue;
5829			}
5830
5831			passed_sz += hdr->b_size;
5832			if (passed_sz > headroom) {
5833				/*
5834				 * Searched too far.
5835				 */
5836				mutex_exit(hash_lock);
5837				ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
5838				break;
5839			}
5840
5841			if (!l2arc_write_eligible(guid, hdr)) {
5842				mutex_exit(hash_lock);
5843				continue;
5844			}
5845
5846			if ((write_sz + hdr->b_size) > target_sz) {
5847				full = B_TRUE;
5848				mutex_exit(hash_lock);
5849				ARCSTAT_BUMP(arcstat_l2_write_full);
5850				break;
5851			}
5852
5853			if (pio == NULL) {
5854				/*
5855				 * Insert a dummy header on the buflist so
5856				 * l2arc_write_done() can find where the
5857				 * write buffers begin without searching.
5858				 */
5859				list_insert_head(&dev->l2ad_buflist, head);
5860
5861				cb = kmem_alloc(
5862				    sizeof (l2arc_write_callback_t), KM_SLEEP);
5863				cb->l2wcb_dev = dev;
5864				cb->l2wcb_head = head;
5865				pio = zio_root(spa, l2arc_write_done, cb,
5866				    ZIO_FLAG_CANFAIL);
5867				ARCSTAT_BUMP(arcstat_l2_write_pios);
5868			}
5869
5870			/*
5871			 * Create and add a new L2ARC header.
5872			 */
5873			hdr->b_l2hdr.b_dev = dev;
5874			hdr->b_flags |= ARC_FLAG_L2_WRITING;
5875			/*
5876			 * Temporarily stash the data buffer in b_tmp_cdata.
5877			 * The subsequent write step will pick it up from
5878			 * there. This is because can't access b_l1hdr.b_buf
5879			 * without holding the hash_lock, which we in turn
5880			 * can't access without holding the ARC list locks
5881			 * (which we want to avoid during compression/writing).
5882			 */
5883			HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
5884			hdr->b_l2hdr.b_asize = hdr->b_size;
5885			hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data;
5886
5887			/*
5888			 * Explicitly set the b_daddr field to a known
5889			 * value which means "invalid address". This
5890			 * enables us to differentiate which stage of
5891			 * l2arc_write_buffers() the particular header
5892			 * is in (e.g. this loop, or the one below).
5893			 * ARC_FLAG_L2_WRITING is not enough to make
5894			 * this distinction, and we need to know in
5895			 * order to do proper l2arc vdev accounting in
5896			 * arc_release() and arc_hdr_destroy().
5897			 *
5898			 * Note, we can't use a new flag to distinguish
5899			 * the two stages because we don't hold the
5900			 * header's hash_lock below, in the second stage
5901			 * of this function. Thus, we can't simply
5902			 * change the b_flags field to denote that the
5903			 * IO has been sent. We can change the b_daddr
5904			 * field of the L2 portion, though, since we'll
5905			 * be holding the l2ad_mtx; which is why we're
5906			 * using it to denote the header's state change.
5907			 */
5908			hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET;
5909
5910			buf_sz = hdr->b_size;
5911			hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
5912
5913			list_insert_head(&dev->l2ad_buflist, hdr);
5914
5915			/*
5916			 * Compute and store the buffer cksum before
5917			 * writing.  On debug the cksum is verified first.
5918			 */
5919			arc_cksum_verify(hdr->b_l1hdr.b_buf);
5920			arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE);
5921
5922			mutex_exit(hash_lock);
5923
5924			write_sz += buf_sz;
5925		}
5926
5927		mutex_exit(list_lock);
5928
5929		if (full == B_TRUE)
5930			break;
5931	}
5932
5933	/* No buffers selected for writing? */
5934	if (pio == NULL) {
5935		ASSERT0(write_sz);
5936		mutex_exit(&dev->l2ad_mtx);
5937		ASSERT(!HDR_HAS_L1HDR(head));
5938		kmem_cache_free(hdr_l2only_cache, head);
5939		return (0);
5940	}
5941
5942	/*
5943	 * Now start writing the buffers. We're starting at the write head
5944	 * and work backwards, retracing the course of the buffer selector
5945	 * loop above.
5946	 */
5947	for (hdr = list_prev(&dev->l2ad_buflist, head); hdr;
5948	    hdr = list_prev(&dev->l2ad_buflist, hdr)) {
5949		uint64_t buf_sz;
5950
5951		/*
5952		 * We shouldn't need to lock the buffer here, since we flagged
5953		 * it as ARC_FLAG_L2_WRITING in the previous step, but we must
5954		 * take care to only access its L2 cache parameters. In
5955		 * particular, hdr->l1hdr.b_buf may be invalid by now due to
5956		 * ARC eviction.
5957		 */
5958		hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
5959
5960		if ((HDR_L2COMPRESS(hdr)) &&
5961		    hdr->b_l2hdr.b_asize >= buf_compress_minsz) {
5962			if (l2arc_compress_buf(hdr)) {
5963				/*
5964				 * If compression succeeded, enable headroom
5965				 * boost on the next scan cycle.
5966				 */
5967				*headroom_boost = B_TRUE;
5968			}
5969		}
5970
5971		/*
5972		 * Pick up the buffer data we had previously stashed away
5973		 * (and now potentially also compressed).
5974		 */
5975		buf_data = hdr->b_l1hdr.b_tmp_cdata;
5976		buf_sz = hdr->b_l2hdr.b_asize;
5977
5978		/*
5979		 * If the data has not been compressed, then clear b_tmp_cdata
5980		 * to make sure that it points only to a temporary compression
5981		 * buffer.
5982		 */
5983		if (!L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr)))
5984			hdr->b_l1hdr.b_tmp_cdata = NULL;
5985
5986		/*
5987		 * We need to do this regardless if buf_sz is zero or
5988		 * not, otherwise, when this l2hdr is evicted we'll
5989		 * remove a reference that was never added.
5990		 */
5991		(void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr);
5992
5993		/* Compression may have squashed the buffer to zero length. */
5994		if (buf_sz != 0) {
5995			uint64_t buf_p_sz;
5996
5997			wzio = zio_write_phys(pio, dev->l2ad_vdev,
5998			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5999			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
6000			    ZIO_FLAG_CANFAIL, B_FALSE);
6001
6002			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6003			    zio_t *, wzio);
6004			(void) zio_nowait(wzio);
6005
6006			write_asize += buf_sz;
6007
6008			/*
6009			 * Keep the clock hand suitably device-aligned.
6010			 */
6011			buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
6012			write_psize += buf_p_sz;
6013			dev->l2ad_hand += buf_p_sz;
6014		}
6015	}
6016
6017	mutex_exit(&dev->l2ad_mtx);
6018
6019	ASSERT3U(write_asize, <=, target_sz);
6020	ARCSTAT_BUMP(arcstat_l2_writes_sent);
6021	ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
6022	ARCSTAT_INCR(arcstat_l2_size, write_sz);
6023	ARCSTAT_INCR(arcstat_l2_asize, write_asize);
6024	vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
6025
6026	/*
6027	 * Bump device hand to the device start if it is approaching the end.
6028	 * l2arc_evict() will already have evicted ahead for this case.
6029	 */
6030	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
6031		dev->l2ad_hand = dev->l2ad_start;
6032		dev->l2ad_first = B_FALSE;
6033	}
6034
6035	dev->l2ad_writing = B_TRUE;
6036	(void) zio_wait(pio);
6037	dev->l2ad_writing = B_FALSE;
6038
6039	return (write_asize);
6040}
6041
6042/*
6043 * Compresses an L2ARC buffer.
6044 * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its
6045 * size in l2hdr->b_asize. This routine tries to compress the data and
6046 * depending on the compression result there are three possible outcomes:
6047 * *) The buffer was incompressible. The original l2hdr contents were left
6048 *    untouched and are ready for writing to an L2 device.
6049 * *) The buffer was all-zeros, so there is no need to write it to an L2
6050 *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
6051 *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
6052 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
6053 *    data buffer which holds the compressed data to be written, and b_asize
6054 *    tells us how much data there is. b_compress is set to the appropriate
6055 *    compression algorithm. Once writing is done, invoke
6056 *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
6057 *
6058 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
6059 * buffer was incompressible).
6060 */
6061static boolean_t
6062l2arc_compress_buf(arc_buf_hdr_t *hdr)
6063{
6064	void *cdata;
6065	size_t csize, len, rounded;
6066	ASSERT(HDR_HAS_L2HDR(hdr));
6067	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
6068
6069	ASSERT(HDR_HAS_L1HDR(hdr));
6070	ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF);
6071	ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
6072
6073	len = l2hdr->b_asize;
6074	cdata = zio_data_buf_alloc(len);
6075	ASSERT3P(cdata, !=, NULL);
6076	csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata,
6077	    cdata, l2hdr->b_asize);
6078
6079	if (csize == 0) {
6080		/* zero block, indicate that there's nothing to write */
6081		zio_data_buf_free(cdata, len);
6082		HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY);
6083		l2hdr->b_asize = 0;
6084		hdr->b_l1hdr.b_tmp_cdata = NULL;
6085		ARCSTAT_BUMP(arcstat_l2_compress_zeros);
6086		return (B_TRUE);
6087	}
6088
6089	rounded = P2ROUNDUP(csize,
6090	    (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift);
6091	if (rounded < len) {
6092		/*
6093		 * Compression succeeded, we'll keep the cdata around for
6094		 * writing and release it afterwards.
6095		 */
6096		if (rounded > csize) {
6097			bzero((char *)cdata + csize, rounded - csize);
6098			csize = rounded;
6099		}
6100		HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4);
6101		l2hdr->b_asize = csize;
6102		hdr->b_l1hdr.b_tmp_cdata = cdata;
6103		ARCSTAT_BUMP(arcstat_l2_compress_successes);
6104		return (B_TRUE);
6105	} else {
6106		/*
6107		 * Compression failed, release the compressed buffer.
6108		 * l2hdr will be left unmodified.
6109		 */
6110		zio_data_buf_free(cdata, len);
6111		ARCSTAT_BUMP(arcstat_l2_compress_failures);
6112		return (B_FALSE);
6113	}
6114}
6115
6116/*
6117 * Decompresses a zio read back from an l2arc device. On success, the
6118 * underlying zio's io_data buffer is overwritten by the uncompressed
6119 * version. On decompression error (corrupt compressed stream), the
6120 * zio->io_error value is set to signal an I/O error.
6121 *
6122 * Please note that the compressed data stream is not checksummed, so
6123 * if the underlying device is experiencing data corruption, we may feed
6124 * corrupt data to the decompressor, so the decompressor needs to be
6125 * able to handle this situation (LZ4 does).
6126 */
6127static void
6128l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
6129{
6130	ASSERT(L2ARC_IS_VALID_COMPRESS(c));
6131
6132	if (zio->io_error != 0) {
6133		/*
6134		 * An io error has occured, just restore the original io
6135		 * size in preparation for a main pool read.
6136		 */
6137		zio->io_orig_size = zio->io_size = hdr->b_size;
6138		return;
6139	}
6140
6141	if (c == ZIO_COMPRESS_EMPTY) {
6142		/*
6143		 * An empty buffer results in a null zio, which means we
6144		 * need to fill its io_data after we're done restoring the
6145		 * buffer's contents.
6146		 */
6147		ASSERT(hdr->b_l1hdr.b_buf != NULL);
6148		bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size);
6149		zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data;
6150	} else {
6151		ASSERT(zio->io_data != NULL);
6152		/*
6153		 * We copy the compressed data from the start of the arc buffer
6154		 * (the zio_read will have pulled in only what we need, the
6155		 * rest is garbage which we will overwrite at decompression)
6156		 * and then decompress back to the ARC data buffer. This way we
6157		 * can minimize copying by simply decompressing back over the
6158		 * original compressed data (rather than decompressing to an
6159		 * aux buffer and then copying back the uncompressed buffer,
6160		 * which is likely to be much larger).
6161		 */
6162		uint64_t csize;
6163		void *cdata;
6164
6165		csize = zio->io_size;
6166		cdata = zio_data_buf_alloc(csize);
6167		bcopy(zio->io_data, cdata, csize);
6168		if (zio_decompress_data(c, cdata, zio->io_data, csize,
6169		    hdr->b_size) != 0)
6170			zio->io_error = EIO;
6171		zio_data_buf_free(cdata, csize);
6172	}
6173
6174	/* Restore the expected uncompressed IO size. */
6175	zio->io_orig_size = zio->io_size = hdr->b_size;
6176}
6177
6178/*
6179 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
6180 * This buffer serves as a temporary holder of compressed data while
6181 * the buffer entry is being written to an l2arc device. Once that is
6182 * done, we can dispose of it.
6183 */
6184static void
6185l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
6186{
6187	ASSERT(HDR_HAS_L1HDR(hdr));
6188	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) {
6189		/*
6190		 * If the data was compressed, then we've allocated a
6191		 * temporary buffer for it, so now we need to release it.
6192		 */
6193		ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
6194		zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata,
6195		    hdr->b_size);
6196		hdr->b_l1hdr.b_tmp_cdata = NULL;
6197	} else {
6198		ASSERT(hdr->b_l1hdr.b_tmp_cdata == NULL);
6199	}
6200}
6201
6202/*
6203 * This thread feeds the L2ARC at regular intervals.  This is the beating
6204 * heart of the L2ARC.
6205 */
6206static void
6207l2arc_feed_thread(void *dummy __unused)
6208{
6209	callb_cpr_t cpr;
6210	l2arc_dev_t *dev;
6211	spa_t *spa;
6212	uint64_t size, wrote;
6213	clock_t begin, next = ddi_get_lbolt();
6214	boolean_t headroom_boost = B_FALSE;
6215
6216	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
6217
6218	mutex_enter(&l2arc_feed_thr_lock);
6219
6220	while (l2arc_thread_exit == 0) {
6221		CALLB_CPR_SAFE_BEGIN(&cpr);
6222		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
6223		    next - ddi_get_lbolt());
6224		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
6225		next = ddi_get_lbolt() + hz;
6226
6227		/*
6228		 * Quick check for L2ARC devices.
6229		 */
6230		mutex_enter(&l2arc_dev_mtx);
6231		if (l2arc_ndev == 0) {
6232			mutex_exit(&l2arc_dev_mtx);
6233			continue;
6234		}
6235		mutex_exit(&l2arc_dev_mtx);
6236		begin = ddi_get_lbolt();
6237
6238		/*
6239		 * This selects the next l2arc device to write to, and in
6240		 * doing so the next spa to feed from: dev->l2ad_spa.   This
6241		 * will return NULL if there are now no l2arc devices or if
6242		 * they are all faulted.
6243		 *
6244		 * If a device is returned, its spa's config lock is also
6245		 * held to prevent device removal.  l2arc_dev_get_next()
6246		 * will grab and release l2arc_dev_mtx.
6247		 */
6248		if ((dev = l2arc_dev_get_next()) == NULL)
6249			continue;
6250
6251		spa = dev->l2ad_spa;
6252		ASSERT(spa != NULL);
6253
6254		/*
6255		 * If the pool is read-only then force the feed thread to
6256		 * sleep a little longer.
6257		 */
6258		if (!spa_writeable(spa)) {
6259			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
6260			spa_config_exit(spa, SCL_L2ARC, dev);
6261			continue;
6262		}
6263
6264		/*
6265		 * Avoid contributing to memory pressure.
6266		 */
6267		if (arc_reclaim_needed()) {
6268			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
6269			spa_config_exit(spa, SCL_L2ARC, dev);
6270			continue;
6271		}
6272
6273		ARCSTAT_BUMP(arcstat_l2_feeds);
6274
6275		size = l2arc_write_size();
6276
6277		/*
6278		 * Evict L2ARC buffers that will be overwritten.
6279		 */
6280		l2arc_evict(dev, size, B_FALSE);
6281
6282		/*
6283		 * Write ARC buffers.
6284		 */
6285		wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
6286
6287		/*
6288		 * Calculate interval between writes.
6289		 */
6290		next = l2arc_write_interval(begin, size, wrote);
6291		spa_config_exit(spa, SCL_L2ARC, dev);
6292	}
6293
6294	l2arc_thread_exit = 0;
6295	cv_broadcast(&l2arc_feed_thr_cv);
6296	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
6297	thread_exit();
6298}
6299
6300boolean_t
6301l2arc_vdev_present(vdev_t *vd)
6302{
6303	l2arc_dev_t *dev;
6304
6305	mutex_enter(&l2arc_dev_mtx);
6306	for (dev = list_head(l2arc_dev_list); dev != NULL;
6307	    dev = list_next(l2arc_dev_list, dev)) {
6308		if (dev->l2ad_vdev == vd)
6309			break;
6310	}
6311	mutex_exit(&l2arc_dev_mtx);
6312
6313	return (dev != NULL);
6314}
6315
6316/*
6317 * Add a vdev for use by the L2ARC.  By this point the spa has already
6318 * validated the vdev and opened it.
6319 */
6320void
6321l2arc_add_vdev(spa_t *spa, vdev_t *vd)
6322{
6323	l2arc_dev_t *adddev;
6324
6325	ASSERT(!l2arc_vdev_present(vd));
6326
6327	vdev_ashift_optimize(vd);
6328
6329	/*
6330	 * Create a new l2arc device entry.
6331	 */
6332	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
6333	adddev->l2ad_spa = spa;
6334	adddev->l2ad_vdev = vd;
6335	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
6336	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
6337	adddev->l2ad_hand = adddev->l2ad_start;
6338	adddev->l2ad_first = B_TRUE;
6339	adddev->l2ad_writing = B_FALSE;
6340
6341	mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
6342	/*
6343	 * This is a list of all ARC buffers that are still valid on the
6344	 * device.
6345	 */
6346	list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
6347	    offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
6348
6349	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
6350	refcount_create(&adddev->l2ad_alloc);
6351
6352	/*
6353	 * Add device to global list
6354	 */
6355	mutex_enter(&l2arc_dev_mtx);
6356	list_insert_head(l2arc_dev_list, adddev);
6357	atomic_inc_64(&l2arc_ndev);
6358	mutex_exit(&l2arc_dev_mtx);
6359}
6360
6361/*
6362 * Remove a vdev from the L2ARC.
6363 */
6364void
6365l2arc_remove_vdev(vdev_t *vd)
6366{
6367	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
6368
6369	/*
6370	 * Find the device by vdev
6371	 */
6372	mutex_enter(&l2arc_dev_mtx);
6373	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
6374		nextdev = list_next(l2arc_dev_list, dev);
6375		if (vd == dev->l2ad_vdev) {
6376			remdev = dev;
6377			break;
6378		}
6379	}
6380	ASSERT(remdev != NULL);
6381
6382	/*
6383	 * Remove device from global list
6384	 */
6385	list_remove(l2arc_dev_list, remdev);
6386	l2arc_dev_last = NULL;		/* may have been invalidated */
6387	atomic_dec_64(&l2arc_ndev);
6388	mutex_exit(&l2arc_dev_mtx);
6389
6390	/*
6391	 * Clear all buflists and ARC references.  L2ARC device flush.
6392	 */
6393	l2arc_evict(remdev, 0, B_TRUE);
6394	list_destroy(&remdev->l2ad_buflist);
6395	mutex_destroy(&remdev->l2ad_mtx);
6396	refcount_destroy(&remdev->l2ad_alloc);
6397	kmem_free(remdev, sizeof (l2arc_dev_t));
6398}
6399
6400void
6401l2arc_init(void)
6402{
6403	l2arc_thread_exit = 0;
6404	l2arc_ndev = 0;
6405	l2arc_writes_sent = 0;
6406	l2arc_writes_done = 0;
6407
6408	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
6409	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
6410	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
6411	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
6412
6413	l2arc_dev_list = &L2ARC_dev_list;
6414	l2arc_free_on_write = &L2ARC_free_on_write;
6415	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
6416	    offsetof(l2arc_dev_t, l2ad_node));
6417	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
6418	    offsetof(l2arc_data_free_t, l2df_list_node));
6419}
6420
6421void
6422l2arc_fini(void)
6423{
6424	/*
6425	 * This is called from dmu_fini(), which is called from spa_fini();
6426	 * Because of this, we can assume that all l2arc devices have
6427	 * already been removed when the pools themselves were removed.
6428	 */
6429
6430	l2arc_do_free_on_write();
6431
6432	mutex_destroy(&l2arc_feed_thr_lock);
6433	cv_destroy(&l2arc_feed_thr_cv);
6434	mutex_destroy(&l2arc_dev_mtx);
6435	mutex_destroy(&l2arc_free_on_write_mtx);
6436
6437	list_destroy(l2arc_dev_list);
6438	list_destroy(l2arc_free_on_write);
6439}
6440
6441void
6442l2arc_start(void)
6443{
6444	if (!(spa_mode_global & FWRITE))
6445		return;
6446
6447	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
6448	    TS_RUN, minclsyspri);
6449}
6450
6451void
6452l2arc_stop(void)
6453{
6454	if (!(spa_mode_global & FWRITE))
6455		return;
6456
6457	mutex_enter(&l2arc_feed_thr_lock);
6458	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
6459	l2arc_thread_exit = 1;
6460	while (l2arc_thread_exit != 0)
6461		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
6462	mutex_exit(&l2arc_feed_thr_lock);
6463}
6464