arc.c revision 277583
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
24 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
25 * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
26 */
27
28/*
29 * DVA-based Adjustable Replacement Cache
30 *
31 * While much of the theory of operation used here is
32 * based on the self-tuning, low overhead replacement cache
33 * presented by Megiddo and Modha at FAST 2003, there are some
34 * significant differences:
35 *
36 * 1. The Megiddo and Modha model assumes any page is evictable.
37 * Pages in its cache cannot be "locked" into memory.  This makes
38 * the eviction algorithm simple: evict the last page in the list.
39 * This also make the performance characteristics easy to reason
40 * about.  Our cache is not so simple.  At any given moment, some
41 * subset of the blocks in the cache are un-evictable because we
42 * have handed out a reference to them.  Blocks are only evictable
43 * when there are no external references active.  This makes
44 * eviction far more problematic:  we choose to evict the evictable
45 * blocks that are the "lowest" in the list.
46 *
47 * There are times when it is not possible to evict the requested
48 * space.  In these circumstances we are unable to adjust the cache
49 * size.  To prevent the cache growing unbounded at these times we
50 * implement a "cache throttle" that slows the flow of new data
51 * into the cache until we can make space available.
52 *
53 * 2. The Megiddo and Modha model assumes a fixed cache size.
54 * Pages are evicted when the cache is full and there is a cache
55 * miss.  Our model has a variable sized cache.  It grows with
56 * high use, but also tries to react to memory pressure from the
57 * operating system: decreasing its size when system memory is
58 * tight.
59 *
60 * 3. The Megiddo and Modha model assumes a fixed page size. All
61 * elements of the cache are therefore exactly the same size.  So
62 * when adjusting the cache size following a cache miss, its simply
63 * a matter of choosing a single page to evict.  In our model, we
64 * have variable sized cache blocks (rangeing from 512 bytes to
65 * 128K bytes).  We therefore choose a set of blocks to evict to make
66 * space for a cache miss that approximates as closely as possible
67 * the space used by the new block.
68 *
69 * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
70 * by N. Megiddo & D. Modha, FAST 2003
71 */
72
73/*
74 * The locking model:
75 *
76 * A new reference to a cache buffer can be obtained in two
77 * ways: 1) via a hash table lookup using the DVA as a key,
78 * or 2) via one of the ARC lists.  The arc_read() interface
79 * uses method 1, while the internal arc algorithms for
80 * adjusting the cache use method 2.  We therefore provide two
81 * types of locks: 1) the hash table lock array, and 2) the
82 * arc list locks.
83 *
84 * Buffers do not have their own mutexs, rather they rely on the
85 * hash table mutexs for the bulk of their protection (i.e. most
86 * fields in the arc_buf_hdr_t are protected by these mutexs).
87 *
88 * buf_hash_find() returns the appropriate mutex (held) when it
89 * locates the requested buffer in the hash table.  It returns
90 * NULL for the mutex if the buffer was not in the table.
91 *
92 * buf_hash_remove() expects the appropriate hash mutex to be
93 * already held before it is invoked.
94 *
95 * Each arc state also has a mutex which is used to protect the
96 * buffer list associated with the state.  When attempting to
97 * obtain a hash table lock while holding an arc list lock you
98 * must use: mutex_tryenter() to avoid deadlock.  Also note that
99 * the active state mutex must be held before the ghost state mutex.
100 *
101 * Arc buffers may have an associated eviction callback function.
102 * This function will be invoked prior to removing the buffer (e.g.
103 * in arc_do_user_evicts()).  Note however that the data associated
104 * with the buffer may be evicted prior to the callback.  The callback
105 * must be made with *no locks held* (to prevent deadlock).  Additionally,
106 * the users of callbacks must ensure that their private data is
107 * protected from simultaneous callbacks from arc_clear_callback()
108 * and arc_do_user_evicts().
109 *
110 * Note that the majority of the performance stats are manipulated
111 * with atomic operations.
112 *
113 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
114 *
115 *	- L2ARC buflist creation
116 *	- L2ARC buflist eviction
117 *	- L2ARC write completion, which walks L2ARC buflists
118 *	- ARC header destruction, as it removes from L2ARC buflists
119 *	- ARC header release, as it removes from L2ARC buflists
120 */
121
122#include <sys/spa.h>
123#include <sys/zio.h>
124#include <sys/zio_compress.h>
125#include <sys/zfs_context.h>
126#include <sys/arc.h>
127#include <sys/refcount.h>
128#include <sys/vdev.h>
129#include <sys/vdev_impl.h>
130#include <sys/dsl_pool.h>
131#ifdef _KERNEL
132#include <sys/dnlc.h>
133#endif
134#include <sys/callb.h>
135#include <sys/kstat.h>
136#include <sys/trim_map.h>
137#include <zfs_fletcher.h>
138#include <sys/sdt.h>
139
140#include <vm/vm_pageout.h>
141#include <machine/vmparam.h>
142
143#ifdef illumos
144#ifndef _KERNEL
145/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
146boolean_t arc_watch = B_FALSE;
147int arc_procfd;
148#endif
149#endif /* illumos */
150
151static kmutex_t		arc_reclaim_thr_lock;
152static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
153static uint8_t		arc_thread_exit;
154
155#define	ARC_REDUCE_DNLC_PERCENT	3
156uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
157
158typedef enum arc_reclaim_strategy {
159	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
160	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
161} arc_reclaim_strategy_t;
162
163/*
164 * The number of iterations through arc_evict_*() before we
165 * drop & reacquire the lock.
166 */
167int arc_evict_iterations = 100;
168
169/* number of seconds before growing cache again */
170static int		arc_grow_retry = 60;
171
172/* shift of arc_c for calculating both min and max arc_p */
173static int		arc_p_min_shift = 4;
174
175/* log2(fraction of arc to reclaim) */
176static int		arc_shrink_shift = 5;
177
178/*
179 * minimum lifespan of a prefetch block in clock ticks
180 * (initialized in arc_init())
181 */
182static int		arc_min_prefetch_lifespan;
183
184/*
185 * If this percent of memory is free, don't throttle.
186 */
187int arc_lotsfree_percent = 10;
188
189static int arc_dead;
190extern int zfs_prefetch_disable;
191
192/*
193 * The arc has filled available memory and has now warmed up.
194 */
195static boolean_t arc_warm;
196
197uint64_t zfs_arc_max;
198uint64_t zfs_arc_min;
199uint64_t zfs_arc_meta_limit = 0;
200int zfs_arc_grow_retry = 0;
201int zfs_arc_shrink_shift = 0;
202int zfs_arc_p_min_shift = 0;
203int zfs_disable_dup_eviction = 0;
204uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
205u_int zfs_arc_free_target = 0;
206
207static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
208static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS);
209
210#ifdef _KERNEL
211static void
212arc_free_target_init(void *unused __unused)
213{
214
215	zfs_arc_free_target = vm_pageout_wakeup_thresh;
216}
217SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
218    arc_free_target_init, NULL);
219
220TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
221TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
222TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
223TUNABLE_QUAD("vfs.zfs.arc_average_blocksize", &zfs_arc_average_blocksize);
224TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift);
225SYSCTL_DECL(_vfs_zfs);
226SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
227    "Maximum ARC size");
228SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
229    "Minimum ARC size");
230SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
231    &zfs_arc_average_blocksize, 0,
232    "ARC average blocksize");
233SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
234    &arc_shrink_shift, 0,
235    "log2(fraction of arc to reclaim)");
236
237/*
238 * We don't have a tunable for arc_free_target due to the dependency on
239 * pagedaemon initialisation.
240 */
241SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
242    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
243    sysctl_vfs_zfs_arc_free_target, "IU",
244    "Desired number of free pages below which ARC triggers reclaim");
245
246static int
247sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
248{
249	u_int val;
250	int err;
251
252	val = zfs_arc_free_target;
253	err = sysctl_handle_int(oidp, &val, 0, req);
254	if (err != 0 || req->newptr == NULL)
255		return (err);
256
257	if (val < minfree)
258		return (EINVAL);
259	if (val > cnt.v_page_count)
260		return (EINVAL);
261
262	zfs_arc_free_target = val;
263
264	return (0);
265}
266
267/*
268 * Must be declared here, before the definition of corresponding kstat
269 * macro which uses the same names will confuse the compiler.
270 */
271SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit,
272    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
273    sysctl_vfs_zfs_arc_meta_limit, "QU",
274    "ARC metadata limit");
275#endif
276
277/*
278 * Note that buffers can be in one of 6 states:
279 *	ARC_anon	- anonymous (discussed below)
280 *	ARC_mru		- recently used, currently cached
281 *	ARC_mru_ghost	- recentely used, no longer in cache
282 *	ARC_mfu		- frequently used, currently cached
283 *	ARC_mfu_ghost	- frequently used, no longer in cache
284 *	ARC_l2c_only	- exists in L2ARC but not other states
285 * When there are no active references to the buffer, they are
286 * are linked onto a list in one of these arc states.  These are
287 * the only buffers that can be evicted or deleted.  Within each
288 * state there are multiple lists, one for meta-data and one for
289 * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
290 * etc.) is tracked separately so that it can be managed more
291 * explicitly: favored over data, limited explicitly.
292 *
293 * Anonymous buffers are buffers that are not associated with
294 * a DVA.  These are buffers that hold dirty block copies
295 * before they are written to stable storage.  By definition,
296 * they are "ref'd" and are considered part of arc_mru
297 * that cannot be freed.  Generally, they will aquire a DVA
298 * as they are written and migrate onto the arc_mru list.
299 *
300 * The ARC_l2c_only state is for buffers that are in the second
301 * level ARC but no longer in any of the ARC_m* lists.  The second
302 * level ARC itself may also contain buffers that are in any of
303 * the ARC_m* states - meaning that a buffer can exist in two
304 * places.  The reason for the ARC_l2c_only state is to keep the
305 * buffer header in the hash table, so that reads that hit the
306 * second level ARC benefit from these fast lookups.
307 */
308
309#define	ARCS_LOCK_PAD		CACHE_LINE_SIZE
310struct arcs_lock {
311	kmutex_t	arcs_lock;
312#ifdef _KERNEL
313	unsigned char	pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
314#endif
315};
316
317/*
318 * must be power of two for mask use to work
319 *
320 */
321#define ARC_BUFC_NUMDATALISTS		16
322#define ARC_BUFC_NUMMETADATALISTS	16
323#define ARC_BUFC_NUMLISTS	(ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
324
325typedef struct arc_state {
326	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
327	uint64_t arcs_size;	/* total amount of data in this state */
328	list_t	arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
329	struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
330} arc_state_t;
331
332#define ARCS_LOCK(s, i)	(&((s)->arcs_locks[(i)].arcs_lock))
333
334/* The 6 states: */
335static arc_state_t ARC_anon;
336static arc_state_t ARC_mru;
337static arc_state_t ARC_mru_ghost;
338static arc_state_t ARC_mfu;
339static arc_state_t ARC_mfu_ghost;
340static arc_state_t ARC_l2c_only;
341
342typedef struct arc_stats {
343	kstat_named_t arcstat_hits;
344	kstat_named_t arcstat_misses;
345	kstat_named_t arcstat_demand_data_hits;
346	kstat_named_t arcstat_demand_data_misses;
347	kstat_named_t arcstat_demand_metadata_hits;
348	kstat_named_t arcstat_demand_metadata_misses;
349	kstat_named_t arcstat_prefetch_data_hits;
350	kstat_named_t arcstat_prefetch_data_misses;
351	kstat_named_t arcstat_prefetch_metadata_hits;
352	kstat_named_t arcstat_prefetch_metadata_misses;
353	kstat_named_t arcstat_mru_hits;
354	kstat_named_t arcstat_mru_ghost_hits;
355	kstat_named_t arcstat_mfu_hits;
356	kstat_named_t arcstat_mfu_ghost_hits;
357	kstat_named_t arcstat_allocated;
358	kstat_named_t arcstat_deleted;
359	kstat_named_t arcstat_stolen;
360	kstat_named_t arcstat_recycle_miss;
361	/*
362	 * Number of buffers that could not be evicted because the hash lock
363	 * was held by another thread.  The lock may not necessarily be held
364	 * by something using the same buffer, since hash locks are shared
365	 * by multiple buffers.
366	 */
367	kstat_named_t arcstat_mutex_miss;
368	/*
369	 * Number of buffers skipped because they have I/O in progress, are
370	 * indrect prefetch buffers that have not lived long enough, or are
371	 * not from the spa we're trying to evict from.
372	 */
373	kstat_named_t arcstat_evict_skip;
374	kstat_named_t arcstat_evict_l2_cached;
375	kstat_named_t arcstat_evict_l2_eligible;
376	kstat_named_t arcstat_evict_l2_ineligible;
377	kstat_named_t arcstat_hash_elements;
378	kstat_named_t arcstat_hash_elements_max;
379	kstat_named_t arcstat_hash_collisions;
380	kstat_named_t arcstat_hash_chains;
381	kstat_named_t arcstat_hash_chain_max;
382	kstat_named_t arcstat_p;
383	kstat_named_t arcstat_c;
384	kstat_named_t arcstat_c_min;
385	kstat_named_t arcstat_c_max;
386	kstat_named_t arcstat_size;
387	kstat_named_t arcstat_hdr_size;
388	kstat_named_t arcstat_data_size;
389	kstat_named_t arcstat_other_size;
390	kstat_named_t arcstat_l2_hits;
391	kstat_named_t arcstat_l2_misses;
392	kstat_named_t arcstat_l2_feeds;
393	kstat_named_t arcstat_l2_rw_clash;
394	kstat_named_t arcstat_l2_read_bytes;
395	kstat_named_t arcstat_l2_write_bytes;
396	kstat_named_t arcstat_l2_writes_sent;
397	kstat_named_t arcstat_l2_writes_done;
398	kstat_named_t arcstat_l2_writes_error;
399	kstat_named_t arcstat_l2_writes_hdr_miss;
400	kstat_named_t arcstat_l2_evict_lock_retry;
401	kstat_named_t arcstat_l2_evict_reading;
402	kstat_named_t arcstat_l2_free_on_write;
403	kstat_named_t arcstat_l2_cdata_free_on_write;
404	kstat_named_t arcstat_l2_abort_lowmem;
405	kstat_named_t arcstat_l2_cksum_bad;
406	kstat_named_t arcstat_l2_io_error;
407	kstat_named_t arcstat_l2_size;
408	kstat_named_t arcstat_l2_asize;
409	kstat_named_t arcstat_l2_hdr_size;
410	kstat_named_t arcstat_l2_compress_successes;
411	kstat_named_t arcstat_l2_compress_zeros;
412	kstat_named_t arcstat_l2_compress_failures;
413	kstat_named_t arcstat_l2_write_trylock_fail;
414	kstat_named_t arcstat_l2_write_passed_headroom;
415	kstat_named_t arcstat_l2_write_spa_mismatch;
416	kstat_named_t arcstat_l2_write_in_l2;
417	kstat_named_t arcstat_l2_write_hdr_io_in_progress;
418	kstat_named_t arcstat_l2_write_not_cacheable;
419	kstat_named_t arcstat_l2_write_full;
420	kstat_named_t arcstat_l2_write_buffer_iter;
421	kstat_named_t arcstat_l2_write_pios;
422	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
423	kstat_named_t arcstat_l2_write_buffer_list_iter;
424	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
425	kstat_named_t arcstat_memory_throttle_count;
426	kstat_named_t arcstat_duplicate_buffers;
427	kstat_named_t arcstat_duplicate_buffers_size;
428	kstat_named_t arcstat_duplicate_reads;
429	kstat_named_t arcstat_meta_used;
430	kstat_named_t arcstat_meta_limit;
431	kstat_named_t arcstat_meta_max;
432} arc_stats_t;
433
434static arc_stats_t arc_stats = {
435	{ "hits",			KSTAT_DATA_UINT64 },
436	{ "misses",			KSTAT_DATA_UINT64 },
437	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
438	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
439	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
440	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
441	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
442	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
443	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
444	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
445	{ "mru_hits",			KSTAT_DATA_UINT64 },
446	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
447	{ "mfu_hits",			KSTAT_DATA_UINT64 },
448	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
449	{ "allocated",			KSTAT_DATA_UINT64 },
450	{ "deleted",			KSTAT_DATA_UINT64 },
451	{ "stolen",			KSTAT_DATA_UINT64 },
452	{ "recycle_miss",		KSTAT_DATA_UINT64 },
453	{ "mutex_miss",			KSTAT_DATA_UINT64 },
454	{ "evict_skip",			KSTAT_DATA_UINT64 },
455	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
456	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
457	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
458	{ "hash_elements",		KSTAT_DATA_UINT64 },
459	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
460	{ "hash_collisions",		KSTAT_DATA_UINT64 },
461	{ "hash_chains",		KSTAT_DATA_UINT64 },
462	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
463	{ "p",				KSTAT_DATA_UINT64 },
464	{ "c",				KSTAT_DATA_UINT64 },
465	{ "c_min",			KSTAT_DATA_UINT64 },
466	{ "c_max",			KSTAT_DATA_UINT64 },
467	{ "size",			KSTAT_DATA_UINT64 },
468	{ "hdr_size",			KSTAT_DATA_UINT64 },
469	{ "data_size",			KSTAT_DATA_UINT64 },
470	{ "other_size",			KSTAT_DATA_UINT64 },
471	{ "l2_hits",			KSTAT_DATA_UINT64 },
472	{ "l2_misses",			KSTAT_DATA_UINT64 },
473	{ "l2_feeds",			KSTAT_DATA_UINT64 },
474	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
475	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
476	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
477	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
478	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
479	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
480	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
481	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
482	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
483	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
484	{ "l2_cdata_free_on_write",	KSTAT_DATA_UINT64 },
485	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
486	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
487	{ "l2_io_error",		KSTAT_DATA_UINT64 },
488	{ "l2_size",			KSTAT_DATA_UINT64 },
489	{ "l2_asize",			KSTAT_DATA_UINT64 },
490	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
491	{ "l2_compress_successes",	KSTAT_DATA_UINT64 },
492	{ "l2_compress_zeros",		KSTAT_DATA_UINT64 },
493	{ "l2_compress_failures",	KSTAT_DATA_UINT64 },
494	{ "l2_write_trylock_fail",	KSTAT_DATA_UINT64 },
495	{ "l2_write_passed_headroom",	KSTAT_DATA_UINT64 },
496	{ "l2_write_spa_mismatch",	KSTAT_DATA_UINT64 },
497	{ "l2_write_in_l2",		KSTAT_DATA_UINT64 },
498	{ "l2_write_io_in_progress",	KSTAT_DATA_UINT64 },
499	{ "l2_write_not_cacheable",	KSTAT_DATA_UINT64 },
500	{ "l2_write_full",		KSTAT_DATA_UINT64 },
501	{ "l2_write_buffer_iter",	KSTAT_DATA_UINT64 },
502	{ "l2_write_pios",		KSTAT_DATA_UINT64 },
503	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
504	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
505	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
506	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
507	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
508	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
509	{ "duplicate_reads",		KSTAT_DATA_UINT64 },
510	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
511	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
512	{ "arc_meta_max",		KSTAT_DATA_UINT64 }
513};
514
515#define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
516
517#define	ARCSTAT_INCR(stat, val) \
518	atomic_add_64(&arc_stats.stat.value.ui64, (val))
519
520#define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
521#define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
522
523#define	ARCSTAT_MAX(stat, val) {					\
524	uint64_t m;							\
525	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
526	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
527		continue;						\
528}
529
530#define	ARCSTAT_MAXSTAT(stat) \
531	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
532
533/*
534 * We define a macro to allow ARC hits/misses to be easily broken down by
535 * two separate conditions, giving a total of four different subtypes for
536 * each of hits and misses (so eight statistics total).
537 */
538#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
539	if (cond1) {							\
540		if (cond2) {						\
541			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
542		} else {						\
543			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
544		}							\
545	} else {							\
546		if (cond2) {						\
547			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
548		} else {						\
549			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
550		}							\
551	}
552
553kstat_t			*arc_ksp;
554static arc_state_t	*arc_anon;
555static arc_state_t	*arc_mru;
556static arc_state_t	*arc_mru_ghost;
557static arc_state_t	*arc_mfu;
558static arc_state_t	*arc_mfu_ghost;
559static arc_state_t	*arc_l2c_only;
560
561/*
562 * There are several ARC variables that are critical to export as kstats --
563 * but we don't want to have to grovel around in the kstat whenever we wish to
564 * manipulate them.  For these variables, we therefore define them to be in
565 * terms of the statistic variable.  This assures that we are not introducing
566 * the possibility of inconsistency by having shadow copies of the variables,
567 * while still allowing the code to be readable.
568 */
569#define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
570#define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
571#define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
572#define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
573#define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
574#define	arc_meta_limit	ARCSTAT(arcstat_meta_limit) /* max size for metadata */
575#define	arc_meta_used	ARCSTAT(arcstat_meta_used) /* size of metadata */
576#define	arc_meta_max	ARCSTAT(arcstat_meta_max) /* max size of metadata */
577
578#define	L2ARC_IS_VALID_COMPRESS(_c_) \
579	((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
580
581static int		arc_no_grow;	/* Don't try to grow cache size */
582static uint64_t		arc_tempreserve;
583static uint64_t		arc_loaned_bytes;
584
585typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
586
587typedef struct arc_callback arc_callback_t;
588
589struct arc_callback {
590	void			*acb_private;
591	arc_done_func_t		*acb_done;
592	arc_buf_t		*acb_buf;
593	zio_t			*acb_zio_dummy;
594	arc_callback_t		*acb_next;
595};
596
597typedef struct arc_write_callback arc_write_callback_t;
598
599struct arc_write_callback {
600	void		*awcb_private;
601	arc_done_func_t	*awcb_ready;
602	arc_done_func_t	*awcb_physdone;
603	arc_done_func_t	*awcb_done;
604	arc_buf_t	*awcb_buf;
605};
606
607struct arc_buf_hdr {
608	/* protected by hash lock */
609	dva_t			b_dva;
610	uint64_t		b_birth;
611	uint64_t		b_cksum0;
612
613	kmutex_t		b_freeze_lock;
614	zio_cksum_t		*b_freeze_cksum;
615	void			*b_thawed;
616
617	arc_buf_hdr_t		*b_hash_next;
618	arc_buf_t		*b_buf;
619	uint32_t		b_flags;
620	uint32_t		b_datacnt;
621
622	arc_callback_t		*b_acb;
623	kcondvar_t		b_cv;
624
625	/* immutable */
626	arc_buf_contents_t	b_type;
627	uint64_t		b_size;
628	uint64_t		b_spa;
629
630	/* protected by arc state mutex */
631	arc_state_t		*b_state;
632	list_node_t		b_arc_node;
633
634	/* updated atomically */
635	clock_t			b_arc_access;
636
637	/* self protecting */
638	refcount_t		b_refcnt;
639
640	l2arc_buf_hdr_t		*b_l2hdr;
641	list_node_t		b_l2node;
642};
643
644#ifdef _KERNEL
645static int
646sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS)
647{
648	uint64_t val;
649	int err;
650
651	val = arc_meta_limit;
652	err = sysctl_handle_64(oidp, &val, 0, req);
653	if (err != 0 || req->newptr == NULL)
654		return (err);
655
656        if (val <= 0 || val > arc_c_max)
657		return (EINVAL);
658
659	arc_meta_limit = val;
660	return (0);
661}
662#endif
663
664static arc_buf_t *arc_eviction_list;
665static kmutex_t arc_eviction_mtx;
666static arc_buf_hdr_t arc_eviction_hdr;
667static void arc_get_data_buf(arc_buf_t *buf);
668static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
669static int arc_evict_needed(arc_buf_contents_t type);
670static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
671#ifdef illumos
672static void arc_buf_watch(arc_buf_t *buf);
673#endif /* illumos */
674
675static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
676
677#define	GHOST_STATE(state)	\
678	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
679	(state) == arc_l2c_only)
680
681/*
682 * Private ARC flags.  These flags are private ARC only flags that will show up
683 * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
684 * be passed in as arc_flags in things like arc_read.  However, these flags
685 * should never be passed and should only be set by ARC code.  When adding new
686 * public flags, make sure not to smash the private ones.
687 */
688
689#define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
690#define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
691#define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
692#define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
693#define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
694#define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
695#define	ARC_FREE_IN_PROGRESS	(1 << 15)	/* hdr about to be freed */
696#define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
697#define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
698#define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
699
700#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
701#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
702#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
703#define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_PREFETCH)
704#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
705#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
706#define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
707#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_L2CACHE)
708#define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS &&	\
709				    (hdr)->b_l2hdr != NULL)
710#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
711#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
712#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
713
714/*
715 * Other sizes
716 */
717
718#define	HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
719#define	L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
720
721/*
722 * Hash table routines
723 */
724
725#define	HT_LOCK_PAD	CACHE_LINE_SIZE
726
727struct ht_lock {
728	kmutex_t	ht_lock;
729#ifdef _KERNEL
730	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
731#endif
732};
733
734#define	BUF_LOCKS 256
735typedef struct buf_hash_table {
736	uint64_t ht_mask;
737	arc_buf_hdr_t **ht_table;
738	struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
739} buf_hash_table_t;
740
741static buf_hash_table_t buf_hash_table;
742
743#define	BUF_HASH_INDEX(spa, dva, birth) \
744	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
745#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
746#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
747#define	HDR_LOCK(hdr) \
748	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
749
750uint64_t zfs_crc64_table[256];
751
752/*
753 * Level 2 ARC
754 */
755
756#define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
757#define	L2ARC_HEADROOM		2			/* num of writes */
758/*
759 * If we discover during ARC scan any buffers to be compressed, we boost
760 * our headroom for the next scanning cycle by this percentage multiple.
761 */
762#define	L2ARC_HEADROOM_BOOST	200
763#define	L2ARC_FEED_SECS		1		/* caching interval secs */
764#define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
765
766#define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
767#define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
768
769/* L2ARC Performance Tunables */
770uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
771uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
772uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
773uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
774uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
775uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
776boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
777boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
778boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
779
780SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
781    &l2arc_write_max, 0, "max write size");
782SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
783    &l2arc_write_boost, 0, "extra write during warmup");
784SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
785    &l2arc_headroom, 0, "number of dev writes");
786SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
787    &l2arc_feed_secs, 0, "interval seconds");
788SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
789    &l2arc_feed_min_ms, 0, "min interval milliseconds");
790
791SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
792    &l2arc_noprefetch, 0, "don't cache prefetch bufs");
793SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
794    &l2arc_feed_again, 0, "turbo warmup");
795SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
796    &l2arc_norw, 0, "no reads during writes");
797
798SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
799    &ARC_anon.arcs_size, 0, "size of anonymous state");
800SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
801    &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
802SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
803    &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
804
805SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
806    &ARC_mru.arcs_size, 0, "size of mru state");
807SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
808    &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
809SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
810    &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
811
812SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
813    &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
814SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
815    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
816    "size of metadata in mru ghost state");
817SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
818    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
819    "size of data in mru ghost state");
820
821SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
822    &ARC_mfu.arcs_size, 0, "size of mfu state");
823SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
824    &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
825SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
826    &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
827
828SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
829    &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
830SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
831    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
832    "size of metadata in mfu ghost state");
833SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
834    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
835    "size of data in mfu ghost state");
836
837SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
838    &ARC_l2c_only.arcs_size, 0, "size of mru state");
839
840/*
841 * L2ARC Internals
842 */
843typedef struct l2arc_dev {
844	vdev_t			*l2ad_vdev;	/* vdev */
845	spa_t			*l2ad_spa;	/* spa */
846	uint64_t		l2ad_hand;	/* next write location */
847	uint64_t		l2ad_start;	/* first addr on device */
848	uint64_t		l2ad_end;	/* last addr on device */
849	uint64_t		l2ad_evict;	/* last addr eviction reached */
850	boolean_t		l2ad_first;	/* first sweep through */
851	boolean_t		l2ad_writing;	/* currently writing */
852	list_t			*l2ad_buflist;	/* buffer list */
853	list_node_t		l2ad_node;	/* device list node */
854} l2arc_dev_t;
855
856static list_t L2ARC_dev_list;			/* device list */
857static list_t *l2arc_dev_list;			/* device list pointer */
858static kmutex_t l2arc_dev_mtx;			/* device list mutex */
859static l2arc_dev_t *l2arc_dev_last;		/* last device used */
860static kmutex_t l2arc_buflist_mtx;		/* mutex for all buflists */
861static list_t L2ARC_free_on_write;		/* free after write buf list */
862static list_t *l2arc_free_on_write;		/* free after write list ptr */
863static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
864static uint64_t l2arc_ndev;			/* number of devices */
865
866typedef struct l2arc_read_callback {
867	arc_buf_t		*l2rcb_buf;		/* read buffer */
868	spa_t			*l2rcb_spa;		/* spa */
869	blkptr_t		l2rcb_bp;		/* original blkptr */
870	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
871	int			l2rcb_flags;		/* original flags */
872	enum zio_compress	l2rcb_compress;		/* applied compress */
873} l2arc_read_callback_t;
874
875typedef struct l2arc_write_callback {
876	l2arc_dev_t	*l2wcb_dev;		/* device info */
877	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
878} l2arc_write_callback_t;
879
880struct l2arc_buf_hdr {
881	/* protected by arc_buf_hdr  mutex */
882	l2arc_dev_t		*b_dev;		/* L2ARC device */
883	uint64_t		b_daddr;	/* disk address, offset byte */
884	/* compression applied to buffer data */
885	enum zio_compress	b_compress;
886	/* real alloc'd buffer size depending on b_compress applied */
887	int			b_asize;
888	/* temporary buffer holder for in-flight compressed data */
889	void			*b_tmp_cdata;
890};
891
892typedef struct l2arc_data_free {
893	/* protected by l2arc_free_on_write_mtx */
894	void		*l2df_data;
895	size_t		l2df_size;
896	void		(*l2df_func)(void *, size_t);
897	list_node_t	l2df_list_node;
898} l2arc_data_free_t;
899
900static kmutex_t l2arc_feed_thr_lock;
901static kcondvar_t l2arc_feed_thr_cv;
902static uint8_t l2arc_thread_exit;
903
904static void l2arc_read_done(zio_t *zio);
905static void l2arc_hdr_stat_add(void);
906static void l2arc_hdr_stat_remove(void);
907
908static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
909static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
910    enum zio_compress c);
911static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
912
913static uint64_t
914buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
915{
916	uint8_t *vdva = (uint8_t *)dva;
917	uint64_t crc = -1ULL;
918	int i;
919
920	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
921
922	for (i = 0; i < sizeof (dva_t); i++)
923		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
924
925	crc ^= (spa>>8) ^ birth;
926
927	return (crc);
928}
929
930#define	BUF_EMPTY(buf)						\
931	((buf)->b_dva.dva_word[0] == 0 &&			\
932	(buf)->b_dva.dva_word[1] == 0 &&			\
933	(buf)->b_cksum0 == 0)
934
935#define	BUF_EQUAL(spa, dva, birth, buf)				\
936	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
937	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
938	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
939
940static void
941buf_discard_identity(arc_buf_hdr_t *hdr)
942{
943	hdr->b_dva.dva_word[0] = 0;
944	hdr->b_dva.dva_word[1] = 0;
945	hdr->b_birth = 0;
946	hdr->b_cksum0 = 0;
947}
948
949static arc_buf_hdr_t *
950buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
951{
952	const dva_t *dva = BP_IDENTITY(bp);
953	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
954	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
955	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
956	arc_buf_hdr_t *buf;
957
958	mutex_enter(hash_lock);
959	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
960	    buf = buf->b_hash_next) {
961		if (BUF_EQUAL(spa, dva, birth, buf)) {
962			*lockp = hash_lock;
963			return (buf);
964		}
965	}
966	mutex_exit(hash_lock);
967	*lockp = NULL;
968	return (NULL);
969}
970
971/*
972 * Insert an entry into the hash table.  If there is already an element
973 * equal to elem in the hash table, then the already existing element
974 * will be returned and the new element will not be inserted.
975 * Otherwise returns NULL.
976 */
977static arc_buf_hdr_t *
978buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
979{
980	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
981	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
982	arc_buf_hdr_t *fbuf;
983	uint32_t i;
984
985	ASSERT(!DVA_IS_EMPTY(&buf->b_dva));
986	ASSERT(buf->b_birth != 0);
987	ASSERT(!HDR_IN_HASH_TABLE(buf));
988	*lockp = hash_lock;
989	mutex_enter(hash_lock);
990	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
991	    fbuf = fbuf->b_hash_next, i++) {
992		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
993			return (fbuf);
994	}
995
996	buf->b_hash_next = buf_hash_table.ht_table[idx];
997	buf_hash_table.ht_table[idx] = buf;
998	buf->b_flags |= ARC_IN_HASH_TABLE;
999
1000	/* collect some hash table performance data */
1001	if (i > 0) {
1002		ARCSTAT_BUMP(arcstat_hash_collisions);
1003		if (i == 1)
1004			ARCSTAT_BUMP(arcstat_hash_chains);
1005
1006		ARCSTAT_MAX(arcstat_hash_chain_max, i);
1007	}
1008
1009	ARCSTAT_BUMP(arcstat_hash_elements);
1010	ARCSTAT_MAXSTAT(arcstat_hash_elements);
1011
1012	return (NULL);
1013}
1014
1015static void
1016buf_hash_remove(arc_buf_hdr_t *buf)
1017{
1018	arc_buf_hdr_t *fbuf, **bufp;
1019	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
1020
1021	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
1022	ASSERT(HDR_IN_HASH_TABLE(buf));
1023
1024	bufp = &buf_hash_table.ht_table[idx];
1025	while ((fbuf = *bufp) != buf) {
1026		ASSERT(fbuf != NULL);
1027		bufp = &fbuf->b_hash_next;
1028	}
1029	*bufp = buf->b_hash_next;
1030	buf->b_hash_next = NULL;
1031	buf->b_flags &= ~ARC_IN_HASH_TABLE;
1032
1033	/* collect some hash table performance data */
1034	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1035
1036	if (buf_hash_table.ht_table[idx] &&
1037	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1038		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1039}
1040
1041/*
1042 * Global data structures and functions for the buf kmem cache.
1043 */
1044static kmem_cache_t *hdr_cache;
1045static kmem_cache_t *buf_cache;
1046
1047static void
1048buf_fini(void)
1049{
1050	int i;
1051
1052	kmem_free(buf_hash_table.ht_table,
1053	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
1054	for (i = 0; i < BUF_LOCKS; i++)
1055		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
1056	kmem_cache_destroy(hdr_cache);
1057	kmem_cache_destroy(buf_cache);
1058}
1059
1060/*
1061 * Constructor callback - called when the cache is empty
1062 * and a new buf is requested.
1063 */
1064/* ARGSUSED */
1065static int
1066hdr_cons(void *vbuf, void *unused, int kmflag)
1067{
1068	arc_buf_hdr_t *buf = vbuf;
1069
1070	bzero(buf, sizeof (arc_buf_hdr_t));
1071	refcount_create(&buf->b_refcnt);
1072	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
1073	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1074	arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1075
1076	return (0);
1077}
1078
1079/* ARGSUSED */
1080static int
1081buf_cons(void *vbuf, void *unused, int kmflag)
1082{
1083	arc_buf_t *buf = vbuf;
1084
1085	bzero(buf, sizeof (arc_buf_t));
1086	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1087	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1088
1089	return (0);
1090}
1091
1092/*
1093 * Destructor callback - called when a cached buf is
1094 * no longer required.
1095 */
1096/* ARGSUSED */
1097static void
1098hdr_dest(void *vbuf, void *unused)
1099{
1100	arc_buf_hdr_t *buf = vbuf;
1101
1102	ASSERT(BUF_EMPTY(buf));
1103	refcount_destroy(&buf->b_refcnt);
1104	cv_destroy(&buf->b_cv);
1105	mutex_destroy(&buf->b_freeze_lock);
1106	arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1107}
1108
1109/* ARGSUSED */
1110static void
1111buf_dest(void *vbuf, void *unused)
1112{
1113	arc_buf_t *buf = vbuf;
1114
1115	mutex_destroy(&buf->b_evict_lock);
1116	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1117}
1118
1119/*
1120 * Reclaim callback -- invoked when memory is low.
1121 */
1122/* ARGSUSED */
1123static void
1124hdr_recl(void *unused)
1125{
1126	dprintf("hdr_recl called\n");
1127	/*
1128	 * umem calls the reclaim func when we destroy the buf cache,
1129	 * which is after we do arc_fini().
1130	 */
1131	if (!arc_dead)
1132		cv_signal(&arc_reclaim_thr_cv);
1133}
1134
1135static void
1136buf_init(void)
1137{
1138	uint64_t *ct;
1139	uint64_t hsize = 1ULL << 12;
1140	int i, j;
1141
1142	/*
1143	 * The hash table is big enough to fill all of physical memory
1144	 * with an average block size of zfs_arc_average_blocksize (default 8K).
1145	 * By default, the table will take up
1146	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1147	 */
1148	while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
1149		hsize <<= 1;
1150retry:
1151	buf_hash_table.ht_mask = hsize - 1;
1152	buf_hash_table.ht_table =
1153	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1154	if (buf_hash_table.ht_table == NULL) {
1155		ASSERT(hsize > (1ULL << 8));
1156		hsize >>= 1;
1157		goto retry;
1158	}
1159
1160	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
1161	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
1162	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1163	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1164
1165	for (i = 0; i < 256; i++)
1166		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1167			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1168
1169	for (i = 0; i < BUF_LOCKS; i++) {
1170		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1171		    NULL, MUTEX_DEFAULT, NULL);
1172	}
1173}
1174
1175#define	ARC_MINTIME	(hz>>4) /* 62 ms */
1176
1177static void
1178arc_cksum_verify(arc_buf_t *buf)
1179{
1180	zio_cksum_t zc;
1181
1182	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1183		return;
1184
1185	mutex_enter(&buf->b_hdr->b_freeze_lock);
1186	if (buf->b_hdr->b_freeze_cksum == NULL ||
1187	    (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
1188		mutex_exit(&buf->b_hdr->b_freeze_lock);
1189		return;
1190	}
1191	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1192	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1193		panic("buffer modified while frozen!");
1194	mutex_exit(&buf->b_hdr->b_freeze_lock);
1195}
1196
1197static int
1198arc_cksum_equal(arc_buf_t *buf)
1199{
1200	zio_cksum_t zc;
1201	int equal;
1202
1203	mutex_enter(&buf->b_hdr->b_freeze_lock);
1204	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1205	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1206	mutex_exit(&buf->b_hdr->b_freeze_lock);
1207
1208	return (equal);
1209}
1210
1211static void
1212arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1213{
1214	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1215		return;
1216
1217	mutex_enter(&buf->b_hdr->b_freeze_lock);
1218	if (buf->b_hdr->b_freeze_cksum != NULL) {
1219		mutex_exit(&buf->b_hdr->b_freeze_lock);
1220		return;
1221	}
1222	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1223	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1224	    buf->b_hdr->b_freeze_cksum);
1225	mutex_exit(&buf->b_hdr->b_freeze_lock);
1226#ifdef illumos
1227	arc_buf_watch(buf);
1228#endif /* illumos */
1229}
1230
1231#ifdef illumos
1232#ifndef _KERNEL
1233typedef struct procctl {
1234	long cmd;
1235	prwatch_t prwatch;
1236} procctl_t;
1237#endif
1238
1239/* ARGSUSED */
1240static void
1241arc_buf_unwatch(arc_buf_t *buf)
1242{
1243#ifndef _KERNEL
1244	if (arc_watch) {
1245		int result;
1246		procctl_t ctl;
1247		ctl.cmd = PCWATCH;
1248		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1249		ctl.prwatch.pr_size = 0;
1250		ctl.prwatch.pr_wflags = 0;
1251		result = write(arc_procfd, &ctl, sizeof (ctl));
1252		ASSERT3U(result, ==, sizeof (ctl));
1253	}
1254#endif
1255}
1256
1257/* ARGSUSED */
1258static void
1259arc_buf_watch(arc_buf_t *buf)
1260{
1261#ifndef _KERNEL
1262	if (arc_watch) {
1263		int result;
1264		procctl_t ctl;
1265		ctl.cmd = PCWATCH;
1266		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1267		ctl.prwatch.pr_size = buf->b_hdr->b_size;
1268		ctl.prwatch.pr_wflags = WA_WRITE;
1269		result = write(arc_procfd, &ctl, sizeof (ctl));
1270		ASSERT3U(result, ==, sizeof (ctl));
1271	}
1272#endif
1273}
1274#endif /* illumos */
1275
1276void
1277arc_buf_thaw(arc_buf_t *buf)
1278{
1279	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1280		if (buf->b_hdr->b_state != arc_anon)
1281			panic("modifying non-anon buffer!");
1282		if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1283			panic("modifying buffer while i/o in progress!");
1284		arc_cksum_verify(buf);
1285	}
1286
1287	mutex_enter(&buf->b_hdr->b_freeze_lock);
1288	if (buf->b_hdr->b_freeze_cksum != NULL) {
1289		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1290		buf->b_hdr->b_freeze_cksum = NULL;
1291	}
1292
1293	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1294		if (buf->b_hdr->b_thawed)
1295			kmem_free(buf->b_hdr->b_thawed, 1);
1296		buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1297	}
1298
1299	mutex_exit(&buf->b_hdr->b_freeze_lock);
1300
1301#ifdef illumos
1302	arc_buf_unwatch(buf);
1303#endif /* illumos */
1304}
1305
1306void
1307arc_buf_freeze(arc_buf_t *buf)
1308{
1309	kmutex_t *hash_lock;
1310
1311	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1312		return;
1313
1314	hash_lock = HDR_LOCK(buf->b_hdr);
1315	mutex_enter(hash_lock);
1316
1317	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1318	    buf->b_hdr->b_state == arc_anon);
1319	arc_cksum_compute(buf, B_FALSE);
1320	mutex_exit(hash_lock);
1321
1322}
1323
1324static void
1325get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock)
1326{
1327	uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth);
1328
1329	if (ab->b_type == ARC_BUFC_METADATA)
1330		buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
1331	else {
1332		buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
1333		buf_hashid += ARC_BUFC_NUMMETADATALISTS;
1334	}
1335
1336	*list = &state->arcs_lists[buf_hashid];
1337	*lock = ARCS_LOCK(state, buf_hashid);
1338}
1339
1340
1341static void
1342add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1343{
1344	ASSERT(MUTEX_HELD(hash_lock));
1345
1346	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1347	    (ab->b_state != arc_anon)) {
1348		uint64_t delta = ab->b_size * ab->b_datacnt;
1349		uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1350		list_t *list;
1351		kmutex_t *lock;
1352
1353		get_buf_info(ab, ab->b_state, &list, &lock);
1354		ASSERT(!MUTEX_HELD(lock));
1355		mutex_enter(lock);
1356		ASSERT(list_link_active(&ab->b_arc_node));
1357		list_remove(list, ab);
1358		if (GHOST_STATE(ab->b_state)) {
1359			ASSERT0(ab->b_datacnt);
1360			ASSERT3P(ab->b_buf, ==, NULL);
1361			delta = ab->b_size;
1362		}
1363		ASSERT(delta > 0);
1364		ASSERT3U(*size, >=, delta);
1365		atomic_add_64(size, -delta);
1366		mutex_exit(lock);
1367		/* remove the prefetch flag if we get a reference */
1368		if (ab->b_flags & ARC_PREFETCH)
1369			ab->b_flags &= ~ARC_PREFETCH;
1370	}
1371}
1372
1373static int
1374remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1375{
1376	int cnt;
1377	arc_state_t *state = ab->b_state;
1378
1379	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1380	ASSERT(!GHOST_STATE(state));
1381
1382	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1383	    (state != arc_anon)) {
1384		uint64_t *size = &state->arcs_lsize[ab->b_type];
1385		list_t *list;
1386		kmutex_t *lock;
1387
1388		get_buf_info(ab, state, &list, &lock);
1389		ASSERT(!MUTEX_HELD(lock));
1390		mutex_enter(lock);
1391		ASSERT(!list_link_active(&ab->b_arc_node));
1392		list_insert_head(list, ab);
1393		ASSERT(ab->b_datacnt > 0);
1394		atomic_add_64(size, ab->b_size * ab->b_datacnt);
1395		mutex_exit(lock);
1396	}
1397	return (cnt);
1398}
1399
1400/*
1401 * Move the supplied buffer to the indicated state.  The mutex
1402 * for the buffer must be held by the caller.
1403 */
1404static void
1405arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1406{
1407	arc_state_t *old_state = ab->b_state;
1408	int64_t refcnt = refcount_count(&ab->b_refcnt);
1409	uint64_t from_delta, to_delta;
1410	list_t *list;
1411	kmutex_t *lock;
1412
1413	ASSERT(MUTEX_HELD(hash_lock));
1414	ASSERT3P(new_state, !=, old_state);
1415	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1416	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1417	ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1418
1419	from_delta = to_delta = ab->b_datacnt * ab->b_size;
1420
1421	/*
1422	 * If this buffer is evictable, transfer it from the
1423	 * old state list to the new state list.
1424	 */
1425	if (refcnt == 0) {
1426		if (old_state != arc_anon) {
1427			int use_mutex;
1428			uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1429
1430			get_buf_info(ab, old_state, &list, &lock);
1431			use_mutex = !MUTEX_HELD(lock);
1432			if (use_mutex)
1433				mutex_enter(lock);
1434
1435			ASSERT(list_link_active(&ab->b_arc_node));
1436			list_remove(list, ab);
1437
1438			/*
1439			 * If prefetching out of the ghost cache,
1440			 * we will have a non-zero datacnt.
1441			 */
1442			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1443				/* ghost elements have a ghost size */
1444				ASSERT(ab->b_buf == NULL);
1445				from_delta = ab->b_size;
1446			}
1447			ASSERT3U(*size, >=, from_delta);
1448			atomic_add_64(size, -from_delta);
1449
1450			if (use_mutex)
1451				mutex_exit(lock);
1452		}
1453		if (new_state != arc_anon) {
1454			int use_mutex;
1455			uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1456
1457			get_buf_info(ab, new_state, &list, &lock);
1458			use_mutex = !MUTEX_HELD(lock);
1459			if (use_mutex)
1460				mutex_enter(lock);
1461
1462			list_insert_head(list, ab);
1463
1464			/* ghost elements have a ghost size */
1465			if (GHOST_STATE(new_state)) {
1466				ASSERT(ab->b_datacnt == 0);
1467				ASSERT(ab->b_buf == NULL);
1468				to_delta = ab->b_size;
1469			}
1470			atomic_add_64(size, to_delta);
1471
1472			if (use_mutex)
1473				mutex_exit(lock);
1474		}
1475	}
1476
1477	ASSERT(!BUF_EMPTY(ab));
1478	if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1479		buf_hash_remove(ab);
1480
1481	/* adjust state sizes */
1482	if (to_delta)
1483		atomic_add_64(&new_state->arcs_size, to_delta);
1484	if (from_delta) {
1485		ASSERT3U(old_state->arcs_size, >=, from_delta);
1486		atomic_add_64(&old_state->arcs_size, -from_delta);
1487	}
1488	ab->b_state = new_state;
1489
1490	/* adjust l2arc hdr stats */
1491	if (new_state == arc_l2c_only)
1492		l2arc_hdr_stat_add();
1493	else if (old_state == arc_l2c_only)
1494		l2arc_hdr_stat_remove();
1495}
1496
1497void
1498arc_space_consume(uint64_t space, arc_space_type_t type)
1499{
1500	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1501
1502	switch (type) {
1503	case ARC_SPACE_DATA:
1504		ARCSTAT_INCR(arcstat_data_size, space);
1505		break;
1506	case ARC_SPACE_OTHER:
1507		ARCSTAT_INCR(arcstat_other_size, space);
1508		break;
1509	case ARC_SPACE_HDRS:
1510		ARCSTAT_INCR(arcstat_hdr_size, space);
1511		break;
1512	case ARC_SPACE_L2HDRS:
1513		ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1514		break;
1515	}
1516
1517	ARCSTAT_INCR(arcstat_meta_used, space);
1518	atomic_add_64(&arc_size, space);
1519}
1520
1521void
1522arc_space_return(uint64_t space, arc_space_type_t type)
1523{
1524	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1525
1526	switch (type) {
1527	case ARC_SPACE_DATA:
1528		ARCSTAT_INCR(arcstat_data_size, -space);
1529		break;
1530	case ARC_SPACE_OTHER:
1531		ARCSTAT_INCR(arcstat_other_size, -space);
1532		break;
1533	case ARC_SPACE_HDRS:
1534		ARCSTAT_INCR(arcstat_hdr_size, -space);
1535		break;
1536	case ARC_SPACE_L2HDRS:
1537		ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1538		break;
1539	}
1540
1541	ASSERT(arc_meta_used >= space);
1542	if (arc_meta_max < arc_meta_used)
1543		arc_meta_max = arc_meta_used;
1544	ARCSTAT_INCR(arcstat_meta_used, -space);
1545	ASSERT(arc_size >= space);
1546	atomic_add_64(&arc_size, -space);
1547}
1548
1549arc_buf_t *
1550arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1551{
1552	arc_buf_hdr_t *hdr;
1553	arc_buf_t *buf;
1554
1555	ASSERT3U(size, >, 0);
1556	hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1557	ASSERT(BUF_EMPTY(hdr));
1558	hdr->b_size = size;
1559	hdr->b_type = type;
1560	hdr->b_spa = spa_load_guid(spa);
1561	hdr->b_state = arc_anon;
1562	hdr->b_arc_access = 0;
1563	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1564	buf->b_hdr = hdr;
1565	buf->b_data = NULL;
1566	buf->b_efunc = NULL;
1567	buf->b_private = NULL;
1568	buf->b_next = NULL;
1569	hdr->b_buf = buf;
1570	arc_get_data_buf(buf);
1571	hdr->b_datacnt = 1;
1572	hdr->b_flags = 0;
1573	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1574	(void) refcount_add(&hdr->b_refcnt, tag);
1575
1576	return (buf);
1577}
1578
1579static char *arc_onloan_tag = "onloan";
1580
1581/*
1582 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1583 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1584 * buffers must be returned to the arc before they can be used by the DMU or
1585 * freed.
1586 */
1587arc_buf_t *
1588arc_loan_buf(spa_t *spa, int size)
1589{
1590	arc_buf_t *buf;
1591
1592	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1593
1594	atomic_add_64(&arc_loaned_bytes, size);
1595	return (buf);
1596}
1597
1598/*
1599 * Return a loaned arc buffer to the arc.
1600 */
1601void
1602arc_return_buf(arc_buf_t *buf, void *tag)
1603{
1604	arc_buf_hdr_t *hdr = buf->b_hdr;
1605
1606	ASSERT(buf->b_data != NULL);
1607	(void) refcount_add(&hdr->b_refcnt, tag);
1608	(void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1609
1610	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1611}
1612
1613/* Detach an arc_buf from a dbuf (tag) */
1614void
1615arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1616{
1617	arc_buf_hdr_t *hdr;
1618
1619	ASSERT(buf->b_data != NULL);
1620	hdr = buf->b_hdr;
1621	(void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1622	(void) refcount_remove(&hdr->b_refcnt, tag);
1623	buf->b_efunc = NULL;
1624	buf->b_private = NULL;
1625
1626	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1627}
1628
1629static arc_buf_t *
1630arc_buf_clone(arc_buf_t *from)
1631{
1632	arc_buf_t *buf;
1633	arc_buf_hdr_t *hdr = from->b_hdr;
1634	uint64_t size = hdr->b_size;
1635
1636	ASSERT(hdr->b_state != arc_anon);
1637
1638	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1639	buf->b_hdr = hdr;
1640	buf->b_data = NULL;
1641	buf->b_efunc = NULL;
1642	buf->b_private = NULL;
1643	buf->b_next = hdr->b_buf;
1644	hdr->b_buf = buf;
1645	arc_get_data_buf(buf);
1646	bcopy(from->b_data, buf->b_data, size);
1647
1648	/*
1649	 * This buffer already exists in the arc so create a duplicate
1650	 * copy for the caller.  If the buffer is associated with user data
1651	 * then track the size and number of duplicates.  These stats will be
1652	 * updated as duplicate buffers are created and destroyed.
1653	 */
1654	if (hdr->b_type == ARC_BUFC_DATA) {
1655		ARCSTAT_BUMP(arcstat_duplicate_buffers);
1656		ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1657	}
1658	hdr->b_datacnt += 1;
1659	return (buf);
1660}
1661
1662void
1663arc_buf_add_ref(arc_buf_t *buf, void* tag)
1664{
1665	arc_buf_hdr_t *hdr;
1666	kmutex_t *hash_lock;
1667
1668	/*
1669	 * Check to see if this buffer is evicted.  Callers
1670	 * must verify b_data != NULL to know if the add_ref
1671	 * was successful.
1672	 */
1673	mutex_enter(&buf->b_evict_lock);
1674	if (buf->b_data == NULL) {
1675		mutex_exit(&buf->b_evict_lock);
1676		return;
1677	}
1678	hash_lock = HDR_LOCK(buf->b_hdr);
1679	mutex_enter(hash_lock);
1680	hdr = buf->b_hdr;
1681	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1682	mutex_exit(&buf->b_evict_lock);
1683
1684	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1685	add_reference(hdr, hash_lock, tag);
1686	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1687	arc_access(hdr, hash_lock);
1688	mutex_exit(hash_lock);
1689	ARCSTAT_BUMP(arcstat_hits);
1690	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1691	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1692	    data, metadata, hits);
1693}
1694
1695static void
1696arc_buf_free_on_write(void *data, size_t size,
1697    void (*free_func)(void *, size_t))
1698{
1699	l2arc_data_free_t *df;
1700
1701	df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1702	df->l2df_data = data;
1703	df->l2df_size = size;
1704	df->l2df_func = free_func;
1705	mutex_enter(&l2arc_free_on_write_mtx);
1706	list_insert_head(l2arc_free_on_write, df);
1707	mutex_exit(&l2arc_free_on_write_mtx);
1708}
1709
1710/*
1711 * Free the arc data buffer.  If it is an l2arc write in progress,
1712 * the buffer is placed on l2arc_free_on_write to be freed later.
1713 */
1714static void
1715arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1716{
1717	arc_buf_hdr_t *hdr = buf->b_hdr;
1718
1719	if (HDR_L2_WRITING(hdr)) {
1720		arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func);
1721		ARCSTAT_BUMP(arcstat_l2_free_on_write);
1722	} else {
1723		free_func(buf->b_data, hdr->b_size);
1724	}
1725}
1726
1727/*
1728 * Free up buf->b_data and if 'remove' is set, then pull the
1729 * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
1730 */
1731static void
1732arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
1733{
1734	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1735
1736	ASSERT(MUTEX_HELD(&l2arc_buflist_mtx));
1737
1738	if (l2hdr->b_tmp_cdata == NULL)
1739		return;
1740
1741	ASSERT(HDR_L2_WRITING(hdr));
1742	arc_buf_free_on_write(l2hdr->b_tmp_cdata, hdr->b_size,
1743	    zio_data_buf_free);
1744	ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
1745	l2hdr->b_tmp_cdata = NULL;
1746}
1747
1748static void
1749arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
1750{
1751	arc_buf_t **bufp;
1752
1753	/* free up data associated with the buf */
1754	if (buf->b_data) {
1755		arc_state_t *state = buf->b_hdr->b_state;
1756		uint64_t size = buf->b_hdr->b_size;
1757		arc_buf_contents_t type = buf->b_hdr->b_type;
1758
1759		arc_cksum_verify(buf);
1760#ifdef illumos
1761		arc_buf_unwatch(buf);
1762#endif /* illumos */
1763
1764		if (!recycle) {
1765			if (type == ARC_BUFC_METADATA) {
1766				arc_buf_data_free(buf, zio_buf_free);
1767				arc_space_return(size, ARC_SPACE_DATA);
1768			} else {
1769				ASSERT(type == ARC_BUFC_DATA);
1770				arc_buf_data_free(buf, zio_data_buf_free);
1771				ARCSTAT_INCR(arcstat_data_size, -size);
1772				atomic_add_64(&arc_size, -size);
1773			}
1774		}
1775		if (list_link_active(&buf->b_hdr->b_arc_node)) {
1776			uint64_t *cnt = &state->arcs_lsize[type];
1777
1778			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1779			ASSERT(state != arc_anon);
1780
1781			ASSERT3U(*cnt, >=, size);
1782			atomic_add_64(cnt, -size);
1783		}
1784		ASSERT3U(state->arcs_size, >=, size);
1785		atomic_add_64(&state->arcs_size, -size);
1786		buf->b_data = NULL;
1787
1788		/*
1789		 * If we're destroying a duplicate buffer make sure
1790		 * that the appropriate statistics are updated.
1791		 */
1792		if (buf->b_hdr->b_datacnt > 1 &&
1793		    buf->b_hdr->b_type == ARC_BUFC_DATA) {
1794			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1795			ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1796		}
1797		ASSERT(buf->b_hdr->b_datacnt > 0);
1798		buf->b_hdr->b_datacnt -= 1;
1799	}
1800
1801	/* only remove the buf if requested */
1802	if (!remove)
1803		return;
1804
1805	/* remove the buf from the hdr list */
1806	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1807		continue;
1808	*bufp = buf->b_next;
1809	buf->b_next = NULL;
1810
1811	ASSERT(buf->b_efunc == NULL);
1812
1813	/* clean up the buf */
1814	buf->b_hdr = NULL;
1815	kmem_cache_free(buf_cache, buf);
1816}
1817
1818static void
1819arc_hdr_destroy(arc_buf_hdr_t *hdr)
1820{
1821	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1822	ASSERT3P(hdr->b_state, ==, arc_anon);
1823	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1824	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1825
1826	if (l2hdr != NULL) {
1827		boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1828		/*
1829		 * To prevent arc_free() and l2arc_evict() from
1830		 * attempting to free the same buffer at the same time,
1831		 * a FREE_IN_PROGRESS flag is given to arc_free() to
1832		 * give it priority.  l2arc_evict() can't destroy this
1833		 * header while we are waiting on l2arc_buflist_mtx.
1834		 *
1835		 * The hdr may be removed from l2ad_buflist before we
1836		 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1837		 */
1838		if (!buflist_held) {
1839			mutex_enter(&l2arc_buflist_mtx);
1840			l2hdr = hdr->b_l2hdr;
1841		}
1842
1843		if (l2hdr != NULL) {
1844			trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
1845			    hdr->b_size, 0);
1846			list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1847			arc_buf_l2_cdata_free(hdr);
1848			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1849			ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1850			vdev_space_update(l2hdr->b_dev->l2ad_vdev,
1851			    -l2hdr->b_asize, 0, 0);
1852			kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1853			if (hdr->b_state == arc_l2c_only)
1854				l2arc_hdr_stat_remove();
1855			hdr->b_l2hdr = NULL;
1856		}
1857
1858		if (!buflist_held)
1859			mutex_exit(&l2arc_buflist_mtx);
1860	}
1861
1862	if (!BUF_EMPTY(hdr)) {
1863		ASSERT(!HDR_IN_HASH_TABLE(hdr));
1864		buf_discard_identity(hdr);
1865	}
1866	while (hdr->b_buf) {
1867		arc_buf_t *buf = hdr->b_buf;
1868
1869		if (buf->b_efunc) {
1870			mutex_enter(&arc_eviction_mtx);
1871			mutex_enter(&buf->b_evict_lock);
1872			ASSERT(buf->b_hdr != NULL);
1873			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1874			hdr->b_buf = buf->b_next;
1875			buf->b_hdr = &arc_eviction_hdr;
1876			buf->b_next = arc_eviction_list;
1877			arc_eviction_list = buf;
1878			mutex_exit(&buf->b_evict_lock);
1879			mutex_exit(&arc_eviction_mtx);
1880		} else {
1881			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1882		}
1883	}
1884	if (hdr->b_freeze_cksum != NULL) {
1885		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1886		hdr->b_freeze_cksum = NULL;
1887	}
1888	if (hdr->b_thawed) {
1889		kmem_free(hdr->b_thawed, 1);
1890		hdr->b_thawed = NULL;
1891	}
1892
1893	ASSERT(!list_link_active(&hdr->b_arc_node));
1894	ASSERT3P(hdr->b_hash_next, ==, NULL);
1895	ASSERT3P(hdr->b_acb, ==, NULL);
1896	kmem_cache_free(hdr_cache, hdr);
1897}
1898
1899void
1900arc_buf_free(arc_buf_t *buf, void *tag)
1901{
1902	arc_buf_hdr_t *hdr = buf->b_hdr;
1903	int hashed = hdr->b_state != arc_anon;
1904
1905	ASSERT(buf->b_efunc == NULL);
1906	ASSERT(buf->b_data != NULL);
1907
1908	if (hashed) {
1909		kmutex_t *hash_lock = HDR_LOCK(hdr);
1910
1911		mutex_enter(hash_lock);
1912		hdr = buf->b_hdr;
1913		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1914
1915		(void) remove_reference(hdr, hash_lock, tag);
1916		if (hdr->b_datacnt > 1) {
1917			arc_buf_destroy(buf, FALSE, TRUE);
1918		} else {
1919			ASSERT(buf == hdr->b_buf);
1920			ASSERT(buf->b_efunc == NULL);
1921			hdr->b_flags |= ARC_BUF_AVAILABLE;
1922		}
1923		mutex_exit(hash_lock);
1924	} else if (HDR_IO_IN_PROGRESS(hdr)) {
1925		int destroy_hdr;
1926		/*
1927		 * We are in the middle of an async write.  Don't destroy
1928		 * this buffer unless the write completes before we finish
1929		 * decrementing the reference count.
1930		 */
1931		mutex_enter(&arc_eviction_mtx);
1932		(void) remove_reference(hdr, NULL, tag);
1933		ASSERT(refcount_is_zero(&hdr->b_refcnt));
1934		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1935		mutex_exit(&arc_eviction_mtx);
1936		if (destroy_hdr)
1937			arc_hdr_destroy(hdr);
1938	} else {
1939		if (remove_reference(hdr, NULL, tag) > 0)
1940			arc_buf_destroy(buf, FALSE, TRUE);
1941		else
1942			arc_hdr_destroy(hdr);
1943	}
1944}
1945
1946boolean_t
1947arc_buf_remove_ref(arc_buf_t *buf, void* tag)
1948{
1949	arc_buf_hdr_t *hdr = buf->b_hdr;
1950	kmutex_t *hash_lock = HDR_LOCK(hdr);
1951	boolean_t no_callback = (buf->b_efunc == NULL);
1952
1953	if (hdr->b_state == arc_anon) {
1954		ASSERT(hdr->b_datacnt == 1);
1955		arc_buf_free(buf, tag);
1956		return (no_callback);
1957	}
1958
1959	mutex_enter(hash_lock);
1960	hdr = buf->b_hdr;
1961	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1962	ASSERT(hdr->b_state != arc_anon);
1963	ASSERT(buf->b_data != NULL);
1964
1965	(void) remove_reference(hdr, hash_lock, tag);
1966	if (hdr->b_datacnt > 1) {
1967		if (no_callback)
1968			arc_buf_destroy(buf, FALSE, TRUE);
1969	} else if (no_callback) {
1970		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1971		ASSERT(buf->b_efunc == NULL);
1972		hdr->b_flags |= ARC_BUF_AVAILABLE;
1973	}
1974	ASSERT(no_callback || hdr->b_datacnt > 1 ||
1975	    refcount_is_zero(&hdr->b_refcnt));
1976	mutex_exit(hash_lock);
1977	return (no_callback);
1978}
1979
1980int
1981arc_buf_size(arc_buf_t *buf)
1982{
1983	return (buf->b_hdr->b_size);
1984}
1985
1986/*
1987 * Called from the DMU to determine if the current buffer should be
1988 * evicted. In order to ensure proper locking, the eviction must be initiated
1989 * from the DMU. Return true if the buffer is associated with user data and
1990 * duplicate buffers still exist.
1991 */
1992boolean_t
1993arc_buf_eviction_needed(arc_buf_t *buf)
1994{
1995	arc_buf_hdr_t *hdr;
1996	boolean_t evict_needed = B_FALSE;
1997
1998	if (zfs_disable_dup_eviction)
1999		return (B_FALSE);
2000
2001	mutex_enter(&buf->b_evict_lock);
2002	hdr = buf->b_hdr;
2003	if (hdr == NULL) {
2004		/*
2005		 * We are in arc_do_user_evicts(); let that function
2006		 * perform the eviction.
2007		 */
2008		ASSERT(buf->b_data == NULL);
2009		mutex_exit(&buf->b_evict_lock);
2010		return (B_FALSE);
2011	} else if (buf->b_data == NULL) {
2012		/*
2013		 * We have already been added to the arc eviction list;
2014		 * recommend eviction.
2015		 */
2016		ASSERT3P(hdr, ==, &arc_eviction_hdr);
2017		mutex_exit(&buf->b_evict_lock);
2018		return (B_TRUE);
2019	}
2020
2021	if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
2022		evict_needed = B_TRUE;
2023
2024	mutex_exit(&buf->b_evict_lock);
2025	return (evict_needed);
2026}
2027
2028/*
2029 * Evict buffers from list until we've removed the specified number of
2030 * bytes.  Move the removed buffers to the appropriate evict state.
2031 * If the recycle flag is set, then attempt to "recycle" a buffer:
2032 * - look for a buffer to evict that is `bytes' long.
2033 * - return the data block from this buffer rather than freeing it.
2034 * This flag is used by callers that are trying to make space for a
2035 * new buffer in a full arc cache.
2036 *
2037 * This function makes a "best effort".  It skips over any buffers
2038 * it can't get a hash_lock on, and so may not catch all candidates.
2039 * It may also return without evicting as much space as requested.
2040 */
2041static void *
2042arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
2043    arc_buf_contents_t type)
2044{
2045	arc_state_t *evicted_state;
2046	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
2047	int64_t bytes_remaining;
2048	arc_buf_hdr_t *ab, *ab_prev = NULL;
2049	list_t *evicted_list, *list, *evicted_list_start, *list_start;
2050	kmutex_t *lock, *evicted_lock;
2051	kmutex_t *hash_lock;
2052	boolean_t have_lock;
2053	void *stolen = NULL;
2054	arc_buf_hdr_t marker = { 0 };
2055	int count = 0;
2056	static int evict_metadata_offset, evict_data_offset;
2057	int i, idx, offset, list_count, lists;
2058
2059	ASSERT(state == arc_mru || state == arc_mfu);
2060
2061	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2062
2063	if (type == ARC_BUFC_METADATA) {
2064		offset = 0;
2065		list_count = ARC_BUFC_NUMMETADATALISTS;
2066		list_start = &state->arcs_lists[0];
2067		evicted_list_start = &evicted_state->arcs_lists[0];
2068		idx = evict_metadata_offset;
2069	} else {
2070		offset = ARC_BUFC_NUMMETADATALISTS;
2071		list_start = &state->arcs_lists[offset];
2072		evicted_list_start = &evicted_state->arcs_lists[offset];
2073		list_count = ARC_BUFC_NUMDATALISTS;
2074		idx = evict_data_offset;
2075	}
2076	bytes_remaining = evicted_state->arcs_lsize[type];
2077	lists = 0;
2078
2079evict_start:
2080	list = &list_start[idx];
2081	evicted_list = &evicted_list_start[idx];
2082	lock = ARCS_LOCK(state, (offset + idx));
2083	evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
2084
2085	mutex_enter(lock);
2086	mutex_enter(evicted_lock);
2087
2088	for (ab = list_tail(list); ab; ab = ab_prev) {
2089		ab_prev = list_prev(list, ab);
2090		bytes_remaining -= (ab->b_size * ab->b_datacnt);
2091		/* prefetch buffers have a minimum lifespan */
2092		if (HDR_IO_IN_PROGRESS(ab) ||
2093		    (spa && ab->b_spa != spa) ||
2094		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
2095		    ddi_get_lbolt() - ab->b_arc_access <
2096		    arc_min_prefetch_lifespan)) {
2097			skipped++;
2098			continue;
2099		}
2100		/* "lookahead" for better eviction candidate */
2101		if (recycle && ab->b_size != bytes &&
2102		    ab_prev && ab_prev->b_size == bytes)
2103			continue;
2104
2105		/* ignore markers */
2106		if (ab->b_spa == 0)
2107			continue;
2108
2109		/*
2110		 * It may take a long time to evict all the bufs requested.
2111		 * To avoid blocking all arc activity, periodically drop
2112		 * the arcs_mtx and give other threads a chance to run
2113		 * before reacquiring the lock.
2114		 *
2115		 * If we are looking for a buffer to recycle, we are in
2116		 * the hot code path, so don't sleep.
2117		 */
2118		if (!recycle && count++ > arc_evict_iterations) {
2119			list_insert_after(list, ab, &marker);
2120			mutex_exit(evicted_lock);
2121			mutex_exit(lock);
2122			kpreempt(KPREEMPT_SYNC);
2123			mutex_enter(lock);
2124			mutex_enter(evicted_lock);
2125			ab_prev = list_prev(list, &marker);
2126			list_remove(list, &marker);
2127			count = 0;
2128			continue;
2129		}
2130
2131		hash_lock = HDR_LOCK(ab);
2132		have_lock = MUTEX_HELD(hash_lock);
2133		if (have_lock || mutex_tryenter(hash_lock)) {
2134			ASSERT0(refcount_count(&ab->b_refcnt));
2135			ASSERT(ab->b_datacnt > 0);
2136			while (ab->b_buf) {
2137				arc_buf_t *buf = ab->b_buf;
2138				if (!mutex_tryenter(&buf->b_evict_lock)) {
2139					missed += 1;
2140					break;
2141				}
2142				if (buf->b_data) {
2143					bytes_evicted += ab->b_size;
2144					if (recycle && ab->b_type == type &&
2145					    ab->b_size == bytes &&
2146					    !HDR_L2_WRITING(ab)) {
2147						stolen = buf->b_data;
2148						recycle = FALSE;
2149					}
2150				}
2151				if (buf->b_efunc) {
2152					mutex_enter(&arc_eviction_mtx);
2153					arc_buf_destroy(buf,
2154					    buf->b_data == stolen, FALSE);
2155					ab->b_buf = buf->b_next;
2156					buf->b_hdr = &arc_eviction_hdr;
2157					buf->b_next = arc_eviction_list;
2158					arc_eviction_list = buf;
2159					mutex_exit(&arc_eviction_mtx);
2160					mutex_exit(&buf->b_evict_lock);
2161				} else {
2162					mutex_exit(&buf->b_evict_lock);
2163					arc_buf_destroy(buf,
2164					    buf->b_data == stolen, TRUE);
2165				}
2166			}
2167
2168			if (ab->b_l2hdr) {
2169				ARCSTAT_INCR(arcstat_evict_l2_cached,
2170				    ab->b_size);
2171			} else {
2172				if (l2arc_write_eligible(ab->b_spa, ab)) {
2173					ARCSTAT_INCR(arcstat_evict_l2_eligible,
2174					    ab->b_size);
2175				} else {
2176					ARCSTAT_INCR(
2177					    arcstat_evict_l2_ineligible,
2178					    ab->b_size);
2179				}
2180			}
2181
2182			if (ab->b_datacnt == 0) {
2183				arc_change_state(evicted_state, ab, hash_lock);
2184				ASSERT(HDR_IN_HASH_TABLE(ab));
2185				ab->b_flags |= ARC_IN_HASH_TABLE;
2186				ab->b_flags &= ~ARC_BUF_AVAILABLE;
2187				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
2188			}
2189			if (!have_lock)
2190				mutex_exit(hash_lock);
2191			if (bytes >= 0 && bytes_evicted >= bytes)
2192				break;
2193			if (bytes_remaining > 0) {
2194				mutex_exit(evicted_lock);
2195				mutex_exit(lock);
2196				idx  = ((idx + 1) & (list_count - 1));
2197				lists++;
2198				goto evict_start;
2199			}
2200		} else {
2201			missed += 1;
2202		}
2203	}
2204
2205	mutex_exit(evicted_lock);
2206	mutex_exit(lock);
2207
2208	idx  = ((idx + 1) & (list_count - 1));
2209	lists++;
2210
2211	if (bytes_evicted < bytes) {
2212		if (lists < list_count)
2213			goto evict_start;
2214		else
2215			dprintf("only evicted %lld bytes from %x",
2216			    (longlong_t)bytes_evicted, state);
2217	}
2218	if (type == ARC_BUFC_METADATA)
2219		evict_metadata_offset = idx;
2220	else
2221		evict_data_offset = idx;
2222
2223	if (skipped)
2224		ARCSTAT_INCR(arcstat_evict_skip, skipped);
2225
2226	if (missed)
2227		ARCSTAT_INCR(arcstat_mutex_miss, missed);
2228
2229	/*
2230	 * Note: we have just evicted some data into the ghost state,
2231	 * potentially putting the ghost size over the desired size.  Rather
2232	 * that evicting from the ghost list in this hot code path, leave
2233	 * this chore to the arc_reclaim_thread().
2234	 */
2235
2236	if (stolen)
2237		ARCSTAT_BUMP(arcstat_stolen);
2238	return (stolen);
2239}
2240
2241/*
2242 * Remove buffers from list until we've removed the specified number of
2243 * bytes.  Destroy the buffers that are removed.
2244 */
2245static void
2246arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
2247{
2248	arc_buf_hdr_t *ab, *ab_prev;
2249	arc_buf_hdr_t marker = { 0 };
2250	list_t *list, *list_start;
2251	kmutex_t *hash_lock, *lock;
2252	uint64_t bytes_deleted = 0;
2253	uint64_t bufs_skipped = 0;
2254	int count = 0;
2255	static int evict_offset;
2256	int list_count, idx = evict_offset;
2257	int offset, lists = 0;
2258
2259	ASSERT(GHOST_STATE(state));
2260
2261	/*
2262	 * data lists come after metadata lists
2263	 */
2264	list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
2265	list_count = ARC_BUFC_NUMDATALISTS;
2266	offset = ARC_BUFC_NUMMETADATALISTS;
2267
2268evict_start:
2269	list = &list_start[idx];
2270	lock = ARCS_LOCK(state, idx + offset);
2271
2272	mutex_enter(lock);
2273	for (ab = list_tail(list); ab; ab = ab_prev) {
2274		ab_prev = list_prev(list, ab);
2275		if (ab->b_type > ARC_BUFC_NUMTYPES)
2276			panic("invalid ab=%p", (void *)ab);
2277		if (spa && ab->b_spa != spa)
2278			continue;
2279
2280		/* ignore markers */
2281		if (ab->b_spa == 0)
2282			continue;
2283
2284		hash_lock = HDR_LOCK(ab);
2285		/* caller may be trying to modify this buffer, skip it */
2286		if (MUTEX_HELD(hash_lock))
2287			continue;
2288
2289		/*
2290		 * It may take a long time to evict all the bufs requested.
2291		 * To avoid blocking all arc activity, periodically drop
2292		 * the arcs_mtx and give other threads a chance to run
2293		 * before reacquiring the lock.
2294		 */
2295		if (count++ > arc_evict_iterations) {
2296			list_insert_after(list, ab, &marker);
2297			mutex_exit(lock);
2298			kpreempt(KPREEMPT_SYNC);
2299			mutex_enter(lock);
2300			ab_prev = list_prev(list, &marker);
2301			list_remove(list, &marker);
2302			count = 0;
2303			continue;
2304		}
2305		if (mutex_tryenter(hash_lock)) {
2306			ASSERT(!HDR_IO_IN_PROGRESS(ab));
2307			ASSERT(ab->b_buf == NULL);
2308			ARCSTAT_BUMP(arcstat_deleted);
2309			bytes_deleted += ab->b_size;
2310
2311			if (ab->b_l2hdr != NULL) {
2312				/*
2313				 * This buffer is cached on the 2nd Level ARC;
2314				 * don't destroy the header.
2315				 */
2316				arc_change_state(arc_l2c_only, ab, hash_lock);
2317				mutex_exit(hash_lock);
2318			} else {
2319				arc_change_state(arc_anon, ab, hash_lock);
2320				mutex_exit(hash_lock);
2321				arc_hdr_destroy(ab);
2322			}
2323
2324			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2325			if (bytes >= 0 && bytes_deleted >= bytes)
2326				break;
2327		} else if (bytes < 0) {
2328			/*
2329			 * Insert a list marker and then wait for the
2330			 * hash lock to become available. Once its
2331			 * available, restart from where we left off.
2332			 */
2333			list_insert_after(list, ab, &marker);
2334			mutex_exit(lock);
2335			mutex_enter(hash_lock);
2336			mutex_exit(hash_lock);
2337			mutex_enter(lock);
2338			ab_prev = list_prev(list, &marker);
2339			list_remove(list, &marker);
2340		} else {
2341			bufs_skipped += 1;
2342		}
2343
2344	}
2345	mutex_exit(lock);
2346	idx  = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
2347	lists++;
2348
2349	if (lists < list_count)
2350		goto evict_start;
2351
2352	evict_offset = idx;
2353	if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
2354	    (bytes < 0 || bytes_deleted < bytes)) {
2355		list_start = &state->arcs_lists[0];
2356		list_count = ARC_BUFC_NUMMETADATALISTS;
2357		offset = lists = 0;
2358		goto evict_start;
2359	}
2360
2361	if (bufs_skipped) {
2362		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2363		ASSERT(bytes >= 0);
2364	}
2365
2366	if (bytes_deleted < bytes)
2367		dprintf("only deleted %lld bytes from %p",
2368		    (longlong_t)bytes_deleted, state);
2369}
2370
2371static void
2372arc_adjust(void)
2373{
2374	int64_t adjustment, delta;
2375
2376	/*
2377	 * Adjust MRU size
2378	 */
2379
2380	adjustment = MIN((int64_t)(arc_size - arc_c),
2381	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2382	    arc_p));
2383
2384	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2385		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2386		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
2387		adjustment -= delta;
2388	}
2389
2390	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2391		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2392		(void) arc_evict(arc_mru, 0, delta, FALSE,
2393		    ARC_BUFC_METADATA);
2394	}
2395
2396	/*
2397	 * Adjust MFU size
2398	 */
2399
2400	adjustment = arc_size - arc_c;
2401
2402	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2403		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2404		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
2405		adjustment -= delta;
2406	}
2407
2408	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2409		int64_t delta = MIN(adjustment,
2410		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2411		(void) arc_evict(arc_mfu, 0, delta, FALSE,
2412		    ARC_BUFC_METADATA);
2413	}
2414
2415	/*
2416	 * Adjust ghost lists
2417	 */
2418
2419	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2420
2421	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2422		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2423		arc_evict_ghost(arc_mru_ghost, 0, delta);
2424	}
2425
2426	adjustment =
2427	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2428
2429	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2430		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2431		arc_evict_ghost(arc_mfu_ghost, 0, delta);
2432	}
2433}
2434
2435static void
2436arc_do_user_evicts(void)
2437{
2438	static arc_buf_t *tmp_arc_eviction_list;
2439
2440	/*
2441	 * Move list over to avoid LOR
2442	 */
2443restart:
2444	mutex_enter(&arc_eviction_mtx);
2445	tmp_arc_eviction_list = arc_eviction_list;
2446	arc_eviction_list = NULL;
2447	mutex_exit(&arc_eviction_mtx);
2448
2449	while (tmp_arc_eviction_list != NULL) {
2450		arc_buf_t *buf = tmp_arc_eviction_list;
2451		tmp_arc_eviction_list = buf->b_next;
2452		mutex_enter(&buf->b_evict_lock);
2453		buf->b_hdr = NULL;
2454		mutex_exit(&buf->b_evict_lock);
2455
2456		if (buf->b_efunc != NULL)
2457			VERIFY0(buf->b_efunc(buf->b_private));
2458
2459		buf->b_efunc = NULL;
2460		buf->b_private = NULL;
2461		kmem_cache_free(buf_cache, buf);
2462	}
2463
2464	if (arc_eviction_list != NULL)
2465		goto restart;
2466}
2467
2468/*
2469 * Flush all *evictable* data from the cache for the given spa.
2470 * NOTE: this will not touch "active" (i.e. referenced) data.
2471 */
2472void
2473arc_flush(spa_t *spa)
2474{
2475	uint64_t guid = 0;
2476
2477	if (spa)
2478		guid = spa_load_guid(spa);
2479
2480	while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
2481		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2482		if (spa)
2483			break;
2484	}
2485	while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
2486		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2487		if (spa)
2488			break;
2489	}
2490	while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
2491		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2492		if (spa)
2493			break;
2494	}
2495	while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
2496		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2497		if (spa)
2498			break;
2499	}
2500
2501	arc_evict_ghost(arc_mru_ghost, guid, -1);
2502	arc_evict_ghost(arc_mfu_ghost, guid, -1);
2503
2504	mutex_enter(&arc_reclaim_thr_lock);
2505	arc_do_user_evicts();
2506	mutex_exit(&arc_reclaim_thr_lock);
2507	ASSERT(spa || arc_eviction_list == NULL);
2508}
2509
2510void
2511arc_shrink(void)
2512{
2513
2514	if (arc_c > arc_c_min) {
2515		uint64_t to_free;
2516
2517		DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
2518			arc_c_min, uint64_t, arc_p, uint64_t, to_free);
2519#ifdef _KERNEL
2520		to_free = arc_c >> arc_shrink_shift;
2521#else
2522		to_free = arc_c >> arc_shrink_shift;
2523#endif
2524		if (arc_c > arc_c_min + to_free)
2525			atomic_add_64(&arc_c, -to_free);
2526		else
2527			arc_c = arc_c_min;
2528
2529		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2530		if (arc_c > arc_size)
2531			arc_c = MAX(arc_size, arc_c_min);
2532		if (arc_p > arc_c)
2533			arc_p = (arc_c >> 1);
2534
2535		DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
2536			arc_p);
2537
2538		ASSERT(arc_c >= arc_c_min);
2539		ASSERT((int64_t)arc_p >= 0);
2540	}
2541
2542	if (arc_size > arc_c) {
2543		DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,
2544			uint64_t, arc_c);
2545		arc_adjust();
2546	}
2547}
2548
2549static int needfree = 0;
2550
2551static int
2552arc_reclaim_needed(void)
2553{
2554
2555#ifdef _KERNEL
2556
2557	if (needfree) {
2558		DTRACE_PROBE(arc__reclaim_needfree);
2559		return (1);
2560	}
2561
2562	/*
2563	 * Cooperate with pagedaemon when it's time for it to scan
2564	 * and reclaim some pages.
2565	 */
2566	if (freemem < zfs_arc_free_target) {
2567		DTRACE_PROBE2(arc__reclaim_freemem, uint64_t,
2568		    freemem, uint64_t, zfs_arc_free_target);
2569		return (1);
2570	}
2571
2572#ifdef sun
2573	/*
2574	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
2575	 */
2576	extra = desfree;
2577
2578	/*
2579	 * check that we're out of range of the pageout scanner.  It starts to
2580	 * schedule paging if freemem is less than lotsfree and needfree.
2581	 * lotsfree is the high-water mark for pageout, and needfree is the
2582	 * number of needed free pages.  We add extra pages here to make sure
2583	 * the scanner doesn't start up while we're freeing memory.
2584	 */
2585	if (freemem < lotsfree + needfree + extra)
2586		return (1);
2587
2588	/*
2589	 * check to make sure that swapfs has enough space so that anon
2590	 * reservations can still succeed. anon_resvmem() checks that the
2591	 * availrmem is greater than swapfs_minfree, and the number of reserved
2592	 * swap pages.  We also add a bit of extra here just to prevent
2593	 * circumstances from getting really dire.
2594	 */
2595	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2596		return (1);
2597
2598	/*
2599	 * Check that we have enough availrmem that memory locking (e.g., via
2600	 * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
2601	 * stores the number of pages that cannot be locked; when availrmem
2602	 * drops below pages_pp_maximum, page locking mechanisms such as
2603	 * page_pp_lock() will fail.)
2604	 */
2605	if (availrmem <= pages_pp_maximum)
2606		return (1);
2607
2608#endif	/* sun */
2609#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
2610	/*
2611	 * If we're on an i386 platform, it's possible that we'll exhaust the
2612	 * kernel heap space before we ever run out of available physical
2613	 * memory.  Most checks of the size of the heap_area compare against
2614	 * tune.t_minarmem, which is the minimum available real memory that we
2615	 * can have in the system.  However, this is generally fixed at 25 pages
2616	 * which is so low that it's useless.  In this comparison, we seek to
2617	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
2618	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2619	 * free)
2620	 */
2621	if (vmem_size(heap_arena, VMEM_FREE) <
2622	    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) {
2623		DTRACE_PROBE2(arc__reclaim_used, uint64_t,
2624		    vmem_size(heap_arena, VMEM_FREE), uint64_t,
2625		    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2);
2626		return (1);
2627	}
2628#endif
2629#ifdef sun
2630	/*
2631	 * If zio data pages are being allocated out of a separate heap segment,
2632	 * then enforce that the size of available vmem for this arena remains
2633	 * above about 1/16th free.
2634	 *
2635	 * Note: The 1/16th arena free requirement was put in place
2636	 * to aggressively evict memory from the arc in order to avoid
2637	 * memory fragmentation issues.
2638	 */
2639	if (zio_arena != NULL &&
2640	    vmem_size(zio_arena, VMEM_FREE) <
2641	    (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2642		return (1);
2643#endif	/* sun */
2644#else	/* _KERNEL */
2645	if (spa_get_random(100) == 0)
2646		return (1);
2647#endif	/* _KERNEL */
2648	DTRACE_PROBE(arc__reclaim_no);
2649
2650	return (0);
2651}
2652
2653extern kmem_cache_t	*zio_buf_cache[];
2654extern kmem_cache_t	*zio_data_buf_cache[];
2655extern kmem_cache_t	*range_seg_cache;
2656
2657static void __noinline
2658arc_kmem_reap_now(arc_reclaim_strategy_t strat)
2659{
2660	size_t			i;
2661	kmem_cache_t		*prev_cache = NULL;
2662	kmem_cache_t		*prev_data_cache = NULL;
2663
2664	DTRACE_PROBE(arc__kmem_reap_start);
2665#ifdef _KERNEL
2666	if (arc_meta_used >= arc_meta_limit) {
2667		/*
2668		 * We are exceeding our meta-data cache limit.
2669		 * Purge some DNLC entries to release holds on meta-data.
2670		 */
2671		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2672	}
2673#if defined(__i386)
2674	/*
2675	 * Reclaim unused memory from all kmem caches.
2676	 */
2677	kmem_reap();
2678#endif
2679#endif
2680
2681	/*
2682	 * An aggressive reclamation will shrink the cache size as well as
2683	 * reap free buffers from the arc kmem caches.
2684	 */
2685	if (strat == ARC_RECLAIM_AGGR)
2686		arc_shrink();
2687
2688	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2689		if (zio_buf_cache[i] != prev_cache) {
2690			prev_cache = zio_buf_cache[i];
2691			kmem_cache_reap_now(zio_buf_cache[i]);
2692		}
2693		if (zio_data_buf_cache[i] != prev_data_cache) {
2694			prev_data_cache = zio_data_buf_cache[i];
2695			kmem_cache_reap_now(zio_data_buf_cache[i]);
2696		}
2697	}
2698	kmem_cache_reap_now(buf_cache);
2699	kmem_cache_reap_now(hdr_cache);
2700	kmem_cache_reap_now(range_seg_cache);
2701
2702#ifdef sun
2703	/*
2704	 * Ask the vmem arena to reclaim unused memory from its
2705	 * quantum caches.
2706	 */
2707	if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
2708		vmem_qcache_reap(zio_arena);
2709#endif
2710	DTRACE_PROBE(arc__kmem_reap_end);
2711}
2712
2713static void
2714arc_reclaim_thread(void *dummy __unused)
2715{
2716	clock_t			growtime = 0;
2717	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
2718	callb_cpr_t		cpr;
2719
2720	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2721
2722	mutex_enter(&arc_reclaim_thr_lock);
2723	while (arc_thread_exit == 0) {
2724		if (arc_reclaim_needed()) {
2725
2726			if (arc_no_grow) {
2727				if (last_reclaim == ARC_RECLAIM_CONS) {
2728					DTRACE_PROBE(arc__reclaim_aggr_no_grow);
2729					last_reclaim = ARC_RECLAIM_AGGR;
2730				} else {
2731					last_reclaim = ARC_RECLAIM_CONS;
2732				}
2733			} else {
2734				arc_no_grow = TRUE;
2735				last_reclaim = ARC_RECLAIM_AGGR;
2736				DTRACE_PROBE(arc__reclaim_aggr);
2737				membar_producer();
2738			}
2739
2740			/* reset the growth delay for every reclaim */
2741			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2742
2743			if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
2744				/*
2745				 * If needfree is TRUE our vm_lowmem hook
2746				 * was called and in that case we must free some
2747				 * memory, so switch to aggressive mode.
2748				 */
2749				arc_no_grow = TRUE;
2750				last_reclaim = ARC_RECLAIM_AGGR;
2751			}
2752			arc_kmem_reap_now(last_reclaim);
2753			arc_warm = B_TRUE;
2754
2755		} else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2756			arc_no_grow = FALSE;
2757		}
2758
2759		arc_adjust();
2760
2761		if (arc_eviction_list != NULL)
2762			arc_do_user_evicts();
2763
2764#ifdef _KERNEL
2765		if (needfree) {
2766			needfree = 0;
2767			wakeup(&needfree);
2768		}
2769#endif
2770
2771		/* block until needed, or one second, whichever is shorter */
2772		CALLB_CPR_SAFE_BEGIN(&cpr);
2773		(void) cv_timedwait(&arc_reclaim_thr_cv,
2774		    &arc_reclaim_thr_lock, hz);
2775		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2776	}
2777
2778	arc_thread_exit = 0;
2779	cv_broadcast(&arc_reclaim_thr_cv);
2780	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
2781	thread_exit();
2782}
2783
2784/*
2785 * Adapt arc info given the number of bytes we are trying to add and
2786 * the state that we are comming from.  This function is only called
2787 * when we are adding new content to the cache.
2788 */
2789static void
2790arc_adapt(int bytes, arc_state_t *state)
2791{
2792	int mult;
2793	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2794
2795	if (state == arc_l2c_only)
2796		return;
2797
2798	ASSERT(bytes > 0);
2799	/*
2800	 * Adapt the target size of the MRU list:
2801	 *	- if we just hit in the MRU ghost list, then increase
2802	 *	  the target size of the MRU list.
2803	 *	- if we just hit in the MFU ghost list, then increase
2804	 *	  the target size of the MFU list by decreasing the
2805	 *	  target size of the MRU list.
2806	 */
2807	if (state == arc_mru_ghost) {
2808		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2809		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2810		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2811
2812		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2813	} else if (state == arc_mfu_ghost) {
2814		uint64_t delta;
2815
2816		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2817		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2818		mult = MIN(mult, 10);
2819
2820		delta = MIN(bytes * mult, arc_p);
2821		arc_p = MAX(arc_p_min, arc_p - delta);
2822	}
2823	ASSERT((int64_t)arc_p >= 0);
2824
2825	if (arc_reclaim_needed()) {
2826		cv_signal(&arc_reclaim_thr_cv);
2827		return;
2828	}
2829
2830	if (arc_no_grow)
2831		return;
2832
2833	if (arc_c >= arc_c_max)
2834		return;
2835
2836	/*
2837	 * If we're within (2 * maxblocksize) bytes of the target
2838	 * cache size, increment the target cache size
2839	 */
2840	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2841		DTRACE_PROBE1(arc__inc_adapt, int, bytes);
2842		atomic_add_64(&arc_c, (int64_t)bytes);
2843		if (arc_c > arc_c_max)
2844			arc_c = arc_c_max;
2845		else if (state == arc_anon)
2846			atomic_add_64(&arc_p, (int64_t)bytes);
2847		if (arc_p > arc_c)
2848			arc_p = arc_c;
2849	}
2850	ASSERT((int64_t)arc_p >= 0);
2851}
2852
2853/*
2854 * Check if the cache has reached its limits and eviction is required
2855 * prior to insert.
2856 */
2857static int
2858arc_evict_needed(arc_buf_contents_t type)
2859{
2860	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2861		return (1);
2862
2863	if (arc_reclaim_needed())
2864		return (1);
2865
2866	return (arc_size > arc_c);
2867}
2868
2869/*
2870 * The buffer, supplied as the first argument, needs a data block.
2871 * So, if we are at cache max, determine which cache should be victimized.
2872 * We have the following cases:
2873 *
2874 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2875 * In this situation if we're out of space, but the resident size of the MFU is
2876 * under the limit, victimize the MFU cache to satisfy this insertion request.
2877 *
2878 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2879 * Here, we've used up all of the available space for the MRU, so we need to
2880 * evict from our own cache instead.  Evict from the set of resident MRU
2881 * entries.
2882 *
2883 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2884 * c minus p represents the MFU space in the cache, since p is the size of the
2885 * cache that is dedicated to the MRU.  In this situation there's still space on
2886 * the MFU side, so the MRU side needs to be victimized.
2887 *
2888 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2889 * MFU's resident set is consuming more space than it has been allotted.  In
2890 * this situation, we must victimize our own cache, the MFU, for this insertion.
2891 */
2892static void
2893arc_get_data_buf(arc_buf_t *buf)
2894{
2895	arc_state_t		*state = buf->b_hdr->b_state;
2896	uint64_t		size = buf->b_hdr->b_size;
2897	arc_buf_contents_t	type = buf->b_hdr->b_type;
2898
2899	arc_adapt(size, state);
2900
2901	/*
2902	 * We have not yet reached cache maximum size,
2903	 * just allocate a new buffer.
2904	 */
2905	if (!arc_evict_needed(type)) {
2906		if (type == ARC_BUFC_METADATA) {
2907			buf->b_data = zio_buf_alloc(size);
2908			arc_space_consume(size, ARC_SPACE_DATA);
2909		} else {
2910			ASSERT(type == ARC_BUFC_DATA);
2911			buf->b_data = zio_data_buf_alloc(size);
2912			ARCSTAT_INCR(arcstat_data_size, size);
2913			atomic_add_64(&arc_size, size);
2914		}
2915		goto out;
2916	}
2917
2918	/*
2919	 * If we are prefetching from the mfu ghost list, this buffer
2920	 * will end up on the mru list; so steal space from there.
2921	 */
2922	if (state == arc_mfu_ghost)
2923		state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2924	else if (state == arc_mru_ghost)
2925		state = arc_mru;
2926
2927	if (state == arc_mru || state == arc_anon) {
2928		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2929		state = (arc_mfu->arcs_lsize[type] >= size &&
2930		    arc_p > mru_used) ? arc_mfu : arc_mru;
2931	} else {
2932		/* MFU cases */
2933		uint64_t mfu_space = arc_c - arc_p;
2934		state =  (arc_mru->arcs_lsize[type] >= size &&
2935		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2936	}
2937	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
2938		if (type == ARC_BUFC_METADATA) {
2939			buf->b_data = zio_buf_alloc(size);
2940			arc_space_consume(size, ARC_SPACE_DATA);
2941		} else {
2942			ASSERT(type == ARC_BUFC_DATA);
2943			buf->b_data = zio_data_buf_alloc(size);
2944			ARCSTAT_INCR(arcstat_data_size, size);
2945			atomic_add_64(&arc_size, size);
2946		}
2947		ARCSTAT_BUMP(arcstat_recycle_miss);
2948	}
2949	ASSERT(buf->b_data != NULL);
2950out:
2951	/*
2952	 * Update the state size.  Note that ghost states have a
2953	 * "ghost size" and so don't need to be updated.
2954	 */
2955	if (!GHOST_STATE(buf->b_hdr->b_state)) {
2956		arc_buf_hdr_t *hdr = buf->b_hdr;
2957
2958		atomic_add_64(&hdr->b_state->arcs_size, size);
2959		if (list_link_active(&hdr->b_arc_node)) {
2960			ASSERT(refcount_is_zero(&hdr->b_refcnt));
2961			atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2962		}
2963		/*
2964		 * If we are growing the cache, and we are adding anonymous
2965		 * data, and we have outgrown arc_p, update arc_p
2966		 */
2967		if (arc_size < arc_c && hdr->b_state == arc_anon &&
2968		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2969			arc_p = MIN(arc_c, arc_p + size);
2970	}
2971	ARCSTAT_BUMP(arcstat_allocated);
2972}
2973
2974/*
2975 * This routine is called whenever a buffer is accessed.
2976 * NOTE: the hash lock is dropped in this function.
2977 */
2978static void
2979arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2980{
2981	clock_t now;
2982
2983	ASSERT(MUTEX_HELD(hash_lock));
2984
2985	if (buf->b_state == arc_anon) {
2986		/*
2987		 * This buffer is not in the cache, and does not
2988		 * appear in our "ghost" list.  Add the new buffer
2989		 * to the MRU state.
2990		 */
2991
2992		ASSERT(buf->b_arc_access == 0);
2993		buf->b_arc_access = ddi_get_lbolt();
2994		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2995		arc_change_state(arc_mru, buf, hash_lock);
2996
2997	} else if (buf->b_state == arc_mru) {
2998		now = ddi_get_lbolt();
2999
3000		/*
3001		 * If this buffer is here because of a prefetch, then either:
3002		 * - clear the flag if this is a "referencing" read
3003		 *   (any subsequent access will bump this into the MFU state).
3004		 * or
3005		 * - move the buffer to the head of the list if this is
3006		 *   another prefetch (to make it less likely to be evicted).
3007		 */
3008		if ((buf->b_flags & ARC_PREFETCH) != 0) {
3009			if (refcount_count(&buf->b_refcnt) == 0) {
3010				ASSERT(list_link_active(&buf->b_arc_node));
3011			} else {
3012				buf->b_flags &= ~ARC_PREFETCH;
3013				ARCSTAT_BUMP(arcstat_mru_hits);
3014			}
3015			buf->b_arc_access = now;
3016			return;
3017		}
3018
3019		/*
3020		 * This buffer has been "accessed" only once so far,
3021		 * but it is still in the cache. Move it to the MFU
3022		 * state.
3023		 */
3024		if (now > buf->b_arc_access + ARC_MINTIME) {
3025			/*
3026			 * More than 125ms have passed since we
3027			 * instantiated this buffer.  Move it to the
3028			 * most frequently used state.
3029			 */
3030			buf->b_arc_access = now;
3031			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
3032			arc_change_state(arc_mfu, buf, hash_lock);
3033		}
3034		ARCSTAT_BUMP(arcstat_mru_hits);
3035	} else if (buf->b_state == arc_mru_ghost) {
3036		arc_state_t	*new_state;
3037		/*
3038		 * This buffer has been "accessed" recently, but
3039		 * was evicted from the cache.  Move it to the
3040		 * MFU state.
3041		 */
3042
3043		if (buf->b_flags & ARC_PREFETCH) {
3044			new_state = arc_mru;
3045			if (refcount_count(&buf->b_refcnt) > 0)
3046				buf->b_flags &= ~ARC_PREFETCH;
3047			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
3048		} else {
3049			new_state = arc_mfu;
3050			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
3051		}
3052
3053		buf->b_arc_access = ddi_get_lbolt();
3054		arc_change_state(new_state, buf, hash_lock);
3055
3056		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
3057	} else if (buf->b_state == arc_mfu) {
3058		/*
3059		 * This buffer has been accessed more than once and is
3060		 * still in the cache.  Keep it in the MFU state.
3061		 *
3062		 * NOTE: an add_reference() that occurred when we did
3063		 * the arc_read() will have kicked this off the list.
3064		 * If it was a prefetch, we will explicitly move it to
3065		 * the head of the list now.
3066		 */
3067		if ((buf->b_flags & ARC_PREFETCH) != 0) {
3068			ASSERT(refcount_count(&buf->b_refcnt) == 0);
3069			ASSERT(list_link_active(&buf->b_arc_node));
3070		}
3071		ARCSTAT_BUMP(arcstat_mfu_hits);
3072		buf->b_arc_access = ddi_get_lbolt();
3073	} else if (buf->b_state == arc_mfu_ghost) {
3074		arc_state_t	*new_state = arc_mfu;
3075		/*
3076		 * This buffer has been accessed more than once but has
3077		 * been evicted from the cache.  Move it back to the
3078		 * MFU state.
3079		 */
3080
3081		if (buf->b_flags & ARC_PREFETCH) {
3082			/*
3083			 * This is a prefetch access...
3084			 * move this block back to the MRU state.
3085			 */
3086			ASSERT0(refcount_count(&buf->b_refcnt));
3087			new_state = arc_mru;
3088		}
3089
3090		buf->b_arc_access = ddi_get_lbolt();
3091		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
3092		arc_change_state(new_state, buf, hash_lock);
3093
3094		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
3095	} else if (buf->b_state == arc_l2c_only) {
3096		/*
3097		 * This buffer is on the 2nd Level ARC.
3098		 */
3099
3100		buf->b_arc_access = ddi_get_lbolt();
3101		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
3102		arc_change_state(arc_mfu, buf, hash_lock);
3103	} else {
3104		ASSERT(!"invalid arc state");
3105	}
3106}
3107
3108/* a generic arc_done_func_t which you can use */
3109/* ARGSUSED */
3110void
3111arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
3112{
3113	if (zio == NULL || zio->io_error == 0)
3114		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
3115	VERIFY(arc_buf_remove_ref(buf, arg));
3116}
3117
3118/* a generic arc_done_func_t */
3119void
3120arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
3121{
3122	arc_buf_t **bufp = arg;
3123	if (zio && zio->io_error) {
3124		VERIFY(arc_buf_remove_ref(buf, arg));
3125		*bufp = NULL;
3126	} else {
3127		*bufp = buf;
3128		ASSERT(buf->b_data);
3129	}
3130}
3131
3132static void
3133arc_read_done(zio_t *zio)
3134{
3135	arc_buf_hdr_t	*hdr;
3136	arc_buf_t	*buf;
3137	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
3138	kmutex_t	*hash_lock = NULL;
3139	arc_callback_t	*callback_list, *acb;
3140	int		freeable = FALSE;
3141
3142	buf = zio->io_private;
3143	hdr = buf->b_hdr;
3144
3145	/*
3146	 * The hdr was inserted into hash-table and removed from lists
3147	 * prior to starting I/O.  We should find this header, since
3148	 * it's in the hash table, and it should be legit since it's
3149	 * not possible to evict it during the I/O.  The only possible
3150	 * reason for it not to be found is if we were freed during the
3151	 * read.
3152	 */
3153	if (HDR_IN_HASH_TABLE(hdr)) {
3154		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
3155		ASSERT3U(hdr->b_dva.dva_word[0], ==,
3156		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
3157		ASSERT3U(hdr->b_dva.dva_word[1], ==,
3158		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
3159
3160		arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
3161		    &hash_lock);
3162
3163		ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
3164		    hash_lock == NULL) ||
3165		    (found == hdr &&
3166		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
3167		    (found == hdr && HDR_L2_READING(hdr)));
3168	}
3169
3170	hdr->b_flags &= ~ARC_L2_EVICTED;
3171	if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
3172		hdr->b_flags &= ~ARC_L2CACHE;
3173
3174	/* byteswap if necessary */
3175	callback_list = hdr->b_acb;
3176	ASSERT(callback_list != NULL);
3177	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
3178		dmu_object_byteswap_t bswap =
3179		    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
3180		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
3181		    byteswap_uint64_array :
3182		    dmu_ot_byteswap[bswap].ob_func;
3183		func(buf->b_data, hdr->b_size);
3184	}
3185
3186	arc_cksum_compute(buf, B_FALSE);
3187#ifdef illumos
3188	arc_buf_watch(buf);
3189#endif /* illumos */
3190
3191	if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
3192		/*
3193		 * Only call arc_access on anonymous buffers.  This is because
3194		 * if we've issued an I/O for an evicted buffer, we've already
3195		 * called arc_access (to prevent any simultaneous readers from
3196		 * getting confused).
3197		 */
3198		arc_access(hdr, hash_lock);
3199	}
3200
3201	/* create copies of the data buffer for the callers */
3202	abuf = buf;
3203	for (acb = callback_list; acb; acb = acb->acb_next) {
3204		if (acb->acb_done) {
3205			if (abuf == NULL) {
3206				ARCSTAT_BUMP(arcstat_duplicate_reads);
3207				abuf = arc_buf_clone(buf);
3208			}
3209			acb->acb_buf = abuf;
3210			abuf = NULL;
3211		}
3212	}
3213	hdr->b_acb = NULL;
3214	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3215	ASSERT(!HDR_BUF_AVAILABLE(hdr));
3216	if (abuf == buf) {
3217		ASSERT(buf->b_efunc == NULL);
3218		ASSERT(hdr->b_datacnt == 1);
3219		hdr->b_flags |= ARC_BUF_AVAILABLE;
3220	}
3221
3222	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
3223
3224	if (zio->io_error != 0) {
3225		hdr->b_flags |= ARC_IO_ERROR;
3226		if (hdr->b_state != arc_anon)
3227			arc_change_state(arc_anon, hdr, hash_lock);
3228		if (HDR_IN_HASH_TABLE(hdr))
3229			buf_hash_remove(hdr);
3230		freeable = refcount_is_zero(&hdr->b_refcnt);
3231	}
3232
3233	/*
3234	 * Broadcast before we drop the hash_lock to avoid the possibility
3235	 * that the hdr (and hence the cv) might be freed before we get to
3236	 * the cv_broadcast().
3237	 */
3238	cv_broadcast(&hdr->b_cv);
3239
3240	if (hash_lock) {
3241		mutex_exit(hash_lock);
3242	} else {
3243		/*
3244		 * This block was freed while we waited for the read to
3245		 * complete.  It has been removed from the hash table and
3246		 * moved to the anonymous state (so that it won't show up
3247		 * in the cache).
3248		 */
3249		ASSERT3P(hdr->b_state, ==, arc_anon);
3250		freeable = refcount_is_zero(&hdr->b_refcnt);
3251	}
3252
3253	/* execute each callback and free its structure */
3254	while ((acb = callback_list) != NULL) {
3255		if (acb->acb_done)
3256			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3257
3258		if (acb->acb_zio_dummy != NULL) {
3259			acb->acb_zio_dummy->io_error = zio->io_error;
3260			zio_nowait(acb->acb_zio_dummy);
3261		}
3262
3263		callback_list = acb->acb_next;
3264		kmem_free(acb, sizeof (arc_callback_t));
3265	}
3266
3267	if (freeable)
3268		arc_hdr_destroy(hdr);
3269}
3270
3271/*
3272 * "Read" the block block at the specified DVA (in bp) via the
3273 * cache.  If the block is found in the cache, invoke the provided
3274 * callback immediately and return.  Note that the `zio' parameter
3275 * in the callback will be NULL in this case, since no IO was
3276 * required.  If the block is not in the cache pass the read request
3277 * on to the spa with a substitute callback function, so that the
3278 * requested block will be added to the cache.
3279 *
3280 * If a read request arrives for a block that has a read in-progress,
3281 * either wait for the in-progress read to complete (and return the
3282 * results); or, if this is a read with a "done" func, add a record
3283 * to the read to invoke the "done" func when the read completes,
3284 * and return; or just return.
3285 *
3286 * arc_read_done() will invoke all the requested "done" functions
3287 * for readers of this block.
3288 */
3289int
3290arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
3291    void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags,
3292    const zbookmark_phys_t *zb)
3293{
3294	arc_buf_hdr_t *hdr = NULL;
3295	arc_buf_t *buf = NULL;
3296	kmutex_t *hash_lock = NULL;
3297	zio_t *rzio;
3298	uint64_t guid = spa_load_guid(spa);
3299
3300	ASSERT(!BP_IS_EMBEDDED(bp) ||
3301	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
3302
3303top:
3304	if (!BP_IS_EMBEDDED(bp)) {
3305		/*
3306		 * Embedded BP's have no DVA and require no I/O to "read".
3307		 * Create an anonymous arc buf to back it.
3308		 */
3309		hdr = buf_hash_find(guid, bp, &hash_lock);
3310	}
3311
3312	if (hdr != NULL && hdr->b_datacnt > 0) {
3313
3314		*arc_flags |= ARC_CACHED;
3315
3316		if (HDR_IO_IN_PROGRESS(hdr)) {
3317
3318			if (*arc_flags & ARC_WAIT) {
3319				cv_wait(&hdr->b_cv, hash_lock);
3320				mutex_exit(hash_lock);
3321				goto top;
3322			}
3323			ASSERT(*arc_flags & ARC_NOWAIT);
3324
3325			if (done) {
3326				arc_callback_t	*acb = NULL;
3327
3328				acb = kmem_zalloc(sizeof (arc_callback_t),
3329				    KM_SLEEP);
3330				acb->acb_done = done;
3331				acb->acb_private = private;
3332				if (pio != NULL)
3333					acb->acb_zio_dummy = zio_null(pio,
3334					    spa, NULL, NULL, NULL, zio_flags);
3335
3336				ASSERT(acb->acb_done != NULL);
3337				acb->acb_next = hdr->b_acb;
3338				hdr->b_acb = acb;
3339				add_reference(hdr, hash_lock, private);
3340				mutex_exit(hash_lock);
3341				return (0);
3342			}
3343			mutex_exit(hash_lock);
3344			return (0);
3345		}
3346
3347		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3348
3349		if (done) {
3350			add_reference(hdr, hash_lock, private);
3351			/*
3352			 * If this block is already in use, create a new
3353			 * copy of the data so that we will be guaranteed
3354			 * that arc_release() will always succeed.
3355			 */
3356			buf = hdr->b_buf;
3357			ASSERT(buf);
3358			ASSERT(buf->b_data);
3359			if (HDR_BUF_AVAILABLE(hdr)) {
3360				ASSERT(buf->b_efunc == NULL);
3361				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3362			} else {
3363				buf = arc_buf_clone(buf);
3364			}
3365
3366		} else if (*arc_flags & ARC_PREFETCH &&
3367		    refcount_count(&hdr->b_refcnt) == 0) {
3368			hdr->b_flags |= ARC_PREFETCH;
3369		}
3370		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3371		arc_access(hdr, hash_lock);
3372		if (*arc_flags & ARC_L2CACHE)
3373			hdr->b_flags |= ARC_L2CACHE;
3374		if (*arc_flags & ARC_L2COMPRESS)
3375			hdr->b_flags |= ARC_L2COMPRESS;
3376		mutex_exit(hash_lock);
3377		ARCSTAT_BUMP(arcstat_hits);
3378		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3379		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3380		    data, metadata, hits);
3381
3382		if (done)
3383			done(NULL, buf, private);
3384	} else {
3385		uint64_t size = BP_GET_LSIZE(bp);
3386		arc_callback_t *acb;
3387		vdev_t *vd = NULL;
3388		uint64_t addr = 0;
3389		boolean_t devw = B_FALSE;
3390		enum zio_compress b_compress = ZIO_COMPRESS_OFF;
3391		uint64_t b_asize = 0;
3392
3393		if (hdr == NULL) {
3394			/* this block is not in the cache */
3395			arc_buf_hdr_t *exists = NULL;
3396			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3397			buf = arc_buf_alloc(spa, size, private, type);
3398			hdr = buf->b_hdr;
3399			if (!BP_IS_EMBEDDED(bp)) {
3400				hdr->b_dva = *BP_IDENTITY(bp);
3401				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3402				hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3403				exists = buf_hash_insert(hdr, &hash_lock);
3404			}
3405			if (exists != NULL) {
3406				/* somebody beat us to the hash insert */
3407				mutex_exit(hash_lock);
3408				buf_discard_identity(hdr);
3409				(void) arc_buf_remove_ref(buf, private);
3410				goto top; /* restart the IO request */
3411			}
3412			/* if this is a prefetch, we don't have a reference */
3413			if (*arc_flags & ARC_PREFETCH) {
3414				(void) remove_reference(hdr, hash_lock,
3415				    private);
3416				hdr->b_flags |= ARC_PREFETCH;
3417			}
3418			if (*arc_flags & ARC_L2CACHE)
3419				hdr->b_flags |= ARC_L2CACHE;
3420			if (*arc_flags & ARC_L2COMPRESS)
3421				hdr->b_flags |= ARC_L2COMPRESS;
3422			if (BP_GET_LEVEL(bp) > 0)
3423				hdr->b_flags |= ARC_INDIRECT;
3424		} else {
3425			/* this block is in the ghost cache */
3426			ASSERT(GHOST_STATE(hdr->b_state));
3427			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3428			ASSERT0(refcount_count(&hdr->b_refcnt));
3429			ASSERT(hdr->b_buf == NULL);
3430
3431			/* if this is a prefetch, we don't have a reference */
3432			if (*arc_flags & ARC_PREFETCH)
3433				hdr->b_flags |= ARC_PREFETCH;
3434			else
3435				add_reference(hdr, hash_lock, private);
3436			if (*arc_flags & ARC_L2CACHE)
3437				hdr->b_flags |= ARC_L2CACHE;
3438			if (*arc_flags & ARC_L2COMPRESS)
3439				hdr->b_flags |= ARC_L2COMPRESS;
3440			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3441			buf->b_hdr = hdr;
3442			buf->b_data = NULL;
3443			buf->b_efunc = NULL;
3444			buf->b_private = NULL;
3445			buf->b_next = NULL;
3446			hdr->b_buf = buf;
3447			ASSERT(hdr->b_datacnt == 0);
3448			hdr->b_datacnt = 1;
3449			arc_get_data_buf(buf);
3450			arc_access(hdr, hash_lock);
3451		}
3452
3453		ASSERT(!GHOST_STATE(hdr->b_state));
3454
3455		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3456		acb->acb_done = done;
3457		acb->acb_private = private;
3458
3459		ASSERT(hdr->b_acb == NULL);
3460		hdr->b_acb = acb;
3461		hdr->b_flags |= ARC_IO_IN_PROGRESS;
3462
3463		if (hdr->b_l2hdr != NULL &&
3464		    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3465			devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3466			addr = hdr->b_l2hdr->b_daddr;
3467			b_compress = hdr->b_l2hdr->b_compress;
3468			b_asize = hdr->b_l2hdr->b_asize;
3469			/*
3470			 * Lock out device removal.
3471			 */
3472			if (vdev_is_dead(vd) ||
3473			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3474				vd = NULL;
3475		}
3476
3477		if (hash_lock != NULL)
3478			mutex_exit(hash_lock);
3479
3480		/*
3481		 * At this point, we have a level 1 cache miss.  Try again in
3482		 * L2ARC if possible.
3483		 */
3484		ASSERT3U(hdr->b_size, ==, size);
3485		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3486		    uint64_t, size, zbookmark_phys_t *, zb);
3487		ARCSTAT_BUMP(arcstat_misses);
3488		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3489		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3490		    data, metadata, misses);
3491#ifdef _KERNEL
3492		curthread->td_ru.ru_inblock++;
3493#endif
3494
3495		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3496			/*
3497			 * Read from the L2ARC if the following are true:
3498			 * 1. The L2ARC vdev was previously cached.
3499			 * 2. This buffer still has L2ARC metadata.
3500			 * 3. This buffer isn't currently writing to the L2ARC.
3501			 * 4. The L2ARC entry wasn't evicted, which may
3502			 *    also have invalidated the vdev.
3503			 * 5. This isn't prefetch and l2arc_noprefetch is set.
3504			 */
3505			if (hdr->b_l2hdr != NULL &&
3506			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3507			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3508				l2arc_read_callback_t *cb;
3509
3510				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3511				ARCSTAT_BUMP(arcstat_l2_hits);
3512
3513				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3514				    KM_SLEEP);
3515				cb->l2rcb_buf = buf;
3516				cb->l2rcb_spa = spa;
3517				cb->l2rcb_bp = *bp;
3518				cb->l2rcb_zb = *zb;
3519				cb->l2rcb_flags = zio_flags;
3520				cb->l2rcb_compress = b_compress;
3521
3522				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3523				    addr + size < vd->vdev_psize -
3524				    VDEV_LABEL_END_SIZE);
3525
3526				/*
3527				 * l2arc read.  The SCL_L2ARC lock will be
3528				 * released by l2arc_read_done().
3529				 * Issue a null zio if the underlying buffer
3530				 * was squashed to zero size by compression.
3531				 */
3532				if (b_compress == ZIO_COMPRESS_EMPTY) {
3533					rzio = zio_null(pio, spa, vd,
3534					    l2arc_read_done, cb,
3535					    zio_flags | ZIO_FLAG_DONT_CACHE |
3536					    ZIO_FLAG_CANFAIL |
3537					    ZIO_FLAG_DONT_PROPAGATE |
3538					    ZIO_FLAG_DONT_RETRY);
3539				} else {
3540					rzio = zio_read_phys(pio, vd, addr,
3541					    b_asize, buf->b_data,
3542					    ZIO_CHECKSUM_OFF,
3543					    l2arc_read_done, cb, priority,
3544					    zio_flags | ZIO_FLAG_DONT_CACHE |
3545					    ZIO_FLAG_CANFAIL |
3546					    ZIO_FLAG_DONT_PROPAGATE |
3547					    ZIO_FLAG_DONT_RETRY, B_FALSE);
3548				}
3549				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3550				    zio_t *, rzio);
3551				ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
3552
3553				if (*arc_flags & ARC_NOWAIT) {
3554					zio_nowait(rzio);
3555					return (0);
3556				}
3557
3558				ASSERT(*arc_flags & ARC_WAIT);
3559				if (zio_wait(rzio) == 0)
3560					return (0);
3561
3562				/* l2arc read error; goto zio_read() */
3563			} else {
3564				DTRACE_PROBE1(l2arc__miss,
3565				    arc_buf_hdr_t *, hdr);
3566				ARCSTAT_BUMP(arcstat_l2_misses);
3567				if (HDR_L2_WRITING(hdr))
3568					ARCSTAT_BUMP(arcstat_l2_rw_clash);
3569				spa_config_exit(spa, SCL_L2ARC, vd);
3570			}
3571		} else {
3572			if (vd != NULL)
3573				spa_config_exit(spa, SCL_L2ARC, vd);
3574			if (l2arc_ndev != 0) {
3575				DTRACE_PROBE1(l2arc__miss,
3576				    arc_buf_hdr_t *, hdr);
3577				ARCSTAT_BUMP(arcstat_l2_misses);
3578			}
3579		}
3580
3581		rzio = zio_read(pio, spa, bp, buf->b_data, size,
3582		    arc_read_done, buf, priority, zio_flags, zb);
3583
3584		if (*arc_flags & ARC_WAIT)
3585			return (zio_wait(rzio));
3586
3587		ASSERT(*arc_flags & ARC_NOWAIT);
3588		zio_nowait(rzio);
3589	}
3590	return (0);
3591}
3592
3593void
3594arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3595{
3596	ASSERT(buf->b_hdr != NULL);
3597	ASSERT(buf->b_hdr->b_state != arc_anon);
3598	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3599	ASSERT(buf->b_efunc == NULL);
3600	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3601
3602	buf->b_efunc = func;
3603	buf->b_private = private;
3604}
3605
3606/*
3607 * Notify the arc that a block was freed, and thus will never be used again.
3608 */
3609void
3610arc_freed(spa_t *spa, const blkptr_t *bp)
3611{
3612	arc_buf_hdr_t *hdr;
3613	kmutex_t *hash_lock;
3614	uint64_t guid = spa_load_guid(spa);
3615
3616	ASSERT(!BP_IS_EMBEDDED(bp));
3617
3618	hdr = buf_hash_find(guid, bp, &hash_lock);
3619	if (hdr == NULL)
3620		return;
3621	if (HDR_BUF_AVAILABLE(hdr)) {
3622		arc_buf_t *buf = hdr->b_buf;
3623		add_reference(hdr, hash_lock, FTAG);
3624		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3625		mutex_exit(hash_lock);
3626
3627		arc_release(buf, FTAG);
3628		(void) arc_buf_remove_ref(buf, FTAG);
3629	} else {
3630		mutex_exit(hash_lock);
3631	}
3632
3633}
3634
3635/*
3636 * Clear the user eviction callback set by arc_set_callback(), first calling
3637 * it if it exists.  Because the presence of a callback keeps an arc_buf cached
3638 * clearing the callback may result in the arc_buf being destroyed.  However,
3639 * it will not result in the *last* arc_buf being destroyed, hence the data
3640 * will remain cached in the ARC. We make a copy of the arc buffer here so
3641 * that we can process the callback without holding any locks.
3642 *
3643 * It's possible that the callback is already in the process of being cleared
3644 * by another thread.  In this case we can not clear the callback.
3645 *
3646 * Returns B_TRUE if the callback was successfully called and cleared.
3647 */
3648boolean_t
3649arc_clear_callback(arc_buf_t *buf)
3650{
3651	arc_buf_hdr_t *hdr;
3652	kmutex_t *hash_lock;
3653	arc_evict_func_t *efunc = buf->b_efunc;
3654	void *private = buf->b_private;
3655	list_t *list, *evicted_list;
3656	kmutex_t *lock, *evicted_lock;
3657
3658	mutex_enter(&buf->b_evict_lock);
3659	hdr = buf->b_hdr;
3660	if (hdr == NULL) {
3661		/*
3662		 * We are in arc_do_user_evicts().
3663		 */
3664		ASSERT(buf->b_data == NULL);
3665		mutex_exit(&buf->b_evict_lock);
3666		return (B_FALSE);
3667	} else if (buf->b_data == NULL) {
3668		/*
3669		 * We are on the eviction list; process this buffer now
3670		 * but let arc_do_user_evicts() do the reaping.
3671		 */
3672		buf->b_efunc = NULL;
3673		mutex_exit(&buf->b_evict_lock);
3674		VERIFY0(efunc(private));
3675		return (B_TRUE);
3676	}
3677	hash_lock = HDR_LOCK(hdr);
3678	mutex_enter(hash_lock);
3679	hdr = buf->b_hdr;
3680	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3681
3682	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3683	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3684
3685	buf->b_efunc = NULL;
3686	buf->b_private = NULL;
3687
3688	if (hdr->b_datacnt > 1) {
3689		mutex_exit(&buf->b_evict_lock);
3690		arc_buf_destroy(buf, FALSE, TRUE);
3691	} else {
3692		ASSERT(buf == hdr->b_buf);
3693		hdr->b_flags |= ARC_BUF_AVAILABLE;
3694		mutex_exit(&buf->b_evict_lock);
3695	}
3696
3697	mutex_exit(hash_lock);
3698	VERIFY0(efunc(private));
3699	return (B_TRUE);
3700}
3701
3702/*
3703 * Release this buffer from the cache, making it an anonymous buffer.  This
3704 * must be done after a read and prior to modifying the buffer contents.
3705 * If the buffer has more than one reference, we must make
3706 * a new hdr for the buffer.
3707 */
3708void
3709arc_release(arc_buf_t *buf, void *tag)
3710{
3711	arc_buf_hdr_t *hdr;
3712	kmutex_t *hash_lock = NULL;
3713	l2arc_buf_hdr_t *l2hdr;
3714	uint64_t buf_size;
3715
3716	/*
3717	 * It would be nice to assert that if it's DMU metadata (level >
3718	 * 0 || it's the dnode file), then it must be syncing context.
3719	 * But we don't know that information at this level.
3720	 */
3721
3722	mutex_enter(&buf->b_evict_lock);
3723	hdr = buf->b_hdr;
3724
3725	/* this buffer is not on any list */
3726	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3727
3728	if (hdr->b_state == arc_anon) {
3729		/* this buffer is already released */
3730		ASSERT(buf->b_efunc == NULL);
3731	} else {
3732		hash_lock = HDR_LOCK(hdr);
3733		mutex_enter(hash_lock);
3734		hdr = buf->b_hdr;
3735		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3736	}
3737
3738	l2hdr = hdr->b_l2hdr;
3739	if (l2hdr) {
3740		mutex_enter(&l2arc_buflist_mtx);
3741		arc_buf_l2_cdata_free(hdr);
3742		hdr->b_l2hdr = NULL;
3743		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3744	}
3745	buf_size = hdr->b_size;
3746
3747	/*
3748	 * Do we have more than one buf?
3749	 */
3750	if (hdr->b_datacnt > 1) {
3751		arc_buf_hdr_t *nhdr;
3752		arc_buf_t **bufp;
3753		uint64_t blksz = hdr->b_size;
3754		uint64_t spa = hdr->b_spa;
3755		arc_buf_contents_t type = hdr->b_type;
3756		uint32_t flags = hdr->b_flags;
3757
3758		ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3759		/*
3760		 * Pull the data off of this hdr and attach it to
3761		 * a new anonymous hdr.
3762		 */
3763		(void) remove_reference(hdr, hash_lock, tag);
3764		bufp = &hdr->b_buf;
3765		while (*bufp != buf)
3766			bufp = &(*bufp)->b_next;
3767		*bufp = buf->b_next;
3768		buf->b_next = NULL;
3769
3770		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3771		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3772		if (refcount_is_zero(&hdr->b_refcnt)) {
3773			uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3774			ASSERT3U(*size, >=, hdr->b_size);
3775			atomic_add_64(size, -hdr->b_size);
3776		}
3777
3778		/*
3779		 * We're releasing a duplicate user data buffer, update
3780		 * our statistics accordingly.
3781		 */
3782		if (hdr->b_type == ARC_BUFC_DATA) {
3783			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3784			ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3785			    -hdr->b_size);
3786		}
3787		hdr->b_datacnt -= 1;
3788		arc_cksum_verify(buf);
3789#ifdef illumos
3790		arc_buf_unwatch(buf);
3791#endif /* illumos */
3792
3793		mutex_exit(hash_lock);
3794
3795		nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3796		nhdr->b_size = blksz;
3797		nhdr->b_spa = spa;
3798		nhdr->b_type = type;
3799		nhdr->b_buf = buf;
3800		nhdr->b_state = arc_anon;
3801		nhdr->b_arc_access = 0;
3802		nhdr->b_flags = flags & ARC_L2_WRITING;
3803		nhdr->b_l2hdr = NULL;
3804		nhdr->b_datacnt = 1;
3805		nhdr->b_freeze_cksum = NULL;
3806		(void) refcount_add(&nhdr->b_refcnt, tag);
3807		buf->b_hdr = nhdr;
3808		mutex_exit(&buf->b_evict_lock);
3809		atomic_add_64(&arc_anon->arcs_size, blksz);
3810	} else {
3811		mutex_exit(&buf->b_evict_lock);
3812		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3813		ASSERT(!list_link_active(&hdr->b_arc_node));
3814		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3815		if (hdr->b_state != arc_anon)
3816			arc_change_state(arc_anon, hdr, hash_lock);
3817		hdr->b_arc_access = 0;
3818		if (hash_lock)
3819			mutex_exit(hash_lock);
3820
3821		buf_discard_identity(hdr);
3822		arc_buf_thaw(buf);
3823	}
3824	buf->b_efunc = NULL;
3825	buf->b_private = NULL;
3826
3827	if (l2hdr) {
3828		ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3829		vdev_space_update(l2hdr->b_dev->l2ad_vdev,
3830		    -l2hdr->b_asize, 0, 0);
3831		trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
3832		    hdr->b_size, 0);
3833		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3834		ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3835		mutex_exit(&l2arc_buflist_mtx);
3836	}
3837}
3838
3839int
3840arc_released(arc_buf_t *buf)
3841{
3842	int released;
3843
3844	mutex_enter(&buf->b_evict_lock);
3845	released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3846	mutex_exit(&buf->b_evict_lock);
3847	return (released);
3848}
3849
3850#ifdef ZFS_DEBUG
3851int
3852arc_referenced(arc_buf_t *buf)
3853{
3854	int referenced;
3855
3856	mutex_enter(&buf->b_evict_lock);
3857	referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3858	mutex_exit(&buf->b_evict_lock);
3859	return (referenced);
3860}
3861#endif
3862
3863static void
3864arc_write_ready(zio_t *zio)
3865{
3866	arc_write_callback_t *callback = zio->io_private;
3867	arc_buf_t *buf = callback->awcb_buf;
3868	arc_buf_hdr_t *hdr = buf->b_hdr;
3869
3870	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3871	callback->awcb_ready(zio, buf, callback->awcb_private);
3872
3873	/*
3874	 * If the IO is already in progress, then this is a re-write
3875	 * attempt, so we need to thaw and re-compute the cksum.
3876	 * It is the responsibility of the callback to handle the
3877	 * accounting for any re-write attempt.
3878	 */
3879	if (HDR_IO_IN_PROGRESS(hdr)) {
3880		mutex_enter(&hdr->b_freeze_lock);
3881		if (hdr->b_freeze_cksum != NULL) {
3882			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3883			hdr->b_freeze_cksum = NULL;
3884		}
3885		mutex_exit(&hdr->b_freeze_lock);
3886	}
3887	arc_cksum_compute(buf, B_FALSE);
3888	hdr->b_flags |= ARC_IO_IN_PROGRESS;
3889}
3890
3891/*
3892 * The SPA calls this callback for each physical write that happens on behalf
3893 * of a logical write.  See the comment in dbuf_write_physdone() for details.
3894 */
3895static void
3896arc_write_physdone(zio_t *zio)
3897{
3898	arc_write_callback_t *cb = zio->io_private;
3899	if (cb->awcb_physdone != NULL)
3900		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
3901}
3902
3903static void
3904arc_write_done(zio_t *zio)
3905{
3906	arc_write_callback_t *callback = zio->io_private;
3907	arc_buf_t *buf = callback->awcb_buf;
3908	arc_buf_hdr_t *hdr = buf->b_hdr;
3909
3910	ASSERT(hdr->b_acb == NULL);
3911
3912	if (zio->io_error == 0) {
3913		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
3914			buf_discard_identity(hdr);
3915		} else {
3916			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3917			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3918			hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3919		}
3920	} else {
3921		ASSERT(BUF_EMPTY(hdr));
3922	}
3923
3924	/*
3925	 * If the block to be written was all-zero or compressed enough to be
3926	 * embedded in the BP, no write was performed so there will be no
3927	 * dva/birth/checksum.  The buffer must therefore remain anonymous
3928	 * (and uncached).
3929	 */
3930	if (!BUF_EMPTY(hdr)) {
3931		arc_buf_hdr_t *exists;
3932		kmutex_t *hash_lock;
3933
3934		ASSERT(zio->io_error == 0);
3935
3936		arc_cksum_verify(buf);
3937
3938		exists = buf_hash_insert(hdr, &hash_lock);
3939		if (exists) {
3940			/*
3941			 * This can only happen if we overwrite for
3942			 * sync-to-convergence, because we remove
3943			 * buffers from the hash table when we arc_free().
3944			 */
3945			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3946				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3947					panic("bad overwrite, hdr=%p exists=%p",
3948					    (void *)hdr, (void *)exists);
3949				ASSERT(refcount_is_zero(&exists->b_refcnt));
3950				arc_change_state(arc_anon, exists, hash_lock);
3951				mutex_exit(hash_lock);
3952				arc_hdr_destroy(exists);
3953				exists = buf_hash_insert(hdr, &hash_lock);
3954				ASSERT3P(exists, ==, NULL);
3955			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3956				/* nopwrite */
3957				ASSERT(zio->io_prop.zp_nopwrite);
3958				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3959					panic("bad nopwrite, hdr=%p exists=%p",
3960					    (void *)hdr, (void *)exists);
3961			} else {
3962				/* Dedup */
3963				ASSERT(hdr->b_datacnt == 1);
3964				ASSERT(hdr->b_state == arc_anon);
3965				ASSERT(BP_GET_DEDUP(zio->io_bp));
3966				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3967			}
3968		}
3969		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3970		/* if it's not anon, we are doing a scrub */
3971		if (!exists && hdr->b_state == arc_anon)
3972			arc_access(hdr, hash_lock);
3973		mutex_exit(hash_lock);
3974	} else {
3975		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3976	}
3977
3978	ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3979	callback->awcb_done(zio, buf, callback->awcb_private);
3980
3981	kmem_free(callback, sizeof (arc_write_callback_t));
3982}
3983
3984zio_t *
3985arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3986    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3987    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
3988    arc_done_func_t *done, void *private, zio_priority_t priority,
3989    int zio_flags, const zbookmark_phys_t *zb)
3990{
3991	arc_buf_hdr_t *hdr = buf->b_hdr;
3992	arc_write_callback_t *callback;
3993	zio_t *zio;
3994
3995	ASSERT(ready != NULL);
3996	ASSERT(done != NULL);
3997	ASSERT(!HDR_IO_ERROR(hdr));
3998	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3999	ASSERT(hdr->b_acb == NULL);
4000	if (l2arc)
4001		hdr->b_flags |= ARC_L2CACHE;
4002	if (l2arc_compress)
4003		hdr->b_flags |= ARC_L2COMPRESS;
4004	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
4005	callback->awcb_ready = ready;
4006	callback->awcb_physdone = physdone;
4007	callback->awcb_done = done;
4008	callback->awcb_private = private;
4009	callback->awcb_buf = buf;
4010
4011	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
4012	    arc_write_ready, arc_write_physdone, arc_write_done, callback,
4013	    priority, zio_flags, zb);
4014
4015	return (zio);
4016}
4017
4018static int
4019arc_memory_throttle(uint64_t reserve, uint64_t txg)
4020{
4021#ifdef _KERNEL
4022	uint64_t available_memory = ptob(freemem);
4023	static uint64_t page_load = 0;
4024	static uint64_t last_txg = 0;
4025
4026#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
4027	available_memory =
4028	    MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE)));
4029#endif
4030
4031	if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
4032		return (0);
4033
4034	if (txg > last_txg) {
4035		last_txg = txg;
4036		page_load = 0;
4037	}
4038	/*
4039	 * If we are in pageout, we know that memory is already tight,
4040	 * the arc is already going to be evicting, so we just want to
4041	 * continue to let page writes occur as quickly as possible.
4042	 */
4043	if (curproc == pageproc) {
4044		if (page_load > MAX(ptob(minfree), available_memory) / 4)
4045			return (SET_ERROR(ERESTART));
4046		/* Note: reserve is inflated, so we deflate */
4047		page_load += reserve / 8;
4048		return (0);
4049	} else if (page_load > 0 && arc_reclaim_needed()) {
4050		/* memory is low, delay before restarting */
4051		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
4052		return (SET_ERROR(EAGAIN));
4053	}
4054	page_load = 0;
4055#endif
4056	return (0);
4057}
4058
4059void
4060arc_tempreserve_clear(uint64_t reserve)
4061{
4062	atomic_add_64(&arc_tempreserve, -reserve);
4063	ASSERT((int64_t)arc_tempreserve >= 0);
4064}
4065
4066int
4067arc_tempreserve_space(uint64_t reserve, uint64_t txg)
4068{
4069	int error;
4070	uint64_t anon_size;
4071
4072	if (reserve > arc_c/4 && !arc_no_grow) {
4073		arc_c = MIN(arc_c_max, reserve * 4);
4074		DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
4075	}
4076	if (reserve > arc_c)
4077		return (SET_ERROR(ENOMEM));
4078
4079	/*
4080	 * Don't count loaned bufs as in flight dirty data to prevent long
4081	 * network delays from blocking transactions that are ready to be
4082	 * assigned to a txg.
4083	 */
4084	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
4085
4086	/*
4087	 * Writes will, almost always, require additional memory allocations
4088	 * in order to compress/encrypt/etc the data.  We therefore need to
4089	 * make sure that there is sufficient available memory for this.
4090	 */
4091	error = arc_memory_throttle(reserve, txg);
4092	if (error != 0)
4093		return (error);
4094
4095	/*
4096	 * Throttle writes when the amount of dirty data in the cache
4097	 * gets too large.  We try to keep the cache less than half full
4098	 * of dirty blocks so that our sync times don't grow too large.
4099	 * Note: if two requests come in concurrently, we might let them
4100	 * both succeed, when one of them should fail.  Not a huge deal.
4101	 */
4102
4103	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
4104	    anon_size > arc_c / 4) {
4105		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
4106		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
4107		    arc_tempreserve>>10,
4108		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
4109		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
4110		    reserve>>10, arc_c>>10);
4111		return (SET_ERROR(ERESTART));
4112	}
4113	atomic_add_64(&arc_tempreserve, reserve);
4114	return (0);
4115}
4116
4117static kmutex_t arc_lowmem_lock;
4118#ifdef _KERNEL
4119static eventhandler_tag arc_event_lowmem = NULL;
4120
4121static void
4122arc_lowmem(void *arg __unused, int howto __unused)
4123{
4124
4125	/* Serialize access via arc_lowmem_lock. */
4126	mutex_enter(&arc_lowmem_lock);
4127	mutex_enter(&arc_reclaim_thr_lock);
4128	needfree = 1;
4129	DTRACE_PROBE(arc__needfree);
4130	cv_signal(&arc_reclaim_thr_cv);
4131
4132	/*
4133	 * It is unsafe to block here in arbitrary threads, because we can come
4134	 * here from ARC itself and may hold ARC locks and thus risk a deadlock
4135	 * with ARC reclaim thread.
4136	 */
4137	if (curproc == pageproc) {
4138		while (needfree)
4139			msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
4140	}
4141	mutex_exit(&arc_reclaim_thr_lock);
4142	mutex_exit(&arc_lowmem_lock);
4143}
4144#endif
4145
4146void
4147arc_init(void)
4148{
4149	int i, prefetch_tunable_set = 0;
4150
4151	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4152	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
4153	mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
4154
4155	/* Convert seconds to clock ticks */
4156	arc_min_prefetch_lifespan = 1 * hz;
4157
4158	/* Start out with 1/8 of all memory */
4159	arc_c = kmem_size() / 8;
4160
4161#ifdef sun
4162#ifdef _KERNEL
4163	/*
4164	 * On architectures where the physical memory can be larger
4165	 * than the addressable space (intel in 32-bit mode), we may
4166	 * need to limit the cache to 1/8 of VM size.
4167	 */
4168	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
4169#endif
4170#endif	/* sun */
4171	/* set min cache to 1/32 of all memory, or 16MB, whichever is more */
4172	arc_c_min = MAX(arc_c / 4, 64<<18);
4173	/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
4174	if (arc_c * 8 >= 1<<30)
4175		arc_c_max = (arc_c * 8) - (1<<30);
4176	else
4177		arc_c_max = arc_c_min;
4178	arc_c_max = MAX(arc_c * 5, arc_c_max);
4179
4180#ifdef _KERNEL
4181	/*
4182	 * Allow the tunables to override our calculations if they are
4183	 * reasonable (ie. over 16MB)
4184	 */
4185	if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size())
4186		arc_c_max = zfs_arc_max;
4187	if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max)
4188		arc_c_min = zfs_arc_min;
4189#endif
4190
4191	arc_c = arc_c_max;
4192	arc_p = (arc_c >> 1);
4193
4194	/* limit meta-data to 1/4 of the arc capacity */
4195	arc_meta_limit = arc_c_max / 4;
4196
4197	/* Allow the tunable to override if it is reasonable */
4198	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
4199		arc_meta_limit = zfs_arc_meta_limit;
4200
4201	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
4202		arc_c_min = arc_meta_limit / 2;
4203
4204	if (zfs_arc_grow_retry > 0)
4205		arc_grow_retry = zfs_arc_grow_retry;
4206
4207	if (zfs_arc_shrink_shift > 0)
4208		arc_shrink_shift = zfs_arc_shrink_shift;
4209
4210	if (zfs_arc_p_min_shift > 0)
4211		arc_p_min_shift = zfs_arc_p_min_shift;
4212
4213	/* if kmem_flags are set, lets try to use less memory */
4214	if (kmem_debugging())
4215		arc_c = arc_c / 2;
4216	if (arc_c < arc_c_min)
4217		arc_c = arc_c_min;
4218
4219	zfs_arc_min = arc_c_min;
4220	zfs_arc_max = arc_c_max;
4221
4222	arc_anon = &ARC_anon;
4223	arc_mru = &ARC_mru;
4224	arc_mru_ghost = &ARC_mru_ghost;
4225	arc_mfu = &ARC_mfu;
4226	arc_mfu_ghost = &ARC_mfu_ghost;
4227	arc_l2c_only = &ARC_l2c_only;
4228	arc_size = 0;
4229
4230	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4231		mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
4232		    NULL, MUTEX_DEFAULT, NULL);
4233		mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
4234		    NULL, MUTEX_DEFAULT, NULL);
4235		mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
4236		    NULL, MUTEX_DEFAULT, NULL);
4237		mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
4238		    NULL, MUTEX_DEFAULT, NULL);
4239		mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
4240		    NULL, MUTEX_DEFAULT, NULL);
4241		mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
4242		    NULL, MUTEX_DEFAULT, NULL);
4243
4244		list_create(&arc_mru->arcs_lists[i],
4245		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4246		list_create(&arc_mru_ghost->arcs_lists[i],
4247		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4248		list_create(&arc_mfu->arcs_lists[i],
4249		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4250		list_create(&arc_mfu_ghost->arcs_lists[i],
4251		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4252		list_create(&arc_mfu_ghost->arcs_lists[i],
4253		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4254		list_create(&arc_l2c_only->arcs_lists[i],
4255		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4256	}
4257
4258	buf_init();
4259
4260	arc_thread_exit = 0;
4261	arc_eviction_list = NULL;
4262	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
4263	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4264
4265	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4266	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4267
4268	if (arc_ksp != NULL) {
4269		arc_ksp->ks_data = &arc_stats;
4270		kstat_install(arc_ksp);
4271	}
4272
4273	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
4274	    TS_RUN, minclsyspri);
4275
4276#ifdef _KERNEL
4277	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
4278	    EVENTHANDLER_PRI_FIRST);
4279#endif
4280
4281	arc_dead = FALSE;
4282	arc_warm = B_FALSE;
4283
4284	/*
4285	 * Calculate maximum amount of dirty data per pool.
4286	 *
4287	 * If it has been set by /etc/system, take that.
4288	 * Otherwise, use a percentage of physical memory defined by
4289	 * zfs_dirty_data_max_percent (default 10%) with a cap at
4290	 * zfs_dirty_data_max_max (default 4GB).
4291	 */
4292	if (zfs_dirty_data_max == 0) {
4293		zfs_dirty_data_max = ptob(physmem) *
4294		    zfs_dirty_data_max_percent / 100;
4295		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
4296		    zfs_dirty_data_max_max);
4297	}
4298
4299#ifdef _KERNEL
4300	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
4301		prefetch_tunable_set = 1;
4302
4303#ifdef __i386__
4304	if (prefetch_tunable_set == 0) {
4305		printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
4306		    "-- to enable,\n");
4307		printf("            add \"vfs.zfs.prefetch_disable=0\" "
4308		    "to /boot/loader.conf.\n");
4309		zfs_prefetch_disable = 1;
4310	}
4311#else
4312	if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
4313	    prefetch_tunable_set == 0) {
4314		printf("ZFS NOTICE: Prefetch is disabled by default if less "
4315		    "than 4GB of RAM is present;\n"
4316		    "            to enable, add \"vfs.zfs.prefetch_disable=0\" "
4317		    "to /boot/loader.conf.\n");
4318		zfs_prefetch_disable = 1;
4319	}
4320#endif
4321	/* Warn about ZFS memory and address space requirements. */
4322	if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
4323		printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
4324		    "expect unstable behavior.\n");
4325	}
4326	if (kmem_size() < 512 * (1 << 20)) {
4327		printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
4328		    "expect unstable behavior.\n");
4329		printf("             Consider tuning vm.kmem_size and "
4330		    "vm.kmem_size_max\n");
4331		printf("             in /boot/loader.conf.\n");
4332	}
4333#endif
4334}
4335
4336void
4337arc_fini(void)
4338{
4339	int i;
4340
4341	mutex_enter(&arc_reclaim_thr_lock);
4342	arc_thread_exit = 1;
4343	cv_signal(&arc_reclaim_thr_cv);
4344	while (arc_thread_exit != 0)
4345		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
4346	mutex_exit(&arc_reclaim_thr_lock);
4347
4348	arc_flush(NULL);
4349
4350	arc_dead = TRUE;
4351
4352	if (arc_ksp != NULL) {
4353		kstat_delete(arc_ksp);
4354		arc_ksp = NULL;
4355	}
4356
4357	mutex_destroy(&arc_eviction_mtx);
4358	mutex_destroy(&arc_reclaim_thr_lock);
4359	cv_destroy(&arc_reclaim_thr_cv);
4360
4361	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4362		list_destroy(&arc_mru->arcs_lists[i]);
4363		list_destroy(&arc_mru_ghost->arcs_lists[i]);
4364		list_destroy(&arc_mfu->arcs_lists[i]);
4365		list_destroy(&arc_mfu_ghost->arcs_lists[i]);
4366		list_destroy(&arc_l2c_only->arcs_lists[i]);
4367
4368		mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
4369		mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
4370		mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
4371		mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
4372		mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
4373		mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
4374	}
4375
4376	buf_fini();
4377
4378	ASSERT(arc_loaned_bytes == 0);
4379
4380	mutex_destroy(&arc_lowmem_lock);
4381#ifdef _KERNEL
4382	if (arc_event_lowmem != NULL)
4383		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
4384#endif
4385}
4386
4387/*
4388 * Level 2 ARC
4389 *
4390 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
4391 * It uses dedicated storage devices to hold cached data, which are populated
4392 * using large infrequent writes.  The main role of this cache is to boost
4393 * the performance of random read workloads.  The intended L2ARC devices
4394 * include short-stroked disks, solid state disks, and other media with
4395 * substantially faster read latency than disk.
4396 *
4397 *                 +-----------------------+
4398 *                 |         ARC           |
4399 *                 +-----------------------+
4400 *                    |         ^     ^
4401 *                    |         |     |
4402 *      l2arc_feed_thread()    arc_read()
4403 *                    |         |     |
4404 *                    |  l2arc read   |
4405 *                    V         |     |
4406 *               +---------------+    |
4407 *               |     L2ARC     |    |
4408 *               +---------------+    |
4409 *                   |    ^           |
4410 *          l2arc_write() |           |
4411 *                   |    |           |
4412 *                   V    |           |
4413 *                 +-------+      +-------+
4414 *                 | vdev  |      | vdev  |
4415 *                 | cache |      | cache |
4416 *                 +-------+      +-------+
4417 *                 +=========+     .-----.
4418 *                 :  L2ARC  :    |-_____-|
4419 *                 : devices :    | Disks |
4420 *                 +=========+    `-_____-'
4421 *
4422 * Read requests are satisfied from the following sources, in order:
4423 *
4424 *	1) ARC
4425 *	2) vdev cache of L2ARC devices
4426 *	3) L2ARC devices
4427 *	4) vdev cache of disks
4428 *	5) disks
4429 *
4430 * Some L2ARC device types exhibit extremely slow write performance.
4431 * To accommodate for this there are some significant differences between
4432 * the L2ARC and traditional cache design:
4433 *
4434 * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
4435 * the ARC behave as usual, freeing buffers and placing headers on ghost
4436 * lists.  The ARC does not send buffers to the L2ARC during eviction as
4437 * this would add inflated write latencies for all ARC memory pressure.
4438 *
4439 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4440 * It does this by periodically scanning buffers from the eviction-end of
4441 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4442 * not already there. It scans until a headroom of buffers is satisfied,
4443 * which itself is a buffer for ARC eviction. If a compressible buffer is
4444 * found during scanning and selected for writing to an L2ARC device, we
4445 * temporarily boost scanning headroom during the next scan cycle to make
4446 * sure we adapt to compression effects (which might significantly reduce
4447 * the data volume we write to L2ARC). The thread that does this is
4448 * l2arc_feed_thread(), illustrated below; example sizes are included to
4449 * provide a better sense of ratio than this diagram:
4450 *
4451 *	       head -->                        tail
4452 *	        +---------------------+----------+
4453 *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
4454 *	        +---------------------+----------+   |   o L2ARC eligible
4455 *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
4456 *	        +---------------------+----------+   |
4457 *	             15.9 Gbytes      ^ 32 Mbytes    |
4458 *	                           headroom          |
4459 *	                                      l2arc_feed_thread()
4460 *	                                             |
4461 *	                 l2arc write hand <--[oooo]--'
4462 *	                         |           8 Mbyte
4463 *	                         |          write max
4464 *	                         V
4465 *		  +==============================+
4466 *	L2ARC dev |####|#|###|###|    |####| ... |
4467 *	          +==============================+
4468 *	                     32 Gbytes
4469 *
4470 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4471 * evicted, then the L2ARC has cached a buffer much sooner than it probably
4472 * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
4473 * safe to say that this is an uncommon case, since buffers at the end of
4474 * the ARC lists have moved there due to inactivity.
4475 *
4476 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4477 * then the L2ARC simply misses copying some buffers.  This serves as a
4478 * pressure valve to prevent heavy read workloads from both stalling the ARC
4479 * with waits and clogging the L2ARC with writes.  This also helps prevent
4480 * the potential for the L2ARC to churn if it attempts to cache content too
4481 * quickly, such as during backups of the entire pool.
4482 *
4483 * 5. After system boot and before the ARC has filled main memory, there are
4484 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4485 * lists can remain mostly static.  Instead of searching from tail of these
4486 * lists as pictured, the l2arc_feed_thread() will search from the list heads
4487 * for eligible buffers, greatly increasing its chance of finding them.
4488 *
4489 * The L2ARC device write speed is also boosted during this time so that
4490 * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
4491 * there are no L2ARC reads, and no fear of degrading read performance
4492 * through increased writes.
4493 *
4494 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4495 * the vdev queue can aggregate them into larger and fewer writes.  Each
4496 * device is written to in a rotor fashion, sweeping writes through
4497 * available space then repeating.
4498 *
4499 * 7. The L2ARC does not store dirty content.  It never needs to flush
4500 * write buffers back to disk based storage.
4501 *
4502 * 8. If an ARC buffer is written (and dirtied) which also exists in the
4503 * L2ARC, the now stale L2ARC buffer is immediately dropped.
4504 *
4505 * The performance of the L2ARC can be tweaked by a number of tunables, which
4506 * may be necessary for different workloads:
4507 *
4508 *	l2arc_write_max		max write bytes per interval
4509 *	l2arc_write_boost	extra write bytes during device warmup
4510 *	l2arc_noprefetch	skip caching prefetched buffers
4511 *	l2arc_headroom		number of max device writes to precache
4512 *	l2arc_headroom_boost	when we find compressed buffers during ARC
4513 *				scanning, we multiply headroom by this
4514 *				percentage factor for the next scan cycle,
4515 *				since more compressed buffers are likely to
4516 *				be present
4517 *	l2arc_feed_secs		seconds between L2ARC writing
4518 *
4519 * Tunables may be removed or added as future performance improvements are
4520 * integrated, and also may become zpool properties.
4521 *
4522 * There are three key functions that control how the L2ARC warms up:
4523 *
4524 *	l2arc_write_eligible()	check if a buffer is eligible to cache
4525 *	l2arc_write_size()	calculate how much to write
4526 *	l2arc_write_interval()	calculate sleep delay between writes
4527 *
4528 * These three functions determine what to write, how much, and how quickly
4529 * to send writes.
4530 */
4531
4532static boolean_t
4533l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4534{
4535	/*
4536	 * A buffer is *not* eligible for the L2ARC if it:
4537	 * 1. belongs to a different spa.
4538	 * 2. is already cached on the L2ARC.
4539	 * 3. has an I/O in progress (it may be an incomplete read).
4540	 * 4. is flagged not eligible (zfs property).
4541	 */
4542	if (ab->b_spa != spa_guid) {
4543		ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
4544		return (B_FALSE);
4545	}
4546	if (ab->b_l2hdr != NULL) {
4547		ARCSTAT_BUMP(arcstat_l2_write_in_l2);
4548		return (B_FALSE);
4549	}
4550	if (HDR_IO_IN_PROGRESS(ab)) {
4551		ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
4552		return (B_FALSE);
4553	}
4554	if (!HDR_L2CACHE(ab)) {
4555		ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
4556		return (B_FALSE);
4557	}
4558
4559	return (B_TRUE);
4560}
4561
4562static uint64_t
4563l2arc_write_size(void)
4564{
4565	uint64_t size;
4566
4567	/*
4568	 * Make sure our globals have meaningful values in case the user
4569	 * altered them.
4570	 */
4571	size = l2arc_write_max;
4572	if (size == 0) {
4573		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4574		    "be greater than zero, resetting it to the default (%d)",
4575		    L2ARC_WRITE_SIZE);
4576		size = l2arc_write_max = L2ARC_WRITE_SIZE;
4577	}
4578
4579	if (arc_warm == B_FALSE)
4580		size += l2arc_write_boost;
4581
4582	return (size);
4583
4584}
4585
4586static clock_t
4587l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4588{
4589	clock_t interval, next, now;
4590
4591	/*
4592	 * If the ARC lists are busy, increase our write rate; if the
4593	 * lists are stale, idle back.  This is achieved by checking
4594	 * how much we previously wrote - if it was more than half of
4595	 * what we wanted, schedule the next write much sooner.
4596	 */
4597	if (l2arc_feed_again && wrote > (wanted / 2))
4598		interval = (hz * l2arc_feed_min_ms) / 1000;
4599	else
4600		interval = hz * l2arc_feed_secs;
4601
4602	now = ddi_get_lbolt();
4603	next = MAX(now, MIN(now + interval, began + interval));
4604
4605	return (next);
4606}
4607
4608static void
4609l2arc_hdr_stat_add(void)
4610{
4611	ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4612	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4613}
4614
4615static void
4616l2arc_hdr_stat_remove(void)
4617{
4618	ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4619	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4620}
4621
4622/*
4623 * Cycle through L2ARC devices.  This is how L2ARC load balances.
4624 * If a device is returned, this also returns holding the spa config lock.
4625 */
4626static l2arc_dev_t *
4627l2arc_dev_get_next(void)
4628{
4629	l2arc_dev_t *first, *next = NULL;
4630
4631	/*
4632	 * Lock out the removal of spas (spa_namespace_lock), then removal
4633	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4634	 * both locks will be dropped and a spa config lock held instead.
4635	 */
4636	mutex_enter(&spa_namespace_lock);
4637	mutex_enter(&l2arc_dev_mtx);
4638
4639	/* if there are no vdevs, there is nothing to do */
4640	if (l2arc_ndev == 0)
4641		goto out;
4642
4643	first = NULL;
4644	next = l2arc_dev_last;
4645	do {
4646		/* loop around the list looking for a non-faulted vdev */
4647		if (next == NULL) {
4648			next = list_head(l2arc_dev_list);
4649		} else {
4650			next = list_next(l2arc_dev_list, next);
4651			if (next == NULL)
4652				next = list_head(l2arc_dev_list);
4653		}
4654
4655		/* if we have come back to the start, bail out */
4656		if (first == NULL)
4657			first = next;
4658		else if (next == first)
4659			break;
4660
4661	} while (vdev_is_dead(next->l2ad_vdev));
4662
4663	/* if we were unable to find any usable vdevs, return NULL */
4664	if (vdev_is_dead(next->l2ad_vdev))
4665		next = NULL;
4666
4667	l2arc_dev_last = next;
4668
4669out:
4670	mutex_exit(&l2arc_dev_mtx);
4671
4672	/*
4673	 * Grab the config lock to prevent the 'next' device from being
4674	 * removed while we are writing to it.
4675	 */
4676	if (next != NULL)
4677		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4678	mutex_exit(&spa_namespace_lock);
4679
4680	return (next);
4681}
4682
4683/*
4684 * Free buffers that were tagged for destruction.
4685 */
4686static void
4687l2arc_do_free_on_write()
4688{
4689	list_t *buflist;
4690	l2arc_data_free_t *df, *df_prev;
4691
4692	mutex_enter(&l2arc_free_on_write_mtx);
4693	buflist = l2arc_free_on_write;
4694
4695	for (df = list_tail(buflist); df; df = df_prev) {
4696		df_prev = list_prev(buflist, df);
4697		ASSERT(df->l2df_data != NULL);
4698		ASSERT(df->l2df_func != NULL);
4699		df->l2df_func(df->l2df_data, df->l2df_size);
4700		list_remove(buflist, df);
4701		kmem_free(df, sizeof (l2arc_data_free_t));
4702	}
4703
4704	mutex_exit(&l2arc_free_on_write_mtx);
4705}
4706
4707/*
4708 * A write to a cache device has completed.  Update all headers to allow
4709 * reads from these buffers to begin.
4710 */
4711static void
4712l2arc_write_done(zio_t *zio)
4713{
4714	l2arc_write_callback_t *cb;
4715	l2arc_dev_t *dev;
4716	list_t *buflist;
4717	arc_buf_hdr_t *head, *ab, *ab_prev;
4718	l2arc_buf_hdr_t *abl2;
4719	kmutex_t *hash_lock;
4720	int64_t bytes_dropped = 0;
4721
4722	cb = zio->io_private;
4723	ASSERT(cb != NULL);
4724	dev = cb->l2wcb_dev;
4725	ASSERT(dev != NULL);
4726	head = cb->l2wcb_head;
4727	ASSERT(head != NULL);
4728	buflist = dev->l2ad_buflist;
4729	ASSERT(buflist != NULL);
4730	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4731	    l2arc_write_callback_t *, cb);
4732
4733	if (zio->io_error != 0)
4734		ARCSTAT_BUMP(arcstat_l2_writes_error);
4735
4736	mutex_enter(&l2arc_buflist_mtx);
4737
4738	/*
4739	 * All writes completed, or an error was hit.
4740	 */
4741	for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4742		ab_prev = list_prev(buflist, ab);
4743		abl2 = ab->b_l2hdr;
4744
4745		/*
4746		 * Release the temporary compressed buffer as soon as possible.
4747		 */
4748		if (abl2->b_compress != ZIO_COMPRESS_OFF)
4749			l2arc_release_cdata_buf(ab);
4750
4751		hash_lock = HDR_LOCK(ab);
4752		if (!mutex_tryenter(hash_lock)) {
4753			/*
4754			 * This buffer misses out.  It may be in a stage
4755			 * of eviction.  Its ARC_L2_WRITING flag will be
4756			 * left set, denying reads to this buffer.
4757			 */
4758			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4759			continue;
4760		}
4761
4762		if (zio->io_error != 0) {
4763			/*
4764			 * Error - drop L2ARC entry.
4765			 */
4766			list_remove(buflist, ab);
4767			ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4768			bytes_dropped += abl2->b_asize;
4769			ab->b_l2hdr = NULL;
4770			trim_map_free(abl2->b_dev->l2ad_vdev, abl2->b_daddr,
4771			    ab->b_size, 0);
4772			kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4773			ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4774		}
4775
4776		/*
4777		 * Allow ARC to begin reads to this L2ARC entry.
4778		 */
4779		ab->b_flags &= ~ARC_L2_WRITING;
4780
4781		mutex_exit(hash_lock);
4782	}
4783
4784	atomic_inc_64(&l2arc_writes_done);
4785	list_remove(buflist, head);
4786	kmem_cache_free(hdr_cache, head);
4787	mutex_exit(&l2arc_buflist_mtx);
4788
4789	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
4790
4791	l2arc_do_free_on_write();
4792
4793	kmem_free(cb, sizeof (l2arc_write_callback_t));
4794}
4795
4796/*
4797 * A read to a cache device completed.  Validate buffer contents before
4798 * handing over to the regular ARC routines.
4799 */
4800static void
4801l2arc_read_done(zio_t *zio)
4802{
4803	l2arc_read_callback_t *cb;
4804	arc_buf_hdr_t *hdr;
4805	arc_buf_t *buf;
4806	kmutex_t *hash_lock;
4807	int equal;
4808
4809	ASSERT(zio->io_vd != NULL);
4810	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4811
4812	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4813
4814	cb = zio->io_private;
4815	ASSERT(cb != NULL);
4816	buf = cb->l2rcb_buf;
4817	ASSERT(buf != NULL);
4818
4819	hash_lock = HDR_LOCK(buf->b_hdr);
4820	mutex_enter(hash_lock);
4821	hdr = buf->b_hdr;
4822	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4823
4824	/*
4825	 * If the buffer was compressed, decompress it first.
4826	 */
4827	if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4828		l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4829	ASSERT(zio->io_data != NULL);
4830
4831	/*
4832	 * Check this survived the L2ARC journey.
4833	 */
4834	equal = arc_cksum_equal(buf);
4835	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4836		mutex_exit(hash_lock);
4837		zio->io_private = buf;
4838		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
4839		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
4840		arc_read_done(zio);
4841	} else {
4842		mutex_exit(hash_lock);
4843		/*
4844		 * Buffer didn't survive caching.  Increment stats and
4845		 * reissue to the original storage device.
4846		 */
4847		if (zio->io_error != 0) {
4848			ARCSTAT_BUMP(arcstat_l2_io_error);
4849		} else {
4850			zio->io_error = SET_ERROR(EIO);
4851		}
4852		if (!equal)
4853			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4854
4855		/*
4856		 * If there's no waiter, issue an async i/o to the primary
4857		 * storage now.  If there *is* a waiter, the caller must
4858		 * issue the i/o in a context where it's OK to block.
4859		 */
4860		if (zio->io_waiter == NULL) {
4861			zio_t *pio = zio_unique_parent(zio);
4862
4863			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4864
4865			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4866			    buf->b_data, zio->io_size, arc_read_done, buf,
4867			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4868		}
4869	}
4870
4871	kmem_free(cb, sizeof (l2arc_read_callback_t));
4872}
4873
4874/*
4875 * This is the list priority from which the L2ARC will search for pages to
4876 * cache.  This is used within loops (0..3) to cycle through lists in the
4877 * desired order.  This order can have a significant effect on cache
4878 * performance.
4879 *
4880 * Currently the metadata lists are hit first, MFU then MRU, followed by
4881 * the data lists.  This function returns a locked list, and also returns
4882 * the lock pointer.
4883 */
4884static list_t *
4885l2arc_list_locked(int list_num, kmutex_t **lock)
4886{
4887	list_t *list = NULL;
4888	int idx;
4889
4890	ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);
4891
4892	if (list_num < ARC_BUFC_NUMMETADATALISTS) {
4893		idx = list_num;
4894		list = &arc_mfu->arcs_lists[idx];
4895		*lock = ARCS_LOCK(arc_mfu, idx);
4896	} else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) {
4897		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4898		list = &arc_mru->arcs_lists[idx];
4899		*lock = ARCS_LOCK(arc_mru, idx);
4900	} else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 +
4901		ARC_BUFC_NUMDATALISTS)) {
4902		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4903		list = &arc_mfu->arcs_lists[idx];
4904		*lock = ARCS_LOCK(arc_mfu, idx);
4905	} else {
4906		idx = list_num - ARC_BUFC_NUMLISTS;
4907		list = &arc_mru->arcs_lists[idx];
4908		*lock = ARCS_LOCK(arc_mru, idx);
4909	}
4910
4911	ASSERT(!(MUTEX_HELD(*lock)));
4912	mutex_enter(*lock);
4913	return (list);
4914}
4915
4916/*
4917 * Evict buffers from the device write hand to the distance specified in
4918 * bytes.  This distance may span populated buffers, it may span nothing.
4919 * This is clearing a region on the L2ARC device ready for writing.
4920 * If the 'all' boolean is set, every buffer is evicted.
4921 */
4922static void
4923l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4924{
4925	list_t *buflist;
4926	l2arc_buf_hdr_t *abl2;
4927	arc_buf_hdr_t *ab, *ab_prev;
4928	kmutex_t *hash_lock;
4929	uint64_t taddr;
4930	int64_t bytes_evicted = 0;
4931
4932	buflist = dev->l2ad_buflist;
4933
4934	if (buflist == NULL)
4935		return;
4936
4937	if (!all && dev->l2ad_first) {
4938		/*
4939		 * This is the first sweep through the device.  There is
4940		 * nothing to evict.
4941		 */
4942		return;
4943	}
4944
4945	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4946		/*
4947		 * When nearing the end of the device, evict to the end
4948		 * before the device write hand jumps to the start.
4949		 */
4950		taddr = dev->l2ad_end;
4951	} else {
4952		taddr = dev->l2ad_hand + distance;
4953	}
4954	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4955	    uint64_t, taddr, boolean_t, all);
4956
4957top:
4958	mutex_enter(&l2arc_buflist_mtx);
4959	for (ab = list_tail(buflist); ab; ab = ab_prev) {
4960		ab_prev = list_prev(buflist, ab);
4961
4962		hash_lock = HDR_LOCK(ab);
4963		if (!mutex_tryenter(hash_lock)) {
4964			/*
4965			 * Missed the hash lock.  Retry.
4966			 */
4967			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4968			mutex_exit(&l2arc_buflist_mtx);
4969			mutex_enter(hash_lock);
4970			mutex_exit(hash_lock);
4971			goto top;
4972		}
4973
4974		if (HDR_L2_WRITE_HEAD(ab)) {
4975			/*
4976			 * We hit a write head node.  Leave it for
4977			 * l2arc_write_done().
4978			 */
4979			list_remove(buflist, ab);
4980			mutex_exit(hash_lock);
4981			continue;
4982		}
4983
4984		if (!all && ab->b_l2hdr != NULL &&
4985		    (ab->b_l2hdr->b_daddr > taddr ||
4986		    ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4987			/*
4988			 * We've evicted to the target address,
4989			 * or the end of the device.
4990			 */
4991			mutex_exit(hash_lock);
4992			break;
4993		}
4994
4995		if (HDR_FREE_IN_PROGRESS(ab)) {
4996			/*
4997			 * Already on the path to destruction.
4998			 */
4999			mutex_exit(hash_lock);
5000			continue;
5001		}
5002
5003		if (ab->b_state == arc_l2c_only) {
5004			ASSERT(!HDR_L2_READING(ab));
5005			/*
5006			 * This doesn't exist in the ARC.  Destroy.
5007			 * arc_hdr_destroy() will call list_remove()
5008			 * and decrement arcstat_l2_size.
5009			 */
5010			arc_change_state(arc_anon, ab, hash_lock);
5011			arc_hdr_destroy(ab);
5012		} else {
5013			/*
5014			 * Invalidate issued or about to be issued
5015			 * reads, since we may be about to write
5016			 * over this location.
5017			 */
5018			if (HDR_L2_READING(ab)) {
5019				ARCSTAT_BUMP(arcstat_l2_evict_reading);
5020				ab->b_flags |= ARC_L2_EVICTED;
5021			}
5022
5023			/*
5024			 * Tell ARC this no longer exists in L2ARC.
5025			 */
5026			if (ab->b_l2hdr != NULL) {
5027				abl2 = ab->b_l2hdr;
5028				ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
5029				bytes_evicted += abl2->b_asize;
5030				ab->b_l2hdr = NULL;
5031				/*
5032				 * We are destroying l2hdr, so ensure that
5033				 * its compressed buffer, if any, is not leaked.
5034				 */
5035				ASSERT(abl2->b_tmp_cdata == NULL);
5036				kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
5037				ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
5038			}
5039			list_remove(buflist, ab);
5040
5041			/*
5042			 * This may have been leftover after a
5043			 * failed write.
5044			 */
5045			ab->b_flags &= ~ARC_L2_WRITING;
5046		}
5047		mutex_exit(hash_lock);
5048	}
5049	mutex_exit(&l2arc_buflist_mtx);
5050
5051	vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0);
5052	dev->l2ad_evict = taddr;
5053}
5054
5055/*
5056 * Find and write ARC buffers to the L2ARC device.
5057 *
5058 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
5059 * for reading until they have completed writing.
5060 * The headroom_boost is an in-out parameter used to maintain headroom boost
5061 * state between calls to this function.
5062 *
5063 * Returns the number of bytes actually written (which may be smaller than
5064 * the delta by which the device hand has changed due to alignment).
5065 */
5066static uint64_t
5067l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
5068    boolean_t *headroom_boost)
5069{
5070	arc_buf_hdr_t *ab, *ab_prev, *head;
5071	list_t *list;
5072	uint64_t write_asize, write_psize, write_sz, headroom,
5073	    buf_compress_minsz;
5074	void *buf_data;
5075	kmutex_t *list_lock;
5076	boolean_t full;
5077	l2arc_write_callback_t *cb;
5078	zio_t *pio, *wzio;
5079	uint64_t guid = spa_load_guid(spa);
5080	const boolean_t do_headroom_boost = *headroom_boost;
5081	int try;
5082
5083	ASSERT(dev->l2ad_vdev != NULL);
5084
5085	/* Lower the flag now, we might want to raise it again later. */
5086	*headroom_boost = B_FALSE;
5087
5088	pio = NULL;
5089	write_sz = write_asize = write_psize = 0;
5090	full = B_FALSE;
5091	head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
5092	head->b_flags |= ARC_L2_WRITE_HEAD;
5093
5094	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
5095	/*
5096	 * We will want to try to compress buffers that are at least 2x the
5097	 * device sector size.
5098	 */
5099	buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
5100
5101	/*
5102	 * Copy buffers for L2ARC writing.
5103	 */
5104	mutex_enter(&l2arc_buflist_mtx);
5105	for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
5106		uint64_t passed_sz = 0;
5107
5108		list = l2arc_list_locked(try, &list_lock);
5109		ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
5110
5111		/*
5112		 * L2ARC fast warmup.
5113		 *
5114		 * Until the ARC is warm and starts to evict, read from the
5115		 * head of the ARC lists rather than the tail.
5116		 */
5117		if (arc_warm == B_FALSE)
5118			ab = list_head(list);
5119		else
5120			ab = list_tail(list);
5121		if (ab == NULL)
5122			ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
5123
5124		headroom = target_sz * l2arc_headroom * 2 / ARC_BUFC_NUMLISTS;
5125		if (do_headroom_boost)
5126			headroom = (headroom * l2arc_headroom_boost) / 100;
5127
5128		for (; ab; ab = ab_prev) {
5129			l2arc_buf_hdr_t *l2hdr;
5130			kmutex_t *hash_lock;
5131			uint64_t buf_sz;
5132
5133			if (arc_warm == B_FALSE)
5134				ab_prev = list_next(list, ab);
5135			else
5136				ab_prev = list_prev(list, ab);
5137			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size);
5138
5139			hash_lock = HDR_LOCK(ab);
5140			if (!mutex_tryenter(hash_lock)) {
5141				ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
5142				/*
5143				 * Skip this buffer rather than waiting.
5144				 */
5145				continue;
5146			}
5147
5148			passed_sz += ab->b_size;
5149			if (passed_sz > headroom) {
5150				/*
5151				 * Searched too far.
5152				 */
5153				mutex_exit(hash_lock);
5154				ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
5155				break;
5156			}
5157
5158			if (!l2arc_write_eligible(guid, ab)) {
5159				mutex_exit(hash_lock);
5160				continue;
5161			}
5162
5163			if ((write_sz + ab->b_size) > target_sz) {
5164				full = B_TRUE;
5165				mutex_exit(hash_lock);
5166				ARCSTAT_BUMP(arcstat_l2_write_full);
5167				break;
5168			}
5169
5170			if (pio == NULL) {
5171				/*
5172				 * Insert a dummy header on the buflist so
5173				 * l2arc_write_done() can find where the
5174				 * write buffers begin without searching.
5175				 */
5176				list_insert_head(dev->l2ad_buflist, head);
5177
5178				cb = kmem_alloc(
5179				    sizeof (l2arc_write_callback_t), KM_SLEEP);
5180				cb->l2wcb_dev = dev;
5181				cb->l2wcb_head = head;
5182				pio = zio_root(spa, l2arc_write_done, cb,
5183				    ZIO_FLAG_CANFAIL);
5184				ARCSTAT_BUMP(arcstat_l2_write_pios);
5185			}
5186
5187			/*
5188			 * Create and add a new L2ARC header.
5189			 */
5190			l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
5191			l2hdr->b_dev = dev;
5192			ab->b_flags |= ARC_L2_WRITING;
5193
5194			/*
5195			 * Temporarily stash the data buffer in b_tmp_cdata.
5196			 * The subsequent write step will pick it up from
5197			 * there. This is because can't access ab->b_buf
5198			 * without holding the hash_lock, which we in turn
5199			 * can't access without holding the ARC list locks
5200			 * (which we want to avoid during compression/writing).
5201			 */
5202			l2hdr->b_compress = ZIO_COMPRESS_OFF;
5203			l2hdr->b_asize = ab->b_size;
5204			l2hdr->b_tmp_cdata = ab->b_buf->b_data;
5205
5206			buf_sz = ab->b_size;
5207			ab->b_l2hdr = l2hdr;
5208
5209			list_insert_head(dev->l2ad_buflist, ab);
5210
5211			/*
5212			 * Compute and store the buffer cksum before
5213			 * writing.  On debug the cksum is verified first.
5214			 */
5215			arc_cksum_verify(ab->b_buf);
5216			arc_cksum_compute(ab->b_buf, B_TRUE);
5217
5218			mutex_exit(hash_lock);
5219
5220			write_sz += buf_sz;
5221		}
5222
5223		mutex_exit(list_lock);
5224
5225		if (full == B_TRUE)
5226			break;
5227	}
5228
5229	/* No buffers selected for writing? */
5230	if (pio == NULL) {
5231		ASSERT0(write_sz);
5232		mutex_exit(&l2arc_buflist_mtx);
5233		kmem_cache_free(hdr_cache, head);
5234		return (0);
5235	}
5236
5237	/*
5238	 * Now start writing the buffers. We're starting at the write head
5239	 * and work backwards, retracing the course of the buffer selector
5240	 * loop above.
5241	 */
5242	for (ab = list_prev(dev->l2ad_buflist, head); ab;
5243	    ab = list_prev(dev->l2ad_buflist, ab)) {
5244		l2arc_buf_hdr_t *l2hdr;
5245		uint64_t buf_sz;
5246
5247		/*
5248		 * We shouldn't need to lock the buffer here, since we flagged
5249		 * it as ARC_L2_WRITING in the previous step, but we must take
5250		 * care to only access its L2 cache parameters. In particular,
5251		 * ab->b_buf may be invalid by now due to ARC eviction.
5252		 */
5253		l2hdr = ab->b_l2hdr;
5254		l2hdr->b_daddr = dev->l2ad_hand;
5255
5256		if ((ab->b_flags & ARC_L2COMPRESS) &&
5257		    l2hdr->b_asize >= buf_compress_minsz) {
5258			if (l2arc_compress_buf(l2hdr)) {
5259				/*
5260				 * If compression succeeded, enable headroom
5261				 * boost on the next scan cycle.
5262				 */
5263				*headroom_boost = B_TRUE;
5264			}
5265		}
5266
5267		/*
5268		 * Pick up the buffer data we had previously stashed away
5269		 * (and now potentially also compressed).
5270		 */
5271		buf_data = l2hdr->b_tmp_cdata;
5272		buf_sz = l2hdr->b_asize;
5273
5274		/*
5275		 * If the data has not been compressed, then clear b_tmp_cdata
5276		 * to make sure that it points only to a temporary compression
5277		 * buffer.
5278		 */
5279		if (!L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress))
5280			l2hdr->b_tmp_cdata = NULL;
5281
5282		/* Compression may have squashed the buffer to zero length. */
5283		if (buf_sz != 0) {
5284			uint64_t buf_p_sz;
5285
5286			wzio = zio_write_phys(pio, dev->l2ad_vdev,
5287			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5288			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5289			    ZIO_FLAG_CANFAIL, B_FALSE);
5290
5291			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5292			    zio_t *, wzio);
5293			(void) zio_nowait(wzio);
5294
5295			write_asize += buf_sz;
5296			/*
5297			 * Keep the clock hand suitably device-aligned.
5298			 */
5299			buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
5300			write_psize += buf_p_sz;
5301			dev->l2ad_hand += buf_p_sz;
5302		}
5303	}
5304
5305	mutex_exit(&l2arc_buflist_mtx);
5306
5307	ASSERT3U(write_asize, <=, target_sz);
5308	ARCSTAT_BUMP(arcstat_l2_writes_sent);
5309	ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5310	ARCSTAT_INCR(arcstat_l2_size, write_sz);
5311	ARCSTAT_INCR(arcstat_l2_asize, write_asize);
5312	vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
5313
5314	/*
5315	 * Bump device hand to the device start if it is approaching the end.
5316	 * l2arc_evict() will already have evicted ahead for this case.
5317	 */
5318	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
5319		dev->l2ad_hand = dev->l2ad_start;
5320		dev->l2ad_evict = dev->l2ad_start;
5321		dev->l2ad_first = B_FALSE;
5322	}
5323
5324	dev->l2ad_writing = B_TRUE;
5325	(void) zio_wait(pio);
5326	dev->l2ad_writing = B_FALSE;
5327
5328	return (write_asize);
5329}
5330
5331/*
5332 * Compresses an L2ARC buffer.
5333 * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
5334 * size in l2hdr->b_asize. This routine tries to compress the data and
5335 * depending on the compression result there are three possible outcomes:
5336 * *) The buffer was incompressible. The original l2hdr contents were left
5337 *    untouched and are ready for writing to an L2 device.
5338 * *) The buffer was all-zeros, so there is no need to write it to an L2
5339 *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
5340 *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
5341 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
5342 *    data buffer which holds the compressed data to be written, and b_asize
5343 *    tells us how much data there is. b_compress is set to the appropriate
5344 *    compression algorithm. Once writing is done, invoke
5345 *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
5346 *
5347 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
5348 * buffer was incompressible).
5349 */
5350static boolean_t
5351l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
5352{
5353	void *cdata;
5354	size_t csize, len, rounded;
5355
5356	ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
5357	ASSERT(l2hdr->b_tmp_cdata != NULL);
5358
5359	len = l2hdr->b_asize;
5360	cdata = zio_data_buf_alloc(len);
5361	csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
5362	    cdata, l2hdr->b_asize);
5363
5364	if (csize == 0) {
5365		/* zero block, indicate that there's nothing to write */
5366		zio_data_buf_free(cdata, len);
5367		l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
5368		l2hdr->b_asize = 0;
5369		l2hdr->b_tmp_cdata = NULL;
5370		ARCSTAT_BUMP(arcstat_l2_compress_zeros);
5371		return (B_TRUE);
5372	}
5373
5374	rounded = P2ROUNDUP(csize,
5375	    (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift);
5376	if (rounded < len) {
5377		/*
5378		 * Compression succeeded, we'll keep the cdata around for
5379		 * writing and release it afterwards.
5380		 */
5381		if (rounded > csize) {
5382			bzero((char *)cdata + csize, rounded - csize);
5383			csize = rounded;
5384		}
5385		l2hdr->b_compress = ZIO_COMPRESS_LZ4;
5386		l2hdr->b_asize = csize;
5387		l2hdr->b_tmp_cdata = cdata;
5388		ARCSTAT_BUMP(arcstat_l2_compress_successes);
5389		return (B_TRUE);
5390	} else {
5391		/*
5392		 * Compression failed, release the compressed buffer.
5393		 * l2hdr will be left unmodified.
5394		 */
5395		zio_data_buf_free(cdata, len);
5396		ARCSTAT_BUMP(arcstat_l2_compress_failures);
5397		return (B_FALSE);
5398	}
5399}
5400
5401/*
5402 * Decompresses a zio read back from an l2arc device. On success, the
5403 * underlying zio's io_data buffer is overwritten by the uncompressed
5404 * version. On decompression error (corrupt compressed stream), the
5405 * zio->io_error value is set to signal an I/O error.
5406 *
5407 * Please note that the compressed data stream is not checksummed, so
5408 * if the underlying device is experiencing data corruption, we may feed
5409 * corrupt data to the decompressor, so the decompressor needs to be
5410 * able to handle this situation (LZ4 does).
5411 */
5412static void
5413l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
5414{
5415	ASSERT(L2ARC_IS_VALID_COMPRESS(c));
5416
5417	if (zio->io_error != 0) {
5418		/*
5419		 * An io error has occured, just restore the original io
5420		 * size in preparation for a main pool read.
5421		 */
5422		zio->io_orig_size = zio->io_size = hdr->b_size;
5423		return;
5424	}
5425
5426	if (c == ZIO_COMPRESS_EMPTY) {
5427		/*
5428		 * An empty buffer results in a null zio, which means we
5429		 * need to fill its io_data after we're done restoring the
5430		 * buffer's contents.
5431		 */
5432		ASSERT(hdr->b_buf != NULL);
5433		bzero(hdr->b_buf->b_data, hdr->b_size);
5434		zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
5435	} else {
5436		ASSERT(zio->io_data != NULL);
5437		/*
5438		 * We copy the compressed data from the start of the arc buffer
5439		 * (the zio_read will have pulled in only what we need, the
5440		 * rest is garbage which we will overwrite at decompression)
5441		 * and then decompress back to the ARC data buffer. This way we
5442		 * can minimize copying by simply decompressing back over the
5443		 * original compressed data (rather than decompressing to an
5444		 * aux buffer and then copying back the uncompressed buffer,
5445		 * which is likely to be much larger).
5446		 */
5447		uint64_t csize;
5448		void *cdata;
5449
5450		csize = zio->io_size;
5451		cdata = zio_data_buf_alloc(csize);
5452		bcopy(zio->io_data, cdata, csize);
5453		if (zio_decompress_data(c, cdata, zio->io_data, csize,
5454		    hdr->b_size) != 0)
5455			zio->io_error = EIO;
5456		zio_data_buf_free(cdata, csize);
5457	}
5458
5459	/* Restore the expected uncompressed IO size. */
5460	zio->io_orig_size = zio->io_size = hdr->b_size;
5461}
5462
5463/*
5464 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
5465 * This buffer serves as a temporary holder of compressed data while
5466 * the buffer entry is being written to an l2arc device. Once that is
5467 * done, we can dispose of it.
5468 */
5469static void
5470l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
5471{
5472	l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
5473
5474	ASSERT(L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress));
5475	if (l2hdr->b_compress != ZIO_COMPRESS_EMPTY) {
5476		/*
5477		 * If the data was compressed, then we've allocated a
5478		 * temporary buffer for it, so now we need to release it.
5479		 */
5480		ASSERT(l2hdr->b_tmp_cdata != NULL);
5481		zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
5482		l2hdr->b_tmp_cdata = NULL;
5483	} else {
5484		ASSERT(l2hdr->b_tmp_cdata == NULL);
5485	}
5486}
5487
5488/*
5489 * This thread feeds the L2ARC at regular intervals.  This is the beating
5490 * heart of the L2ARC.
5491 */
5492static void
5493l2arc_feed_thread(void *dummy __unused)
5494{
5495	callb_cpr_t cpr;
5496	l2arc_dev_t *dev;
5497	spa_t *spa;
5498	uint64_t size, wrote;
5499	clock_t begin, next = ddi_get_lbolt();
5500	boolean_t headroom_boost = B_FALSE;
5501
5502	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
5503
5504	mutex_enter(&l2arc_feed_thr_lock);
5505
5506	while (l2arc_thread_exit == 0) {
5507		CALLB_CPR_SAFE_BEGIN(&cpr);
5508		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
5509		    next - ddi_get_lbolt());
5510		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
5511		next = ddi_get_lbolt() + hz;
5512
5513		/*
5514		 * Quick check for L2ARC devices.
5515		 */
5516		mutex_enter(&l2arc_dev_mtx);
5517		if (l2arc_ndev == 0) {
5518			mutex_exit(&l2arc_dev_mtx);
5519			continue;
5520		}
5521		mutex_exit(&l2arc_dev_mtx);
5522		begin = ddi_get_lbolt();
5523
5524		/*
5525		 * This selects the next l2arc device to write to, and in
5526		 * doing so the next spa to feed from: dev->l2ad_spa.   This
5527		 * will return NULL if there are now no l2arc devices or if
5528		 * they are all faulted.
5529		 *
5530		 * If a device is returned, its spa's config lock is also
5531		 * held to prevent device removal.  l2arc_dev_get_next()
5532		 * will grab and release l2arc_dev_mtx.
5533		 */
5534		if ((dev = l2arc_dev_get_next()) == NULL)
5535			continue;
5536
5537		spa = dev->l2ad_spa;
5538		ASSERT(spa != NULL);
5539
5540		/*
5541		 * If the pool is read-only then force the feed thread to
5542		 * sleep a little longer.
5543		 */
5544		if (!spa_writeable(spa)) {
5545			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
5546			spa_config_exit(spa, SCL_L2ARC, dev);
5547			continue;
5548		}
5549
5550		/*
5551		 * Avoid contributing to memory pressure.
5552		 */
5553		if (arc_reclaim_needed()) {
5554			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5555			spa_config_exit(spa, SCL_L2ARC, dev);
5556			continue;
5557		}
5558
5559		ARCSTAT_BUMP(arcstat_l2_feeds);
5560
5561		size = l2arc_write_size();
5562
5563		/*
5564		 * Evict L2ARC buffers that will be overwritten.
5565		 */
5566		l2arc_evict(dev, size, B_FALSE);
5567
5568		/*
5569		 * Write ARC buffers.
5570		 */
5571		wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5572
5573		/*
5574		 * Calculate interval between writes.
5575		 */
5576		next = l2arc_write_interval(begin, size, wrote);
5577		spa_config_exit(spa, SCL_L2ARC, dev);
5578	}
5579
5580	l2arc_thread_exit = 0;
5581	cv_broadcast(&l2arc_feed_thr_cv);
5582	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
5583	thread_exit();
5584}
5585
5586boolean_t
5587l2arc_vdev_present(vdev_t *vd)
5588{
5589	l2arc_dev_t *dev;
5590
5591	mutex_enter(&l2arc_dev_mtx);
5592	for (dev = list_head(l2arc_dev_list); dev != NULL;
5593	    dev = list_next(l2arc_dev_list, dev)) {
5594		if (dev->l2ad_vdev == vd)
5595			break;
5596	}
5597	mutex_exit(&l2arc_dev_mtx);
5598
5599	return (dev != NULL);
5600}
5601
5602/*
5603 * Add a vdev for use by the L2ARC.  By this point the spa has already
5604 * validated the vdev and opened it.
5605 */
5606void
5607l2arc_add_vdev(spa_t *spa, vdev_t *vd)
5608{
5609	l2arc_dev_t *adddev;
5610
5611	ASSERT(!l2arc_vdev_present(vd));
5612
5613	vdev_ashift_optimize(vd);
5614
5615	/*
5616	 * Create a new l2arc device entry.
5617	 */
5618	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5619	adddev->l2ad_spa = spa;
5620	adddev->l2ad_vdev = vd;
5621	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5622	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5623	adddev->l2ad_hand = adddev->l2ad_start;
5624	adddev->l2ad_evict = adddev->l2ad_start;
5625	adddev->l2ad_first = B_TRUE;
5626	adddev->l2ad_writing = B_FALSE;
5627
5628	/*
5629	 * This is a list of all ARC buffers that are still valid on the
5630	 * device.
5631	 */
5632	adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5633	list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5634	    offsetof(arc_buf_hdr_t, b_l2node));
5635
5636	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5637
5638	/*
5639	 * Add device to global list
5640	 */
5641	mutex_enter(&l2arc_dev_mtx);
5642	list_insert_head(l2arc_dev_list, adddev);
5643	atomic_inc_64(&l2arc_ndev);
5644	mutex_exit(&l2arc_dev_mtx);
5645}
5646
5647/*
5648 * Remove a vdev from the L2ARC.
5649 */
5650void
5651l2arc_remove_vdev(vdev_t *vd)
5652{
5653	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5654
5655	/*
5656	 * Find the device by vdev
5657	 */
5658	mutex_enter(&l2arc_dev_mtx);
5659	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5660		nextdev = list_next(l2arc_dev_list, dev);
5661		if (vd == dev->l2ad_vdev) {
5662			remdev = dev;
5663			break;
5664		}
5665	}
5666	ASSERT(remdev != NULL);
5667
5668	/*
5669	 * Remove device from global list
5670	 */
5671	list_remove(l2arc_dev_list, remdev);
5672	l2arc_dev_last = NULL;		/* may have been invalidated */
5673	atomic_dec_64(&l2arc_ndev);
5674	mutex_exit(&l2arc_dev_mtx);
5675
5676	/*
5677	 * Clear all buflists and ARC references.  L2ARC device flush.
5678	 */
5679	l2arc_evict(remdev, 0, B_TRUE);
5680	list_destroy(remdev->l2ad_buflist);
5681	kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5682	kmem_free(remdev, sizeof (l2arc_dev_t));
5683}
5684
5685void
5686l2arc_init(void)
5687{
5688	l2arc_thread_exit = 0;
5689	l2arc_ndev = 0;
5690	l2arc_writes_sent = 0;
5691	l2arc_writes_done = 0;
5692
5693	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5694	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5695	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5696	mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5697	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5698
5699	l2arc_dev_list = &L2ARC_dev_list;
5700	l2arc_free_on_write = &L2ARC_free_on_write;
5701	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5702	    offsetof(l2arc_dev_t, l2ad_node));
5703	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5704	    offsetof(l2arc_data_free_t, l2df_list_node));
5705}
5706
5707void
5708l2arc_fini(void)
5709{
5710	/*
5711	 * This is called from dmu_fini(), which is called from spa_fini();
5712	 * Because of this, we can assume that all l2arc devices have
5713	 * already been removed when the pools themselves were removed.
5714	 */
5715
5716	l2arc_do_free_on_write();
5717
5718	mutex_destroy(&l2arc_feed_thr_lock);
5719	cv_destroy(&l2arc_feed_thr_cv);
5720	mutex_destroy(&l2arc_dev_mtx);
5721	mutex_destroy(&l2arc_buflist_mtx);
5722	mutex_destroy(&l2arc_free_on_write_mtx);
5723
5724	list_destroy(l2arc_dev_list);
5725	list_destroy(l2arc_free_on_write);
5726}
5727
5728void
5729l2arc_start(void)
5730{
5731	if (!(spa_mode_global & FWRITE))
5732		return;
5733
5734	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5735	    TS_RUN, minclsyspri);
5736}
5737
5738void
5739l2arc_stop(void)
5740{
5741	if (!(spa_mode_global & FWRITE))
5742		return;
5743
5744	mutex_enter(&l2arc_feed_thr_lock);
5745	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
5746	l2arc_thread_exit = 1;
5747	while (l2arc_thread_exit != 0)
5748		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5749	mutex_exit(&l2arc_feed_thr_lock);
5750}
5751