arc.c revision 258388
1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23228103Smm * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
24249195Smm * Copyright (c) 2013 by Delphix. All rights reserved.
25251478Sdelphij * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26168404Spjd */
27168404Spjd
28168404Spjd/*
29168404Spjd * DVA-based Adjustable Replacement Cache
30168404Spjd *
31168404Spjd * While much of the theory of operation used here is
32168404Spjd * based on the self-tuning, low overhead replacement cache
33168404Spjd * presented by Megiddo and Modha at FAST 2003, there are some
34168404Spjd * significant differences:
35168404Spjd *
36168404Spjd * 1. The Megiddo and Modha model assumes any page is evictable.
37168404Spjd * Pages in its cache cannot be "locked" into memory.  This makes
38168404Spjd * the eviction algorithm simple: evict the last page in the list.
39168404Spjd * This also make the performance characteristics easy to reason
40168404Spjd * about.  Our cache is not so simple.  At any given moment, some
41168404Spjd * subset of the blocks in the cache are un-evictable because we
42168404Spjd * have handed out a reference to them.  Blocks are only evictable
43168404Spjd * when there are no external references active.  This makes
44168404Spjd * eviction far more problematic:  we choose to evict the evictable
45168404Spjd * blocks that are the "lowest" in the list.
46168404Spjd *
47168404Spjd * There are times when it is not possible to evict the requested
48168404Spjd * space.  In these circumstances we are unable to adjust the cache
49168404Spjd * size.  To prevent the cache growing unbounded at these times we
50185029Spjd * implement a "cache throttle" that slows the flow of new data
51185029Spjd * into the cache until we can make space available.
52168404Spjd *
53168404Spjd * 2. The Megiddo and Modha model assumes a fixed cache size.
54168404Spjd * Pages are evicted when the cache is full and there is a cache
55168404Spjd * miss.  Our model has a variable sized cache.  It grows with
56185029Spjd * high use, but also tries to react to memory pressure from the
57168404Spjd * operating system: decreasing its size when system memory is
58168404Spjd * tight.
59168404Spjd *
60168404Spjd * 3. The Megiddo and Modha model assumes a fixed page size. All
61251631Sdelphij * elements of the cache are therefore exactly the same size.  So
62168404Spjd * when adjusting the cache size following a cache miss, its simply
63168404Spjd * a matter of choosing a single page to evict.  In our model, we
64168404Spjd * have variable sized cache blocks (rangeing from 512 bytes to
65251631Sdelphij * 128K bytes).  We therefore choose a set of blocks to evict to make
66168404Spjd * space for a cache miss that approximates as closely as possible
67168404Spjd * the space used by the new block.
68168404Spjd *
69168404Spjd * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
70168404Spjd * by N. Megiddo & D. Modha, FAST 2003
71168404Spjd */
72168404Spjd
73168404Spjd/*
74168404Spjd * The locking model:
75168404Spjd *
76168404Spjd * A new reference to a cache buffer can be obtained in two
77168404Spjd * ways: 1) via a hash table lookup using the DVA as a key,
78185029Spjd * or 2) via one of the ARC lists.  The arc_read() interface
79168404Spjd * uses method 1, while the internal arc algorithms for
80251631Sdelphij * adjusting the cache use method 2.  We therefore provide two
81168404Spjd * types of locks: 1) the hash table lock array, and 2) the
82168404Spjd * arc list locks.
83168404Spjd *
84168404Spjd * Buffers do not have their own mutexs, rather they rely on the
85168404Spjd * hash table mutexs for the bulk of their protection (i.e. most
86168404Spjd * fields in the arc_buf_hdr_t are protected by these mutexs).
87168404Spjd *
88168404Spjd * buf_hash_find() returns the appropriate mutex (held) when it
89168404Spjd * locates the requested buffer in the hash table.  It returns
90168404Spjd * NULL for the mutex if the buffer was not in the table.
91168404Spjd *
92168404Spjd * buf_hash_remove() expects the appropriate hash mutex to be
93168404Spjd * already held before it is invoked.
94168404Spjd *
95168404Spjd * Each arc state also has a mutex which is used to protect the
96168404Spjd * buffer list associated with the state.  When attempting to
97168404Spjd * obtain a hash table lock while holding an arc list lock you
98168404Spjd * must use: mutex_tryenter() to avoid deadlock.  Also note that
99168404Spjd * the active state mutex must be held before the ghost state mutex.
100168404Spjd *
101168404Spjd * Arc buffers may have an associated eviction callback function.
102168404Spjd * This function will be invoked prior to removing the buffer (e.g.
103168404Spjd * in arc_do_user_evicts()).  Note however that the data associated
104168404Spjd * with the buffer may be evicted prior to the callback.  The callback
105168404Spjd * must be made with *no locks held* (to prevent deadlock).  Additionally,
106168404Spjd * the users of callbacks must ensure that their private data is
107168404Spjd * protected from simultaneous callbacks from arc_buf_evict()
108168404Spjd * and arc_do_user_evicts().
109168404Spjd *
110168404Spjd * Note that the majority of the performance stats are manipulated
111168404Spjd * with atomic operations.
112185029Spjd *
113185029Spjd * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
114185029Spjd *
115185029Spjd *	- L2ARC buflist creation
116185029Spjd *	- L2ARC buflist eviction
117185029Spjd *	- L2ARC write completion, which walks L2ARC buflists
118185029Spjd *	- ARC header destruction, as it removes from L2ARC buflists
119185029Spjd *	- ARC header release, as it removes from L2ARC buflists
120168404Spjd */
121168404Spjd
122168404Spjd#include <sys/spa.h>
123168404Spjd#include <sys/zio.h>
124251478Sdelphij#include <sys/zio_compress.h>
125168404Spjd#include <sys/zfs_context.h>
126168404Spjd#include <sys/arc.h>
127168404Spjd#include <sys/refcount.h>
128185029Spjd#include <sys/vdev.h>
129219089Spjd#include <sys/vdev_impl.h>
130168404Spjd#ifdef _KERNEL
131168404Spjd#include <sys/dnlc.h>
132168404Spjd#endif
133168404Spjd#include <sys/callb.h>
134168404Spjd#include <sys/kstat.h>
135248572Ssmh#include <sys/trim_map.h>
136219089Spjd#include <zfs_fletcher.h>
137168404Spjd#include <sys/sdt.h>
138168404Spjd
139191902Skmacy#include <vm/vm_pageout.h>
140191902Skmacy
141240133Smm#ifdef illumos
142240133Smm#ifndef _KERNEL
143240133Smm/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
144240133Smmboolean_t arc_watch = B_FALSE;
145240133Smmint arc_procfd;
146240133Smm#endif
147240133Smm#endif /* illumos */
148240133Smm
149168404Spjdstatic kmutex_t		arc_reclaim_thr_lock;
150168404Spjdstatic kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
151168404Spjdstatic uint8_t		arc_thread_exit;
152168404Spjd
153185029Spjdextern int zfs_write_limit_shift;
154185029Spjdextern uint64_t zfs_write_limit_max;
155185029Spjdextern kmutex_t zfs_write_limit_lock;
156185029Spjd
157168404Spjd#define	ARC_REDUCE_DNLC_PERCENT	3
158168404Spjduint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
159168404Spjd
160168404Spjdtypedef enum arc_reclaim_strategy {
161168404Spjd	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
162168404Spjd	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
163168404Spjd} arc_reclaim_strategy_t;
164168404Spjd
165168404Spjd/* number of seconds before growing cache again */
166168404Spjdstatic int		arc_grow_retry = 60;
167168404Spjd
168208373Smm/* shift of arc_c for calculating both min and max arc_p */
169208373Smmstatic int		arc_p_min_shift = 4;
170208373Smm
171208373Smm/* log2(fraction of arc to reclaim) */
172208373Smmstatic int		arc_shrink_shift = 5;
173208373Smm
174168404Spjd/*
175168404Spjd * minimum lifespan of a prefetch block in clock ticks
176168404Spjd * (initialized in arc_init())
177168404Spjd */
178168404Spjdstatic int		arc_min_prefetch_lifespan;
179168404Spjd
180208373Smmstatic int arc_dead;
181194043Skmacyextern int zfs_prefetch_disable;
182168404Spjd
183168404Spjd/*
184185029Spjd * The arc has filled available memory and has now warmed up.
185185029Spjd */
186185029Spjdstatic boolean_t arc_warm;
187185029Spjd
188185029Spjd/*
189168404Spjd * These tunables are for performance analysis.
190168404Spjd */
191185029Spjduint64_t zfs_arc_max;
192185029Spjduint64_t zfs_arc_min;
193185029Spjduint64_t zfs_arc_meta_limit = 0;
194208373Smmint zfs_arc_grow_retry = 0;
195208373Smmint zfs_arc_shrink_shift = 0;
196208373Smmint zfs_arc_p_min_shift = 0;
197242845Sdelphijint zfs_disable_dup_eviction = 0;
198185029Spjd
199185029SpjdTUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
200185029SpjdTUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
201185029SpjdTUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
202168473SpjdSYSCTL_DECL(_vfs_zfs);
203217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
204168473Spjd    "Maximum ARC size");
205217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
206168473Spjd    "Minimum ARC size");
207168404Spjd
208168404Spjd/*
209185029Spjd * Note that buffers can be in one of 6 states:
210168404Spjd *	ARC_anon	- anonymous (discussed below)
211168404Spjd *	ARC_mru		- recently used, currently cached
212168404Spjd *	ARC_mru_ghost	- recentely used, no longer in cache
213168404Spjd *	ARC_mfu		- frequently used, currently cached
214168404Spjd *	ARC_mfu_ghost	- frequently used, no longer in cache
215185029Spjd *	ARC_l2c_only	- exists in L2ARC but not other states
216185029Spjd * When there are no active references to the buffer, they are
217185029Spjd * are linked onto a list in one of these arc states.  These are
218185029Spjd * the only buffers that can be evicted or deleted.  Within each
219185029Spjd * state there are multiple lists, one for meta-data and one for
220185029Spjd * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
221185029Spjd * etc.) is tracked separately so that it can be managed more
222185029Spjd * explicitly: favored over data, limited explicitly.
223168404Spjd *
224168404Spjd * Anonymous buffers are buffers that are not associated with
225168404Spjd * a DVA.  These are buffers that hold dirty block copies
226168404Spjd * before they are written to stable storage.  By definition,
227168404Spjd * they are "ref'd" and are considered part of arc_mru
228168404Spjd * that cannot be freed.  Generally, they will aquire a DVA
229168404Spjd * as they are written and migrate onto the arc_mru list.
230185029Spjd *
231185029Spjd * The ARC_l2c_only state is for buffers that are in the second
232185029Spjd * level ARC but no longer in any of the ARC_m* lists.  The second
233185029Spjd * level ARC itself may also contain buffers that are in any of
234185029Spjd * the ARC_m* states - meaning that a buffer can exist in two
235185029Spjd * places.  The reason for the ARC_l2c_only state is to keep the
236185029Spjd * buffer header in the hash table, so that reads that hit the
237185029Spjd * second level ARC benefit from these fast lookups.
238168404Spjd */
239168404Spjd
240205264Skmacy#define	ARCS_LOCK_PAD		CACHE_LINE_SIZE
241205231Skmacystruct arcs_lock {
242205231Skmacy	kmutex_t	arcs_lock;
243205231Skmacy#ifdef _KERNEL
244205231Skmacy	unsigned char	pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
245205231Skmacy#endif
246205231Skmacy};
247205231Skmacy
248205231Skmacy/*
249205231Skmacy * must be power of two for mask use to work
250205231Skmacy *
251205231Skmacy */
252205231Skmacy#define ARC_BUFC_NUMDATALISTS		16
253205231Skmacy#define ARC_BUFC_NUMMETADATALISTS	16
254206796Spjd#define ARC_BUFC_NUMLISTS	(ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
255205231Skmacy
256168404Spjdtypedef struct arc_state {
257185029Spjd	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
258185029Spjd	uint64_t arcs_size;	/* total amount of data in this state */
259205231Skmacy	list_t	arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
260205264Skmacy	struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
261168404Spjd} arc_state_t;
262168404Spjd
263206796Spjd#define ARCS_LOCK(s, i)	(&((s)->arcs_locks[(i)].arcs_lock))
264205231Skmacy
265185029Spjd/* The 6 states: */
266168404Spjdstatic arc_state_t ARC_anon;
267168404Spjdstatic arc_state_t ARC_mru;
268168404Spjdstatic arc_state_t ARC_mru_ghost;
269168404Spjdstatic arc_state_t ARC_mfu;
270168404Spjdstatic arc_state_t ARC_mfu_ghost;
271185029Spjdstatic arc_state_t ARC_l2c_only;
272168404Spjd
273168404Spjdtypedef struct arc_stats {
274168404Spjd	kstat_named_t arcstat_hits;
275168404Spjd	kstat_named_t arcstat_misses;
276168404Spjd	kstat_named_t arcstat_demand_data_hits;
277168404Spjd	kstat_named_t arcstat_demand_data_misses;
278168404Spjd	kstat_named_t arcstat_demand_metadata_hits;
279168404Spjd	kstat_named_t arcstat_demand_metadata_misses;
280168404Spjd	kstat_named_t arcstat_prefetch_data_hits;
281168404Spjd	kstat_named_t arcstat_prefetch_data_misses;
282168404Spjd	kstat_named_t arcstat_prefetch_metadata_hits;
283168404Spjd	kstat_named_t arcstat_prefetch_metadata_misses;
284168404Spjd	kstat_named_t arcstat_mru_hits;
285168404Spjd	kstat_named_t arcstat_mru_ghost_hits;
286168404Spjd	kstat_named_t arcstat_mfu_hits;
287168404Spjd	kstat_named_t arcstat_mfu_ghost_hits;
288205231Skmacy	kstat_named_t arcstat_allocated;
289168404Spjd	kstat_named_t arcstat_deleted;
290205231Skmacy	kstat_named_t arcstat_stolen;
291168404Spjd	kstat_named_t arcstat_recycle_miss;
292251629Sdelphij	/*
293251629Sdelphij	 * Number of buffers that could not be evicted because the hash lock
294251629Sdelphij	 * was held by another thread.  The lock may not necessarily be held
295251629Sdelphij	 * by something using the same buffer, since hash locks are shared
296251629Sdelphij	 * by multiple buffers.
297251629Sdelphij	 */
298168404Spjd	kstat_named_t arcstat_mutex_miss;
299251629Sdelphij	/*
300251629Sdelphij	 * Number of buffers skipped because they have I/O in progress, are
301251629Sdelphij	 * indrect prefetch buffers that have not lived long enough, or are
302251629Sdelphij	 * not from the spa we're trying to evict from.
303251629Sdelphij	 */
304168404Spjd	kstat_named_t arcstat_evict_skip;
305208373Smm	kstat_named_t arcstat_evict_l2_cached;
306208373Smm	kstat_named_t arcstat_evict_l2_eligible;
307208373Smm	kstat_named_t arcstat_evict_l2_ineligible;
308168404Spjd	kstat_named_t arcstat_hash_elements;
309168404Spjd	kstat_named_t arcstat_hash_elements_max;
310168404Spjd	kstat_named_t arcstat_hash_collisions;
311168404Spjd	kstat_named_t arcstat_hash_chains;
312168404Spjd	kstat_named_t arcstat_hash_chain_max;
313168404Spjd	kstat_named_t arcstat_p;
314168404Spjd	kstat_named_t arcstat_c;
315168404Spjd	kstat_named_t arcstat_c_min;
316168404Spjd	kstat_named_t arcstat_c_max;
317168404Spjd	kstat_named_t arcstat_size;
318185029Spjd	kstat_named_t arcstat_hdr_size;
319208373Smm	kstat_named_t arcstat_data_size;
320208373Smm	kstat_named_t arcstat_other_size;
321185029Spjd	kstat_named_t arcstat_l2_hits;
322185029Spjd	kstat_named_t arcstat_l2_misses;
323185029Spjd	kstat_named_t arcstat_l2_feeds;
324185029Spjd	kstat_named_t arcstat_l2_rw_clash;
325208373Smm	kstat_named_t arcstat_l2_read_bytes;
326208373Smm	kstat_named_t arcstat_l2_write_bytes;
327185029Spjd	kstat_named_t arcstat_l2_writes_sent;
328185029Spjd	kstat_named_t arcstat_l2_writes_done;
329185029Spjd	kstat_named_t arcstat_l2_writes_error;
330185029Spjd	kstat_named_t arcstat_l2_writes_hdr_miss;
331185029Spjd	kstat_named_t arcstat_l2_evict_lock_retry;
332185029Spjd	kstat_named_t arcstat_l2_evict_reading;
333185029Spjd	kstat_named_t arcstat_l2_free_on_write;
334185029Spjd	kstat_named_t arcstat_l2_abort_lowmem;
335185029Spjd	kstat_named_t arcstat_l2_cksum_bad;
336185029Spjd	kstat_named_t arcstat_l2_io_error;
337185029Spjd	kstat_named_t arcstat_l2_size;
338251478Sdelphij	kstat_named_t arcstat_l2_asize;
339185029Spjd	kstat_named_t arcstat_l2_hdr_size;
340251478Sdelphij	kstat_named_t arcstat_l2_compress_successes;
341251478Sdelphij	kstat_named_t arcstat_l2_compress_zeros;
342251478Sdelphij	kstat_named_t arcstat_l2_compress_failures;
343205231Skmacy	kstat_named_t arcstat_l2_write_trylock_fail;
344205231Skmacy	kstat_named_t arcstat_l2_write_passed_headroom;
345205231Skmacy	kstat_named_t arcstat_l2_write_spa_mismatch;
346206796Spjd	kstat_named_t arcstat_l2_write_in_l2;
347205231Skmacy	kstat_named_t arcstat_l2_write_hdr_io_in_progress;
348205231Skmacy	kstat_named_t arcstat_l2_write_not_cacheable;
349205231Skmacy	kstat_named_t arcstat_l2_write_full;
350205231Skmacy	kstat_named_t arcstat_l2_write_buffer_iter;
351205231Skmacy	kstat_named_t arcstat_l2_write_pios;
352205231Skmacy	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
353205231Skmacy	kstat_named_t arcstat_l2_write_buffer_list_iter;
354205231Skmacy	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
355242845Sdelphij	kstat_named_t arcstat_memory_throttle_count;
356242845Sdelphij	kstat_named_t arcstat_duplicate_buffers;
357242845Sdelphij	kstat_named_t arcstat_duplicate_buffers_size;
358242845Sdelphij	kstat_named_t arcstat_duplicate_reads;
359168404Spjd} arc_stats_t;
360168404Spjd
361168404Spjdstatic arc_stats_t arc_stats = {
362168404Spjd	{ "hits",			KSTAT_DATA_UINT64 },
363168404Spjd	{ "misses",			KSTAT_DATA_UINT64 },
364168404Spjd	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
365168404Spjd	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
366168404Spjd	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
367168404Spjd	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
368168404Spjd	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
369168404Spjd	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
370168404Spjd	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
371168404Spjd	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
372168404Spjd	{ "mru_hits",			KSTAT_DATA_UINT64 },
373168404Spjd	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
374168404Spjd	{ "mfu_hits",			KSTAT_DATA_UINT64 },
375168404Spjd	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
376205231Skmacy	{ "allocated",			KSTAT_DATA_UINT64 },
377168404Spjd	{ "deleted",			KSTAT_DATA_UINT64 },
378205231Skmacy	{ "stolen",			KSTAT_DATA_UINT64 },
379168404Spjd	{ "recycle_miss",		KSTAT_DATA_UINT64 },
380168404Spjd	{ "mutex_miss",			KSTAT_DATA_UINT64 },
381168404Spjd	{ "evict_skip",			KSTAT_DATA_UINT64 },
382208373Smm	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
383208373Smm	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
384208373Smm	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
385168404Spjd	{ "hash_elements",		KSTAT_DATA_UINT64 },
386168404Spjd	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
387168404Spjd	{ "hash_collisions",		KSTAT_DATA_UINT64 },
388168404Spjd	{ "hash_chains",		KSTAT_DATA_UINT64 },
389168404Spjd	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
390168404Spjd	{ "p",				KSTAT_DATA_UINT64 },
391168404Spjd	{ "c",				KSTAT_DATA_UINT64 },
392168404Spjd	{ "c_min",			KSTAT_DATA_UINT64 },
393168404Spjd	{ "c_max",			KSTAT_DATA_UINT64 },
394185029Spjd	{ "size",			KSTAT_DATA_UINT64 },
395185029Spjd	{ "hdr_size",			KSTAT_DATA_UINT64 },
396208373Smm	{ "data_size",			KSTAT_DATA_UINT64 },
397208373Smm	{ "other_size",			KSTAT_DATA_UINT64 },
398185029Spjd	{ "l2_hits",			KSTAT_DATA_UINT64 },
399185029Spjd	{ "l2_misses",			KSTAT_DATA_UINT64 },
400185029Spjd	{ "l2_feeds",			KSTAT_DATA_UINT64 },
401185029Spjd	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
402208373Smm	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
403208373Smm	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
404185029Spjd	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
405185029Spjd	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
406185029Spjd	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
407185029Spjd	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
408185029Spjd	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
409185029Spjd	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
410185029Spjd	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
411185029Spjd	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
412185029Spjd	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
413185029Spjd	{ "l2_io_error",		KSTAT_DATA_UINT64 },
414185029Spjd	{ "l2_size",			KSTAT_DATA_UINT64 },
415251478Sdelphij	{ "l2_asize",			KSTAT_DATA_UINT64 },
416185029Spjd	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
417251478Sdelphij	{ "l2_compress_successes",	KSTAT_DATA_UINT64 },
418251478Sdelphij	{ "l2_compress_zeros",		KSTAT_DATA_UINT64 },
419251478Sdelphij	{ "l2_compress_failures",	KSTAT_DATA_UINT64 },
420206796Spjd	{ "l2_write_trylock_fail",	KSTAT_DATA_UINT64 },
421206796Spjd	{ "l2_write_passed_headroom",	KSTAT_DATA_UINT64 },
422206796Spjd	{ "l2_write_spa_mismatch",	KSTAT_DATA_UINT64 },
423206796Spjd	{ "l2_write_in_l2",		KSTAT_DATA_UINT64 },
424206796Spjd	{ "l2_write_io_in_progress",	KSTAT_DATA_UINT64 },
425206796Spjd	{ "l2_write_not_cacheable",	KSTAT_DATA_UINT64 },
426206796Spjd	{ "l2_write_full",		KSTAT_DATA_UINT64 },
427206796Spjd	{ "l2_write_buffer_iter",	KSTAT_DATA_UINT64 },
428206796Spjd	{ "l2_write_pios",		KSTAT_DATA_UINT64 },
429206796Spjd	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
430206796Spjd	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
431242845Sdelphij	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
432242845Sdelphij	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
433242845Sdelphij	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
434242845Sdelphij	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
435242845Sdelphij	{ "duplicate_reads",		KSTAT_DATA_UINT64 }
436168404Spjd};
437168404Spjd
438168404Spjd#define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
439168404Spjd
440168404Spjd#define	ARCSTAT_INCR(stat, val) \
441251631Sdelphij	atomic_add_64(&arc_stats.stat.value.ui64, (val))
442168404Spjd
443206796Spjd#define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
444168404Spjd#define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
445168404Spjd
446168404Spjd#define	ARCSTAT_MAX(stat, val) {					\
447168404Spjd	uint64_t m;							\
448168404Spjd	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
449168404Spjd	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
450168404Spjd		continue;						\
451168404Spjd}
452168404Spjd
453168404Spjd#define	ARCSTAT_MAXSTAT(stat) \
454168404Spjd	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
455168404Spjd
456168404Spjd/*
457168404Spjd * We define a macro to allow ARC hits/misses to be easily broken down by
458168404Spjd * two separate conditions, giving a total of four different subtypes for
459168404Spjd * each of hits and misses (so eight statistics total).
460168404Spjd */
461168404Spjd#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
462168404Spjd	if (cond1) {							\
463168404Spjd		if (cond2) {						\
464168404Spjd			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
465168404Spjd		} else {						\
466168404Spjd			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
467168404Spjd		}							\
468168404Spjd	} else {							\
469168404Spjd		if (cond2) {						\
470168404Spjd			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
471168404Spjd		} else {						\
472168404Spjd			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
473168404Spjd		}							\
474168404Spjd	}
475168404Spjd
476168404Spjdkstat_t			*arc_ksp;
477206796Spjdstatic arc_state_t	*arc_anon;
478168404Spjdstatic arc_state_t	*arc_mru;
479168404Spjdstatic arc_state_t	*arc_mru_ghost;
480168404Spjdstatic arc_state_t	*arc_mfu;
481168404Spjdstatic arc_state_t	*arc_mfu_ghost;
482185029Spjdstatic arc_state_t	*arc_l2c_only;
483168404Spjd
484168404Spjd/*
485168404Spjd * There are several ARC variables that are critical to export as kstats --
486168404Spjd * but we don't want to have to grovel around in the kstat whenever we wish to
487168404Spjd * manipulate them.  For these variables, we therefore define them to be in
488168404Spjd * terms of the statistic variable.  This assures that we are not introducing
489168404Spjd * the possibility of inconsistency by having shadow copies of the variables,
490168404Spjd * while still allowing the code to be readable.
491168404Spjd */
492168404Spjd#define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
493168404Spjd#define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
494168404Spjd#define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
495168404Spjd#define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
496168404Spjd#define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
497168404Spjd
498251478Sdelphij#define	L2ARC_IS_VALID_COMPRESS(_c_) \
499251478Sdelphij	((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
500251478Sdelphij
501168404Spjdstatic int		arc_no_grow;	/* Don't try to grow cache size */
502168404Spjdstatic uint64_t		arc_tempreserve;
503209962Smmstatic uint64_t		arc_loaned_bytes;
504185029Spjdstatic uint64_t		arc_meta_used;
505185029Spjdstatic uint64_t		arc_meta_limit;
506185029Spjdstatic uint64_t		arc_meta_max = 0;
507229663SpjdSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RD, &arc_meta_used, 0,
508229663Spjd    "ARC metadata used");
509229663SpjdSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RW, &arc_meta_limit, 0,
510229663Spjd    "ARC metadata limit");
511168404Spjd
512185029Spjdtypedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
513185029Spjd
514168404Spjdtypedef struct arc_callback arc_callback_t;
515168404Spjd
516168404Spjdstruct arc_callback {
517168404Spjd	void			*acb_private;
518168404Spjd	arc_done_func_t		*acb_done;
519168404Spjd	arc_buf_t		*acb_buf;
520168404Spjd	zio_t			*acb_zio_dummy;
521168404Spjd	arc_callback_t		*acb_next;
522168404Spjd};
523168404Spjd
524168404Spjdtypedef struct arc_write_callback arc_write_callback_t;
525168404Spjd
526168404Spjdstruct arc_write_callback {
527168404Spjd	void		*awcb_private;
528168404Spjd	arc_done_func_t	*awcb_ready;
529168404Spjd	arc_done_func_t	*awcb_done;
530168404Spjd	arc_buf_t	*awcb_buf;
531168404Spjd};
532168404Spjd
533168404Spjdstruct arc_buf_hdr {
534168404Spjd	/* protected by hash lock */
535168404Spjd	dva_t			b_dva;
536168404Spjd	uint64_t		b_birth;
537168404Spjd	uint64_t		b_cksum0;
538168404Spjd
539168404Spjd	kmutex_t		b_freeze_lock;
540168404Spjd	zio_cksum_t		*b_freeze_cksum;
541219089Spjd	void			*b_thawed;
542168404Spjd
543168404Spjd	arc_buf_hdr_t		*b_hash_next;
544168404Spjd	arc_buf_t		*b_buf;
545168404Spjd	uint32_t		b_flags;
546168404Spjd	uint32_t		b_datacnt;
547168404Spjd
548168404Spjd	arc_callback_t		*b_acb;
549168404Spjd	kcondvar_t		b_cv;
550168404Spjd
551168404Spjd	/* immutable */
552168404Spjd	arc_buf_contents_t	b_type;
553168404Spjd	uint64_t		b_size;
554209962Smm	uint64_t		b_spa;
555168404Spjd
556168404Spjd	/* protected by arc state mutex */
557168404Spjd	arc_state_t		*b_state;
558168404Spjd	list_node_t		b_arc_node;
559168404Spjd
560168404Spjd	/* updated atomically */
561168404Spjd	clock_t			b_arc_access;
562168404Spjd
563168404Spjd	/* self protecting */
564168404Spjd	refcount_t		b_refcnt;
565185029Spjd
566185029Spjd	l2arc_buf_hdr_t		*b_l2hdr;
567185029Spjd	list_node_t		b_l2node;
568168404Spjd};
569168404Spjd
570168404Spjdstatic arc_buf_t *arc_eviction_list;
571168404Spjdstatic kmutex_t arc_eviction_mtx;
572168404Spjdstatic arc_buf_hdr_t arc_eviction_hdr;
573168404Spjdstatic void arc_get_data_buf(arc_buf_t *buf);
574168404Spjdstatic void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
575185029Spjdstatic int arc_evict_needed(arc_buf_contents_t type);
576209962Smmstatic void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
577240133Smm#ifdef illumos
578240133Smmstatic void arc_buf_watch(arc_buf_t *buf);
579240133Smm#endif /* illumos */
580168404Spjd
581209962Smmstatic boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
582208373Smm
583168404Spjd#define	GHOST_STATE(state)	\
584185029Spjd	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
585185029Spjd	(state) == arc_l2c_only)
586168404Spjd
587168404Spjd/*
588168404Spjd * Private ARC flags.  These flags are private ARC only flags that will show up
589168404Spjd * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
590168404Spjd * be passed in as arc_flags in things like arc_read.  However, these flags
591168404Spjd * should never be passed and should only be set by ARC code.  When adding new
592168404Spjd * public flags, make sure not to smash the private ones.
593168404Spjd */
594168404Spjd
595168404Spjd#define	ARC_IN_HASH_TABLE	(1 << 9)	/* this buffer is hashed */
596168404Spjd#define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
597168404Spjd#define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
598168404Spjd#define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
599168404Spjd#define	ARC_BUF_AVAILABLE	(1 << 13)	/* block not in active use */
600168404Spjd#define	ARC_INDIRECT		(1 << 14)	/* this is an indirect block */
601185029Spjd#define	ARC_FREE_IN_PROGRESS	(1 << 15)	/* hdr about to be freed */
602185029Spjd#define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
603185029Spjd#define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
604185029Spjd#define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
605168404Spjd
606168404Spjd#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
607168404Spjd#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
608168404Spjd#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
609208373Smm#define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_PREFETCH)
610168404Spjd#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
611168404Spjd#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
612185029Spjd#define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
613185029Spjd#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_L2CACHE)
614185029Spjd#define	HDR_L2_READING(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS &&	\
615185029Spjd				    (hdr)->b_l2hdr != NULL)
616185029Spjd#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_L2_WRITING)
617185029Spjd#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_L2_EVICTED)
618185029Spjd#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_L2_WRITE_HEAD)
619168404Spjd
620168404Spjd/*
621185029Spjd * Other sizes
622185029Spjd */
623185029Spjd
624185029Spjd#define	HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
625185029Spjd#define	L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
626185029Spjd
627185029Spjd/*
628168404Spjd * Hash table routines
629168404Spjd */
630168404Spjd
631205253Skmacy#define	HT_LOCK_PAD	CACHE_LINE_SIZE
632168404Spjd
633168404Spjdstruct ht_lock {
634168404Spjd	kmutex_t	ht_lock;
635168404Spjd#ifdef _KERNEL
636168404Spjd	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
637168404Spjd#endif
638168404Spjd};
639168404Spjd
640168404Spjd#define	BUF_LOCKS 256
641168404Spjdtypedef struct buf_hash_table {
642168404Spjd	uint64_t ht_mask;
643168404Spjd	arc_buf_hdr_t **ht_table;
644205264Skmacy	struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
645168404Spjd} buf_hash_table_t;
646168404Spjd
647168404Spjdstatic buf_hash_table_t buf_hash_table;
648168404Spjd
649168404Spjd#define	BUF_HASH_INDEX(spa, dva, birth) \
650168404Spjd	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
651168404Spjd#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
652168404Spjd#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
653219089Spjd#define	HDR_LOCK(hdr) \
654219089Spjd	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
655168404Spjd
656168404Spjduint64_t zfs_crc64_table[256];
657168404Spjd
658185029Spjd/*
659185029Spjd * Level 2 ARC
660185029Spjd */
661185029Spjd
662208373Smm#define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
663251478Sdelphij#define	L2ARC_HEADROOM		2			/* num of writes */
664251478Sdelphij/*
665251478Sdelphij * If we discover during ARC scan any buffers to be compressed, we boost
666251478Sdelphij * our headroom for the next scanning cycle by this percentage multiple.
667251478Sdelphij */
668251478Sdelphij#define	L2ARC_HEADROOM_BOOST	200
669208373Smm#define	L2ARC_FEED_SECS		1		/* caching interval secs */
670208373Smm#define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
671185029Spjd
672185029Spjd#define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
673185029Spjd#define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
674185029Spjd
675251631Sdelphij/* L2ARC Performance Tunables */
676185029Spjduint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
677185029Spjduint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
678185029Spjduint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
679251478Sdelphijuint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
680185029Spjduint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
681208373Smmuint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
682219089Spjdboolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
683208373Smmboolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
684208373Smmboolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
685185029Spjd
686217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
687205231Skmacy    &l2arc_write_max, 0, "max write size");
688217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
689205231Skmacy    &l2arc_write_boost, 0, "extra write during warmup");
690217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
691205231Skmacy    &l2arc_headroom, 0, "number of dev writes");
692217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
693205231Skmacy    &l2arc_feed_secs, 0, "interval seconds");
694217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
695208373Smm    &l2arc_feed_min_ms, 0, "min interval milliseconds");
696205231Skmacy
697205231SkmacySYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
698205231Skmacy    &l2arc_noprefetch, 0, "don't cache prefetch bufs");
699208373SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
700208373Smm    &l2arc_feed_again, 0, "turbo warmup");
701208373SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
702208373Smm    &l2arc_norw, 0, "no reads during writes");
703205231Skmacy
704217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
705205231Skmacy    &ARC_anon.arcs_size, 0, "size of anonymous state");
706217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
707205231Skmacy    &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
708217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
709205231Skmacy    &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
710205231Skmacy
711217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
712205231Skmacy    &ARC_mru.arcs_size, 0, "size of mru state");
713217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
714205231Skmacy    &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
715217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
716205231Skmacy    &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
717205231Skmacy
718217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
719205231Skmacy    &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
720217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
721205231Skmacy    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
722205231Skmacy    "size of metadata in mru ghost state");
723217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
724205231Skmacy    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
725205231Skmacy    "size of data in mru ghost state");
726205231Skmacy
727217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
728205231Skmacy    &ARC_mfu.arcs_size, 0, "size of mfu state");
729217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
730205231Skmacy    &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
731217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
732205231Skmacy    &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
733205231Skmacy
734217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
735205231Skmacy    &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
736217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
737205231Skmacy    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
738205231Skmacy    "size of metadata in mfu ghost state");
739217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
740205231Skmacy    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
741205231Skmacy    "size of data in mfu ghost state");
742205231Skmacy
743217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
744205231Skmacy    &ARC_l2c_only.arcs_size, 0, "size of mru state");
745205231Skmacy
746185029Spjd/*
747185029Spjd * L2ARC Internals
748185029Spjd */
749185029Spjdtypedef struct l2arc_dev {
750185029Spjd	vdev_t			*l2ad_vdev;	/* vdev */
751185029Spjd	spa_t			*l2ad_spa;	/* spa */
752185029Spjd	uint64_t		l2ad_hand;	/* next write location */
753185029Spjd	uint64_t		l2ad_start;	/* first addr on device */
754185029Spjd	uint64_t		l2ad_end;	/* last addr on device */
755185029Spjd	uint64_t		l2ad_evict;	/* last addr eviction reached */
756185029Spjd	boolean_t		l2ad_first;	/* first sweep through */
757208373Smm	boolean_t		l2ad_writing;	/* currently writing */
758185029Spjd	list_t			*l2ad_buflist;	/* buffer list */
759185029Spjd	list_node_t		l2ad_node;	/* device list node */
760185029Spjd} l2arc_dev_t;
761185029Spjd
762185029Spjdstatic list_t L2ARC_dev_list;			/* device list */
763185029Spjdstatic list_t *l2arc_dev_list;			/* device list pointer */
764185029Spjdstatic kmutex_t l2arc_dev_mtx;			/* device list mutex */
765185029Spjdstatic l2arc_dev_t *l2arc_dev_last;		/* last device used */
766185029Spjdstatic kmutex_t l2arc_buflist_mtx;		/* mutex for all buflists */
767185029Spjdstatic list_t L2ARC_free_on_write;		/* free after write buf list */
768185029Spjdstatic list_t *l2arc_free_on_write;		/* free after write list ptr */
769185029Spjdstatic kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
770185029Spjdstatic uint64_t l2arc_ndev;			/* number of devices */
771185029Spjd
772185029Spjdtypedef struct l2arc_read_callback {
773251478Sdelphij	arc_buf_t		*l2rcb_buf;		/* read buffer */
774251478Sdelphij	spa_t			*l2rcb_spa;		/* spa */
775251478Sdelphij	blkptr_t		l2rcb_bp;		/* original blkptr */
776251478Sdelphij	zbookmark_t		l2rcb_zb;		/* original bookmark */
777251478Sdelphij	int			l2rcb_flags;		/* original flags */
778251478Sdelphij	enum zio_compress	l2rcb_compress;		/* applied compress */
779185029Spjd} l2arc_read_callback_t;
780185029Spjd
781185029Spjdtypedef struct l2arc_write_callback {
782185029Spjd	l2arc_dev_t	*l2wcb_dev;		/* device info */
783185029Spjd	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
784185029Spjd} l2arc_write_callback_t;
785185029Spjd
786185029Spjdstruct l2arc_buf_hdr {
787185029Spjd	/* protected by arc_buf_hdr  mutex */
788251478Sdelphij	l2arc_dev_t		*b_dev;		/* L2ARC device */
789251478Sdelphij	uint64_t		b_daddr;	/* disk address, offset byte */
790251478Sdelphij	/* compression applied to buffer data */
791251478Sdelphij	enum zio_compress	b_compress;
792251478Sdelphij	/* real alloc'd buffer size depending on b_compress applied */
793251478Sdelphij	int			b_asize;
794251478Sdelphij	/* temporary buffer holder for in-flight compressed data */
795251478Sdelphij	void			*b_tmp_cdata;
796185029Spjd};
797185029Spjd
798185029Spjdtypedef struct l2arc_data_free {
799185029Spjd	/* protected by l2arc_free_on_write_mtx */
800185029Spjd	void		*l2df_data;
801185029Spjd	size_t		l2df_size;
802185029Spjd	void		(*l2df_func)(void *, size_t);
803185029Spjd	list_node_t	l2df_list_node;
804185029Spjd} l2arc_data_free_t;
805185029Spjd
806185029Spjdstatic kmutex_t l2arc_feed_thr_lock;
807185029Spjdstatic kcondvar_t l2arc_feed_thr_cv;
808185029Spjdstatic uint8_t l2arc_thread_exit;
809185029Spjd
810185029Spjdstatic void l2arc_read_done(zio_t *zio);
811185029Spjdstatic void l2arc_hdr_stat_add(void);
812185029Spjdstatic void l2arc_hdr_stat_remove(void);
813185029Spjd
814251478Sdelphijstatic boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
815251478Sdelphijstatic void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
816251478Sdelphij    enum zio_compress c);
817251478Sdelphijstatic void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
818251478Sdelphij
819168404Spjdstatic uint64_t
820209962Smmbuf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
821168404Spjd{
822168404Spjd	uint8_t *vdva = (uint8_t *)dva;
823168404Spjd	uint64_t crc = -1ULL;
824168404Spjd	int i;
825168404Spjd
826168404Spjd	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
827168404Spjd
828168404Spjd	for (i = 0; i < sizeof (dva_t); i++)
829168404Spjd		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
830168404Spjd
831209962Smm	crc ^= (spa>>8) ^ birth;
832168404Spjd
833168404Spjd	return (crc);
834168404Spjd}
835168404Spjd
836168404Spjd#define	BUF_EMPTY(buf)						\
837168404Spjd	((buf)->b_dva.dva_word[0] == 0 &&			\
838168404Spjd	(buf)->b_dva.dva_word[1] == 0 &&			\
839168404Spjd	(buf)->b_birth == 0)
840168404Spjd
841168404Spjd#define	BUF_EQUAL(spa, dva, birth, buf)				\
842168404Spjd	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
843168404Spjd	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
844168404Spjd	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
845168404Spjd
846219089Spjdstatic void
847219089Spjdbuf_discard_identity(arc_buf_hdr_t *hdr)
848219089Spjd{
849219089Spjd	hdr->b_dva.dva_word[0] = 0;
850219089Spjd	hdr->b_dva.dva_word[1] = 0;
851219089Spjd	hdr->b_birth = 0;
852219089Spjd	hdr->b_cksum0 = 0;
853219089Spjd}
854219089Spjd
855168404Spjdstatic arc_buf_hdr_t *
856209962Smmbuf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
857168404Spjd{
858168404Spjd	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
859168404Spjd	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
860168404Spjd	arc_buf_hdr_t *buf;
861168404Spjd
862168404Spjd	mutex_enter(hash_lock);
863168404Spjd	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
864168404Spjd	    buf = buf->b_hash_next) {
865168404Spjd		if (BUF_EQUAL(spa, dva, birth, buf)) {
866168404Spjd			*lockp = hash_lock;
867168404Spjd			return (buf);
868168404Spjd		}
869168404Spjd	}
870168404Spjd	mutex_exit(hash_lock);
871168404Spjd	*lockp = NULL;
872168404Spjd	return (NULL);
873168404Spjd}
874168404Spjd
875168404Spjd/*
876168404Spjd * Insert an entry into the hash table.  If there is already an element
877168404Spjd * equal to elem in the hash table, then the already existing element
878168404Spjd * will be returned and the new element will not be inserted.
879168404Spjd * Otherwise returns NULL.
880168404Spjd */
881168404Spjdstatic arc_buf_hdr_t *
882168404Spjdbuf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
883168404Spjd{
884168404Spjd	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
885168404Spjd	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
886168404Spjd	arc_buf_hdr_t *fbuf;
887168404Spjd	uint32_t i;
888168404Spjd
889168404Spjd	ASSERT(!HDR_IN_HASH_TABLE(buf));
890168404Spjd	*lockp = hash_lock;
891168404Spjd	mutex_enter(hash_lock);
892168404Spjd	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
893168404Spjd	    fbuf = fbuf->b_hash_next, i++) {
894168404Spjd		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
895168404Spjd			return (fbuf);
896168404Spjd	}
897168404Spjd
898168404Spjd	buf->b_hash_next = buf_hash_table.ht_table[idx];
899168404Spjd	buf_hash_table.ht_table[idx] = buf;
900168404Spjd	buf->b_flags |= ARC_IN_HASH_TABLE;
901168404Spjd
902168404Spjd	/* collect some hash table performance data */
903168404Spjd	if (i > 0) {
904168404Spjd		ARCSTAT_BUMP(arcstat_hash_collisions);
905168404Spjd		if (i == 1)
906168404Spjd			ARCSTAT_BUMP(arcstat_hash_chains);
907168404Spjd
908168404Spjd		ARCSTAT_MAX(arcstat_hash_chain_max, i);
909168404Spjd	}
910168404Spjd
911168404Spjd	ARCSTAT_BUMP(arcstat_hash_elements);
912168404Spjd	ARCSTAT_MAXSTAT(arcstat_hash_elements);
913168404Spjd
914168404Spjd	return (NULL);
915168404Spjd}
916168404Spjd
917168404Spjdstatic void
918168404Spjdbuf_hash_remove(arc_buf_hdr_t *buf)
919168404Spjd{
920168404Spjd	arc_buf_hdr_t *fbuf, **bufp;
921168404Spjd	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
922168404Spjd
923168404Spjd	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
924168404Spjd	ASSERT(HDR_IN_HASH_TABLE(buf));
925168404Spjd
926168404Spjd	bufp = &buf_hash_table.ht_table[idx];
927168404Spjd	while ((fbuf = *bufp) != buf) {
928168404Spjd		ASSERT(fbuf != NULL);
929168404Spjd		bufp = &fbuf->b_hash_next;
930168404Spjd	}
931168404Spjd	*bufp = buf->b_hash_next;
932168404Spjd	buf->b_hash_next = NULL;
933168404Spjd	buf->b_flags &= ~ARC_IN_HASH_TABLE;
934168404Spjd
935168404Spjd	/* collect some hash table performance data */
936168404Spjd	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
937168404Spjd
938168404Spjd	if (buf_hash_table.ht_table[idx] &&
939168404Spjd	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
940168404Spjd		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
941168404Spjd}
942168404Spjd
943168404Spjd/*
944168404Spjd * Global data structures and functions for the buf kmem cache.
945168404Spjd */
946168404Spjdstatic kmem_cache_t *hdr_cache;
947168404Spjdstatic kmem_cache_t *buf_cache;
948168404Spjd
949168404Spjdstatic void
950168404Spjdbuf_fini(void)
951168404Spjd{
952168404Spjd	int i;
953168404Spjd
954168404Spjd	kmem_free(buf_hash_table.ht_table,
955168404Spjd	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
956168404Spjd	for (i = 0; i < BUF_LOCKS; i++)
957168404Spjd		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
958168404Spjd	kmem_cache_destroy(hdr_cache);
959168404Spjd	kmem_cache_destroy(buf_cache);
960168404Spjd}
961168404Spjd
962168404Spjd/*
963168404Spjd * Constructor callback - called when the cache is empty
964168404Spjd * and a new buf is requested.
965168404Spjd */
966168404Spjd/* ARGSUSED */
967168404Spjdstatic int
968168404Spjdhdr_cons(void *vbuf, void *unused, int kmflag)
969168404Spjd{
970168404Spjd	arc_buf_hdr_t *buf = vbuf;
971168404Spjd
972168404Spjd	bzero(buf, sizeof (arc_buf_hdr_t));
973168404Spjd	refcount_create(&buf->b_refcnt);
974168404Spjd	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
975185029Spjd	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
976208373Smm	arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
977185029Spjd
978168404Spjd	return (0);
979168404Spjd}
980168404Spjd
981185029Spjd/* ARGSUSED */
982185029Spjdstatic int
983185029Spjdbuf_cons(void *vbuf, void *unused, int kmflag)
984185029Spjd{
985185029Spjd	arc_buf_t *buf = vbuf;
986185029Spjd
987185029Spjd	bzero(buf, sizeof (arc_buf_t));
988219089Spjd	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
989208373Smm	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
990208373Smm
991185029Spjd	return (0);
992185029Spjd}
993185029Spjd
994168404Spjd/*
995168404Spjd * Destructor callback - called when a cached buf is
996168404Spjd * no longer required.
997168404Spjd */
998168404Spjd/* ARGSUSED */
999168404Spjdstatic void
1000168404Spjdhdr_dest(void *vbuf, void *unused)
1001168404Spjd{
1002168404Spjd	arc_buf_hdr_t *buf = vbuf;
1003168404Spjd
1004219089Spjd	ASSERT(BUF_EMPTY(buf));
1005168404Spjd	refcount_destroy(&buf->b_refcnt);
1006168404Spjd	cv_destroy(&buf->b_cv);
1007185029Spjd	mutex_destroy(&buf->b_freeze_lock);
1008208373Smm	arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
1009168404Spjd}
1010168404Spjd
1011185029Spjd/* ARGSUSED */
1012185029Spjdstatic void
1013185029Spjdbuf_dest(void *vbuf, void *unused)
1014185029Spjd{
1015185029Spjd	arc_buf_t *buf = vbuf;
1016185029Spjd
1017219089Spjd	mutex_destroy(&buf->b_evict_lock);
1018208373Smm	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1019185029Spjd}
1020185029Spjd
1021168404Spjd/*
1022168404Spjd * Reclaim callback -- invoked when memory is low.
1023168404Spjd */
1024168404Spjd/* ARGSUSED */
1025168404Spjdstatic void
1026168404Spjdhdr_recl(void *unused)
1027168404Spjd{
1028168404Spjd	dprintf("hdr_recl called\n");
1029168404Spjd	/*
1030168404Spjd	 * umem calls the reclaim func when we destroy the buf cache,
1031168404Spjd	 * which is after we do arc_fini().
1032168404Spjd	 */
1033168404Spjd	if (!arc_dead)
1034168404Spjd		cv_signal(&arc_reclaim_thr_cv);
1035168404Spjd}
1036168404Spjd
1037168404Spjdstatic void
1038168404Spjdbuf_init(void)
1039168404Spjd{
1040168404Spjd	uint64_t *ct;
1041168404Spjd	uint64_t hsize = 1ULL << 12;
1042168404Spjd	int i, j;
1043168404Spjd
1044168404Spjd	/*
1045168404Spjd	 * The hash table is big enough to fill all of physical memory
1046168404Spjd	 * with an average 64K block size.  The table will take up
1047168404Spjd	 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
1048168404Spjd	 */
1049168696Spjd	while (hsize * 65536 < (uint64_t)physmem * PAGESIZE)
1050168404Spjd		hsize <<= 1;
1051168404Spjdretry:
1052168404Spjd	buf_hash_table.ht_mask = hsize - 1;
1053168404Spjd	buf_hash_table.ht_table =
1054168404Spjd	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1055168404Spjd	if (buf_hash_table.ht_table == NULL) {
1056168404Spjd		ASSERT(hsize > (1ULL << 8));
1057168404Spjd		hsize >>= 1;
1058168404Spjd		goto retry;
1059168404Spjd	}
1060168404Spjd
1061168404Spjd	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
1062168404Spjd	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
1063168404Spjd	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1064185029Spjd	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1065168404Spjd
1066168404Spjd	for (i = 0; i < 256; i++)
1067168404Spjd		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1068168404Spjd			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1069168404Spjd
1070168404Spjd	for (i = 0; i < BUF_LOCKS; i++) {
1071168404Spjd		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1072168404Spjd		    NULL, MUTEX_DEFAULT, NULL);
1073168404Spjd	}
1074168404Spjd}
1075168404Spjd
1076168404Spjd#define	ARC_MINTIME	(hz>>4) /* 62 ms */
1077168404Spjd
1078168404Spjdstatic void
1079168404Spjdarc_cksum_verify(arc_buf_t *buf)
1080168404Spjd{
1081168404Spjd	zio_cksum_t zc;
1082168404Spjd
1083168404Spjd	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1084168404Spjd		return;
1085168404Spjd
1086168404Spjd	mutex_enter(&buf->b_hdr->b_freeze_lock);
1087168404Spjd	if (buf->b_hdr->b_freeze_cksum == NULL ||
1088168404Spjd	    (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
1089168404Spjd		mutex_exit(&buf->b_hdr->b_freeze_lock);
1090168404Spjd		return;
1091168404Spjd	}
1092168404Spjd	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1093168404Spjd	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1094168404Spjd		panic("buffer modified while frozen!");
1095168404Spjd	mutex_exit(&buf->b_hdr->b_freeze_lock);
1096168404Spjd}
1097168404Spjd
1098185029Spjdstatic int
1099185029Spjdarc_cksum_equal(arc_buf_t *buf)
1100185029Spjd{
1101185029Spjd	zio_cksum_t zc;
1102185029Spjd	int equal;
1103185029Spjd
1104185029Spjd	mutex_enter(&buf->b_hdr->b_freeze_lock);
1105185029Spjd	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1106185029Spjd	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1107185029Spjd	mutex_exit(&buf->b_hdr->b_freeze_lock);
1108185029Spjd
1109185029Spjd	return (equal);
1110185029Spjd}
1111185029Spjd
1112168404Spjdstatic void
1113185029Spjdarc_cksum_compute(arc_buf_t *buf, boolean_t force)
1114168404Spjd{
1115185029Spjd	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1116168404Spjd		return;
1117168404Spjd
1118168404Spjd	mutex_enter(&buf->b_hdr->b_freeze_lock);
1119168404Spjd	if (buf->b_hdr->b_freeze_cksum != NULL) {
1120168404Spjd		mutex_exit(&buf->b_hdr->b_freeze_lock);
1121168404Spjd		return;
1122168404Spjd	}
1123168404Spjd	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1124168404Spjd	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1125168404Spjd	    buf->b_hdr->b_freeze_cksum);
1126168404Spjd	mutex_exit(&buf->b_hdr->b_freeze_lock);
1127240133Smm#ifdef illumos
1128240133Smm	arc_buf_watch(buf);
1129240133Smm#endif /* illumos */
1130168404Spjd}
1131168404Spjd
1132240133Smm#ifdef illumos
1133240133Smm#ifndef _KERNEL
1134240133Smmtypedef struct procctl {
1135240133Smm	long cmd;
1136240133Smm	prwatch_t prwatch;
1137240133Smm} procctl_t;
1138240133Smm#endif
1139240133Smm
1140240133Smm/* ARGSUSED */
1141240133Smmstatic void
1142240133Smmarc_buf_unwatch(arc_buf_t *buf)
1143240133Smm{
1144240133Smm#ifndef _KERNEL
1145240133Smm	if (arc_watch) {
1146240133Smm		int result;
1147240133Smm		procctl_t ctl;
1148240133Smm		ctl.cmd = PCWATCH;
1149240133Smm		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1150240133Smm		ctl.prwatch.pr_size = 0;
1151240133Smm		ctl.prwatch.pr_wflags = 0;
1152240133Smm		result = write(arc_procfd, &ctl, sizeof (ctl));
1153240133Smm		ASSERT3U(result, ==, sizeof (ctl));
1154240133Smm	}
1155240133Smm#endif
1156240133Smm}
1157240133Smm
1158240133Smm/* ARGSUSED */
1159240133Smmstatic void
1160240133Smmarc_buf_watch(arc_buf_t *buf)
1161240133Smm{
1162240133Smm#ifndef _KERNEL
1163240133Smm	if (arc_watch) {
1164240133Smm		int result;
1165240133Smm		procctl_t ctl;
1166240133Smm		ctl.cmd = PCWATCH;
1167240133Smm		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1168240133Smm		ctl.prwatch.pr_size = buf->b_hdr->b_size;
1169240133Smm		ctl.prwatch.pr_wflags = WA_WRITE;
1170240133Smm		result = write(arc_procfd, &ctl, sizeof (ctl));
1171240133Smm		ASSERT3U(result, ==, sizeof (ctl));
1172240133Smm	}
1173240133Smm#endif
1174240133Smm}
1175240133Smm#endif /* illumos */
1176240133Smm
1177168404Spjdvoid
1178168404Spjdarc_buf_thaw(arc_buf_t *buf)
1179168404Spjd{
1180185029Spjd	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1181185029Spjd		if (buf->b_hdr->b_state != arc_anon)
1182185029Spjd			panic("modifying non-anon buffer!");
1183185029Spjd		if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
1184185029Spjd			panic("modifying buffer while i/o in progress!");
1185185029Spjd		arc_cksum_verify(buf);
1186185029Spjd	}
1187168404Spjd
1188168404Spjd	mutex_enter(&buf->b_hdr->b_freeze_lock);
1189168404Spjd	if (buf->b_hdr->b_freeze_cksum != NULL) {
1190168404Spjd		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1191168404Spjd		buf->b_hdr->b_freeze_cksum = NULL;
1192168404Spjd	}
1193219089Spjd
1194219089Spjd	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1195219089Spjd		if (buf->b_hdr->b_thawed)
1196219089Spjd			kmem_free(buf->b_hdr->b_thawed, 1);
1197219089Spjd		buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
1198219089Spjd	}
1199219089Spjd
1200168404Spjd	mutex_exit(&buf->b_hdr->b_freeze_lock);
1201240133Smm
1202240133Smm#ifdef illumos
1203240133Smm	arc_buf_unwatch(buf);
1204240133Smm#endif /* illumos */
1205168404Spjd}
1206168404Spjd
1207168404Spjdvoid
1208168404Spjdarc_buf_freeze(arc_buf_t *buf)
1209168404Spjd{
1210219089Spjd	kmutex_t *hash_lock;
1211219089Spjd
1212168404Spjd	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1213168404Spjd		return;
1214168404Spjd
1215219089Spjd	hash_lock = HDR_LOCK(buf->b_hdr);
1216219089Spjd	mutex_enter(hash_lock);
1217219089Spjd
1218168404Spjd	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1219168404Spjd	    buf->b_hdr->b_state == arc_anon);
1220185029Spjd	arc_cksum_compute(buf, B_FALSE);
1221219089Spjd	mutex_exit(hash_lock);
1222240133Smm
1223168404Spjd}
1224168404Spjd
1225168404Spjdstatic void
1226205231Skmacyget_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock)
1227205231Skmacy{
1228205231Skmacy	uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth);
1229205231Skmacy
1230206796Spjd	if (ab->b_type == ARC_BUFC_METADATA)
1231206796Spjd		buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
1232205231Skmacy	else {
1233206796Spjd		buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
1234205231Skmacy		buf_hashid += ARC_BUFC_NUMMETADATALISTS;
1235205231Skmacy	}
1236205231Skmacy
1237205231Skmacy	*list = &state->arcs_lists[buf_hashid];
1238205231Skmacy	*lock = ARCS_LOCK(state, buf_hashid);
1239205231Skmacy}
1240205231Skmacy
1241205231Skmacy
1242205231Skmacystatic void
1243168404Spjdadd_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1244168404Spjd{
1245168404Spjd	ASSERT(MUTEX_HELD(hash_lock));
1246168404Spjd
1247168404Spjd	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
1248168404Spjd	    (ab->b_state != arc_anon)) {
1249206796Spjd		uint64_t delta = ab->b_size * ab->b_datacnt;
1250206796Spjd		uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
1251205231Skmacy		list_t *list;
1252205231Skmacy		kmutex_t *lock;
1253168404Spjd
1254205231Skmacy		get_buf_info(ab, ab->b_state, &list, &lock);
1255205231Skmacy		ASSERT(!MUTEX_HELD(lock));
1256205231Skmacy		mutex_enter(lock);
1257168404Spjd		ASSERT(list_link_active(&ab->b_arc_node));
1258185029Spjd		list_remove(list, ab);
1259168404Spjd		if (GHOST_STATE(ab->b_state)) {
1260240415Smm			ASSERT0(ab->b_datacnt);
1261168404Spjd			ASSERT3P(ab->b_buf, ==, NULL);
1262168404Spjd			delta = ab->b_size;
1263168404Spjd		}
1264168404Spjd		ASSERT(delta > 0);
1265185029Spjd		ASSERT3U(*size, >=, delta);
1266185029Spjd		atomic_add_64(size, -delta);
1267206794Spjd		mutex_exit(lock);
1268185029Spjd		/* remove the prefetch flag if we get a reference */
1269168404Spjd		if (ab->b_flags & ARC_PREFETCH)
1270168404Spjd			ab->b_flags &= ~ARC_PREFETCH;
1271168404Spjd	}
1272168404Spjd}
1273168404Spjd
1274168404Spjdstatic int
1275168404Spjdremove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
1276168404Spjd{
1277168404Spjd	int cnt;
1278168404Spjd	arc_state_t *state = ab->b_state;
1279168404Spjd
1280168404Spjd	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1281168404Spjd	ASSERT(!GHOST_STATE(state));
1282168404Spjd
1283168404Spjd	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
1284168404Spjd	    (state != arc_anon)) {
1285185029Spjd		uint64_t *size = &state->arcs_lsize[ab->b_type];
1286205231Skmacy		list_t *list;
1287205231Skmacy		kmutex_t *lock;
1288185029Spjd
1289205231Skmacy		get_buf_info(ab, state, &list, &lock);
1290205231Skmacy		ASSERT(!MUTEX_HELD(lock));
1291205231Skmacy		mutex_enter(lock);
1292168404Spjd		ASSERT(!list_link_active(&ab->b_arc_node));
1293205231Skmacy		list_insert_head(list, ab);
1294168404Spjd		ASSERT(ab->b_datacnt > 0);
1295185029Spjd		atomic_add_64(size, ab->b_size * ab->b_datacnt);
1296206794Spjd		mutex_exit(lock);
1297168404Spjd	}
1298168404Spjd	return (cnt);
1299168404Spjd}
1300168404Spjd
1301168404Spjd/*
1302168404Spjd * Move the supplied buffer to the indicated state.  The mutex
1303168404Spjd * for the buffer must be held by the caller.
1304168404Spjd */
1305168404Spjdstatic void
1306168404Spjdarc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
1307168404Spjd{
1308168404Spjd	arc_state_t *old_state = ab->b_state;
1309168404Spjd	int64_t refcnt = refcount_count(&ab->b_refcnt);
1310168404Spjd	uint64_t from_delta, to_delta;
1311205231Skmacy	list_t *list;
1312205231Skmacy	kmutex_t *lock;
1313168404Spjd
1314168404Spjd	ASSERT(MUTEX_HELD(hash_lock));
1315168404Spjd	ASSERT(new_state != old_state);
1316168404Spjd	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
1317168404Spjd	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
1318219089Spjd	ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
1319168404Spjd
1320168404Spjd	from_delta = to_delta = ab->b_datacnt * ab->b_size;
1321168404Spjd
1322168404Spjd	/*
1323168404Spjd	 * If this buffer is evictable, transfer it from the
1324168404Spjd	 * old state list to the new state list.
1325168404Spjd	 */
1326168404Spjd	if (refcnt == 0) {
1327168404Spjd		if (old_state != arc_anon) {
1328205231Skmacy			int use_mutex;
1329185029Spjd			uint64_t *size = &old_state->arcs_lsize[ab->b_type];
1330168404Spjd
1331205231Skmacy			get_buf_info(ab, old_state, &list, &lock);
1332205231Skmacy			use_mutex = !MUTEX_HELD(lock);
1333168404Spjd			if (use_mutex)
1334205231Skmacy				mutex_enter(lock);
1335168404Spjd
1336168404Spjd			ASSERT(list_link_active(&ab->b_arc_node));
1337205231Skmacy			list_remove(list, ab);
1338168404Spjd
1339168404Spjd			/*
1340168404Spjd			 * If prefetching out of the ghost cache,
1341219089Spjd			 * we will have a non-zero datacnt.
1342168404Spjd			 */
1343168404Spjd			if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
1344168404Spjd				/* ghost elements have a ghost size */
1345168404Spjd				ASSERT(ab->b_buf == NULL);
1346168404Spjd				from_delta = ab->b_size;
1347168404Spjd			}
1348185029Spjd			ASSERT3U(*size, >=, from_delta);
1349185029Spjd			atomic_add_64(size, -from_delta);
1350168404Spjd
1351168404Spjd			if (use_mutex)
1352205231Skmacy				mutex_exit(lock);
1353168404Spjd		}
1354168404Spjd		if (new_state != arc_anon) {
1355206796Spjd			int use_mutex;
1356185029Spjd			uint64_t *size = &new_state->arcs_lsize[ab->b_type];
1357168404Spjd
1358205231Skmacy			get_buf_info(ab, new_state, &list, &lock);
1359205231Skmacy			use_mutex = !MUTEX_HELD(lock);
1360168404Spjd			if (use_mutex)
1361205231Skmacy				mutex_enter(lock);
1362168404Spjd
1363205231Skmacy			list_insert_head(list, ab);
1364168404Spjd
1365168404Spjd			/* ghost elements have a ghost size */
1366168404Spjd			if (GHOST_STATE(new_state)) {
1367168404Spjd				ASSERT(ab->b_datacnt == 0);
1368168404Spjd				ASSERT(ab->b_buf == NULL);
1369168404Spjd				to_delta = ab->b_size;
1370168404Spjd			}
1371185029Spjd			atomic_add_64(size, to_delta);
1372168404Spjd
1373168404Spjd			if (use_mutex)
1374205231Skmacy				mutex_exit(lock);
1375168404Spjd		}
1376168404Spjd	}
1377168404Spjd
1378168404Spjd	ASSERT(!BUF_EMPTY(ab));
1379219089Spjd	if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
1380168404Spjd		buf_hash_remove(ab);
1381168404Spjd
1382168404Spjd	/* adjust state sizes */
1383168404Spjd	if (to_delta)
1384168404Spjd		atomic_add_64(&new_state->arcs_size, to_delta);
1385168404Spjd	if (from_delta) {
1386168404Spjd		ASSERT3U(old_state->arcs_size, >=, from_delta);
1387168404Spjd		atomic_add_64(&old_state->arcs_size, -from_delta);
1388168404Spjd	}
1389168404Spjd	ab->b_state = new_state;
1390185029Spjd
1391185029Spjd	/* adjust l2arc hdr stats */
1392185029Spjd	if (new_state == arc_l2c_only)
1393185029Spjd		l2arc_hdr_stat_add();
1394185029Spjd	else if (old_state == arc_l2c_only)
1395185029Spjd		l2arc_hdr_stat_remove();
1396168404Spjd}
1397168404Spjd
1398185029Spjdvoid
1399208373Smmarc_space_consume(uint64_t space, arc_space_type_t type)
1400185029Spjd{
1401208373Smm	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1402208373Smm
1403208373Smm	switch (type) {
1404208373Smm	case ARC_SPACE_DATA:
1405208373Smm		ARCSTAT_INCR(arcstat_data_size, space);
1406208373Smm		break;
1407208373Smm	case ARC_SPACE_OTHER:
1408208373Smm		ARCSTAT_INCR(arcstat_other_size, space);
1409208373Smm		break;
1410208373Smm	case ARC_SPACE_HDRS:
1411208373Smm		ARCSTAT_INCR(arcstat_hdr_size, space);
1412208373Smm		break;
1413208373Smm	case ARC_SPACE_L2HDRS:
1414208373Smm		ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1415208373Smm		break;
1416208373Smm	}
1417208373Smm
1418185029Spjd	atomic_add_64(&arc_meta_used, space);
1419185029Spjd	atomic_add_64(&arc_size, space);
1420185029Spjd}
1421185029Spjd
1422185029Spjdvoid
1423208373Smmarc_space_return(uint64_t space, arc_space_type_t type)
1424185029Spjd{
1425208373Smm	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1426208373Smm
1427208373Smm	switch (type) {
1428208373Smm	case ARC_SPACE_DATA:
1429208373Smm		ARCSTAT_INCR(arcstat_data_size, -space);
1430208373Smm		break;
1431208373Smm	case ARC_SPACE_OTHER:
1432208373Smm		ARCSTAT_INCR(arcstat_other_size, -space);
1433208373Smm		break;
1434208373Smm	case ARC_SPACE_HDRS:
1435208373Smm		ARCSTAT_INCR(arcstat_hdr_size, -space);
1436208373Smm		break;
1437208373Smm	case ARC_SPACE_L2HDRS:
1438208373Smm		ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1439208373Smm		break;
1440208373Smm	}
1441208373Smm
1442185029Spjd	ASSERT(arc_meta_used >= space);
1443185029Spjd	if (arc_meta_max < arc_meta_used)
1444185029Spjd		arc_meta_max = arc_meta_used;
1445185029Spjd	atomic_add_64(&arc_meta_used, -space);
1446185029Spjd	ASSERT(arc_size >= space);
1447185029Spjd	atomic_add_64(&arc_size, -space);
1448185029Spjd}
1449185029Spjd
1450185029Spjdvoid *
1451185029Spjdarc_data_buf_alloc(uint64_t size)
1452185029Spjd{
1453185029Spjd	if (arc_evict_needed(ARC_BUFC_DATA))
1454185029Spjd		cv_signal(&arc_reclaim_thr_cv);
1455185029Spjd	atomic_add_64(&arc_size, size);
1456185029Spjd	return (zio_data_buf_alloc(size));
1457185029Spjd}
1458185029Spjd
1459185029Spjdvoid
1460185029Spjdarc_data_buf_free(void *buf, uint64_t size)
1461185029Spjd{
1462185029Spjd	zio_data_buf_free(buf, size);
1463185029Spjd	ASSERT(arc_size >= size);
1464185029Spjd	atomic_add_64(&arc_size, -size);
1465185029Spjd}
1466185029Spjd
1467168404Spjdarc_buf_t *
1468168404Spjdarc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
1469168404Spjd{
1470168404Spjd	arc_buf_hdr_t *hdr;
1471168404Spjd	arc_buf_t *buf;
1472168404Spjd
1473168404Spjd	ASSERT3U(size, >, 0);
1474185029Spjd	hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
1475168404Spjd	ASSERT(BUF_EMPTY(hdr));
1476168404Spjd	hdr->b_size = size;
1477168404Spjd	hdr->b_type = type;
1478228103Smm	hdr->b_spa = spa_load_guid(spa);
1479168404Spjd	hdr->b_state = arc_anon;
1480168404Spjd	hdr->b_arc_access = 0;
1481185029Spjd	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1482168404Spjd	buf->b_hdr = hdr;
1483168404Spjd	buf->b_data = NULL;
1484168404Spjd	buf->b_efunc = NULL;
1485168404Spjd	buf->b_private = NULL;
1486168404Spjd	buf->b_next = NULL;
1487168404Spjd	hdr->b_buf = buf;
1488168404Spjd	arc_get_data_buf(buf);
1489168404Spjd	hdr->b_datacnt = 1;
1490168404Spjd	hdr->b_flags = 0;
1491168404Spjd	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1492168404Spjd	(void) refcount_add(&hdr->b_refcnt, tag);
1493168404Spjd
1494168404Spjd	return (buf);
1495168404Spjd}
1496168404Spjd
1497209962Smmstatic char *arc_onloan_tag = "onloan";
1498209962Smm
1499209962Smm/*
1500209962Smm * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1501209962Smm * flight data by arc_tempreserve_space() until they are "returned". Loaned
1502209962Smm * buffers must be returned to the arc before they can be used by the DMU or
1503209962Smm * freed.
1504209962Smm */
1505209962Smmarc_buf_t *
1506209962Smmarc_loan_buf(spa_t *spa, int size)
1507209962Smm{
1508209962Smm	arc_buf_t *buf;
1509209962Smm
1510209962Smm	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1511209962Smm
1512209962Smm	atomic_add_64(&arc_loaned_bytes, size);
1513209962Smm	return (buf);
1514209962Smm}
1515209962Smm
1516209962Smm/*
1517209962Smm * Return a loaned arc buffer to the arc.
1518209962Smm */
1519209962Smmvoid
1520209962Smmarc_return_buf(arc_buf_t *buf, void *tag)
1521209962Smm{
1522209962Smm	arc_buf_hdr_t *hdr = buf->b_hdr;
1523209962Smm
1524209962Smm	ASSERT(buf->b_data != NULL);
1525219089Spjd	(void) refcount_add(&hdr->b_refcnt, tag);
1526219089Spjd	(void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
1527209962Smm
1528209962Smm	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1529209962Smm}
1530209962Smm
1531219089Spjd/* Detach an arc_buf from a dbuf (tag) */
1532219089Spjdvoid
1533219089Spjdarc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1534219089Spjd{
1535219089Spjd	arc_buf_hdr_t *hdr;
1536219089Spjd
1537219089Spjd	ASSERT(buf->b_data != NULL);
1538219089Spjd	hdr = buf->b_hdr;
1539219089Spjd	(void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
1540219089Spjd	(void) refcount_remove(&hdr->b_refcnt, tag);
1541219089Spjd	buf->b_efunc = NULL;
1542219089Spjd	buf->b_private = NULL;
1543219089Spjd
1544219089Spjd	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1545219089Spjd}
1546219089Spjd
1547168404Spjdstatic arc_buf_t *
1548168404Spjdarc_buf_clone(arc_buf_t *from)
1549168404Spjd{
1550168404Spjd	arc_buf_t *buf;
1551168404Spjd	arc_buf_hdr_t *hdr = from->b_hdr;
1552168404Spjd	uint64_t size = hdr->b_size;
1553168404Spjd
1554219089Spjd	ASSERT(hdr->b_state != arc_anon);
1555219089Spjd
1556185029Spjd	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1557168404Spjd	buf->b_hdr = hdr;
1558168404Spjd	buf->b_data = NULL;
1559168404Spjd	buf->b_efunc = NULL;
1560168404Spjd	buf->b_private = NULL;
1561168404Spjd	buf->b_next = hdr->b_buf;
1562168404Spjd	hdr->b_buf = buf;
1563168404Spjd	arc_get_data_buf(buf);
1564168404Spjd	bcopy(from->b_data, buf->b_data, size);
1565242845Sdelphij
1566242845Sdelphij	/*
1567242845Sdelphij	 * This buffer already exists in the arc so create a duplicate
1568242845Sdelphij	 * copy for the caller.  If the buffer is associated with user data
1569242845Sdelphij	 * then track the size and number of duplicates.  These stats will be
1570242845Sdelphij	 * updated as duplicate buffers are created and destroyed.
1571242845Sdelphij	 */
1572242845Sdelphij	if (hdr->b_type == ARC_BUFC_DATA) {
1573242845Sdelphij		ARCSTAT_BUMP(arcstat_duplicate_buffers);
1574242845Sdelphij		ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1575242845Sdelphij	}
1576168404Spjd	hdr->b_datacnt += 1;
1577168404Spjd	return (buf);
1578168404Spjd}
1579168404Spjd
1580168404Spjdvoid
1581168404Spjdarc_buf_add_ref(arc_buf_t *buf, void* tag)
1582168404Spjd{
1583168404Spjd	arc_buf_hdr_t *hdr;
1584168404Spjd	kmutex_t *hash_lock;
1585168404Spjd
1586168404Spjd	/*
1587185029Spjd	 * Check to see if this buffer is evicted.  Callers
1588185029Spjd	 * must verify b_data != NULL to know if the add_ref
1589185029Spjd	 * was successful.
1590168404Spjd	 */
1591219089Spjd	mutex_enter(&buf->b_evict_lock);
1592185029Spjd	if (buf->b_data == NULL) {
1593219089Spjd		mutex_exit(&buf->b_evict_lock);
1594168404Spjd		return;
1595168404Spjd	}
1596219089Spjd	hash_lock = HDR_LOCK(buf->b_hdr);
1597219089Spjd	mutex_enter(hash_lock);
1598185029Spjd	hdr = buf->b_hdr;
1599219089Spjd	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1600219089Spjd	mutex_exit(&buf->b_evict_lock);
1601168404Spjd
1602168404Spjd	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
1603168404Spjd	add_reference(hdr, hash_lock, tag);
1604208373Smm	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1605168404Spjd	arc_access(hdr, hash_lock);
1606168404Spjd	mutex_exit(hash_lock);
1607168404Spjd	ARCSTAT_BUMP(arcstat_hits);
1608168404Spjd	ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
1609168404Spjd	    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
1610168404Spjd	    data, metadata, hits);
1611168404Spjd}
1612168404Spjd
1613185029Spjd/*
1614185029Spjd * Free the arc data buffer.  If it is an l2arc write in progress,
1615185029Spjd * the buffer is placed on l2arc_free_on_write to be freed later.
1616185029Spjd */
1617168404Spjdstatic void
1618240133Smmarc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1619185029Spjd{
1620240133Smm	arc_buf_hdr_t *hdr = buf->b_hdr;
1621240133Smm
1622185029Spjd	if (HDR_L2_WRITING(hdr)) {
1623185029Spjd		l2arc_data_free_t *df;
1624185029Spjd		df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1625240133Smm		df->l2df_data = buf->b_data;
1626240133Smm		df->l2df_size = hdr->b_size;
1627185029Spjd		df->l2df_func = free_func;
1628185029Spjd		mutex_enter(&l2arc_free_on_write_mtx);
1629185029Spjd		list_insert_head(l2arc_free_on_write, df);
1630185029Spjd		mutex_exit(&l2arc_free_on_write_mtx);
1631185029Spjd		ARCSTAT_BUMP(arcstat_l2_free_on_write);
1632185029Spjd	} else {
1633240133Smm		free_func(buf->b_data, hdr->b_size);
1634185029Spjd	}
1635185029Spjd}
1636185029Spjd
1637185029Spjdstatic void
1638168404Spjdarc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
1639168404Spjd{
1640168404Spjd	arc_buf_t **bufp;
1641168404Spjd
1642168404Spjd	/* free up data associated with the buf */
1643168404Spjd	if (buf->b_data) {
1644168404Spjd		arc_state_t *state = buf->b_hdr->b_state;
1645168404Spjd		uint64_t size = buf->b_hdr->b_size;
1646168404Spjd		arc_buf_contents_t type = buf->b_hdr->b_type;
1647168404Spjd
1648168404Spjd		arc_cksum_verify(buf);
1649240133Smm#ifdef illumos
1650240133Smm		arc_buf_unwatch(buf);
1651240133Smm#endif /* illumos */
1652219089Spjd
1653168404Spjd		if (!recycle) {
1654168404Spjd			if (type == ARC_BUFC_METADATA) {
1655240133Smm				arc_buf_data_free(buf, zio_buf_free);
1656208373Smm				arc_space_return(size, ARC_SPACE_DATA);
1657168404Spjd			} else {
1658168404Spjd				ASSERT(type == ARC_BUFC_DATA);
1659240133Smm				arc_buf_data_free(buf, zio_data_buf_free);
1660208373Smm				ARCSTAT_INCR(arcstat_data_size, -size);
1661185029Spjd				atomic_add_64(&arc_size, -size);
1662168404Spjd			}
1663168404Spjd		}
1664168404Spjd		if (list_link_active(&buf->b_hdr->b_arc_node)) {
1665185029Spjd			uint64_t *cnt = &state->arcs_lsize[type];
1666185029Spjd
1667168404Spjd			ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
1668168404Spjd			ASSERT(state != arc_anon);
1669185029Spjd
1670185029Spjd			ASSERT3U(*cnt, >=, size);
1671185029Spjd			atomic_add_64(cnt, -size);
1672168404Spjd		}
1673168404Spjd		ASSERT3U(state->arcs_size, >=, size);
1674168404Spjd		atomic_add_64(&state->arcs_size, -size);
1675168404Spjd		buf->b_data = NULL;
1676242845Sdelphij
1677242845Sdelphij		/*
1678242845Sdelphij		 * If we're destroying a duplicate buffer make sure
1679242845Sdelphij		 * that the appropriate statistics are updated.
1680242845Sdelphij		 */
1681242845Sdelphij		if (buf->b_hdr->b_datacnt > 1 &&
1682242845Sdelphij		    buf->b_hdr->b_type == ARC_BUFC_DATA) {
1683242845Sdelphij			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
1684242845Sdelphij			ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
1685242845Sdelphij		}
1686168404Spjd		ASSERT(buf->b_hdr->b_datacnt > 0);
1687168404Spjd		buf->b_hdr->b_datacnt -= 1;
1688168404Spjd	}
1689168404Spjd
1690168404Spjd	/* only remove the buf if requested */
1691168404Spjd	if (!all)
1692168404Spjd		return;
1693168404Spjd
1694168404Spjd	/* remove the buf from the hdr list */
1695168404Spjd	for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
1696168404Spjd		continue;
1697168404Spjd	*bufp = buf->b_next;
1698219089Spjd	buf->b_next = NULL;
1699168404Spjd
1700168404Spjd	ASSERT(buf->b_efunc == NULL);
1701168404Spjd
1702168404Spjd	/* clean up the buf */
1703168404Spjd	buf->b_hdr = NULL;
1704168404Spjd	kmem_cache_free(buf_cache, buf);
1705168404Spjd}
1706168404Spjd
1707168404Spjdstatic void
1708168404Spjdarc_hdr_destroy(arc_buf_hdr_t *hdr)
1709168404Spjd{
1710168404Spjd	ASSERT(refcount_is_zero(&hdr->b_refcnt));
1711168404Spjd	ASSERT3P(hdr->b_state, ==, arc_anon);
1712168404Spjd	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1713219089Spjd	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
1714168404Spjd
1715219089Spjd	if (l2hdr != NULL) {
1716219089Spjd		boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
1717219089Spjd		/*
1718219089Spjd		 * To prevent arc_free() and l2arc_evict() from
1719219089Spjd		 * attempting to free the same buffer at the same time,
1720219089Spjd		 * a FREE_IN_PROGRESS flag is given to arc_free() to
1721219089Spjd		 * give it priority.  l2arc_evict() can't destroy this
1722219089Spjd		 * header while we are waiting on l2arc_buflist_mtx.
1723219089Spjd		 *
1724219089Spjd		 * The hdr may be removed from l2ad_buflist before we
1725219089Spjd		 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
1726219089Spjd		 */
1727219089Spjd		if (!buflist_held) {
1728185029Spjd			mutex_enter(&l2arc_buflist_mtx);
1729219089Spjd			l2hdr = hdr->b_l2hdr;
1730219089Spjd		}
1731219089Spjd
1732219089Spjd		if (l2hdr != NULL) {
1733248572Ssmh			trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
1734248574Ssmh			    hdr->b_size, 0);
1735219089Spjd			list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
1736219089Spjd			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
1737251478Sdelphij			ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
1738219089Spjd			kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
1739219089Spjd			if (hdr->b_state == arc_l2c_only)
1740219089Spjd				l2arc_hdr_stat_remove();
1741219089Spjd			hdr->b_l2hdr = NULL;
1742219089Spjd		}
1743219089Spjd
1744219089Spjd		if (!buflist_held)
1745185029Spjd			mutex_exit(&l2arc_buflist_mtx);
1746185029Spjd	}
1747185029Spjd
1748168404Spjd	if (!BUF_EMPTY(hdr)) {
1749168404Spjd		ASSERT(!HDR_IN_HASH_TABLE(hdr));
1750219089Spjd		buf_discard_identity(hdr);
1751168404Spjd	}
1752168404Spjd	while (hdr->b_buf) {
1753168404Spjd		arc_buf_t *buf = hdr->b_buf;
1754168404Spjd
1755168404Spjd		if (buf->b_efunc) {
1756168404Spjd			mutex_enter(&arc_eviction_mtx);
1757219089Spjd			mutex_enter(&buf->b_evict_lock);
1758168404Spjd			ASSERT(buf->b_hdr != NULL);
1759168404Spjd			arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
1760168404Spjd			hdr->b_buf = buf->b_next;
1761168404Spjd			buf->b_hdr = &arc_eviction_hdr;
1762168404Spjd			buf->b_next = arc_eviction_list;
1763168404Spjd			arc_eviction_list = buf;
1764219089Spjd			mutex_exit(&buf->b_evict_lock);
1765168404Spjd			mutex_exit(&arc_eviction_mtx);
1766168404Spjd		} else {
1767168404Spjd			arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
1768168404Spjd		}
1769168404Spjd	}
1770168404Spjd	if (hdr->b_freeze_cksum != NULL) {
1771168404Spjd		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1772168404Spjd		hdr->b_freeze_cksum = NULL;
1773168404Spjd	}
1774219089Spjd	if (hdr->b_thawed) {
1775219089Spjd		kmem_free(hdr->b_thawed, 1);
1776219089Spjd		hdr->b_thawed = NULL;
1777219089Spjd	}
1778168404Spjd
1779168404Spjd	ASSERT(!list_link_active(&hdr->b_arc_node));
1780168404Spjd	ASSERT3P(hdr->b_hash_next, ==, NULL);
1781168404Spjd	ASSERT3P(hdr->b_acb, ==, NULL);
1782168404Spjd	kmem_cache_free(hdr_cache, hdr);
1783168404Spjd}
1784168404Spjd
1785168404Spjdvoid
1786168404Spjdarc_buf_free(arc_buf_t *buf, void *tag)
1787168404Spjd{
1788168404Spjd	arc_buf_hdr_t *hdr = buf->b_hdr;
1789168404Spjd	int hashed = hdr->b_state != arc_anon;
1790168404Spjd
1791168404Spjd	ASSERT(buf->b_efunc == NULL);
1792168404Spjd	ASSERT(buf->b_data != NULL);
1793168404Spjd
1794168404Spjd	if (hashed) {
1795168404Spjd		kmutex_t *hash_lock = HDR_LOCK(hdr);
1796168404Spjd
1797168404Spjd		mutex_enter(hash_lock);
1798219089Spjd		hdr = buf->b_hdr;
1799219089Spjd		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1800219089Spjd
1801168404Spjd		(void) remove_reference(hdr, hash_lock, tag);
1802219089Spjd		if (hdr->b_datacnt > 1) {
1803168404Spjd			arc_buf_destroy(buf, FALSE, TRUE);
1804219089Spjd		} else {
1805219089Spjd			ASSERT(buf == hdr->b_buf);
1806219089Spjd			ASSERT(buf->b_efunc == NULL);
1807168404Spjd			hdr->b_flags |= ARC_BUF_AVAILABLE;
1808219089Spjd		}
1809168404Spjd		mutex_exit(hash_lock);
1810168404Spjd	} else if (HDR_IO_IN_PROGRESS(hdr)) {
1811168404Spjd		int destroy_hdr;
1812168404Spjd		/*
1813168404Spjd		 * We are in the middle of an async write.  Don't destroy
1814168404Spjd		 * this buffer unless the write completes before we finish
1815168404Spjd		 * decrementing the reference count.
1816168404Spjd		 */
1817168404Spjd		mutex_enter(&arc_eviction_mtx);
1818168404Spjd		(void) remove_reference(hdr, NULL, tag);
1819168404Spjd		ASSERT(refcount_is_zero(&hdr->b_refcnt));
1820168404Spjd		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
1821168404Spjd		mutex_exit(&arc_eviction_mtx);
1822168404Spjd		if (destroy_hdr)
1823168404Spjd			arc_hdr_destroy(hdr);
1824168404Spjd	} else {
1825219089Spjd		if (remove_reference(hdr, NULL, tag) > 0)
1826168404Spjd			arc_buf_destroy(buf, FALSE, TRUE);
1827219089Spjd		else
1828168404Spjd			arc_hdr_destroy(hdr);
1829168404Spjd	}
1830168404Spjd}
1831168404Spjd
1832248571Smmboolean_t
1833168404Spjdarc_buf_remove_ref(arc_buf_t *buf, void* tag)
1834168404Spjd{
1835168404Spjd	arc_buf_hdr_t *hdr = buf->b_hdr;
1836168404Spjd	kmutex_t *hash_lock = HDR_LOCK(hdr);
1837248571Smm	boolean_t no_callback = (buf->b_efunc == NULL);
1838168404Spjd
1839168404Spjd	if (hdr->b_state == arc_anon) {
1840219089Spjd		ASSERT(hdr->b_datacnt == 1);
1841168404Spjd		arc_buf_free(buf, tag);
1842168404Spjd		return (no_callback);
1843168404Spjd	}
1844168404Spjd
1845168404Spjd	mutex_enter(hash_lock);
1846219089Spjd	hdr = buf->b_hdr;
1847219089Spjd	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1848168404Spjd	ASSERT(hdr->b_state != arc_anon);
1849168404Spjd	ASSERT(buf->b_data != NULL);
1850168404Spjd
1851168404Spjd	(void) remove_reference(hdr, hash_lock, tag);
1852168404Spjd	if (hdr->b_datacnt > 1) {
1853168404Spjd		if (no_callback)
1854168404Spjd			arc_buf_destroy(buf, FALSE, TRUE);
1855168404Spjd	} else if (no_callback) {
1856168404Spjd		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
1857219089Spjd		ASSERT(buf->b_efunc == NULL);
1858168404Spjd		hdr->b_flags |= ARC_BUF_AVAILABLE;
1859168404Spjd	}
1860168404Spjd	ASSERT(no_callback || hdr->b_datacnt > 1 ||
1861168404Spjd	    refcount_is_zero(&hdr->b_refcnt));
1862168404Spjd	mutex_exit(hash_lock);
1863168404Spjd	return (no_callback);
1864168404Spjd}
1865168404Spjd
1866168404Spjdint
1867168404Spjdarc_buf_size(arc_buf_t *buf)
1868168404Spjd{
1869168404Spjd	return (buf->b_hdr->b_size);
1870168404Spjd}
1871168404Spjd
1872168404Spjd/*
1873242845Sdelphij * Called from the DMU to determine if the current buffer should be
1874242845Sdelphij * evicted. In order to ensure proper locking, the eviction must be initiated
1875242845Sdelphij * from the DMU. Return true if the buffer is associated with user data and
1876242845Sdelphij * duplicate buffers still exist.
1877242845Sdelphij */
1878242845Sdelphijboolean_t
1879242845Sdelphijarc_buf_eviction_needed(arc_buf_t *buf)
1880242845Sdelphij{
1881242845Sdelphij	arc_buf_hdr_t *hdr;
1882242845Sdelphij	boolean_t evict_needed = B_FALSE;
1883242845Sdelphij
1884242845Sdelphij	if (zfs_disable_dup_eviction)
1885242845Sdelphij		return (B_FALSE);
1886242845Sdelphij
1887242845Sdelphij	mutex_enter(&buf->b_evict_lock);
1888242845Sdelphij	hdr = buf->b_hdr;
1889242845Sdelphij	if (hdr == NULL) {
1890242845Sdelphij		/*
1891242845Sdelphij		 * We are in arc_do_user_evicts(); let that function
1892242845Sdelphij		 * perform the eviction.
1893242845Sdelphij		 */
1894242845Sdelphij		ASSERT(buf->b_data == NULL);
1895242845Sdelphij		mutex_exit(&buf->b_evict_lock);
1896242845Sdelphij		return (B_FALSE);
1897242845Sdelphij	} else if (buf->b_data == NULL) {
1898242845Sdelphij		/*
1899242845Sdelphij		 * We have already been added to the arc eviction list;
1900242845Sdelphij		 * recommend eviction.
1901242845Sdelphij		 */
1902242845Sdelphij		ASSERT3P(hdr, ==, &arc_eviction_hdr);
1903242845Sdelphij		mutex_exit(&buf->b_evict_lock);
1904242845Sdelphij		return (B_TRUE);
1905242845Sdelphij	}
1906242845Sdelphij
1907242845Sdelphij	if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
1908242845Sdelphij		evict_needed = B_TRUE;
1909242845Sdelphij
1910242845Sdelphij	mutex_exit(&buf->b_evict_lock);
1911242845Sdelphij	return (evict_needed);
1912242845Sdelphij}
1913242845Sdelphij
1914242845Sdelphij/*
1915168404Spjd * Evict buffers from list until we've removed the specified number of
1916168404Spjd * bytes.  Move the removed buffers to the appropriate evict state.
1917168404Spjd * If the recycle flag is set, then attempt to "recycle" a buffer:
1918168404Spjd * - look for a buffer to evict that is `bytes' long.
1919168404Spjd * - return the data block from this buffer rather than freeing it.
1920168404Spjd * This flag is used by callers that are trying to make space for a
1921168404Spjd * new buffer in a full arc cache.
1922185029Spjd *
1923185029Spjd * This function makes a "best effort".  It skips over any buffers
1924185029Spjd * it can't get a hash_lock on, and so may not catch all candidates.
1925185029Spjd * It may also return without evicting as much space as requested.
1926168404Spjd */
1927168404Spjdstatic void *
1928209962Smmarc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
1929168404Spjd    arc_buf_contents_t type)
1930168404Spjd{
1931168404Spjd	arc_state_t *evicted_state;
1932168404Spjd	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
1933205231Skmacy	int64_t bytes_remaining;
1934168404Spjd	arc_buf_hdr_t *ab, *ab_prev = NULL;
1935205231Skmacy	list_t *evicted_list, *list, *evicted_list_start, *list_start;
1936205231Skmacy	kmutex_t *lock, *evicted_lock;
1937168404Spjd	kmutex_t *hash_lock;
1938168404Spjd	boolean_t have_lock;
1939168404Spjd	void *stolen = NULL;
1940205231Skmacy	static int evict_metadata_offset, evict_data_offset;
1941205231Skmacy	int i, idx, offset, list_count, count;
1942168404Spjd
1943168404Spjd	ASSERT(state == arc_mru || state == arc_mfu);
1944168404Spjd
1945168404Spjd	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
1946206796Spjd
1947205231Skmacy	if (type == ARC_BUFC_METADATA) {
1948205231Skmacy		offset = 0;
1949205231Skmacy		list_count = ARC_BUFC_NUMMETADATALISTS;
1950205231Skmacy		list_start = &state->arcs_lists[0];
1951205231Skmacy		evicted_list_start = &evicted_state->arcs_lists[0];
1952205231Skmacy		idx = evict_metadata_offset;
1953205231Skmacy	} else {
1954205231Skmacy		offset = ARC_BUFC_NUMMETADATALISTS;
1955205231Skmacy		list_start = &state->arcs_lists[offset];
1956205231Skmacy		evicted_list_start = &evicted_state->arcs_lists[offset];
1957205231Skmacy		list_count = ARC_BUFC_NUMDATALISTS;
1958205231Skmacy		idx = evict_data_offset;
1959205231Skmacy	}
1960205231Skmacy	bytes_remaining = evicted_state->arcs_lsize[type];
1961205231Skmacy	count = 0;
1962206796Spjd
1963205231Skmacyevict_start:
1964205231Skmacy	list = &list_start[idx];
1965205231Skmacy	evicted_list = &evicted_list_start[idx];
1966205231Skmacy	lock = ARCS_LOCK(state, (offset + idx));
1967206796Spjd	evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
1968168404Spjd
1969205231Skmacy	mutex_enter(lock);
1970205231Skmacy	mutex_enter(evicted_lock);
1971205231Skmacy
1972185029Spjd	for (ab = list_tail(list); ab; ab = ab_prev) {
1973185029Spjd		ab_prev = list_prev(list, ab);
1974205231Skmacy		bytes_remaining -= (ab->b_size * ab->b_datacnt);
1975168404Spjd		/* prefetch buffers have a minimum lifespan */
1976168404Spjd		if (HDR_IO_IN_PROGRESS(ab) ||
1977185029Spjd		    (spa && ab->b_spa != spa) ||
1978168404Spjd		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
1979219089Spjd		    ddi_get_lbolt() - ab->b_arc_access <
1980219089Spjd		    arc_min_prefetch_lifespan)) {
1981168404Spjd			skipped++;
1982168404Spjd			continue;
1983168404Spjd		}
1984168404Spjd		/* "lookahead" for better eviction candidate */
1985168404Spjd		if (recycle && ab->b_size != bytes &&
1986168404Spjd		    ab_prev && ab_prev->b_size == bytes)
1987168404Spjd			continue;
1988168404Spjd		hash_lock = HDR_LOCK(ab);
1989168404Spjd		have_lock = MUTEX_HELD(hash_lock);
1990168404Spjd		if (have_lock || mutex_tryenter(hash_lock)) {
1991240415Smm			ASSERT0(refcount_count(&ab->b_refcnt));
1992168404Spjd			ASSERT(ab->b_datacnt > 0);
1993168404Spjd			while (ab->b_buf) {
1994168404Spjd				arc_buf_t *buf = ab->b_buf;
1995219089Spjd				if (!mutex_tryenter(&buf->b_evict_lock)) {
1996185029Spjd					missed += 1;
1997185029Spjd					break;
1998185029Spjd				}
1999168404Spjd				if (buf->b_data) {
2000168404Spjd					bytes_evicted += ab->b_size;
2001168404Spjd					if (recycle && ab->b_type == type &&
2002185029Spjd					    ab->b_size == bytes &&
2003185029Spjd					    !HDR_L2_WRITING(ab)) {
2004168404Spjd						stolen = buf->b_data;
2005168404Spjd						recycle = FALSE;
2006168404Spjd					}
2007168404Spjd				}
2008168404Spjd				if (buf->b_efunc) {
2009168404Spjd					mutex_enter(&arc_eviction_mtx);
2010168404Spjd					arc_buf_destroy(buf,
2011168404Spjd					    buf->b_data == stolen, FALSE);
2012168404Spjd					ab->b_buf = buf->b_next;
2013168404Spjd					buf->b_hdr = &arc_eviction_hdr;
2014168404Spjd					buf->b_next = arc_eviction_list;
2015168404Spjd					arc_eviction_list = buf;
2016168404Spjd					mutex_exit(&arc_eviction_mtx);
2017219089Spjd					mutex_exit(&buf->b_evict_lock);
2018168404Spjd				} else {
2019219089Spjd					mutex_exit(&buf->b_evict_lock);
2020168404Spjd					arc_buf_destroy(buf,
2021168404Spjd					    buf->b_data == stolen, TRUE);
2022168404Spjd				}
2023168404Spjd			}
2024208373Smm
2025208373Smm			if (ab->b_l2hdr) {
2026208373Smm				ARCSTAT_INCR(arcstat_evict_l2_cached,
2027208373Smm				    ab->b_size);
2028208373Smm			} else {
2029208373Smm				if (l2arc_write_eligible(ab->b_spa, ab)) {
2030208373Smm					ARCSTAT_INCR(arcstat_evict_l2_eligible,
2031208373Smm					    ab->b_size);
2032208373Smm				} else {
2033208373Smm					ARCSTAT_INCR(
2034208373Smm					    arcstat_evict_l2_ineligible,
2035208373Smm					    ab->b_size);
2036208373Smm				}
2037208373Smm			}
2038208373Smm
2039185029Spjd			if (ab->b_datacnt == 0) {
2040185029Spjd				arc_change_state(evicted_state, ab, hash_lock);
2041185029Spjd				ASSERT(HDR_IN_HASH_TABLE(ab));
2042185029Spjd				ab->b_flags |= ARC_IN_HASH_TABLE;
2043185029Spjd				ab->b_flags &= ~ARC_BUF_AVAILABLE;
2044185029Spjd				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
2045185029Spjd			}
2046168404Spjd			if (!have_lock)
2047168404Spjd				mutex_exit(hash_lock);
2048168404Spjd			if (bytes >= 0 && bytes_evicted >= bytes)
2049168404Spjd				break;
2050205231Skmacy			if (bytes_remaining > 0) {
2051205231Skmacy				mutex_exit(evicted_lock);
2052205231Skmacy				mutex_exit(lock);
2053206796Spjd				idx  = ((idx + 1) & (list_count - 1));
2054205231Skmacy				count++;
2055205231Skmacy				goto evict_start;
2056205231Skmacy			}
2057168404Spjd		} else {
2058168404Spjd			missed += 1;
2059168404Spjd		}
2060168404Spjd	}
2061168404Spjd
2062205231Skmacy	mutex_exit(evicted_lock);
2063205231Skmacy	mutex_exit(lock);
2064206796Spjd
2065206796Spjd	idx  = ((idx + 1) & (list_count - 1));
2066205231Skmacy	count++;
2067168404Spjd
2068205231Skmacy	if (bytes_evicted < bytes) {
2069205231Skmacy		if (count < list_count)
2070205231Skmacy			goto evict_start;
2071205231Skmacy		else
2072205231Skmacy			dprintf("only evicted %lld bytes from %x",
2073205231Skmacy			    (longlong_t)bytes_evicted, state);
2074205231Skmacy	}
2075206796Spjd	if (type == ARC_BUFC_METADATA)
2076205231Skmacy		evict_metadata_offset = idx;
2077205231Skmacy	else
2078205231Skmacy		evict_data_offset = idx;
2079206796Spjd
2080168404Spjd	if (skipped)
2081168404Spjd		ARCSTAT_INCR(arcstat_evict_skip, skipped);
2082168404Spjd
2083168404Spjd	if (missed)
2084168404Spjd		ARCSTAT_INCR(arcstat_mutex_miss, missed);
2085168404Spjd
2086185029Spjd	/*
2087248571Smm	 * We have just evicted some data into the ghost state, make
2088185029Spjd	 * sure we also adjust the ghost state size if necessary.
2089185029Spjd	 */
2090185029Spjd	if (arc_no_grow &&
2091185029Spjd	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
2092185029Spjd		int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
2093185029Spjd		    arc_mru_ghost->arcs_size - arc_c;
2094185029Spjd
2095185029Spjd		if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
2096185029Spjd			int64_t todelete =
2097185029Spjd			    MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
2098209962Smm			arc_evict_ghost(arc_mru_ghost, 0, todelete);
2099185029Spjd		} else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
2100185029Spjd			int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
2101185029Spjd			    arc_mru_ghost->arcs_size +
2102185029Spjd			    arc_mfu_ghost->arcs_size - arc_c);
2103209962Smm			arc_evict_ghost(arc_mfu_ghost, 0, todelete);
2104185029Spjd		}
2105185029Spjd	}
2106205231Skmacy	if (stolen)
2107205231Skmacy		ARCSTAT_BUMP(arcstat_stolen);
2108185029Spjd
2109168404Spjd	return (stolen);
2110168404Spjd}
2111168404Spjd
2112168404Spjd/*
2113168404Spjd * Remove buffers from list until we've removed the specified number of
2114168404Spjd * bytes.  Destroy the buffers that are removed.
2115168404Spjd */
2116168404Spjdstatic void
2117209962Smmarc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
2118168404Spjd{
2119168404Spjd	arc_buf_hdr_t *ab, *ab_prev;
2120219089Spjd	arc_buf_hdr_t marker = { 0 };
2121205231Skmacy	list_t *list, *list_start;
2122205231Skmacy	kmutex_t *hash_lock, *lock;
2123168404Spjd	uint64_t bytes_deleted = 0;
2124168404Spjd	uint64_t bufs_skipped = 0;
2125205231Skmacy	static int evict_offset;
2126205231Skmacy	int list_count, idx = evict_offset;
2127205231Skmacy	int offset, count = 0;
2128168404Spjd
2129168404Spjd	ASSERT(GHOST_STATE(state));
2130205231Skmacy
2131205231Skmacy	/*
2132205231Skmacy	 * data lists come after metadata lists
2133205231Skmacy	 */
2134205231Skmacy	list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
2135205231Skmacy	list_count = ARC_BUFC_NUMDATALISTS;
2136205231Skmacy	offset = ARC_BUFC_NUMMETADATALISTS;
2137206796Spjd
2138205231Skmacyevict_start:
2139205231Skmacy	list = &list_start[idx];
2140205231Skmacy	lock = ARCS_LOCK(state, idx + offset);
2141205231Skmacy
2142205231Skmacy	mutex_enter(lock);
2143185029Spjd	for (ab = list_tail(list); ab; ab = ab_prev) {
2144185029Spjd		ab_prev = list_prev(list, ab);
2145185029Spjd		if (spa && ab->b_spa != spa)
2146185029Spjd			continue;
2147219089Spjd
2148219089Spjd		/* ignore markers */
2149219089Spjd		if (ab->b_spa == 0)
2150219089Spjd			continue;
2151219089Spjd
2152168404Spjd		hash_lock = HDR_LOCK(ab);
2153219089Spjd		/* caller may be trying to modify this buffer, skip it */
2154219089Spjd		if (MUTEX_HELD(hash_lock))
2155219089Spjd			continue;
2156168404Spjd		if (mutex_tryenter(hash_lock)) {
2157168404Spjd			ASSERT(!HDR_IO_IN_PROGRESS(ab));
2158168404Spjd			ASSERT(ab->b_buf == NULL);
2159168404Spjd			ARCSTAT_BUMP(arcstat_deleted);
2160168404Spjd			bytes_deleted += ab->b_size;
2161185029Spjd
2162185029Spjd			if (ab->b_l2hdr != NULL) {
2163185029Spjd				/*
2164185029Spjd				 * This buffer is cached on the 2nd Level ARC;
2165185029Spjd				 * don't destroy the header.
2166185029Spjd				 */
2167185029Spjd				arc_change_state(arc_l2c_only, ab, hash_lock);
2168185029Spjd				mutex_exit(hash_lock);
2169185029Spjd			} else {
2170185029Spjd				arc_change_state(arc_anon, ab, hash_lock);
2171185029Spjd				mutex_exit(hash_lock);
2172185029Spjd				arc_hdr_destroy(ab);
2173185029Spjd			}
2174185029Spjd
2175168404Spjd			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
2176168404Spjd			if (bytes >= 0 && bytes_deleted >= bytes)
2177168404Spjd				break;
2178219089Spjd		} else if (bytes < 0) {
2179219089Spjd			/*
2180219089Spjd			 * Insert a list marker and then wait for the
2181219089Spjd			 * hash lock to become available. Once its
2182219089Spjd			 * available, restart from where we left off.
2183219089Spjd			 */
2184219089Spjd			list_insert_after(list, ab, &marker);
2185219089Spjd			mutex_exit(lock);
2186219089Spjd			mutex_enter(hash_lock);
2187219089Spjd			mutex_exit(hash_lock);
2188219089Spjd			mutex_enter(lock);
2189219089Spjd			ab_prev = list_prev(list, &marker);
2190219089Spjd			list_remove(list, &marker);
2191219089Spjd		} else
2192168404Spjd			bufs_skipped += 1;
2193168404Spjd	}
2194205231Skmacy	mutex_exit(lock);
2195206796Spjd	idx  = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
2196205231Skmacy	count++;
2197206796Spjd
2198205231Skmacy	if (count < list_count)
2199205231Skmacy		goto evict_start;
2200206796Spjd
2201205231Skmacy	evict_offset = idx;
2202205231Skmacy	if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
2203185029Spjd	    (bytes < 0 || bytes_deleted < bytes)) {
2204205231Skmacy		list_start = &state->arcs_lists[0];
2205205231Skmacy		list_count = ARC_BUFC_NUMMETADATALISTS;
2206205231Skmacy		offset = count = 0;
2207205231Skmacy		goto evict_start;
2208185029Spjd	}
2209185029Spjd
2210168404Spjd	if (bufs_skipped) {
2211168404Spjd		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2212168404Spjd		ASSERT(bytes >= 0);
2213168404Spjd	}
2214168404Spjd
2215168404Spjd	if (bytes_deleted < bytes)
2216168404Spjd		dprintf("only deleted %lld bytes from %p",
2217168404Spjd		    (longlong_t)bytes_deleted, state);
2218168404Spjd}
2219168404Spjd
2220168404Spjdstatic void
2221168404Spjdarc_adjust(void)
2222168404Spjd{
2223208373Smm	int64_t adjustment, delta;
2224168404Spjd
2225208373Smm	/*
2226208373Smm	 * Adjust MRU size
2227208373Smm	 */
2228168404Spjd
2229209275Smm	adjustment = MIN((int64_t)(arc_size - arc_c),
2230209275Smm	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2231209275Smm	    arc_p));
2232208373Smm
2233208373Smm	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2234208373Smm		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2235209962Smm		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
2236208373Smm		adjustment -= delta;
2237168404Spjd	}
2238168404Spjd
2239208373Smm	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2240208373Smm		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2241209962Smm		(void) arc_evict(arc_mru, 0, delta, FALSE,
2242185029Spjd		    ARC_BUFC_METADATA);
2243185029Spjd	}
2244185029Spjd
2245208373Smm	/*
2246208373Smm	 * Adjust MFU size
2247208373Smm	 */
2248168404Spjd
2249208373Smm	adjustment = arc_size - arc_c;
2250208373Smm
2251208373Smm	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2252208373Smm		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2253209962Smm		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
2254208373Smm		adjustment -= delta;
2255168404Spjd	}
2256168404Spjd
2257208373Smm	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2258208373Smm		int64_t delta = MIN(adjustment,
2259208373Smm		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2260209962Smm		(void) arc_evict(arc_mfu, 0, delta, FALSE,
2261208373Smm		    ARC_BUFC_METADATA);
2262208373Smm	}
2263168404Spjd
2264208373Smm	/*
2265208373Smm	 * Adjust ghost lists
2266208373Smm	 */
2267168404Spjd
2268208373Smm	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2269168404Spjd
2270208373Smm	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2271208373Smm		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2272209962Smm		arc_evict_ghost(arc_mru_ghost, 0, delta);
2273208373Smm	}
2274185029Spjd
2275208373Smm	adjustment =
2276208373Smm	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2277208373Smm
2278208373Smm	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2279208373Smm		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2280209962Smm		arc_evict_ghost(arc_mfu_ghost, 0, delta);
2281168404Spjd	}
2282168404Spjd}
2283168404Spjd
2284168404Spjdstatic void
2285168404Spjdarc_do_user_evicts(void)
2286168404Spjd{
2287191903Skmacy	static arc_buf_t *tmp_arc_eviction_list;
2288191903Skmacy
2289191903Skmacy	/*
2290191903Skmacy	 * Move list over to avoid LOR
2291191903Skmacy	 */
2292206796Spjdrestart:
2293168404Spjd	mutex_enter(&arc_eviction_mtx);
2294191903Skmacy	tmp_arc_eviction_list = arc_eviction_list;
2295191903Skmacy	arc_eviction_list = NULL;
2296191903Skmacy	mutex_exit(&arc_eviction_mtx);
2297191903Skmacy
2298191903Skmacy	while (tmp_arc_eviction_list != NULL) {
2299191903Skmacy		arc_buf_t *buf = tmp_arc_eviction_list;
2300191903Skmacy		tmp_arc_eviction_list = buf->b_next;
2301219089Spjd		mutex_enter(&buf->b_evict_lock);
2302168404Spjd		buf->b_hdr = NULL;
2303219089Spjd		mutex_exit(&buf->b_evict_lock);
2304168404Spjd
2305168404Spjd		if (buf->b_efunc != NULL)
2306168404Spjd			VERIFY(buf->b_efunc(buf) == 0);
2307168404Spjd
2308168404Spjd		buf->b_efunc = NULL;
2309168404Spjd		buf->b_private = NULL;
2310168404Spjd		kmem_cache_free(buf_cache, buf);
2311168404Spjd	}
2312191903Skmacy
2313191903Skmacy	if (arc_eviction_list != NULL)
2314191903Skmacy		goto restart;
2315168404Spjd}
2316168404Spjd
2317168404Spjd/*
2318185029Spjd * Flush all *evictable* data from the cache for the given spa.
2319168404Spjd * NOTE: this will not touch "active" (i.e. referenced) data.
2320168404Spjd */
2321168404Spjdvoid
2322185029Spjdarc_flush(spa_t *spa)
2323168404Spjd{
2324209962Smm	uint64_t guid = 0;
2325209962Smm
2326209962Smm	if (spa)
2327228103Smm		guid = spa_load_guid(spa);
2328209962Smm
2329205231Skmacy	while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
2330209962Smm		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2331185029Spjd		if (spa)
2332185029Spjd			break;
2333185029Spjd	}
2334205231Skmacy	while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
2335209962Smm		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2336185029Spjd		if (spa)
2337185029Spjd			break;
2338185029Spjd	}
2339205231Skmacy	while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
2340209962Smm		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2341185029Spjd		if (spa)
2342185029Spjd			break;
2343185029Spjd	}
2344205231Skmacy	while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
2345209962Smm		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2346185029Spjd		if (spa)
2347185029Spjd			break;
2348185029Spjd	}
2349168404Spjd
2350209962Smm	arc_evict_ghost(arc_mru_ghost, guid, -1);
2351209962Smm	arc_evict_ghost(arc_mfu_ghost, guid, -1);
2352168404Spjd
2353168404Spjd	mutex_enter(&arc_reclaim_thr_lock);
2354168404Spjd	arc_do_user_evicts();
2355168404Spjd	mutex_exit(&arc_reclaim_thr_lock);
2356185029Spjd	ASSERT(spa || arc_eviction_list == NULL);
2357168404Spjd}
2358168404Spjd
2359168404Spjdvoid
2360168404Spjdarc_shrink(void)
2361168404Spjd{
2362168404Spjd	if (arc_c > arc_c_min) {
2363168404Spjd		uint64_t to_free;
2364168404Spjd
2365168404Spjd#ifdef _KERNEL
2366168404Spjd		to_free = arc_c >> arc_shrink_shift;
2367168404Spjd#else
2368168404Spjd		to_free = arc_c >> arc_shrink_shift;
2369168404Spjd#endif
2370168404Spjd		if (arc_c > arc_c_min + to_free)
2371168404Spjd			atomic_add_64(&arc_c, -to_free);
2372168404Spjd		else
2373168404Spjd			arc_c = arc_c_min;
2374168404Spjd
2375168404Spjd		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2376168404Spjd		if (arc_c > arc_size)
2377168404Spjd			arc_c = MAX(arc_size, arc_c_min);
2378168404Spjd		if (arc_p > arc_c)
2379168404Spjd			arc_p = (arc_c >> 1);
2380168404Spjd		ASSERT(arc_c >= arc_c_min);
2381168404Spjd		ASSERT((int64_t)arc_p >= 0);
2382168404Spjd	}
2383168404Spjd
2384168404Spjd	if (arc_size > arc_c)
2385168404Spjd		arc_adjust();
2386168404Spjd}
2387168404Spjd
2388185029Spjdstatic int needfree = 0;
2389168404Spjd
2390168404Spjdstatic int
2391168404Spjdarc_reclaim_needed(void)
2392168404Spjd{
2393168404Spjd
2394168404Spjd#ifdef _KERNEL
2395219089Spjd
2396197816Skmacy	if (needfree)
2397197816Skmacy		return (1);
2398168404Spjd
2399191902Skmacy	/*
2400212780Savg	 * Cooperate with pagedaemon when it's time for it to scan
2401212780Savg	 * and reclaim some pages.
2402191902Skmacy	 */
2403212783Savg	if (vm_paging_needed())
2404191902Skmacy		return (1);
2405191902Skmacy
2406219089Spjd#ifdef sun
2407168404Spjd	/*
2408185029Spjd	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
2409185029Spjd	 */
2410185029Spjd	extra = desfree;
2411185029Spjd
2412185029Spjd	/*
2413185029Spjd	 * check that we're out of range of the pageout scanner.  It starts to
2414185029Spjd	 * schedule paging if freemem is less than lotsfree and needfree.
2415185029Spjd	 * lotsfree is the high-water mark for pageout, and needfree is the
2416185029Spjd	 * number of needed free pages.  We add extra pages here to make sure
2417185029Spjd	 * the scanner doesn't start up while we're freeing memory.
2418185029Spjd	 */
2419185029Spjd	if (freemem < lotsfree + needfree + extra)
2420185029Spjd		return (1);
2421185029Spjd
2422185029Spjd	/*
2423168404Spjd	 * check to make sure that swapfs has enough space so that anon
2424185029Spjd	 * reservations can still succeed. anon_resvmem() checks that the
2425168404Spjd	 * availrmem is greater than swapfs_minfree, and the number of reserved
2426168404Spjd	 * swap pages.  We also add a bit of extra here just to prevent
2427168404Spjd	 * circumstances from getting really dire.
2428168404Spjd	 */
2429168404Spjd	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2430168404Spjd		return (1);
2431168404Spjd
2432168404Spjd#if defined(__i386)
2433168404Spjd	/*
2434168404Spjd	 * If we're on an i386 platform, it's possible that we'll exhaust the
2435168404Spjd	 * kernel heap space before we ever run out of available physical
2436168404Spjd	 * memory.  Most checks of the size of the heap_area compare against
2437168404Spjd	 * tune.t_minarmem, which is the minimum available real memory that we
2438168404Spjd	 * can have in the system.  However, this is generally fixed at 25 pages
2439168404Spjd	 * which is so low that it's useless.  In this comparison, we seek to
2440168404Spjd	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
2441185029Spjd	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2442168404Spjd	 * free)
2443168404Spjd	 */
2444168404Spjd	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
2445168404Spjd	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
2446168404Spjd		return (1);
2447168404Spjd#endif
2448219089Spjd#else	/* !sun */
2449175633Spjd	if (kmem_used() > (kmem_size() * 3) / 4)
2450168404Spjd		return (1);
2451219089Spjd#endif	/* sun */
2452168404Spjd
2453168404Spjd#else
2454168404Spjd	if (spa_get_random(100) == 0)
2455168404Spjd		return (1);
2456168404Spjd#endif
2457168404Spjd	return (0);
2458168404Spjd}
2459168404Spjd
2460208454Spjdextern kmem_cache_t	*zio_buf_cache[];
2461208454Spjdextern kmem_cache_t	*zio_data_buf_cache[];
2462208454Spjd
2463168404Spjdstatic void
2464168404Spjdarc_kmem_reap_now(arc_reclaim_strategy_t strat)
2465168404Spjd{
2466168404Spjd	size_t			i;
2467168404Spjd	kmem_cache_t		*prev_cache = NULL;
2468168404Spjd	kmem_cache_t		*prev_data_cache = NULL;
2469168404Spjd
2470168404Spjd#ifdef _KERNEL
2471185029Spjd	if (arc_meta_used >= arc_meta_limit) {
2472185029Spjd		/*
2473185029Spjd		 * We are exceeding our meta-data cache limit.
2474185029Spjd		 * Purge some DNLC entries to release holds on meta-data.
2475185029Spjd		 */
2476185029Spjd		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2477185029Spjd	}
2478168404Spjd#if defined(__i386)
2479168404Spjd	/*
2480168404Spjd	 * Reclaim unused memory from all kmem caches.
2481168404Spjd	 */
2482168404Spjd	kmem_reap();
2483168404Spjd#endif
2484168404Spjd#endif
2485168404Spjd
2486168404Spjd	/*
2487185029Spjd	 * An aggressive reclamation will shrink the cache size as well as
2488168404Spjd	 * reap free buffers from the arc kmem caches.
2489168404Spjd	 */
2490168404Spjd	if (strat == ARC_RECLAIM_AGGR)
2491168404Spjd		arc_shrink();
2492168404Spjd
2493168404Spjd	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
2494168404Spjd		if (zio_buf_cache[i] != prev_cache) {
2495168404Spjd			prev_cache = zio_buf_cache[i];
2496168404Spjd			kmem_cache_reap_now(zio_buf_cache[i]);
2497168404Spjd		}
2498168404Spjd		if (zio_data_buf_cache[i] != prev_data_cache) {
2499168404Spjd			prev_data_cache = zio_data_buf_cache[i];
2500168404Spjd			kmem_cache_reap_now(zio_data_buf_cache[i]);
2501168404Spjd		}
2502168404Spjd	}
2503168404Spjd	kmem_cache_reap_now(buf_cache);
2504168404Spjd	kmem_cache_reap_now(hdr_cache);
2505168404Spjd}
2506168404Spjd
2507168404Spjdstatic void
2508168404Spjdarc_reclaim_thread(void *dummy __unused)
2509168404Spjd{
2510168404Spjd	clock_t			growtime = 0;
2511168404Spjd	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
2512168404Spjd	callb_cpr_t		cpr;
2513168404Spjd
2514168404Spjd	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
2515168404Spjd
2516168404Spjd	mutex_enter(&arc_reclaim_thr_lock);
2517168404Spjd	while (arc_thread_exit == 0) {
2518168404Spjd		if (arc_reclaim_needed()) {
2519168404Spjd
2520168404Spjd			if (arc_no_grow) {
2521168404Spjd				if (last_reclaim == ARC_RECLAIM_CONS) {
2522168404Spjd					last_reclaim = ARC_RECLAIM_AGGR;
2523168404Spjd				} else {
2524168404Spjd					last_reclaim = ARC_RECLAIM_CONS;
2525168404Spjd				}
2526168404Spjd			} else {
2527168404Spjd				arc_no_grow = TRUE;
2528168404Spjd				last_reclaim = ARC_RECLAIM_AGGR;
2529168404Spjd				membar_producer();
2530168404Spjd			}
2531168404Spjd
2532168404Spjd			/* reset the growth delay for every reclaim */
2533219089Spjd			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
2534168404Spjd
2535185029Spjd			if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
2536168404Spjd				/*
2537185029Spjd				 * If needfree is TRUE our vm_lowmem hook
2538168404Spjd				 * was called and in that case we must free some
2539168404Spjd				 * memory, so switch to aggressive mode.
2540168404Spjd				 */
2541168404Spjd				arc_no_grow = TRUE;
2542168404Spjd				last_reclaim = ARC_RECLAIM_AGGR;
2543168404Spjd			}
2544168404Spjd			arc_kmem_reap_now(last_reclaim);
2545185029Spjd			arc_warm = B_TRUE;
2546185029Spjd
2547219089Spjd		} else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
2548168404Spjd			arc_no_grow = FALSE;
2549168404Spjd		}
2550168404Spjd
2551209275Smm		arc_adjust();
2552168404Spjd
2553168404Spjd		if (arc_eviction_list != NULL)
2554168404Spjd			arc_do_user_evicts();
2555168404Spjd
2556211762Savg#ifdef _KERNEL
2557211762Savg		if (needfree) {
2558185029Spjd			needfree = 0;
2559185029Spjd			wakeup(&needfree);
2560211762Savg		}
2561168404Spjd#endif
2562168404Spjd
2563168404Spjd		/* block until needed, or one second, whichever is shorter */
2564168404Spjd		CALLB_CPR_SAFE_BEGIN(&cpr);
2565168404Spjd		(void) cv_timedwait(&arc_reclaim_thr_cv,
2566168404Spjd		    &arc_reclaim_thr_lock, hz);
2567168404Spjd		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
2568168404Spjd	}
2569168404Spjd
2570168404Spjd	arc_thread_exit = 0;
2571168404Spjd	cv_broadcast(&arc_reclaim_thr_cv);
2572168404Spjd	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
2573168404Spjd	thread_exit();
2574168404Spjd}
2575168404Spjd
2576168404Spjd/*
2577168404Spjd * Adapt arc info given the number of bytes we are trying to add and
2578168404Spjd * the state that we are comming from.  This function is only called
2579168404Spjd * when we are adding new content to the cache.
2580168404Spjd */
2581168404Spjdstatic void
2582168404Spjdarc_adapt(int bytes, arc_state_t *state)
2583168404Spjd{
2584168404Spjd	int mult;
2585208373Smm	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
2586168404Spjd
2587185029Spjd	if (state == arc_l2c_only)
2588185029Spjd		return;
2589185029Spjd
2590168404Spjd	ASSERT(bytes > 0);
2591168404Spjd	/*
2592168404Spjd	 * Adapt the target size of the MRU list:
2593168404Spjd	 *	- if we just hit in the MRU ghost list, then increase
2594168404Spjd	 *	  the target size of the MRU list.
2595168404Spjd	 *	- if we just hit in the MFU ghost list, then increase
2596168404Spjd	 *	  the target size of the MFU list by decreasing the
2597168404Spjd	 *	  target size of the MRU list.
2598168404Spjd	 */
2599168404Spjd	if (state == arc_mru_ghost) {
2600168404Spjd		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
2601168404Spjd		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
2602209275Smm		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
2603168404Spjd
2604208373Smm		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
2605168404Spjd	} else if (state == arc_mfu_ghost) {
2606208373Smm		uint64_t delta;
2607208373Smm
2608168404Spjd		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
2609168404Spjd		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
2610209275Smm		mult = MIN(mult, 10);
2611168404Spjd
2612208373Smm		delta = MIN(bytes * mult, arc_p);
2613208373Smm		arc_p = MAX(arc_p_min, arc_p - delta);
2614168404Spjd	}
2615168404Spjd	ASSERT((int64_t)arc_p >= 0);
2616168404Spjd
2617168404Spjd	if (arc_reclaim_needed()) {
2618168404Spjd		cv_signal(&arc_reclaim_thr_cv);
2619168404Spjd		return;
2620168404Spjd	}
2621168404Spjd
2622168404Spjd	if (arc_no_grow)
2623168404Spjd		return;
2624168404Spjd
2625168404Spjd	if (arc_c >= arc_c_max)
2626168404Spjd		return;
2627168404Spjd
2628168404Spjd	/*
2629168404Spjd	 * If we're within (2 * maxblocksize) bytes of the target
2630168404Spjd	 * cache size, increment the target cache size
2631168404Spjd	 */
2632168404Spjd	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
2633168404Spjd		atomic_add_64(&arc_c, (int64_t)bytes);
2634168404Spjd		if (arc_c > arc_c_max)
2635168404Spjd			arc_c = arc_c_max;
2636168404Spjd		else if (state == arc_anon)
2637168404Spjd			atomic_add_64(&arc_p, (int64_t)bytes);
2638168404Spjd		if (arc_p > arc_c)
2639168404Spjd			arc_p = arc_c;
2640168404Spjd	}
2641168404Spjd	ASSERT((int64_t)arc_p >= 0);
2642168404Spjd}
2643168404Spjd
2644168404Spjd/*
2645168404Spjd * Check if the cache has reached its limits and eviction is required
2646168404Spjd * prior to insert.
2647168404Spjd */
2648168404Spjdstatic int
2649185029Spjdarc_evict_needed(arc_buf_contents_t type)
2650168404Spjd{
2651185029Spjd	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
2652185029Spjd		return (1);
2653185029Spjd
2654219089Spjd#ifdef sun
2655185029Spjd#ifdef _KERNEL
2656185029Spjd	/*
2657185029Spjd	 * If zio data pages are being allocated out of a separate heap segment,
2658185029Spjd	 * then enforce that the size of available vmem for this area remains
2659185029Spjd	 * above about 1/32nd free.
2660185029Spjd	 */
2661185029Spjd	if (type == ARC_BUFC_DATA && zio_arena != NULL &&
2662185029Spjd	    vmem_size(zio_arena, VMEM_FREE) <
2663185029Spjd	    (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
2664185029Spjd		return (1);
2665185029Spjd#endif
2666219089Spjd#endif	/* sun */
2667185029Spjd
2668168404Spjd	if (arc_reclaim_needed())
2669168404Spjd		return (1);
2670168404Spjd
2671168404Spjd	return (arc_size > arc_c);
2672168404Spjd}
2673168404Spjd
2674168404Spjd/*
2675168404Spjd * The buffer, supplied as the first argument, needs a data block.
2676168404Spjd * So, if we are at cache max, determine which cache should be victimized.
2677168404Spjd * We have the following cases:
2678168404Spjd *
2679168404Spjd * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
2680168404Spjd * In this situation if we're out of space, but the resident size of the MFU is
2681168404Spjd * under the limit, victimize the MFU cache to satisfy this insertion request.
2682168404Spjd *
2683168404Spjd * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
2684168404Spjd * Here, we've used up all of the available space for the MRU, so we need to
2685168404Spjd * evict from our own cache instead.  Evict from the set of resident MRU
2686168404Spjd * entries.
2687168404Spjd *
2688168404Spjd * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
2689168404Spjd * c minus p represents the MFU space in the cache, since p is the size of the
2690168404Spjd * cache that is dedicated to the MRU.  In this situation there's still space on
2691168404Spjd * the MFU side, so the MRU side needs to be victimized.
2692168404Spjd *
2693168404Spjd * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
2694168404Spjd * MFU's resident set is consuming more space than it has been allotted.  In
2695168404Spjd * this situation, we must victimize our own cache, the MFU, for this insertion.
2696168404Spjd */
2697168404Spjdstatic void
2698168404Spjdarc_get_data_buf(arc_buf_t *buf)
2699168404Spjd{
2700168404Spjd	arc_state_t		*state = buf->b_hdr->b_state;
2701168404Spjd	uint64_t		size = buf->b_hdr->b_size;
2702168404Spjd	arc_buf_contents_t	type = buf->b_hdr->b_type;
2703168404Spjd
2704168404Spjd	arc_adapt(size, state);
2705168404Spjd
2706168404Spjd	/*
2707168404Spjd	 * We have not yet reached cache maximum size,
2708168404Spjd	 * just allocate a new buffer.
2709168404Spjd	 */
2710185029Spjd	if (!arc_evict_needed(type)) {
2711168404Spjd		if (type == ARC_BUFC_METADATA) {
2712168404Spjd			buf->b_data = zio_buf_alloc(size);
2713208373Smm			arc_space_consume(size, ARC_SPACE_DATA);
2714168404Spjd		} else {
2715168404Spjd			ASSERT(type == ARC_BUFC_DATA);
2716168404Spjd			buf->b_data = zio_data_buf_alloc(size);
2717208373Smm			ARCSTAT_INCR(arcstat_data_size, size);
2718185029Spjd			atomic_add_64(&arc_size, size);
2719168404Spjd		}
2720168404Spjd		goto out;
2721168404Spjd	}
2722168404Spjd
2723168404Spjd	/*
2724168404Spjd	 * If we are prefetching from the mfu ghost list, this buffer
2725168404Spjd	 * will end up on the mru list; so steal space from there.
2726168404Spjd	 */
2727168404Spjd	if (state == arc_mfu_ghost)
2728168404Spjd		state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
2729168404Spjd	else if (state == arc_mru_ghost)
2730168404Spjd		state = arc_mru;
2731168404Spjd
2732168404Spjd	if (state == arc_mru || state == arc_anon) {
2733168404Spjd		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
2734208373Smm		state = (arc_mfu->arcs_lsize[type] >= size &&
2735185029Spjd		    arc_p > mru_used) ? arc_mfu : arc_mru;
2736168404Spjd	} else {
2737168404Spjd		/* MFU cases */
2738168404Spjd		uint64_t mfu_space = arc_c - arc_p;
2739208373Smm		state =  (arc_mru->arcs_lsize[type] >= size &&
2740185029Spjd		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
2741168404Spjd	}
2742209962Smm	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
2743168404Spjd		if (type == ARC_BUFC_METADATA) {
2744168404Spjd			buf->b_data = zio_buf_alloc(size);
2745208373Smm			arc_space_consume(size, ARC_SPACE_DATA);
2746168404Spjd		} else {
2747168404Spjd			ASSERT(type == ARC_BUFC_DATA);
2748168404Spjd			buf->b_data = zio_data_buf_alloc(size);
2749208373Smm			ARCSTAT_INCR(arcstat_data_size, size);
2750185029Spjd			atomic_add_64(&arc_size, size);
2751168404Spjd		}
2752168404Spjd		ARCSTAT_BUMP(arcstat_recycle_miss);
2753168404Spjd	}
2754168404Spjd	ASSERT(buf->b_data != NULL);
2755168404Spjdout:
2756168404Spjd	/*
2757168404Spjd	 * Update the state size.  Note that ghost states have a
2758168404Spjd	 * "ghost size" and so don't need to be updated.
2759168404Spjd	 */
2760168404Spjd	if (!GHOST_STATE(buf->b_hdr->b_state)) {
2761168404Spjd		arc_buf_hdr_t *hdr = buf->b_hdr;
2762168404Spjd
2763168404Spjd		atomic_add_64(&hdr->b_state->arcs_size, size);
2764168404Spjd		if (list_link_active(&hdr->b_arc_node)) {
2765168404Spjd			ASSERT(refcount_is_zero(&hdr->b_refcnt));
2766185029Spjd			atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
2767168404Spjd		}
2768168404Spjd		/*
2769168404Spjd		 * If we are growing the cache, and we are adding anonymous
2770168404Spjd		 * data, and we have outgrown arc_p, update arc_p
2771168404Spjd		 */
2772168404Spjd		if (arc_size < arc_c && hdr->b_state == arc_anon &&
2773168404Spjd		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
2774168404Spjd			arc_p = MIN(arc_c, arc_p + size);
2775168404Spjd	}
2776205231Skmacy	ARCSTAT_BUMP(arcstat_allocated);
2777168404Spjd}
2778168404Spjd
2779168404Spjd/*
2780168404Spjd * This routine is called whenever a buffer is accessed.
2781168404Spjd * NOTE: the hash lock is dropped in this function.
2782168404Spjd */
2783168404Spjdstatic void
2784168404Spjdarc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
2785168404Spjd{
2786219089Spjd	clock_t now;
2787219089Spjd
2788168404Spjd	ASSERT(MUTEX_HELD(hash_lock));
2789168404Spjd
2790168404Spjd	if (buf->b_state == arc_anon) {
2791168404Spjd		/*
2792168404Spjd		 * This buffer is not in the cache, and does not
2793168404Spjd		 * appear in our "ghost" list.  Add the new buffer
2794168404Spjd		 * to the MRU state.
2795168404Spjd		 */
2796168404Spjd
2797168404Spjd		ASSERT(buf->b_arc_access == 0);
2798219089Spjd		buf->b_arc_access = ddi_get_lbolt();
2799168404Spjd		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2800168404Spjd		arc_change_state(arc_mru, buf, hash_lock);
2801168404Spjd
2802168404Spjd	} else if (buf->b_state == arc_mru) {
2803219089Spjd		now = ddi_get_lbolt();
2804219089Spjd
2805168404Spjd		/*
2806168404Spjd		 * If this buffer is here because of a prefetch, then either:
2807168404Spjd		 * - clear the flag if this is a "referencing" read
2808168404Spjd		 *   (any subsequent access will bump this into the MFU state).
2809168404Spjd		 * or
2810168404Spjd		 * - move the buffer to the head of the list if this is
2811168404Spjd		 *   another prefetch (to make it less likely to be evicted).
2812168404Spjd		 */
2813168404Spjd		if ((buf->b_flags & ARC_PREFETCH) != 0) {
2814168404Spjd			if (refcount_count(&buf->b_refcnt) == 0) {
2815168404Spjd				ASSERT(list_link_active(&buf->b_arc_node));
2816168404Spjd			} else {
2817168404Spjd				buf->b_flags &= ~ARC_PREFETCH;
2818168404Spjd				ARCSTAT_BUMP(arcstat_mru_hits);
2819168404Spjd			}
2820219089Spjd			buf->b_arc_access = now;
2821168404Spjd			return;
2822168404Spjd		}
2823168404Spjd
2824168404Spjd		/*
2825168404Spjd		 * This buffer has been "accessed" only once so far,
2826168404Spjd		 * but it is still in the cache. Move it to the MFU
2827168404Spjd		 * state.
2828168404Spjd		 */
2829219089Spjd		if (now > buf->b_arc_access + ARC_MINTIME) {
2830168404Spjd			/*
2831168404Spjd			 * More than 125ms have passed since we
2832168404Spjd			 * instantiated this buffer.  Move it to the
2833168404Spjd			 * most frequently used state.
2834168404Spjd			 */
2835219089Spjd			buf->b_arc_access = now;
2836168404Spjd			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2837168404Spjd			arc_change_state(arc_mfu, buf, hash_lock);
2838168404Spjd		}
2839168404Spjd		ARCSTAT_BUMP(arcstat_mru_hits);
2840168404Spjd	} else if (buf->b_state == arc_mru_ghost) {
2841168404Spjd		arc_state_t	*new_state;
2842168404Spjd		/*
2843168404Spjd		 * This buffer has been "accessed" recently, but
2844168404Spjd		 * was evicted from the cache.  Move it to the
2845168404Spjd		 * MFU state.
2846168404Spjd		 */
2847168404Spjd
2848168404Spjd		if (buf->b_flags & ARC_PREFETCH) {
2849168404Spjd			new_state = arc_mru;
2850168404Spjd			if (refcount_count(&buf->b_refcnt) > 0)
2851168404Spjd				buf->b_flags &= ~ARC_PREFETCH;
2852168404Spjd			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
2853168404Spjd		} else {
2854168404Spjd			new_state = arc_mfu;
2855168404Spjd			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2856168404Spjd		}
2857168404Spjd
2858219089Spjd		buf->b_arc_access = ddi_get_lbolt();
2859168404Spjd		arc_change_state(new_state, buf, hash_lock);
2860168404Spjd
2861168404Spjd		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
2862168404Spjd	} else if (buf->b_state == arc_mfu) {
2863168404Spjd		/*
2864168404Spjd		 * This buffer has been accessed more than once and is
2865168404Spjd		 * still in the cache.  Keep it in the MFU state.
2866168404Spjd		 *
2867168404Spjd		 * NOTE: an add_reference() that occurred when we did
2868168404Spjd		 * the arc_read() will have kicked this off the list.
2869168404Spjd		 * If it was a prefetch, we will explicitly move it to
2870168404Spjd		 * the head of the list now.
2871168404Spjd		 */
2872168404Spjd		if ((buf->b_flags & ARC_PREFETCH) != 0) {
2873168404Spjd			ASSERT(refcount_count(&buf->b_refcnt) == 0);
2874168404Spjd			ASSERT(list_link_active(&buf->b_arc_node));
2875168404Spjd		}
2876168404Spjd		ARCSTAT_BUMP(arcstat_mfu_hits);
2877219089Spjd		buf->b_arc_access = ddi_get_lbolt();
2878168404Spjd	} else if (buf->b_state == arc_mfu_ghost) {
2879168404Spjd		arc_state_t	*new_state = arc_mfu;
2880168404Spjd		/*
2881168404Spjd		 * This buffer has been accessed more than once but has
2882168404Spjd		 * been evicted from the cache.  Move it back to the
2883168404Spjd		 * MFU state.
2884168404Spjd		 */
2885168404Spjd
2886168404Spjd		if (buf->b_flags & ARC_PREFETCH) {
2887168404Spjd			/*
2888168404Spjd			 * This is a prefetch access...
2889168404Spjd			 * move this block back to the MRU state.
2890168404Spjd			 */
2891240415Smm			ASSERT0(refcount_count(&buf->b_refcnt));
2892168404Spjd			new_state = arc_mru;
2893168404Spjd		}
2894168404Spjd
2895219089Spjd		buf->b_arc_access = ddi_get_lbolt();
2896168404Spjd		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2897168404Spjd		arc_change_state(new_state, buf, hash_lock);
2898168404Spjd
2899168404Spjd		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
2900185029Spjd	} else if (buf->b_state == arc_l2c_only) {
2901185029Spjd		/*
2902185029Spjd		 * This buffer is on the 2nd Level ARC.
2903185029Spjd		 */
2904185029Spjd
2905219089Spjd		buf->b_arc_access = ddi_get_lbolt();
2906185029Spjd		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
2907185029Spjd		arc_change_state(arc_mfu, buf, hash_lock);
2908168404Spjd	} else {
2909168404Spjd		ASSERT(!"invalid arc state");
2910168404Spjd	}
2911168404Spjd}
2912168404Spjd
2913168404Spjd/* a generic arc_done_func_t which you can use */
2914168404Spjd/* ARGSUSED */
2915168404Spjdvoid
2916168404Spjdarc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
2917168404Spjd{
2918219089Spjd	if (zio == NULL || zio->io_error == 0)
2919219089Spjd		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
2920248571Smm	VERIFY(arc_buf_remove_ref(buf, arg));
2921168404Spjd}
2922168404Spjd
2923185029Spjd/* a generic arc_done_func_t */
2924168404Spjdvoid
2925168404Spjdarc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
2926168404Spjd{
2927168404Spjd	arc_buf_t **bufp = arg;
2928168404Spjd	if (zio && zio->io_error) {
2929248571Smm		VERIFY(arc_buf_remove_ref(buf, arg));
2930168404Spjd		*bufp = NULL;
2931168404Spjd	} else {
2932168404Spjd		*bufp = buf;
2933219089Spjd		ASSERT(buf->b_data);
2934168404Spjd	}
2935168404Spjd}
2936168404Spjd
2937168404Spjdstatic void
2938168404Spjdarc_read_done(zio_t *zio)
2939168404Spjd{
2940168404Spjd	arc_buf_hdr_t	*hdr, *found;
2941168404Spjd	arc_buf_t	*buf;
2942168404Spjd	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
2943168404Spjd	kmutex_t	*hash_lock;
2944168404Spjd	arc_callback_t	*callback_list, *acb;
2945168404Spjd	int		freeable = FALSE;
2946168404Spjd
2947168404Spjd	buf = zio->io_private;
2948168404Spjd	hdr = buf->b_hdr;
2949168404Spjd
2950168404Spjd	/*
2951168404Spjd	 * The hdr was inserted into hash-table and removed from lists
2952168404Spjd	 * prior to starting I/O.  We should find this header, since
2953168404Spjd	 * it's in the hash table, and it should be legit since it's
2954168404Spjd	 * not possible to evict it during the I/O.  The only possible
2955168404Spjd	 * reason for it not to be found is if we were freed during the
2956168404Spjd	 * read.
2957168404Spjd	 */
2958209962Smm	found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
2959168404Spjd	    &hash_lock);
2960168404Spjd
2961168404Spjd	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
2962185029Spjd	    (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
2963185029Spjd	    (found == hdr && HDR_L2_READING(hdr)));
2964168404Spjd
2965185029Spjd	hdr->b_flags &= ~ARC_L2_EVICTED;
2966185029Spjd	if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
2967185029Spjd		hdr->b_flags &= ~ARC_L2CACHE;
2968206796Spjd
2969168404Spjd	/* byteswap if necessary */
2970168404Spjd	callback_list = hdr->b_acb;
2971168404Spjd	ASSERT(callback_list != NULL);
2972209101Smm	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
2973236884Smm		dmu_object_byteswap_t bswap =
2974236884Smm		    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
2975185029Spjd		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
2976185029Spjd		    byteswap_uint64_array :
2977236884Smm		    dmu_ot_byteswap[bswap].ob_func;
2978185029Spjd		func(buf->b_data, hdr->b_size);
2979185029Spjd	}
2980168404Spjd
2981185029Spjd	arc_cksum_compute(buf, B_FALSE);
2982240133Smm#ifdef illumos
2983240133Smm	arc_buf_watch(buf);
2984240133Smm#endif /* illumos */
2985168404Spjd
2986219089Spjd	if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
2987219089Spjd		/*
2988219089Spjd		 * Only call arc_access on anonymous buffers.  This is because
2989219089Spjd		 * if we've issued an I/O for an evicted buffer, we've already
2990219089Spjd		 * called arc_access (to prevent any simultaneous readers from
2991219089Spjd		 * getting confused).
2992219089Spjd		 */
2993219089Spjd		arc_access(hdr, hash_lock);
2994219089Spjd	}
2995219089Spjd
2996168404Spjd	/* create copies of the data buffer for the callers */
2997168404Spjd	abuf = buf;
2998168404Spjd	for (acb = callback_list; acb; acb = acb->acb_next) {
2999168404Spjd		if (acb->acb_done) {
3000242845Sdelphij			if (abuf == NULL) {
3001242845Sdelphij				ARCSTAT_BUMP(arcstat_duplicate_reads);
3002168404Spjd				abuf = arc_buf_clone(buf);
3003242845Sdelphij			}
3004168404Spjd			acb->acb_buf = abuf;
3005168404Spjd			abuf = NULL;
3006168404Spjd		}
3007168404Spjd	}
3008168404Spjd	hdr->b_acb = NULL;
3009168404Spjd	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3010168404Spjd	ASSERT(!HDR_BUF_AVAILABLE(hdr));
3011219089Spjd	if (abuf == buf) {
3012219089Spjd		ASSERT(buf->b_efunc == NULL);
3013219089Spjd		ASSERT(hdr->b_datacnt == 1);
3014168404Spjd		hdr->b_flags |= ARC_BUF_AVAILABLE;
3015219089Spjd	}
3016168404Spjd
3017168404Spjd	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
3018168404Spjd
3019168404Spjd	if (zio->io_error != 0) {
3020168404Spjd		hdr->b_flags |= ARC_IO_ERROR;
3021168404Spjd		if (hdr->b_state != arc_anon)
3022168404Spjd			arc_change_state(arc_anon, hdr, hash_lock);
3023168404Spjd		if (HDR_IN_HASH_TABLE(hdr))
3024168404Spjd			buf_hash_remove(hdr);
3025168404Spjd		freeable = refcount_is_zero(&hdr->b_refcnt);
3026168404Spjd	}
3027168404Spjd
3028168404Spjd	/*
3029168404Spjd	 * Broadcast before we drop the hash_lock to avoid the possibility
3030168404Spjd	 * that the hdr (and hence the cv) might be freed before we get to
3031168404Spjd	 * the cv_broadcast().
3032168404Spjd	 */
3033168404Spjd	cv_broadcast(&hdr->b_cv);
3034168404Spjd
3035168404Spjd	if (hash_lock) {
3036168404Spjd		mutex_exit(hash_lock);
3037168404Spjd	} else {
3038168404Spjd		/*
3039168404Spjd		 * This block was freed while we waited for the read to
3040168404Spjd		 * complete.  It has been removed from the hash table and
3041168404Spjd		 * moved to the anonymous state (so that it won't show up
3042168404Spjd		 * in the cache).
3043168404Spjd		 */
3044168404Spjd		ASSERT3P(hdr->b_state, ==, arc_anon);
3045168404Spjd		freeable = refcount_is_zero(&hdr->b_refcnt);
3046168404Spjd	}
3047168404Spjd
3048168404Spjd	/* execute each callback and free its structure */
3049168404Spjd	while ((acb = callback_list) != NULL) {
3050168404Spjd		if (acb->acb_done)
3051168404Spjd			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3052168404Spjd
3053168404Spjd		if (acb->acb_zio_dummy != NULL) {
3054168404Spjd			acb->acb_zio_dummy->io_error = zio->io_error;
3055168404Spjd			zio_nowait(acb->acb_zio_dummy);
3056168404Spjd		}
3057168404Spjd
3058168404Spjd		callback_list = acb->acb_next;
3059168404Spjd		kmem_free(acb, sizeof (arc_callback_t));
3060168404Spjd	}
3061168404Spjd
3062168404Spjd	if (freeable)
3063168404Spjd		arc_hdr_destroy(hdr);
3064168404Spjd}
3065168404Spjd
3066168404Spjd/*
3067168404Spjd * "Read" the block block at the specified DVA (in bp) via the
3068168404Spjd * cache.  If the block is found in the cache, invoke the provided
3069168404Spjd * callback immediately and return.  Note that the `zio' parameter
3070168404Spjd * in the callback will be NULL in this case, since no IO was
3071168404Spjd * required.  If the block is not in the cache pass the read request
3072168404Spjd * on to the spa with a substitute callback function, so that the
3073168404Spjd * requested block will be added to the cache.
3074168404Spjd *
3075168404Spjd * If a read request arrives for a block that has a read in-progress,
3076168404Spjd * either wait for the in-progress read to complete (and return the
3077168404Spjd * results); or, if this is a read with a "done" func, add a record
3078168404Spjd * to the read to invoke the "done" func when the read completes,
3079168404Spjd * and return; or just return.
3080168404Spjd *
3081168404Spjd * arc_read_done() will invoke all the requested "done" functions
3082168404Spjd * for readers of this block.
3083168404Spjd */
3084168404Spjdint
3085246666Smmarc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
3086246666Smm    void *private, int priority, int zio_flags, uint32_t *arc_flags,
3087246666Smm    const zbookmark_t *zb)
3088168404Spjd{
3089168404Spjd	arc_buf_hdr_t *hdr;
3090247187Smm	arc_buf_t *buf = NULL;
3091168404Spjd	kmutex_t *hash_lock;
3092185029Spjd	zio_t *rzio;
3093228103Smm	uint64_t guid = spa_load_guid(spa);
3094168404Spjd
3095168404Spjdtop:
3096219089Spjd	hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3097219089Spjd	    &hash_lock);
3098168404Spjd	if (hdr && hdr->b_datacnt > 0) {
3099168404Spjd
3100168404Spjd		*arc_flags |= ARC_CACHED;
3101168404Spjd
3102168404Spjd		if (HDR_IO_IN_PROGRESS(hdr)) {
3103168404Spjd
3104168404Spjd			if (*arc_flags & ARC_WAIT) {
3105168404Spjd				cv_wait(&hdr->b_cv, hash_lock);
3106168404Spjd				mutex_exit(hash_lock);
3107168404Spjd				goto top;
3108168404Spjd			}
3109168404Spjd			ASSERT(*arc_flags & ARC_NOWAIT);
3110168404Spjd
3111168404Spjd			if (done) {
3112168404Spjd				arc_callback_t	*acb = NULL;
3113168404Spjd
3114168404Spjd				acb = kmem_zalloc(sizeof (arc_callback_t),
3115168404Spjd				    KM_SLEEP);
3116168404Spjd				acb->acb_done = done;
3117168404Spjd				acb->acb_private = private;
3118168404Spjd				if (pio != NULL)
3119168404Spjd					acb->acb_zio_dummy = zio_null(pio,
3120209962Smm					    spa, NULL, NULL, NULL, zio_flags);
3121168404Spjd
3122168404Spjd				ASSERT(acb->acb_done != NULL);
3123168404Spjd				acb->acb_next = hdr->b_acb;
3124168404Spjd				hdr->b_acb = acb;
3125168404Spjd				add_reference(hdr, hash_lock, private);
3126168404Spjd				mutex_exit(hash_lock);
3127168404Spjd				return (0);
3128168404Spjd			}
3129168404Spjd			mutex_exit(hash_lock);
3130168404Spjd			return (0);
3131168404Spjd		}
3132168404Spjd
3133168404Spjd		ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3134168404Spjd
3135168404Spjd		if (done) {
3136168404Spjd			add_reference(hdr, hash_lock, private);
3137168404Spjd			/*
3138168404Spjd			 * If this block is already in use, create a new
3139168404Spjd			 * copy of the data so that we will be guaranteed
3140168404Spjd			 * that arc_release() will always succeed.
3141168404Spjd			 */
3142168404Spjd			buf = hdr->b_buf;
3143168404Spjd			ASSERT(buf);
3144168404Spjd			ASSERT(buf->b_data);
3145168404Spjd			if (HDR_BUF_AVAILABLE(hdr)) {
3146168404Spjd				ASSERT(buf->b_efunc == NULL);
3147168404Spjd				hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3148168404Spjd			} else {
3149168404Spjd				buf = arc_buf_clone(buf);
3150168404Spjd			}
3151219089Spjd
3152168404Spjd		} else if (*arc_flags & ARC_PREFETCH &&
3153168404Spjd		    refcount_count(&hdr->b_refcnt) == 0) {
3154168404Spjd			hdr->b_flags |= ARC_PREFETCH;
3155168404Spjd		}
3156168404Spjd		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3157168404Spjd		arc_access(hdr, hash_lock);
3158185029Spjd		if (*arc_flags & ARC_L2CACHE)
3159185029Spjd			hdr->b_flags |= ARC_L2CACHE;
3160251478Sdelphij		if (*arc_flags & ARC_L2COMPRESS)
3161251478Sdelphij			hdr->b_flags |= ARC_L2COMPRESS;
3162168404Spjd		mutex_exit(hash_lock);
3163168404Spjd		ARCSTAT_BUMP(arcstat_hits);
3164168404Spjd		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3165168404Spjd		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3166168404Spjd		    data, metadata, hits);
3167168404Spjd
3168168404Spjd		if (done)
3169168404Spjd			done(NULL, buf, private);
3170168404Spjd	} else {
3171168404Spjd		uint64_t size = BP_GET_LSIZE(bp);
3172168404Spjd		arc_callback_t	*acb;
3173185029Spjd		vdev_t *vd = NULL;
3174247187Smm		uint64_t addr = 0;
3175208373Smm		boolean_t devw = B_FALSE;
3176168404Spjd
3177168404Spjd		if (hdr == NULL) {
3178168404Spjd			/* this block is not in the cache */
3179168404Spjd			arc_buf_hdr_t	*exists;
3180168404Spjd			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3181168404Spjd			buf = arc_buf_alloc(spa, size, private, type);
3182168404Spjd			hdr = buf->b_hdr;
3183168404Spjd			hdr->b_dva = *BP_IDENTITY(bp);
3184219089Spjd			hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3185168404Spjd			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
3186168404Spjd			exists = buf_hash_insert(hdr, &hash_lock);
3187168404Spjd			if (exists) {
3188168404Spjd				/* somebody beat us to the hash insert */
3189168404Spjd				mutex_exit(hash_lock);
3190219089Spjd				buf_discard_identity(hdr);
3191168404Spjd				(void) arc_buf_remove_ref(buf, private);
3192168404Spjd				goto top; /* restart the IO request */
3193168404Spjd			}
3194168404Spjd			/* if this is a prefetch, we don't have a reference */
3195168404Spjd			if (*arc_flags & ARC_PREFETCH) {
3196168404Spjd				(void) remove_reference(hdr, hash_lock,
3197168404Spjd				    private);
3198168404Spjd				hdr->b_flags |= ARC_PREFETCH;
3199168404Spjd			}
3200185029Spjd			if (*arc_flags & ARC_L2CACHE)
3201185029Spjd				hdr->b_flags |= ARC_L2CACHE;
3202251478Sdelphij			if (*arc_flags & ARC_L2COMPRESS)
3203251478Sdelphij				hdr->b_flags |= ARC_L2COMPRESS;
3204168404Spjd			if (BP_GET_LEVEL(bp) > 0)
3205168404Spjd				hdr->b_flags |= ARC_INDIRECT;
3206168404Spjd		} else {
3207168404Spjd			/* this block is in the ghost cache */
3208168404Spjd			ASSERT(GHOST_STATE(hdr->b_state));
3209168404Spjd			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3210240415Smm			ASSERT0(refcount_count(&hdr->b_refcnt));
3211168404Spjd			ASSERT(hdr->b_buf == NULL);
3212168404Spjd
3213168404Spjd			/* if this is a prefetch, we don't have a reference */
3214168404Spjd			if (*arc_flags & ARC_PREFETCH)
3215168404Spjd				hdr->b_flags |= ARC_PREFETCH;
3216168404Spjd			else
3217168404Spjd				add_reference(hdr, hash_lock, private);
3218185029Spjd			if (*arc_flags & ARC_L2CACHE)
3219185029Spjd				hdr->b_flags |= ARC_L2CACHE;
3220251478Sdelphij			if (*arc_flags & ARC_L2COMPRESS)
3221251478Sdelphij				hdr->b_flags |= ARC_L2COMPRESS;
3222185029Spjd			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3223168404Spjd			buf->b_hdr = hdr;
3224168404Spjd			buf->b_data = NULL;
3225168404Spjd			buf->b_efunc = NULL;
3226168404Spjd			buf->b_private = NULL;
3227168404Spjd			buf->b_next = NULL;
3228168404Spjd			hdr->b_buf = buf;
3229168404Spjd			ASSERT(hdr->b_datacnt == 0);
3230168404Spjd			hdr->b_datacnt = 1;
3231219089Spjd			arc_get_data_buf(buf);
3232219089Spjd			arc_access(hdr, hash_lock);
3233168404Spjd		}
3234168404Spjd
3235219089Spjd		ASSERT(!GHOST_STATE(hdr->b_state));
3236219089Spjd
3237168404Spjd		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3238168404Spjd		acb->acb_done = done;
3239168404Spjd		acb->acb_private = private;
3240168404Spjd
3241168404Spjd		ASSERT(hdr->b_acb == NULL);
3242168404Spjd		hdr->b_acb = acb;
3243168404Spjd		hdr->b_flags |= ARC_IO_IN_PROGRESS;
3244168404Spjd
3245185029Spjd		if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
3246185029Spjd		    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
3247208373Smm			devw = hdr->b_l2hdr->b_dev->l2ad_writing;
3248185029Spjd			addr = hdr->b_l2hdr->b_daddr;
3249185029Spjd			/*
3250185029Spjd			 * Lock out device removal.
3251185029Spjd			 */
3252185029Spjd			if (vdev_is_dead(vd) ||
3253185029Spjd			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3254185029Spjd				vd = NULL;
3255185029Spjd		}
3256185029Spjd
3257168404Spjd		mutex_exit(hash_lock);
3258168404Spjd
3259251629Sdelphij		/*
3260251629Sdelphij		 * At this point, we have a level 1 cache miss.  Try again in
3261251629Sdelphij		 * L2ARC if possible.
3262251629Sdelphij		 */
3263168404Spjd		ASSERT3U(hdr->b_size, ==, size);
3264219089Spjd		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3265219089Spjd		    uint64_t, size, zbookmark_t *, zb);
3266168404Spjd		ARCSTAT_BUMP(arcstat_misses);
3267168404Spjd		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
3268168404Spjd		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
3269168404Spjd		    data, metadata, misses);
3270228392Spjd#ifdef _KERNEL
3271228392Spjd		curthread->td_ru.ru_inblock++;
3272228392Spjd#endif
3273168404Spjd
3274208373Smm		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3275185029Spjd			/*
3276185029Spjd			 * Read from the L2ARC if the following are true:
3277185029Spjd			 * 1. The L2ARC vdev was previously cached.
3278185029Spjd			 * 2. This buffer still has L2ARC metadata.
3279185029Spjd			 * 3. This buffer isn't currently writing to the L2ARC.
3280185029Spjd			 * 4. The L2ARC entry wasn't evicted, which may
3281185029Spjd			 *    also have invalidated the vdev.
3282208373Smm			 * 5. This isn't prefetch and l2arc_noprefetch is set.
3283185029Spjd			 */
3284185029Spjd			if (hdr->b_l2hdr != NULL &&
3285208373Smm			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3286208373Smm			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3287185029Spjd				l2arc_read_callback_t *cb;
3288185029Spjd
3289185029Spjd				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3290185029Spjd				ARCSTAT_BUMP(arcstat_l2_hits);
3291185029Spjd
3292185029Spjd				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3293185029Spjd				    KM_SLEEP);
3294185029Spjd				cb->l2rcb_buf = buf;
3295185029Spjd				cb->l2rcb_spa = spa;
3296185029Spjd				cb->l2rcb_bp = *bp;
3297185029Spjd				cb->l2rcb_zb = *zb;
3298185029Spjd				cb->l2rcb_flags = zio_flags;
3299251478Sdelphij				cb->l2rcb_compress = hdr->b_l2hdr->b_compress;
3300185029Spjd
3301247187Smm				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3302247187Smm				    addr + size < vd->vdev_psize -
3303247187Smm				    VDEV_LABEL_END_SIZE);
3304247187Smm
3305185029Spjd				/*
3306185029Spjd				 * l2arc read.  The SCL_L2ARC lock will be
3307185029Spjd				 * released by l2arc_read_done().
3308251478Sdelphij				 * Issue a null zio if the underlying buffer
3309251478Sdelphij				 * was squashed to zero size by compression.
3310185029Spjd				 */
3311251478Sdelphij				if (hdr->b_l2hdr->b_compress ==
3312251478Sdelphij				    ZIO_COMPRESS_EMPTY) {
3313251478Sdelphij					rzio = zio_null(pio, spa, vd,
3314251478Sdelphij					    l2arc_read_done, cb,
3315251478Sdelphij					    zio_flags | ZIO_FLAG_DONT_CACHE |
3316251478Sdelphij					    ZIO_FLAG_CANFAIL |
3317251478Sdelphij					    ZIO_FLAG_DONT_PROPAGATE |
3318251478Sdelphij					    ZIO_FLAG_DONT_RETRY);
3319251478Sdelphij				} else {
3320251478Sdelphij					rzio = zio_read_phys(pio, vd, addr,
3321251478Sdelphij					    hdr->b_l2hdr->b_asize,
3322251478Sdelphij					    buf->b_data, ZIO_CHECKSUM_OFF,
3323251478Sdelphij					    l2arc_read_done, cb, priority,
3324251478Sdelphij					    zio_flags | ZIO_FLAG_DONT_CACHE |
3325251478Sdelphij					    ZIO_FLAG_CANFAIL |
3326251478Sdelphij					    ZIO_FLAG_DONT_PROPAGATE |
3327251478Sdelphij					    ZIO_FLAG_DONT_RETRY, B_FALSE);
3328251478Sdelphij				}
3329185029Spjd				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3330185029Spjd				    zio_t *, rzio);
3331251478Sdelphij				ARCSTAT_INCR(arcstat_l2_read_bytes,
3332251478Sdelphij				    hdr->b_l2hdr->b_asize);
3333185029Spjd
3334185029Spjd				if (*arc_flags & ARC_NOWAIT) {
3335185029Spjd					zio_nowait(rzio);
3336185029Spjd					return (0);
3337185029Spjd				}
3338185029Spjd
3339185029Spjd				ASSERT(*arc_flags & ARC_WAIT);
3340185029Spjd				if (zio_wait(rzio) == 0)
3341185029Spjd					return (0);
3342185029Spjd
3343185029Spjd				/* l2arc read error; goto zio_read() */
3344185029Spjd			} else {
3345185029Spjd				DTRACE_PROBE1(l2arc__miss,
3346185029Spjd				    arc_buf_hdr_t *, hdr);
3347185029Spjd				ARCSTAT_BUMP(arcstat_l2_misses);
3348185029Spjd				if (HDR_L2_WRITING(hdr))
3349185029Spjd					ARCSTAT_BUMP(arcstat_l2_rw_clash);
3350185029Spjd				spa_config_exit(spa, SCL_L2ARC, vd);
3351185029Spjd			}
3352208373Smm		} else {
3353208373Smm			if (vd != NULL)
3354208373Smm				spa_config_exit(spa, SCL_L2ARC, vd);
3355208373Smm			if (l2arc_ndev != 0) {
3356208373Smm				DTRACE_PROBE1(l2arc__miss,
3357208373Smm				    arc_buf_hdr_t *, hdr);
3358208373Smm				ARCSTAT_BUMP(arcstat_l2_misses);
3359208373Smm			}
3360185029Spjd		}
3361185029Spjd
3362168404Spjd		rzio = zio_read(pio, spa, bp, buf->b_data, size,
3363185029Spjd		    arc_read_done, buf, priority, zio_flags, zb);
3364168404Spjd
3365168404Spjd		if (*arc_flags & ARC_WAIT)
3366168404Spjd			return (zio_wait(rzio));
3367168404Spjd
3368168404Spjd		ASSERT(*arc_flags & ARC_NOWAIT);
3369168404Spjd		zio_nowait(rzio);
3370168404Spjd	}
3371168404Spjd	return (0);
3372168404Spjd}
3373168404Spjd
3374168404Spjdvoid
3375168404Spjdarc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3376168404Spjd{
3377168404Spjd	ASSERT(buf->b_hdr != NULL);
3378168404Spjd	ASSERT(buf->b_hdr->b_state != arc_anon);
3379168404Spjd	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
3380219089Spjd	ASSERT(buf->b_efunc == NULL);
3381219089Spjd	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3382219089Spjd
3383168404Spjd	buf->b_efunc = func;
3384168404Spjd	buf->b_private = private;
3385168404Spjd}
3386168404Spjd
3387168404Spjd/*
3388251520Sdelphij * Notify the arc that a block was freed, and thus will never be used again.
3389251520Sdelphij */
3390251520Sdelphijvoid
3391251520Sdelphijarc_freed(spa_t *spa, const blkptr_t *bp)
3392251520Sdelphij{
3393251520Sdelphij	arc_buf_hdr_t *hdr;
3394251520Sdelphij	kmutex_t *hash_lock;
3395251520Sdelphij	uint64_t guid = spa_load_guid(spa);
3396251520Sdelphij
3397251520Sdelphij	hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
3398251520Sdelphij	    &hash_lock);
3399251520Sdelphij	if (hdr == NULL)
3400251520Sdelphij		return;
3401251520Sdelphij	if (HDR_BUF_AVAILABLE(hdr)) {
3402251520Sdelphij		arc_buf_t *buf = hdr->b_buf;
3403251520Sdelphij		add_reference(hdr, hash_lock, FTAG);
3404251520Sdelphij		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3405251520Sdelphij		mutex_exit(hash_lock);
3406251520Sdelphij
3407251520Sdelphij		arc_release(buf, FTAG);
3408251520Sdelphij		(void) arc_buf_remove_ref(buf, FTAG);
3409251520Sdelphij	} else {
3410251520Sdelphij		mutex_exit(hash_lock);
3411251520Sdelphij	}
3412251520Sdelphij
3413251520Sdelphij}
3414251520Sdelphij
3415251520Sdelphij/*
3416168404Spjd * This is used by the DMU to let the ARC know that a buffer is
3417168404Spjd * being evicted, so the ARC should clean up.  If this arc buf
3418168404Spjd * is not yet in the evicted state, it will be put there.
3419168404Spjd */
3420168404Spjdint
3421168404Spjdarc_buf_evict(arc_buf_t *buf)
3422168404Spjd{
3423168404Spjd	arc_buf_hdr_t *hdr;
3424168404Spjd	kmutex_t *hash_lock;
3425168404Spjd	arc_buf_t **bufp;
3426205231Skmacy	list_t *list, *evicted_list;
3427205231Skmacy	kmutex_t *lock, *evicted_lock;
3428206796Spjd
3429219089Spjd	mutex_enter(&buf->b_evict_lock);
3430168404Spjd	hdr = buf->b_hdr;
3431168404Spjd	if (hdr == NULL) {
3432168404Spjd		/*
3433168404Spjd		 * We are in arc_do_user_evicts().
3434168404Spjd		 */
3435168404Spjd		ASSERT(buf->b_data == NULL);
3436219089Spjd		mutex_exit(&buf->b_evict_lock);
3437168404Spjd		return (0);
3438185029Spjd	} else if (buf->b_data == NULL) {
3439185029Spjd		arc_buf_t copy = *buf; /* structure assignment */
3440185029Spjd		/*
3441185029Spjd		 * We are on the eviction list; process this buffer now
3442185029Spjd		 * but let arc_do_user_evicts() do the reaping.
3443185029Spjd		 */
3444185029Spjd		buf->b_efunc = NULL;
3445219089Spjd		mutex_exit(&buf->b_evict_lock);
3446185029Spjd		VERIFY(copy.b_efunc(&copy) == 0);
3447185029Spjd		return (1);
3448168404Spjd	}
3449168404Spjd	hash_lock = HDR_LOCK(hdr);
3450168404Spjd	mutex_enter(hash_lock);
3451219089Spjd	hdr = buf->b_hdr;
3452219089Spjd	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3453168404Spjd
3454168404Spjd	ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
3455168404Spjd	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
3456168404Spjd
3457168404Spjd	/*
3458168404Spjd	 * Pull this buffer off of the hdr
3459168404Spjd	 */
3460168404Spjd	bufp = &hdr->b_buf;
3461168404Spjd	while (*bufp != buf)
3462168404Spjd		bufp = &(*bufp)->b_next;
3463168404Spjd	*bufp = buf->b_next;
3464168404Spjd
3465168404Spjd	ASSERT(buf->b_data != NULL);
3466168404Spjd	arc_buf_destroy(buf, FALSE, FALSE);
3467168404Spjd
3468168404Spjd	if (hdr->b_datacnt == 0) {
3469168404Spjd		arc_state_t *old_state = hdr->b_state;
3470168404Spjd		arc_state_t *evicted_state;
3471168404Spjd
3472219089Spjd		ASSERT(hdr->b_buf == NULL);
3473168404Spjd		ASSERT(refcount_is_zero(&hdr->b_refcnt));
3474168404Spjd
3475168404Spjd		evicted_state =
3476168404Spjd		    (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
3477168404Spjd
3478205231Skmacy		get_buf_info(hdr, old_state, &list, &lock);
3479205231Skmacy		get_buf_info(hdr, evicted_state, &evicted_list, &evicted_lock);
3480205231Skmacy		mutex_enter(lock);
3481205231Skmacy		mutex_enter(evicted_lock);
3482168404Spjd
3483168404Spjd		arc_change_state(evicted_state, hdr, hash_lock);
3484168404Spjd		ASSERT(HDR_IN_HASH_TABLE(hdr));
3485185029Spjd		hdr->b_flags |= ARC_IN_HASH_TABLE;
3486185029Spjd		hdr->b_flags &= ~ARC_BUF_AVAILABLE;
3487168404Spjd
3488205231Skmacy		mutex_exit(evicted_lock);
3489205231Skmacy		mutex_exit(lock);
3490168404Spjd	}
3491168404Spjd	mutex_exit(hash_lock);
3492219089Spjd	mutex_exit(&buf->b_evict_lock);
3493168404Spjd
3494168404Spjd	VERIFY(buf->b_efunc(buf) == 0);
3495168404Spjd	buf->b_efunc = NULL;
3496168404Spjd	buf->b_private = NULL;
3497168404Spjd	buf->b_hdr = NULL;
3498219089Spjd	buf->b_next = NULL;
3499168404Spjd	kmem_cache_free(buf_cache, buf);
3500168404Spjd	return (1);
3501168404Spjd}
3502168404Spjd
3503168404Spjd/*
3504251629Sdelphij * Release this buffer from the cache, making it an anonymous buffer.  This
3505251629Sdelphij * must be done after a read and prior to modifying the buffer contents.
3506168404Spjd * If the buffer has more than one reference, we must make
3507185029Spjd * a new hdr for the buffer.
3508168404Spjd */
3509168404Spjdvoid
3510168404Spjdarc_release(arc_buf_t *buf, void *tag)
3511168404Spjd{
3512185029Spjd	arc_buf_hdr_t *hdr;
3513219089Spjd	kmutex_t *hash_lock = NULL;
3514185029Spjd	l2arc_buf_hdr_t *l2hdr;
3515185029Spjd	uint64_t buf_size;
3516168404Spjd
3517219089Spjd	/*
3518219089Spjd	 * It would be nice to assert that if it's DMU metadata (level >
3519219089Spjd	 * 0 || it's the dnode file), then it must be syncing context.
3520219089Spjd	 * But we don't know that information at this level.
3521219089Spjd	 */
3522219089Spjd
3523219089Spjd	mutex_enter(&buf->b_evict_lock);
3524185029Spjd	hdr = buf->b_hdr;
3525185029Spjd
3526168404Spjd	/* this buffer is not on any list */
3527168404Spjd	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
3528168404Spjd
3529168404Spjd	if (hdr->b_state == arc_anon) {
3530168404Spjd		/* this buffer is already released */
3531168404Spjd		ASSERT(buf->b_efunc == NULL);
3532208373Smm	} else {
3533208373Smm		hash_lock = HDR_LOCK(hdr);
3534208373Smm		mutex_enter(hash_lock);
3535219089Spjd		hdr = buf->b_hdr;
3536219089Spjd		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
3537168404Spjd	}
3538168404Spjd
3539185029Spjd	l2hdr = hdr->b_l2hdr;
3540185029Spjd	if (l2hdr) {
3541185029Spjd		mutex_enter(&l2arc_buflist_mtx);
3542185029Spjd		hdr->b_l2hdr = NULL;
3543258388Savg		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
3544185029Spjd	}
3545247187Smm	buf_size = hdr->b_size;
3546185029Spjd
3547168404Spjd	/*
3548168404Spjd	 * Do we have more than one buf?
3549168404Spjd	 */
3550185029Spjd	if (hdr->b_datacnt > 1) {
3551168404Spjd		arc_buf_hdr_t *nhdr;
3552168404Spjd		arc_buf_t **bufp;
3553168404Spjd		uint64_t blksz = hdr->b_size;
3554209962Smm		uint64_t spa = hdr->b_spa;
3555168404Spjd		arc_buf_contents_t type = hdr->b_type;
3556185029Spjd		uint32_t flags = hdr->b_flags;
3557168404Spjd
3558185029Spjd		ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
3559168404Spjd		/*
3560219089Spjd		 * Pull the data off of this hdr and attach it to
3561219089Spjd		 * a new anonymous hdr.
3562168404Spjd		 */
3563168404Spjd		(void) remove_reference(hdr, hash_lock, tag);
3564168404Spjd		bufp = &hdr->b_buf;
3565168404Spjd		while (*bufp != buf)
3566168404Spjd			bufp = &(*bufp)->b_next;
3567219089Spjd		*bufp = buf->b_next;
3568168404Spjd		buf->b_next = NULL;
3569168404Spjd
3570168404Spjd		ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
3571168404Spjd		atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
3572168404Spjd		if (refcount_is_zero(&hdr->b_refcnt)) {
3573185029Spjd			uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
3574185029Spjd			ASSERT3U(*size, >=, hdr->b_size);
3575185029Spjd			atomic_add_64(size, -hdr->b_size);
3576168404Spjd		}
3577242845Sdelphij
3578242845Sdelphij		/*
3579242845Sdelphij		 * We're releasing a duplicate user data buffer, update
3580242845Sdelphij		 * our statistics accordingly.
3581242845Sdelphij		 */
3582242845Sdelphij		if (hdr->b_type == ARC_BUFC_DATA) {
3583242845Sdelphij			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
3584242845Sdelphij			ARCSTAT_INCR(arcstat_duplicate_buffers_size,
3585242845Sdelphij			    -hdr->b_size);
3586242845Sdelphij		}
3587168404Spjd		hdr->b_datacnt -= 1;
3588168404Spjd		arc_cksum_verify(buf);
3589240133Smm#ifdef illumos
3590240133Smm		arc_buf_unwatch(buf);
3591240133Smm#endif /* illumos */
3592168404Spjd
3593168404Spjd		mutex_exit(hash_lock);
3594168404Spjd
3595185029Spjd		nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
3596168404Spjd		nhdr->b_size = blksz;
3597168404Spjd		nhdr->b_spa = spa;
3598168404Spjd		nhdr->b_type = type;
3599168404Spjd		nhdr->b_buf = buf;
3600168404Spjd		nhdr->b_state = arc_anon;
3601168404Spjd		nhdr->b_arc_access = 0;
3602185029Spjd		nhdr->b_flags = flags & ARC_L2_WRITING;
3603185029Spjd		nhdr->b_l2hdr = NULL;
3604168404Spjd		nhdr->b_datacnt = 1;
3605168404Spjd		nhdr->b_freeze_cksum = NULL;
3606168404Spjd		(void) refcount_add(&nhdr->b_refcnt, tag);
3607168404Spjd		buf->b_hdr = nhdr;
3608219089Spjd		mutex_exit(&buf->b_evict_lock);
3609168404Spjd		atomic_add_64(&arc_anon->arcs_size, blksz);
3610168404Spjd	} else {
3611219089Spjd		mutex_exit(&buf->b_evict_lock);
3612168404Spjd		ASSERT(refcount_count(&hdr->b_refcnt) == 1);
3613168404Spjd		ASSERT(!list_link_active(&hdr->b_arc_node));
3614168404Spjd		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3615219089Spjd		if (hdr->b_state != arc_anon)
3616219089Spjd			arc_change_state(arc_anon, hdr, hash_lock);
3617168404Spjd		hdr->b_arc_access = 0;
3618219089Spjd		if (hash_lock)
3619219089Spjd			mutex_exit(hash_lock);
3620185029Spjd
3621219089Spjd		buf_discard_identity(hdr);
3622168404Spjd		arc_buf_thaw(buf);
3623168404Spjd	}
3624168404Spjd	buf->b_efunc = NULL;
3625168404Spjd	buf->b_private = NULL;
3626185029Spjd
3627185029Spjd	if (l2hdr) {
3628251478Sdelphij		ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
3629248572Ssmh		trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
3630248574Ssmh		    hdr->b_size, 0);
3631185029Spjd		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
3632185029Spjd		ARCSTAT_INCR(arcstat_l2_size, -buf_size);
3633185029Spjd		mutex_exit(&l2arc_buflist_mtx);
3634185029Spjd	}
3635168404Spjd}
3636168404Spjd
3637168404Spjdint
3638168404Spjdarc_released(arc_buf_t *buf)
3639168404Spjd{
3640185029Spjd	int released;
3641185029Spjd
3642219089Spjd	mutex_enter(&buf->b_evict_lock);
3643185029Spjd	released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
3644219089Spjd	mutex_exit(&buf->b_evict_lock);
3645185029Spjd	return (released);
3646168404Spjd}
3647168404Spjd
3648168404Spjdint
3649168404Spjdarc_has_callback(arc_buf_t *buf)
3650168404Spjd{
3651185029Spjd	int callback;
3652185029Spjd
3653219089Spjd	mutex_enter(&buf->b_evict_lock);
3654185029Spjd	callback = (buf->b_efunc != NULL);
3655219089Spjd	mutex_exit(&buf->b_evict_lock);
3656185029Spjd	return (callback);
3657168404Spjd}
3658168404Spjd
3659168404Spjd#ifdef ZFS_DEBUG
3660168404Spjdint
3661168404Spjdarc_referenced(arc_buf_t *buf)
3662168404Spjd{
3663185029Spjd	int referenced;
3664185029Spjd
3665219089Spjd	mutex_enter(&buf->b_evict_lock);
3666185029Spjd	referenced = (refcount_count(&buf->b_hdr->b_refcnt));
3667219089Spjd	mutex_exit(&buf->b_evict_lock);
3668185029Spjd	return (referenced);
3669168404Spjd}
3670168404Spjd#endif
3671168404Spjd
3672168404Spjdstatic void
3673168404Spjdarc_write_ready(zio_t *zio)
3674168404Spjd{
3675168404Spjd	arc_write_callback_t *callback = zio->io_private;
3676168404Spjd	arc_buf_t *buf = callback->awcb_buf;
3677185029Spjd	arc_buf_hdr_t *hdr = buf->b_hdr;
3678168404Spjd
3679185029Spjd	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
3680185029Spjd	callback->awcb_ready(zio, buf, callback->awcb_private);
3681185029Spjd
3682185029Spjd	/*
3683185029Spjd	 * If the IO is already in progress, then this is a re-write
3684185029Spjd	 * attempt, so we need to thaw and re-compute the cksum.
3685185029Spjd	 * It is the responsibility of the callback to handle the
3686185029Spjd	 * accounting for any re-write attempt.
3687185029Spjd	 */
3688185029Spjd	if (HDR_IO_IN_PROGRESS(hdr)) {
3689185029Spjd		mutex_enter(&hdr->b_freeze_lock);
3690185029Spjd		if (hdr->b_freeze_cksum != NULL) {
3691185029Spjd			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
3692185029Spjd			hdr->b_freeze_cksum = NULL;
3693185029Spjd		}
3694185029Spjd		mutex_exit(&hdr->b_freeze_lock);
3695168404Spjd	}
3696185029Spjd	arc_cksum_compute(buf, B_FALSE);
3697185029Spjd	hdr->b_flags |= ARC_IO_IN_PROGRESS;
3698168404Spjd}
3699168404Spjd
3700168404Spjdstatic void
3701168404Spjdarc_write_done(zio_t *zio)
3702168404Spjd{
3703168404Spjd	arc_write_callback_t *callback = zio->io_private;
3704168404Spjd	arc_buf_t *buf = callback->awcb_buf;
3705168404Spjd	arc_buf_hdr_t *hdr = buf->b_hdr;
3706168404Spjd
3707219089Spjd	ASSERT(hdr->b_acb == NULL);
3708168404Spjd
3709219089Spjd	if (zio->io_error == 0) {
3710219089Spjd		hdr->b_dva = *BP_IDENTITY(zio->io_bp);
3711219089Spjd		hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
3712219089Spjd		hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
3713219089Spjd	} else {
3714219089Spjd		ASSERT(BUF_EMPTY(hdr));
3715219089Spjd	}
3716219089Spjd
3717168404Spjd	/*
3718168404Spjd	 * If the block to be written was all-zero, we may have
3719168404Spjd	 * compressed it away.  In this case no write was performed
3720219089Spjd	 * so there will be no dva/birth/checksum.  The buffer must
3721219089Spjd	 * therefore remain anonymous (and uncached).
3722168404Spjd	 */
3723168404Spjd	if (!BUF_EMPTY(hdr)) {
3724168404Spjd		arc_buf_hdr_t *exists;
3725168404Spjd		kmutex_t *hash_lock;
3726168404Spjd
3727219089Spjd		ASSERT(zio->io_error == 0);
3728219089Spjd
3729168404Spjd		arc_cksum_verify(buf);
3730168404Spjd
3731168404Spjd		exists = buf_hash_insert(hdr, &hash_lock);
3732168404Spjd		if (exists) {
3733168404Spjd			/*
3734168404Spjd			 * This can only happen if we overwrite for
3735168404Spjd			 * sync-to-convergence, because we remove
3736168404Spjd			 * buffers from the hash table when we arc_free().
3737168404Spjd			 */
3738219089Spjd			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
3739219089Spjd				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3740219089Spjd					panic("bad overwrite, hdr=%p exists=%p",
3741219089Spjd					    (void *)hdr, (void *)exists);
3742219089Spjd				ASSERT(refcount_is_zero(&exists->b_refcnt));
3743219089Spjd				arc_change_state(arc_anon, exists, hash_lock);
3744219089Spjd				mutex_exit(hash_lock);
3745219089Spjd				arc_hdr_destroy(exists);
3746219089Spjd				exists = buf_hash_insert(hdr, &hash_lock);
3747219089Spjd				ASSERT3P(exists, ==, NULL);
3748243524Smm			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
3749243524Smm				/* nopwrite */
3750243524Smm				ASSERT(zio->io_prop.zp_nopwrite);
3751243524Smm				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
3752243524Smm					panic("bad nopwrite, hdr=%p exists=%p",
3753243524Smm					    (void *)hdr, (void *)exists);
3754219089Spjd			} else {
3755219089Spjd				/* Dedup */
3756219089Spjd				ASSERT(hdr->b_datacnt == 1);
3757219089Spjd				ASSERT(hdr->b_state == arc_anon);
3758219089Spjd				ASSERT(BP_GET_DEDUP(zio->io_bp));
3759219089Spjd				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
3760219089Spjd			}
3761168404Spjd		}
3762168404Spjd		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3763185029Spjd		/* if it's not anon, we are doing a scrub */
3764219089Spjd		if (!exists && hdr->b_state == arc_anon)
3765185029Spjd			arc_access(hdr, hash_lock);
3766168404Spjd		mutex_exit(hash_lock);
3767168404Spjd	} else {
3768168404Spjd		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
3769168404Spjd	}
3770168404Spjd
3771219089Spjd	ASSERT(!refcount_is_zero(&hdr->b_refcnt));
3772219089Spjd	callback->awcb_done(zio, buf, callback->awcb_private);
3773168404Spjd
3774168404Spjd	kmem_free(callback, sizeof (arc_write_callback_t));
3775168404Spjd}
3776168404Spjd
3777168404Spjdzio_t *
3778219089Spjdarc_write(zio_t *pio, spa_t *spa, uint64_t txg,
3779251478Sdelphij    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
3780251478Sdelphij    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
3781251478Sdelphij    void *private, int priority, int zio_flags, const zbookmark_t *zb)
3782168404Spjd{
3783168404Spjd	arc_buf_hdr_t *hdr = buf->b_hdr;
3784168404Spjd	arc_write_callback_t *callback;
3785185029Spjd	zio_t *zio;
3786168404Spjd
3787185029Spjd	ASSERT(ready != NULL);
3788219089Spjd	ASSERT(done != NULL);
3789168404Spjd	ASSERT(!HDR_IO_ERROR(hdr));
3790168404Spjd	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
3791219089Spjd	ASSERT(hdr->b_acb == NULL);
3792185029Spjd	if (l2arc)
3793185029Spjd		hdr->b_flags |= ARC_L2CACHE;
3794251478Sdelphij	if (l2arc_compress)
3795251478Sdelphij		hdr->b_flags |= ARC_L2COMPRESS;
3796168404Spjd	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
3797168404Spjd	callback->awcb_ready = ready;
3798168404Spjd	callback->awcb_done = done;
3799168404Spjd	callback->awcb_private = private;
3800168404Spjd	callback->awcb_buf = buf;
3801168404Spjd
3802219089Spjd	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
3803185029Spjd	    arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
3804185029Spjd
3805168404Spjd	return (zio);
3806168404Spjd}
3807168404Spjd
3808185029Spjdstatic int
3809209962Smmarc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
3810185029Spjd{
3811185029Spjd#ifdef _KERNEL
3812219089Spjd	uint64_t available_memory =
3813219089Spjd	    ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count);
3814185029Spjd	static uint64_t page_load = 0;
3815185029Spjd	static uint64_t last_txg = 0;
3816185029Spjd
3817219089Spjd#ifdef sun
3818185029Spjd#if defined(__i386)
3819185029Spjd	available_memory =
3820185029Spjd	    MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
3821185029Spjd#endif
3822219089Spjd#endif	/* sun */
3823185029Spjd	if (available_memory >= zfs_write_limit_max)
3824185029Spjd		return (0);
3825185029Spjd
3826185029Spjd	if (txg > last_txg) {
3827185029Spjd		last_txg = txg;
3828185029Spjd		page_load = 0;
3829185029Spjd	}
3830185029Spjd	/*
3831185029Spjd	 * If we are in pageout, we know that memory is already tight,
3832185029Spjd	 * the arc is already going to be evicting, so we just want to
3833185029Spjd	 * continue to let page writes occur as quickly as possible.
3834185029Spjd	 */
3835185029Spjd	if (curproc == pageproc) {
3836185029Spjd		if (page_load > available_memory / 4)
3837249195Smm			return (SET_ERROR(ERESTART));
3838185029Spjd		/* Note: reserve is inflated, so we deflate */
3839185029Spjd		page_load += reserve / 8;
3840185029Spjd		return (0);
3841185029Spjd	} else if (page_load > 0 && arc_reclaim_needed()) {
3842185029Spjd		/* memory is low, delay before restarting */
3843185029Spjd		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3844249195Smm		return (SET_ERROR(EAGAIN));
3845185029Spjd	}
3846185029Spjd	page_load = 0;
3847185029Spjd
3848185029Spjd	if (arc_size > arc_c_min) {
3849185029Spjd		uint64_t evictable_memory =
3850185029Spjd		    arc_mru->arcs_lsize[ARC_BUFC_DATA] +
3851185029Spjd		    arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
3852185029Spjd		    arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
3853185029Spjd		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
3854185029Spjd		available_memory += MIN(evictable_memory, arc_size - arc_c_min);
3855185029Spjd	}
3856185029Spjd
3857185029Spjd	if (inflight_data > available_memory / 4) {
3858185029Spjd		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
3859249195Smm		return (SET_ERROR(ERESTART));
3860185029Spjd	}
3861185029Spjd#endif
3862185029Spjd	return (0);
3863185029Spjd}
3864185029Spjd
3865168404Spjdvoid
3866185029Spjdarc_tempreserve_clear(uint64_t reserve)
3867168404Spjd{
3868185029Spjd	atomic_add_64(&arc_tempreserve, -reserve);
3869168404Spjd	ASSERT((int64_t)arc_tempreserve >= 0);
3870168404Spjd}
3871168404Spjd
3872168404Spjdint
3873185029Spjdarc_tempreserve_space(uint64_t reserve, uint64_t txg)
3874168404Spjd{
3875185029Spjd	int error;
3876209962Smm	uint64_t anon_size;
3877185029Spjd
3878168404Spjd#ifdef ZFS_DEBUG
3879168404Spjd	/*
3880168404Spjd	 * Once in a while, fail for no reason.  Everything should cope.
3881168404Spjd	 */
3882168404Spjd	if (spa_get_random(10000) == 0) {
3883168404Spjd		dprintf("forcing random failure\n");
3884249195Smm		return (SET_ERROR(ERESTART));
3885168404Spjd	}
3886168404Spjd#endif
3887185029Spjd	if (reserve > arc_c/4 && !arc_no_grow)
3888185029Spjd		arc_c = MIN(arc_c_max, reserve * 4);
3889185029Spjd	if (reserve > arc_c)
3890249195Smm		return (SET_ERROR(ENOMEM));
3891168404Spjd
3892168404Spjd	/*
3893209962Smm	 * Don't count loaned bufs as in flight dirty data to prevent long
3894209962Smm	 * network delays from blocking transactions that are ready to be
3895209962Smm	 * assigned to a txg.
3896209962Smm	 */
3897209962Smm	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
3898209962Smm
3899209962Smm	/*
3900185029Spjd	 * Writes will, almost always, require additional memory allocations
3901251631Sdelphij	 * in order to compress/encrypt/etc the data.  We therefore need to
3902185029Spjd	 * make sure that there is sufficient available memory for this.
3903185029Spjd	 */
3904209962Smm	if (error = arc_memory_throttle(reserve, anon_size, txg))
3905185029Spjd		return (error);
3906185029Spjd
3907185029Spjd	/*
3908168404Spjd	 * Throttle writes when the amount of dirty data in the cache
3909168404Spjd	 * gets too large.  We try to keep the cache less than half full
3910168404Spjd	 * of dirty blocks so that our sync times don't grow too large.
3911168404Spjd	 * Note: if two requests come in concurrently, we might let them
3912168404Spjd	 * both succeed, when one of them should fail.  Not a huge deal.
3913168404Spjd	 */
3914209962Smm
3915209962Smm	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
3916209962Smm	    anon_size > arc_c / 4) {
3917185029Spjd		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
3918185029Spjd		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
3919185029Spjd		    arc_tempreserve>>10,
3920185029Spjd		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
3921185029Spjd		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
3922185029Spjd		    reserve>>10, arc_c>>10);
3923249195Smm		return (SET_ERROR(ERESTART));
3924168404Spjd	}
3925185029Spjd	atomic_add_64(&arc_tempreserve, reserve);
3926168404Spjd	return (0);
3927168404Spjd}
3928168404Spjd
3929168582Spjdstatic kmutex_t arc_lowmem_lock;
3930168404Spjd#ifdef _KERNEL
3931168566Spjdstatic eventhandler_tag arc_event_lowmem = NULL;
3932168404Spjd
3933168404Spjdstatic void
3934168566Spjdarc_lowmem(void *arg __unused, int howto __unused)
3935168404Spjd{
3936168404Spjd
3937168566Spjd	/* Serialize access via arc_lowmem_lock. */
3938168566Spjd	mutex_enter(&arc_lowmem_lock);
3939219089Spjd	mutex_enter(&arc_reclaim_thr_lock);
3940185029Spjd	needfree = 1;
3941168404Spjd	cv_signal(&arc_reclaim_thr_cv);
3942241773Savg
3943241773Savg	/*
3944241773Savg	 * It is unsafe to block here in arbitrary threads, because we can come
3945241773Savg	 * here from ARC itself and may hold ARC locks and thus risk a deadlock
3946241773Savg	 * with ARC reclaim thread.
3947241773Savg	 */
3948241773Savg	if (curproc == pageproc) {
3949241773Savg		while (needfree)
3950241773Savg			msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
3951241773Savg	}
3952219089Spjd	mutex_exit(&arc_reclaim_thr_lock);
3953168566Spjd	mutex_exit(&arc_lowmem_lock);
3954168404Spjd}
3955168404Spjd#endif
3956168404Spjd
3957168404Spjdvoid
3958168404Spjdarc_init(void)
3959168404Spjd{
3960219089Spjd	int i, prefetch_tunable_set = 0;
3961205231Skmacy
3962168404Spjd	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
3963168404Spjd	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
3964168566Spjd	mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
3965168404Spjd
3966168404Spjd	/* Convert seconds to clock ticks */
3967168404Spjd	arc_min_prefetch_lifespan = 1 * hz;
3968168404Spjd
3969168404Spjd	/* Start out with 1/8 of all memory */
3970168566Spjd	arc_c = kmem_size() / 8;
3971219089Spjd
3972219089Spjd#ifdef sun
3973192360Skmacy#ifdef _KERNEL
3974192360Skmacy	/*
3975192360Skmacy	 * On architectures where the physical memory can be larger
3976192360Skmacy	 * than the addressable space (intel in 32-bit mode), we may
3977192360Skmacy	 * need to limit the cache to 1/8 of VM size.
3978192360Skmacy	 */
3979192360Skmacy	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
3980192360Skmacy#endif
3981219089Spjd#endif	/* sun */
3982168566Spjd	/* set min cache to 1/32 of all memory, or 16MB, whichever is more */
3983168566Spjd	arc_c_min = MAX(arc_c / 4, 64<<18);
3984168566Spjd	/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
3985168404Spjd	if (arc_c * 8 >= 1<<30)
3986168404Spjd		arc_c_max = (arc_c * 8) - (1<<30);
3987168404Spjd	else
3988168404Spjd		arc_c_max = arc_c_min;
3989175633Spjd	arc_c_max = MAX(arc_c * 5, arc_c_max);
3990219089Spjd
3991168481Spjd#ifdef _KERNEL
3992168404Spjd	/*
3993168404Spjd	 * Allow the tunables to override our calculations if they are
3994168566Spjd	 * reasonable (ie. over 16MB)
3995168404Spjd	 */
3996219089Spjd	if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size())
3997168404Spjd		arc_c_max = zfs_arc_max;
3998219089Spjd	if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max)
3999168404Spjd		arc_c_min = zfs_arc_min;
4000168481Spjd#endif
4001219089Spjd
4002168404Spjd	arc_c = arc_c_max;
4003168404Spjd	arc_p = (arc_c >> 1);
4004168404Spjd
4005185029Spjd	/* limit meta-data to 1/4 of the arc capacity */
4006185029Spjd	arc_meta_limit = arc_c_max / 4;
4007185029Spjd
4008185029Spjd	/* Allow the tunable to override if it is reasonable */
4009185029Spjd	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
4010185029Spjd		arc_meta_limit = zfs_arc_meta_limit;
4011185029Spjd
4012185029Spjd	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
4013185029Spjd		arc_c_min = arc_meta_limit / 2;
4014185029Spjd
4015208373Smm	if (zfs_arc_grow_retry > 0)
4016208373Smm		arc_grow_retry = zfs_arc_grow_retry;
4017208373Smm
4018208373Smm	if (zfs_arc_shrink_shift > 0)
4019208373Smm		arc_shrink_shift = zfs_arc_shrink_shift;
4020208373Smm
4021208373Smm	if (zfs_arc_p_min_shift > 0)
4022208373Smm		arc_p_min_shift = zfs_arc_p_min_shift;
4023208373Smm
4024168404Spjd	/* if kmem_flags are set, lets try to use less memory */
4025168404Spjd	if (kmem_debugging())
4026168404Spjd		arc_c = arc_c / 2;
4027168404Spjd	if (arc_c < arc_c_min)
4028168404Spjd		arc_c = arc_c_min;
4029168404Spjd
4030168473Spjd	zfs_arc_min = arc_c_min;
4031168473Spjd	zfs_arc_max = arc_c_max;
4032168473Spjd
4033168404Spjd	arc_anon = &ARC_anon;
4034168404Spjd	arc_mru = &ARC_mru;
4035168404Spjd	arc_mru_ghost = &ARC_mru_ghost;
4036168404Spjd	arc_mfu = &ARC_mfu;
4037168404Spjd	arc_mfu_ghost = &ARC_mfu_ghost;
4038185029Spjd	arc_l2c_only = &ARC_l2c_only;
4039168404Spjd	arc_size = 0;
4040168404Spjd
4041205231Skmacy	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4042205231Skmacy		mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
4043205231Skmacy		    NULL, MUTEX_DEFAULT, NULL);
4044205231Skmacy		mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
4045205231Skmacy		    NULL, MUTEX_DEFAULT, NULL);
4046205231Skmacy		mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
4047205231Skmacy		    NULL, MUTEX_DEFAULT, NULL);
4048205231Skmacy		mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
4049205231Skmacy		    NULL, MUTEX_DEFAULT, NULL);
4050205231Skmacy		mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
4051205231Skmacy		    NULL, MUTEX_DEFAULT, NULL);
4052205231Skmacy		mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
4053205231Skmacy		    NULL, MUTEX_DEFAULT, NULL);
4054206796Spjd
4055205231Skmacy		list_create(&arc_mru->arcs_lists[i],
4056205231Skmacy		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4057205231Skmacy		list_create(&arc_mru_ghost->arcs_lists[i],
4058205231Skmacy		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4059205231Skmacy		list_create(&arc_mfu->arcs_lists[i],
4060205231Skmacy		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4061205231Skmacy		list_create(&arc_mfu_ghost->arcs_lists[i],
4062205231Skmacy		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4063205231Skmacy		list_create(&arc_mfu_ghost->arcs_lists[i],
4064205231Skmacy		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4065205231Skmacy		list_create(&arc_l2c_only->arcs_lists[i],
4066205231Skmacy		    sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
4067205231Skmacy	}
4068168404Spjd
4069168404Spjd	buf_init();
4070168404Spjd
4071168404Spjd	arc_thread_exit = 0;
4072168404Spjd	arc_eviction_list = NULL;
4073168404Spjd	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
4074168404Spjd	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4075168404Spjd
4076168404Spjd	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4077168404Spjd	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4078168404Spjd
4079168404Spjd	if (arc_ksp != NULL) {
4080168404Spjd		arc_ksp->ks_data = &arc_stats;
4081168404Spjd		kstat_install(arc_ksp);
4082168404Spjd	}
4083168404Spjd
4084168404Spjd	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
4085168404Spjd	    TS_RUN, minclsyspri);
4086168404Spjd
4087168404Spjd#ifdef _KERNEL
4088168566Spjd	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
4089168404Spjd	    EVENTHANDLER_PRI_FIRST);
4090168404Spjd#endif
4091168404Spjd
4092168404Spjd	arc_dead = FALSE;
4093185029Spjd	arc_warm = B_FALSE;
4094168566Spjd
4095185029Spjd	if (zfs_write_limit_max == 0)
4096185029Spjd		zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
4097185029Spjd	else
4098185029Spjd		zfs_write_limit_shift = 0;
4099185029Spjd	mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
4100185029Spjd
4101168566Spjd#ifdef _KERNEL
4102194043Skmacy	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
4103193953Skmacy		prefetch_tunable_set = 1;
4104206796Spjd
4105193878Skmacy#ifdef __i386__
4106193953Skmacy	if (prefetch_tunable_set == 0) {
4107196863Strasz		printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
4108196863Strasz		    "-- to enable,\n");
4109196863Strasz		printf("            add \"vfs.zfs.prefetch_disable=0\" "
4110196863Strasz		    "to /boot/loader.conf.\n");
4111219089Spjd		zfs_prefetch_disable = 1;
4112193878Skmacy	}
4113206796Spjd#else
4114193878Skmacy	if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
4115193953Skmacy	    prefetch_tunable_set == 0) {
4116196863Strasz		printf("ZFS NOTICE: Prefetch is disabled by default if less "
4117196941Strasz		    "than 4GB of RAM is present;\n"
4118196863Strasz		    "            to enable, add \"vfs.zfs.prefetch_disable=0\" "
4119196863Strasz		    "to /boot/loader.conf.\n");
4120219089Spjd		zfs_prefetch_disable = 1;
4121193878Skmacy	}
4122206796Spjd#endif
4123175633Spjd	/* Warn about ZFS memory and address space requirements. */
4124168696Spjd	if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
4125168987Sbmah		printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
4126168987Sbmah		    "expect unstable behavior.\n");
4127175633Spjd	}
4128175633Spjd	if (kmem_size() < 512 * (1 << 20)) {
4129173419Spjd		printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
4130168987Sbmah		    "expect unstable behavior.\n");
4131185029Spjd		printf("             Consider tuning vm.kmem_size and "
4132173419Spjd		    "vm.kmem_size_max\n");
4133185029Spjd		printf("             in /boot/loader.conf.\n");
4134168566Spjd	}
4135168566Spjd#endif
4136168404Spjd}
4137168404Spjd
4138168404Spjdvoid
4139168404Spjdarc_fini(void)
4140168404Spjd{
4141205231Skmacy	int i;
4142206796Spjd
4143168404Spjd	mutex_enter(&arc_reclaim_thr_lock);
4144168404Spjd	arc_thread_exit = 1;
4145168404Spjd	cv_signal(&arc_reclaim_thr_cv);
4146168404Spjd	while (arc_thread_exit != 0)
4147168404Spjd		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
4148168404Spjd	mutex_exit(&arc_reclaim_thr_lock);
4149168404Spjd
4150185029Spjd	arc_flush(NULL);
4151168404Spjd
4152168404Spjd	arc_dead = TRUE;
4153168404Spjd
4154168404Spjd	if (arc_ksp != NULL) {
4155168404Spjd		kstat_delete(arc_ksp);
4156168404Spjd		arc_ksp = NULL;
4157168404Spjd	}
4158168404Spjd
4159168404Spjd	mutex_destroy(&arc_eviction_mtx);
4160168404Spjd	mutex_destroy(&arc_reclaim_thr_lock);
4161168404Spjd	cv_destroy(&arc_reclaim_thr_cv);
4162168404Spjd
4163205231Skmacy	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4164205231Skmacy		list_destroy(&arc_mru->arcs_lists[i]);
4165205231Skmacy		list_destroy(&arc_mru_ghost->arcs_lists[i]);
4166205231Skmacy		list_destroy(&arc_mfu->arcs_lists[i]);
4167205231Skmacy		list_destroy(&arc_mfu_ghost->arcs_lists[i]);
4168206795Spjd		list_destroy(&arc_l2c_only->arcs_lists[i]);
4169168404Spjd
4170205231Skmacy		mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
4171205231Skmacy		mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
4172205231Skmacy		mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
4173205231Skmacy		mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
4174205231Skmacy		mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
4175206795Spjd		mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
4176205231Skmacy	}
4177206796Spjd
4178185029Spjd	mutex_destroy(&zfs_write_limit_lock);
4179185029Spjd
4180168404Spjd	buf_fini();
4181168404Spjd
4182209962Smm	ASSERT(arc_loaned_bytes == 0);
4183209962Smm
4184168582Spjd	mutex_destroy(&arc_lowmem_lock);
4185168404Spjd#ifdef _KERNEL
4186168566Spjd	if (arc_event_lowmem != NULL)
4187168566Spjd		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
4188168404Spjd#endif
4189168404Spjd}
4190185029Spjd
4191185029Spjd/*
4192185029Spjd * Level 2 ARC
4193185029Spjd *
4194185029Spjd * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
4195185029Spjd * It uses dedicated storage devices to hold cached data, which are populated
4196185029Spjd * using large infrequent writes.  The main role of this cache is to boost
4197185029Spjd * the performance of random read workloads.  The intended L2ARC devices
4198185029Spjd * include short-stroked disks, solid state disks, and other media with
4199185029Spjd * substantially faster read latency than disk.
4200185029Spjd *
4201185029Spjd *                 +-----------------------+
4202185029Spjd *                 |         ARC           |
4203185029Spjd *                 +-----------------------+
4204185029Spjd *                    |         ^     ^
4205185029Spjd *                    |         |     |
4206185029Spjd *      l2arc_feed_thread()    arc_read()
4207185029Spjd *                    |         |     |
4208185029Spjd *                    |  l2arc read   |
4209185029Spjd *                    V         |     |
4210185029Spjd *               +---------------+    |
4211185029Spjd *               |     L2ARC     |    |
4212185029Spjd *               +---------------+    |
4213185029Spjd *                   |    ^           |
4214185029Spjd *          l2arc_write() |           |
4215185029Spjd *                   |    |           |
4216185029Spjd *                   V    |           |
4217185029Spjd *                 +-------+      +-------+
4218185029Spjd *                 | vdev  |      | vdev  |
4219185029Spjd *                 | cache |      | cache |
4220185029Spjd *                 +-------+      +-------+
4221185029Spjd *                 +=========+     .-----.
4222185029Spjd *                 :  L2ARC  :    |-_____-|
4223185029Spjd *                 : devices :    | Disks |
4224185029Spjd *                 +=========+    `-_____-'
4225185029Spjd *
4226185029Spjd * Read requests are satisfied from the following sources, in order:
4227185029Spjd *
4228185029Spjd *	1) ARC
4229185029Spjd *	2) vdev cache of L2ARC devices
4230185029Spjd *	3) L2ARC devices
4231185029Spjd *	4) vdev cache of disks
4232185029Spjd *	5) disks
4233185029Spjd *
4234185029Spjd * Some L2ARC device types exhibit extremely slow write performance.
4235185029Spjd * To accommodate for this there are some significant differences between
4236185029Spjd * the L2ARC and traditional cache design:
4237185029Spjd *
4238185029Spjd * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
4239185029Spjd * the ARC behave as usual, freeing buffers and placing headers on ghost
4240185029Spjd * lists.  The ARC does not send buffers to the L2ARC during eviction as
4241185029Spjd * this would add inflated write latencies for all ARC memory pressure.
4242185029Spjd *
4243185029Spjd * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4244185029Spjd * It does this by periodically scanning buffers from the eviction-end of
4245185029Spjd * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4246251478Sdelphij * not already there. It scans until a headroom of buffers is satisfied,
4247251478Sdelphij * which itself is a buffer for ARC eviction. If a compressible buffer is
4248251478Sdelphij * found during scanning and selected for writing to an L2ARC device, we
4249251478Sdelphij * temporarily boost scanning headroom during the next scan cycle to make
4250251478Sdelphij * sure we adapt to compression effects (which might significantly reduce
4251251478Sdelphij * the data volume we write to L2ARC). The thread that does this is
4252185029Spjd * l2arc_feed_thread(), illustrated below; example sizes are included to
4253185029Spjd * provide a better sense of ratio than this diagram:
4254185029Spjd *
4255185029Spjd *	       head -->                        tail
4256185029Spjd *	        +---------------------+----------+
4257185029Spjd *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
4258185029Spjd *	        +---------------------+----------+   |   o L2ARC eligible
4259185029Spjd *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
4260185029Spjd *	        +---------------------+----------+   |
4261185029Spjd *	             15.9 Gbytes      ^ 32 Mbytes    |
4262185029Spjd *	                           headroom          |
4263185029Spjd *	                                      l2arc_feed_thread()
4264185029Spjd *	                                             |
4265185029Spjd *	                 l2arc write hand <--[oooo]--'
4266185029Spjd *	                         |           8 Mbyte
4267185029Spjd *	                         |          write max
4268185029Spjd *	                         V
4269185029Spjd *		  +==============================+
4270185029Spjd *	L2ARC dev |####|#|###|###|    |####| ... |
4271185029Spjd *	          +==============================+
4272185029Spjd *	                     32 Gbytes
4273185029Spjd *
4274185029Spjd * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4275185029Spjd * evicted, then the L2ARC has cached a buffer much sooner than it probably
4276185029Spjd * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
4277185029Spjd * safe to say that this is an uncommon case, since buffers at the end of
4278185029Spjd * the ARC lists have moved there due to inactivity.
4279185029Spjd *
4280185029Spjd * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4281185029Spjd * then the L2ARC simply misses copying some buffers.  This serves as a
4282185029Spjd * pressure valve to prevent heavy read workloads from both stalling the ARC
4283185029Spjd * with waits and clogging the L2ARC with writes.  This also helps prevent
4284185029Spjd * the potential for the L2ARC to churn if it attempts to cache content too
4285185029Spjd * quickly, such as during backups of the entire pool.
4286185029Spjd *
4287185029Spjd * 5. After system boot and before the ARC has filled main memory, there are
4288185029Spjd * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4289185029Spjd * lists can remain mostly static.  Instead of searching from tail of these
4290185029Spjd * lists as pictured, the l2arc_feed_thread() will search from the list heads
4291185029Spjd * for eligible buffers, greatly increasing its chance of finding them.
4292185029Spjd *
4293185029Spjd * The L2ARC device write speed is also boosted during this time so that
4294185029Spjd * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
4295185029Spjd * there are no L2ARC reads, and no fear of degrading read performance
4296185029Spjd * through increased writes.
4297185029Spjd *
4298185029Spjd * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4299185029Spjd * the vdev queue can aggregate them into larger and fewer writes.  Each
4300185029Spjd * device is written to in a rotor fashion, sweeping writes through
4301185029Spjd * available space then repeating.
4302185029Spjd *
4303185029Spjd * 7. The L2ARC does not store dirty content.  It never needs to flush
4304185029Spjd * write buffers back to disk based storage.
4305185029Spjd *
4306185029Spjd * 8. If an ARC buffer is written (and dirtied) which also exists in the
4307185029Spjd * L2ARC, the now stale L2ARC buffer is immediately dropped.
4308185029Spjd *
4309185029Spjd * The performance of the L2ARC can be tweaked by a number of tunables, which
4310185029Spjd * may be necessary for different workloads:
4311185029Spjd *
4312185029Spjd *	l2arc_write_max		max write bytes per interval
4313185029Spjd *	l2arc_write_boost	extra write bytes during device warmup
4314185029Spjd *	l2arc_noprefetch	skip caching prefetched buffers
4315185029Spjd *	l2arc_headroom		number of max device writes to precache
4316251478Sdelphij *	l2arc_headroom_boost	when we find compressed buffers during ARC
4317251478Sdelphij *				scanning, we multiply headroom by this
4318251478Sdelphij *				percentage factor for the next scan cycle,
4319251478Sdelphij *				since more compressed buffers are likely to
4320251478Sdelphij *				be present
4321185029Spjd *	l2arc_feed_secs		seconds between L2ARC writing
4322185029Spjd *
4323185029Spjd * Tunables may be removed or added as future performance improvements are
4324185029Spjd * integrated, and also may become zpool properties.
4325208373Smm *
4326208373Smm * There are three key functions that control how the L2ARC warms up:
4327208373Smm *
4328208373Smm *	l2arc_write_eligible()	check if a buffer is eligible to cache
4329208373Smm *	l2arc_write_size()	calculate how much to write
4330208373Smm *	l2arc_write_interval()	calculate sleep delay between writes
4331208373Smm *
4332208373Smm * These three functions determine what to write, how much, and how quickly
4333208373Smm * to send writes.
4334185029Spjd */
4335185029Spjd
4336208373Smmstatic boolean_t
4337209962Smml2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
4338208373Smm{
4339208373Smm	/*
4340208373Smm	 * A buffer is *not* eligible for the L2ARC if it:
4341208373Smm	 * 1. belongs to a different spa.
4342208373Smm	 * 2. is already cached on the L2ARC.
4343208373Smm	 * 3. has an I/O in progress (it may be an incomplete read).
4344208373Smm	 * 4. is flagged not eligible (zfs property).
4345208373Smm	 */
4346209962Smm	if (ab->b_spa != spa_guid) {
4347208373Smm		ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
4348208373Smm		return (B_FALSE);
4349208373Smm	}
4350208373Smm	if (ab->b_l2hdr != NULL) {
4351208373Smm		ARCSTAT_BUMP(arcstat_l2_write_in_l2);
4352208373Smm		return (B_FALSE);
4353208373Smm	}
4354208373Smm	if (HDR_IO_IN_PROGRESS(ab)) {
4355208373Smm		ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
4356208373Smm		return (B_FALSE);
4357208373Smm	}
4358208373Smm	if (!HDR_L2CACHE(ab)) {
4359208373Smm		ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
4360208373Smm		return (B_FALSE);
4361208373Smm	}
4362208373Smm
4363208373Smm	return (B_TRUE);
4364208373Smm}
4365208373Smm
4366208373Smmstatic uint64_t
4367251478Sdelphijl2arc_write_size(void)
4368208373Smm{
4369208373Smm	uint64_t size;
4370208373Smm
4371251478Sdelphij	/*
4372251478Sdelphij	 * Make sure our globals have meaningful values in case the user
4373251478Sdelphij	 * altered them.
4374251478Sdelphij	 */
4375251478Sdelphij	size = l2arc_write_max;
4376251478Sdelphij	if (size == 0) {
4377251478Sdelphij		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4378251478Sdelphij		    "be greater than zero, resetting it to the default (%d)",
4379251478Sdelphij		    L2ARC_WRITE_SIZE);
4380251478Sdelphij		size = l2arc_write_max = L2ARC_WRITE_SIZE;
4381251478Sdelphij	}
4382208373Smm
4383208373Smm	if (arc_warm == B_FALSE)
4384251478Sdelphij		size += l2arc_write_boost;
4385208373Smm
4386208373Smm	return (size);
4387208373Smm
4388208373Smm}
4389208373Smm
4390208373Smmstatic clock_t
4391208373Smml2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4392208373Smm{
4393219089Spjd	clock_t interval, next, now;
4394208373Smm
4395208373Smm	/*
4396208373Smm	 * If the ARC lists are busy, increase our write rate; if the
4397208373Smm	 * lists are stale, idle back.  This is achieved by checking
4398208373Smm	 * how much we previously wrote - if it was more than half of
4399208373Smm	 * what we wanted, schedule the next write much sooner.
4400208373Smm	 */
4401208373Smm	if (l2arc_feed_again && wrote > (wanted / 2))
4402208373Smm		interval = (hz * l2arc_feed_min_ms) / 1000;
4403208373Smm	else
4404208373Smm		interval = hz * l2arc_feed_secs;
4405208373Smm
4406219089Spjd	now = ddi_get_lbolt();
4407219089Spjd	next = MAX(now, MIN(now + interval, began + interval));
4408208373Smm
4409208373Smm	return (next);
4410208373Smm}
4411208373Smm
4412185029Spjdstatic void
4413185029Spjdl2arc_hdr_stat_add(void)
4414185029Spjd{
4415185029Spjd	ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
4416185029Spjd	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
4417185029Spjd}
4418185029Spjd
4419185029Spjdstatic void
4420185029Spjdl2arc_hdr_stat_remove(void)
4421185029Spjd{
4422185029Spjd	ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
4423185029Spjd	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
4424185029Spjd}
4425185029Spjd
4426185029Spjd/*
4427185029Spjd * Cycle through L2ARC devices.  This is how L2ARC load balances.
4428185029Spjd * If a device is returned, this also returns holding the spa config lock.
4429185029Spjd */
4430185029Spjdstatic l2arc_dev_t *
4431185029Spjdl2arc_dev_get_next(void)
4432185029Spjd{
4433185029Spjd	l2arc_dev_t *first, *next = NULL;
4434185029Spjd
4435185029Spjd	/*
4436185029Spjd	 * Lock out the removal of spas (spa_namespace_lock), then removal
4437185029Spjd	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
4438185029Spjd	 * both locks will be dropped and a spa config lock held instead.
4439185029Spjd	 */
4440185029Spjd	mutex_enter(&spa_namespace_lock);
4441185029Spjd	mutex_enter(&l2arc_dev_mtx);
4442185029Spjd
4443185029Spjd	/* if there are no vdevs, there is nothing to do */
4444185029Spjd	if (l2arc_ndev == 0)
4445185029Spjd		goto out;
4446185029Spjd
4447185029Spjd	first = NULL;
4448185029Spjd	next = l2arc_dev_last;
4449185029Spjd	do {
4450185029Spjd		/* loop around the list looking for a non-faulted vdev */
4451185029Spjd		if (next == NULL) {
4452185029Spjd			next = list_head(l2arc_dev_list);
4453185029Spjd		} else {
4454185029Spjd			next = list_next(l2arc_dev_list, next);
4455185029Spjd			if (next == NULL)
4456185029Spjd				next = list_head(l2arc_dev_list);
4457185029Spjd		}
4458185029Spjd
4459185029Spjd		/* if we have come back to the start, bail out */
4460185029Spjd		if (first == NULL)
4461185029Spjd			first = next;
4462185029Spjd		else if (next == first)
4463185029Spjd			break;
4464185029Spjd
4465185029Spjd	} while (vdev_is_dead(next->l2ad_vdev));
4466185029Spjd
4467185029Spjd	/* if we were unable to find any usable vdevs, return NULL */
4468185029Spjd	if (vdev_is_dead(next->l2ad_vdev))
4469185029Spjd		next = NULL;
4470185029Spjd
4471185029Spjd	l2arc_dev_last = next;
4472185029Spjd
4473185029Spjdout:
4474185029Spjd	mutex_exit(&l2arc_dev_mtx);
4475185029Spjd
4476185029Spjd	/*
4477185029Spjd	 * Grab the config lock to prevent the 'next' device from being
4478185029Spjd	 * removed while we are writing to it.
4479185029Spjd	 */
4480185029Spjd	if (next != NULL)
4481185029Spjd		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
4482185029Spjd	mutex_exit(&spa_namespace_lock);
4483185029Spjd
4484185029Spjd	return (next);
4485185029Spjd}
4486185029Spjd
4487185029Spjd/*
4488185029Spjd * Free buffers that were tagged for destruction.
4489185029Spjd */
4490185029Spjdstatic void
4491185029Spjdl2arc_do_free_on_write()
4492185029Spjd{
4493185029Spjd	list_t *buflist;
4494185029Spjd	l2arc_data_free_t *df, *df_prev;
4495185029Spjd
4496185029Spjd	mutex_enter(&l2arc_free_on_write_mtx);
4497185029Spjd	buflist = l2arc_free_on_write;
4498185029Spjd
4499185029Spjd	for (df = list_tail(buflist); df; df = df_prev) {
4500185029Spjd		df_prev = list_prev(buflist, df);
4501185029Spjd		ASSERT(df->l2df_data != NULL);
4502185029Spjd		ASSERT(df->l2df_func != NULL);
4503185029Spjd		df->l2df_func(df->l2df_data, df->l2df_size);
4504185029Spjd		list_remove(buflist, df);
4505185029Spjd		kmem_free(df, sizeof (l2arc_data_free_t));
4506185029Spjd	}
4507185029Spjd
4508185029Spjd	mutex_exit(&l2arc_free_on_write_mtx);
4509185029Spjd}
4510185029Spjd
4511185029Spjd/*
4512185029Spjd * A write to a cache device has completed.  Update all headers to allow
4513185029Spjd * reads from these buffers to begin.
4514185029Spjd */
4515185029Spjdstatic void
4516185029Spjdl2arc_write_done(zio_t *zio)
4517185029Spjd{
4518185029Spjd	l2arc_write_callback_t *cb;
4519185029Spjd	l2arc_dev_t *dev;
4520185029Spjd	list_t *buflist;
4521185029Spjd	arc_buf_hdr_t *head, *ab, *ab_prev;
4522185029Spjd	l2arc_buf_hdr_t *abl2;
4523185029Spjd	kmutex_t *hash_lock;
4524185029Spjd
4525185029Spjd	cb = zio->io_private;
4526185029Spjd	ASSERT(cb != NULL);
4527185029Spjd	dev = cb->l2wcb_dev;
4528185029Spjd	ASSERT(dev != NULL);
4529185029Spjd	head = cb->l2wcb_head;
4530185029Spjd	ASSERT(head != NULL);
4531185029Spjd	buflist = dev->l2ad_buflist;
4532185029Spjd	ASSERT(buflist != NULL);
4533185029Spjd	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
4534185029Spjd	    l2arc_write_callback_t *, cb);
4535185029Spjd
4536185029Spjd	if (zio->io_error != 0)
4537185029Spjd		ARCSTAT_BUMP(arcstat_l2_writes_error);
4538185029Spjd
4539185029Spjd	mutex_enter(&l2arc_buflist_mtx);
4540185029Spjd
4541185029Spjd	/*
4542185029Spjd	 * All writes completed, or an error was hit.
4543185029Spjd	 */
4544185029Spjd	for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
4545185029Spjd		ab_prev = list_prev(buflist, ab);
4546185029Spjd
4547185029Spjd		hash_lock = HDR_LOCK(ab);
4548185029Spjd		if (!mutex_tryenter(hash_lock)) {
4549185029Spjd			/*
4550185029Spjd			 * This buffer misses out.  It may be in a stage
4551185029Spjd			 * of eviction.  Its ARC_L2_WRITING flag will be
4552185029Spjd			 * left set, denying reads to this buffer.
4553185029Spjd			 */
4554185029Spjd			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
4555185029Spjd			continue;
4556185029Spjd		}
4557185029Spjd
4558251478Sdelphij		abl2 = ab->b_l2hdr;
4559251478Sdelphij
4560251478Sdelphij		/*
4561251478Sdelphij		 * Release the temporary compressed buffer as soon as possible.
4562251478Sdelphij		 */
4563251478Sdelphij		if (abl2->b_compress != ZIO_COMPRESS_OFF)
4564251478Sdelphij			l2arc_release_cdata_buf(ab);
4565251478Sdelphij
4566185029Spjd		if (zio->io_error != 0) {
4567185029Spjd			/*
4568185029Spjd			 * Error - drop L2ARC entry.
4569185029Spjd			 */
4570185029Spjd			list_remove(buflist, ab);
4571251478Sdelphij			ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4572185029Spjd			ab->b_l2hdr = NULL;
4573248572Ssmh			trim_map_free(abl2->b_dev->l2ad_vdev, abl2->b_daddr,
4574248574Ssmh			    ab->b_size, 0);
4575185029Spjd			kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4576185029Spjd			ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4577185029Spjd		}
4578185029Spjd
4579185029Spjd		/*
4580185029Spjd		 * Allow ARC to begin reads to this L2ARC entry.
4581185029Spjd		 */
4582185029Spjd		ab->b_flags &= ~ARC_L2_WRITING;
4583185029Spjd
4584185029Spjd		mutex_exit(hash_lock);
4585185029Spjd	}
4586185029Spjd
4587185029Spjd	atomic_inc_64(&l2arc_writes_done);
4588185029Spjd	list_remove(buflist, head);
4589185029Spjd	kmem_cache_free(hdr_cache, head);
4590185029Spjd	mutex_exit(&l2arc_buflist_mtx);
4591185029Spjd
4592185029Spjd	l2arc_do_free_on_write();
4593185029Spjd
4594185029Spjd	kmem_free(cb, sizeof (l2arc_write_callback_t));
4595185029Spjd}
4596185029Spjd
4597185029Spjd/*
4598185029Spjd * A read to a cache device completed.  Validate buffer contents before
4599185029Spjd * handing over to the regular ARC routines.
4600185029Spjd */
4601185029Spjdstatic void
4602185029Spjdl2arc_read_done(zio_t *zio)
4603185029Spjd{
4604185029Spjd	l2arc_read_callback_t *cb;
4605185029Spjd	arc_buf_hdr_t *hdr;
4606185029Spjd	arc_buf_t *buf;
4607185029Spjd	kmutex_t *hash_lock;
4608185029Spjd	int equal;
4609185029Spjd
4610185029Spjd	ASSERT(zio->io_vd != NULL);
4611185029Spjd	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
4612185029Spjd
4613185029Spjd	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
4614185029Spjd
4615185029Spjd	cb = zio->io_private;
4616185029Spjd	ASSERT(cb != NULL);
4617185029Spjd	buf = cb->l2rcb_buf;
4618185029Spjd	ASSERT(buf != NULL);
4619185029Spjd
4620219089Spjd	hash_lock = HDR_LOCK(buf->b_hdr);
4621185029Spjd	mutex_enter(hash_lock);
4622219089Spjd	hdr = buf->b_hdr;
4623219089Spjd	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4624185029Spjd
4625185029Spjd	/*
4626251478Sdelphij	 * If the buffer was compressed, decompress it first.
4627251478Sdelphij	 */
4628251478Sdelphij	if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
4629251478Sdelphij		l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
4630251478Sdelphij	ASSERT(zio->io_data != NULL);
4631251478Sdelphij
4632251478Sdelphij	/*
4633185029Spjd	 * Check this survived the L2ARC journey.
4634185029Spjd	 */
4635185029Spjd	equal = arc_cksum_equal(buf);
4636185029Spjd	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
4637185029Spjd		mutex_exit(hash_lock);
4638185029Spjd		zio->io_private = buf;
4639185029Spjd		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
4640185029Spjd		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
4641185029Spjd		arc_read_done(zio);
4642185029Spjd	} else {
4643185029Spjd		mutex_exit(hash_lock);
4644185029Spjd		/*
4645185029Spjd		 * Buffer didn't survive caching.  Increment stats and
4646185029Spjd		 * reissue to the original storage device.
4647185029Spjd		 */
4648185029Spjd		if (zio->io_error != 0) {
4649185029Spjd			ARCSTAT_BUMP(arcstat_l2_io_error);
4650185029Spjd		} else {
4651249195Smm			zio->io_error = SET_ERROR(EIO);
4652185029Spjd		}
4653185029Spjd		if (!equal)
4654185029Spjd			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
4655185029Spjd
4656185029Spjd		/*
4657185029Spjd		 * If there's no waiter, issue an async i/o to the primary
4658185029Spjd		 * storage now.  If there *is* a waiter, the caller must
4659185029Spjd		 * issue the i/o in a context where it's OK to block.
4660185029Spjd		 */
4661209962Smm		if (zio->io_waiter == NULL) {
4662209962Smm			zio_t *pio = zio_unique_parent(zio);
4663209962Smm
4664209962Smm			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
4665209962Smm
4666209962Smm			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
4667185029Spjd			    buf->b_data, zio->io_size, arc_read_done, buf,
4668185029Spjd			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
4669209962Smm		}
4670185029Spjd	}
4671185029Spjd
4672185029Spjd	kmem_free(cb, sizeof (l2arc_read_callback_t));
4673185029Spjd}
4674185029Spjd
4675185029Spjd/*
4676185029Spjd * This is the list priority from which the L2ARC will search for pages to
4677185029Spjd * cache.  This is used within loops (0..3) to cycle through lists in the
4678185029Spjd * desired order.  This order can have a significant effect on cache
4679185029Spjd * performance.
4680185029Spjd *
4681185029Spjd * Currently the metadata lists are hit first, MFU then MRU, followed by
4682185029Spjd * the data lists.  This function returns a locked list, and also returns
4683185029Spjd * the lock pointer.
4684185029Spjd */
4685185029Spjdstatic list_t *
4686185029Spjdl2arc_list_locked(int list_num, kmutex_t **lock)
4687185029Spjd{
4688247187Smm	list_t *list = NULL;
4689205231Skmacy	int idx;
4690185029Spjd
4691206796Spjd	ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);
4692206796Spjd
4693205231Skmacy	if (list_num < ARC_BUFC_NUMMETADATALISTS) {
4694205231Skmacy		idx = list_num;
4695205231Skmacy		list = &arc_mfu->arcs_lists[idx];
4696205231Skmacy		*lock = ARCS_LOCK(arc_mfu, idx);
4697206796Spjd	} else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) {
4698205231Skmacy		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4699205231Skmacy		list = &arc_mru->arcs_lists[idx];
4700205231Skmacy		*lock = ARCS_LOCK(arc_mru, idx);
4701206796Spjd	} else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 +
4702205231Skmacy		ARC_BUFC_NUMDATALISTS)) {
4703205231Skmacy		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
4704205231Skmacy		list = &arc_mfu->arcs_lists[idx];
4705205231Skmacy		*lock = ARCS_LOCK(arc_mfu, idx);
4706205231Skmacy	} else {
4707205231Skmacy		idx = list_num - ARC_BUFC_NUMLISTS;
4708205231Skmacy		list = &arc_mru->arcs_lists[idx];
4709205231Skmacy		*lock = ARCS_LOCK(arc_mru, idx);
4710185029Spjd	}
4711185029Spjd
4712185029Spjd	ASSERT(!(MUTEX_HELD(*lock)));
4713185029Spjd	mutex_enter(*lock);
4714185029Spjd	return (list);
4715185029Spjd}
4716185029Spjd
4717185029Spjd/*
4718185029Spjd * Evict buffers from the device write hand to the distance specified in
4719185029Spjd * bytes.  This distance may span populated buffers, it may span nothing.
4720185029Spjd * This is clearing a region on the L2ARC device ready for writing.
4721185029Spjd * If the 'all' boolean is set, every buffer is evicted.
4722185029Spjd */
4723185029Spjdstatic void
4724185029Spjdl2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
4725185029Spjd{
4726185029Spjd	list_t *buflist;
4727185029Spjd	l2arc_buf_hdr_t *abl2;
4728185029Spjd	arc_buf_hdr_t *ab, *ab_prev;
4729185029Spjd	kmutex_t *hash_lock;
4730185029Spjd	uint64_t taddr;
4731185029Spjd
4732185029Spjd	buflist = dev->l2ad_buflist;
4733185029Spjd
4734185029Spjd	if (buflist == NULL)
4735185029Spjd		return;
4736185029Spjd
4737185029Spjd	if (!all && dev->l2ad_first) {
4738185029Spjd		/*
4739185029Spjd		 * This is the first sweep through the device.  There is
4740185029Spjd		 * nothing to evict.
4741185029Spjd		 */
4742185029Spjd		return;
4743185029Spjd	}
4744185029Spjd
4745185029Spjd	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
4746185029Spjd		/*
4747185029Spjd		 * When nearing the end of the device, evict to the end
4748185029Spjd		 * before the device write hand jumps to the start.
4749185029Spjd		 */
4750185029Spjd		taddr = dev->l2ad_end;
4751185029Spjd	} else {
4752185029Spjd		taddr = dev->l2ad_hand + distance;
4753185029Spjd	}
4754185029Spjd	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
4755185029Spjd	    uint64_t, taddr, boolean_t, all);
4756185029Spjd
4757185029Spjdtop:
4758185029Spjd	mutex_enter(&l2arc_buflist_mtx);
4759185029Spjd	for (ab = list_tail(buflist); ab; ab = ab_prev) {
4760185029Spjd		ab_prev = list_prev(buflist, ab);
4761185029Spjd
4762185029Spjd		hash_lock = HDR_LOCK(ab);
4763185029Spjd		if (!mutex_tryenter(hash_lock)) {
4764185029Spjd			/*
4765185029Spjd			 * Missed the hash lock.  Retry.
4766185029Spjd			 */
4767185029Spjd			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
4768185029Spjd			mutex_exit(&l2arc_buflist_mtx);
4769185029Spjd			mutex_enter(hash_lock);
4770185029Spjd			mutex_exit(hash_lock);
4771185029Spjd			goto top;
4772185029Spjd		}
4773185029Spjd
4774185029Spjd		if (HDR_L2_WRITE_HEAD(ab)) {
4775185029Spjd			/*
4776185029Spjd			 * We hit a write head node.  Leave it for
4777185029Spjd			 * l2arc_write_done().
4778185029Spjd			 */
4779185029Spjd			list_remove(buflist, ab);
4780185029Spjd			mutex_exit(hash_lock);
4781185029Spjd			continue;
4782185029Spjd		}
4783185029Spjd
4784185029Spjd		if (!all && ab->b_l2hdr != NULL &&
4785185029Spjd		    (ab->b_l2hdr->b_daddr > taddr ||
4786185029Spjd		    ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
4787185029Spjd			/*
4788185029Spjd			 * We've evicted to the target address,
4789185029Spjd			 * or the end of the device.
4790185029Spjd			 */
4791185029Spjd			mutex_exit(hash_lock);
4792185029Spjd			break;
4793185029Spjd		}
4794185029Spjd
4795185029Spjd		if (HDR_FREE_IN_PROGRESS(ab)) {
4796185029Spjd			/*
4797185029Spjd			 * Already on the path to destruction.
4798185029Spjd			 */
4799185029Spjd			mutex_exit(hash_lock);
4800185029Spjd			continue;
4801185029Spjd		}
4802185029Spjd
4803185029Spjd		if (ab->b_state == arc_l2c_only) {
4804185029Spjd			ASSERT(!HDR_L2_READING(ab));
4805185029Spjd			/*
4806185029Spjd			 * This doesn't exist in the ARC.  Destroy.
4807185029Spjd			 * arc_hdr_destroy() will call list_remove()
4808185029Spjd			 * and decrement arcstat_l2_size.
4809185029Spjd			 */
4810185029Spjd			arc_change_state(arc_anon, ab, hash_lock);
4811185029Spjd			arc_hdr_destroy(ab);
4812185029Spjd		} else {
4813185029Spjd			/*
4814185029Spjd			 * Invalidate issued or about to be issued
4815185029Spjd			 * reads, since we may be about to write
4816185029Spjd			 * over this location.
4817185029Spjd			 */
4818185029Spjd			if (HDR_L2_READING(ab)) {
4819185029Spjd				ARCSTAT_BUMP(arcstat_l2_evict_reading);
4820185029Spjd				ab->b_flags |= ARC_L2_EVICTED;
4821185029Spjd			}
4822185029Spjd
4823185029Spjd			/*
4824185029Spjd			 * Tell ARC this no longer exists in L2ARC.
4825185029Spjd			 */
4826185029Spjd			if (ab->b_l2hdr != NULL) {
4827185029Spjd				abl2 = ab->b_l2hdr;
4828251478Sdelphij				ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
4829185029Spjd				ab->b_l2hdr = NULL;
4830185029Spjd				kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
4831185029Spjd				ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
4832185029Spjd			}
4833185029Spjd			list_remove(buflist, ab);
4834185029Spjd
4835185029Spjd			/*
4836185029Spjd			 * This may have been leftover after a
4837185029Spjd			 * failed write.
4838185029Spjd			 */
4839185029Spjd			ab->b_flags &= ~ARC_L2_WRITING;
4840185029Spjd		}
4841185029Spjd		mutex_exit(hash_lock);
4842185029Spjd	}
4843185029Spjd	mutex_exit(&l2arc_buflist_mtx);
4844185029Spjd
4845219089Spjd	vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
4846185029Spjd	dev->l2ad_evict = taddr;
4847185029Spjd}
4848185029Spjd
4849185029Spjd/*
4850185029Spjd * Find and write ARC buffers to the L2ARC device.
4851185029Spjd *
4852185029Spjd * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
4853185029Spjd * for reading until they have completed writing.
4854251478Sdelphij * The headroom_boost is an in-out parameter used to maintain headroom boost
4855251478Sdelphij * state between calls to this function.
4856251478Sdelphij *
4857251478Sdelphij * Returns the number of bytes actually written (which may be smaller than
4858251478Sdelphij * the delta by which the device hand has changed due to alignment).
4859185029Spjd */
4860208373Smmstatic uint64_t
4861251478Sdelphijl2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
4862251478Sdelphij    boolean_t *headroom_boost)
4863185029Spjd{
4864185029Spjd	arc_buf_hdr_t *ab, *ab_prev, *head;
4865185029Spjd	list_t *list;
4866251478Sdelphij	uint64_t write_asize, write_psize, write_sz, headroom,
4867251478Sdelphij	    buf_compress_minsz;
4868185029Spjd	void *buf_data;
4869251478Sdelphij	kmutex_t *list_lock;
4870251478Sdelphij	boolean_t full;
4871185029Spjd	l2arc_write_callback_t *cb;
4872185029Spjd	zio_t *pio, *wzio;
4873228103Smm	uint64_t guid = spa_load_guid(spa);
4874251478Sdelphij	const boolean_t do_headroom_boost = *headroom_boost;
4875185029Spjd	int try;
4876185029Spjd
4877185029Spjd	ASSERT(dev->l2ad_vdev != NULL);
4878185029Spjd
4879251478Sdelphij	/* Lower the flag now, we might want to raise it again later. */
4880251478Sdelphij	*headroom_boost = B_FALSE;
4881251478Sdelphij
4882185029Spjd	pio = NULL;
4883251478Sdelphij	write_sz = write_asize = write_psize = 0;
4884185029Spjd	full = B_FALSE;
4885185029Spjd	head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
4886185029Spjd	head->b_flags |= ARC_L2_WRITE_HEAD;
4887185029Spjd
4888205231Skmacy	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
4889185029Spjd	/*
4890251478Sdelphij	 * We will want to try to compress buffers that are at least 2x the
4891251478Sdelphij	 * device sector size.
4892251478Sdelphij	 */
4893251478Sdelphij	buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
4894251478Sdelphij
4895251478Sdelphij	/*
4896185029Spjd	 * Copy buffers for L2ARC writing.
4897185029Spjd	 */
4898185029Spjd	mutex_enter(&l2arc_buflist_mtx);
4899206796Spjd	for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
4900251478Sdelphij		uint64_t passed_sz = 0;
4901251478Sdelphij
4902185029Spjd		list = l2arc_list_locked(try, &list_lock);
4903205231Skmacy		ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
4904185029Spjd
4905185029Spjd		/*
4906185029Spjd		 * L2ARC fast warmup.
4907185029Spjd		 *
4908185029Spjd		 * Until the ARC is warm and starts to evict, read from the
4909185029Spjd		 * head of the ARC lists rather than the tail.
4910185029Spjd		 */
4911185029Spjd		if (arc_warm == B_FALSE)
4912185029Spjd			ab = list_head(list);
4913185029Spjd		else
4914185029Spjd			ab = list_tail(list);
4915206796Spjd		if (ab == NULL)
4916205231Skmacy			ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
4917185029Spjd
4918251478Sdelphij		headroom = target_sz * l2arc_headroom;
4919251478Sdelphij		if (do_headroom_boost)
4920251478Sdelphij			headroom = (headroom * l2arc_headroom_boost) / 100;
4921251478Sdelphij
4922185029Spjd		for (; ab; ab = ab_prev) {
4923251478Sdelphij			l2arc_buf_hdr_t *l2hdr;
4924251478Sdelphij			kmutex_t *hash_lock;
4925251478Sdelphij			uint64_t buf_sz;
4926251478Sdelphij
4927185029Spjd			if (arc_warm == B_FALSE)
4928185029Spjd				ab_prev = list_next(list, ab);
4929185029Spjd			else
4930185029Spjd				ab_prev = list_prev(list, ab);
4931205231Skmacy			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size);
4932206796Spjd
4933185029Spjd			hash_lock = HDR_LOCK(ab);
4934251478Sdelphij			if (!mutex_tryenter(hash_lock)) {
4935205231Skmacy				ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
4936185029Spjd				/*
4937185029Spjd				 * Skip this buffer rather than waiting.
4938185029Spjd				 */
4939185029Spjd				continue;
4940185029Spjd			}
4941185029Spjd
4942185029Spjd			passed_sz += ab->b_size;
4943185029Spjd			if (passed_sz > headroom) {
4944185029Spjd				/*
4945185029Spjd				 * Searched too far.
4946185029Spjd				 */
4947185029Spjd				mutex_exit(hash_lock);
4948205231Skmacy				ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
4949185029Spjd				break;
4950185029Spjd			}
4951185029Spjd
4952209962Smm			if (!l2arc_write_eligible(guid, ab)) {
4953185029Spjd				mutex_exit(hash_lock);
4954185029Spjd				continue;
4955185029Spjd			}
4956185029Spjd
4957185029Spjd			if ((write_sz + ab->b_size) > target_sz) {
4958185029Spjd				full = B_TRUE;
4959185029Spjd				mutex_exit(hash_lock);
4960205231Skmacy				ARCSTAT_BUMP(arcstat_l2_write_full);
4961185029Spjd				break;
4962185029Spjd			}
4963185029Spjd
4964185029Spjd			if (pio == NULL) {
4965185029Spjd				/*
4966185029Spjd				 * Insert a dummy header on the buflist so
4967185029Spjd				 * l2arc_write_done() can find where the
4968185029Spjd				 * write buffers begin without searching.
4969185029Spjd				 */
4970185029Spjd				list_insert_head(dev->l2ad_buflist, head);
4971185029Spjd
4972185029Spjd				cb = kmem_alloc(
4973185029Spjd				    sizeof (l2arc_write_callback_t), KM_SLEEP);
4974185029Spjd				cb->l2wcb_dev = dev;
4975185029Spjd				cb->l2wcb_head = head;
4976185029Spjd				pio = zio_root(spa, l2arc_write_done, cb,
4977185029Spjd				    ZIO_FLAG_CANFAIL);
4978205231Skmacy				ARCSTAT_BUMP(arcstat_l2_write_pios);
4979185029Spjd			}
4980185029Spjd
4981185029Spjd			/*
4982185029Spjd			 * Create and add a new L2ARC header.
4983185029Spjd			 */
4984251478Sdelphij			l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
4985251478Sdelphij			l2hdr->b_dev = dev;
4986251478Sdelphij			ab->b_flags |= ARC_L2_WRITING;
4987185029Spjd
4988251478Sdelphij			/*
4989251478Sdelphij			 * Temporarily stash the data buffer in b_tmp_cdata.
4990251478Sdelphij			 * The subsequent write step will pick it up from
4991251478Sdelphij			 * there. This is because can't access ab->b_buf
4992251478Sdelphij			 * without holding the hash_lock, which we in turn
4993251478Sdelphij			 * can't access without holding the ARC list locks
4994251478Sdelphij			 * (which we want to avoid during compression/writing).
4995251478Sdelphij			 */
4996251478Sdelphij			l2hdr->b_compress = ZIO_COMPRESS_OFF;
4997251478Sdelphij			l2hdr->b_asize = ab->b_size;
4998251478Sdelphij			l2hdr->b_tmp_cdata = ab->b_buf->b_data;
4999251478Sdelphij
5000185029Spjd			buf_sz = ab->b_size;
5001251478Sdelphij			ab->b_l2hdr = l2hdr;
5002185029Spjd
5003251478Sdelphij			list_insert_head(dev->l2ad_buflist, ab);
5004251478Sdelphij
5005185029Spjd			/*
5006185029Spjd			 * Compute and store the buffer cksum before
5007185029Spjd			 * writing.  On debug the cksum is verified first.
5008185029Spjd			 */
5009185029Spjd			arc_cksum_verify(ab->b_buf);
5010185029Spjd			arc_cksum_compute(ab->b_buf, B_TRUE);
5011185029Spjd
5012185029Spjd			mutex_exit(hash_lock);
5013185029Spjd
5014251478Sdelphij			write_sz += buf_sz;
5015251478Sdelphij		}
5016251478Sdelphij
5017251478Sdelphij		mutex_exit(list_lock);
5018251478Sdelphij
5019251478Sdelphij		if (full == B_TRUE)
5020251478Sdelphij			break;
5021251478Sdelphij	}
5022251478Sdelphij
5023251478Sdelphij	/* No buffers selected for writing? */
5024251478Sdelphij	if (pio == NULL) {
5025251478Sdelphij		ASSERT0(write_sz);
5026251478Sdelphij		mutex_exit(&l2arc_buflist_mtx);
5027251478Sdelphij		kmem_cache_free(hdr_cache, head);
5028251478Sdelphij		return (0);
5029251478Sdelphij	}
5030251478Sdelphij
5031251478Sdelphij	/*
5032251478Sdelphij	 * Now start writing the buffers. We're starting at the write head
5033251478Sdelphij	 * and work backwards, retracing the course of the buffer selector
5034251478Sdelphij	 * loop above.
5035251478Sdelphij	 */
5036251478Sdelphij	for (ab = list_prev(dev->l2ad_buflist, head); ab;
5037251478Sdelphij	    ab = list_prev(dev->l2ad_buflist, ab)) {
5038251478Sdelphij		l2arc_buf_hdr_t *l2hdr;
5039251478Sdelphij		uint64_t buf_sz;
5040251478Sdelphij
5041251478Sdelphij		/*
5042251478Sdelphij		 * We shouldn't need to lock the buffer here, since we flagged
5043251478Sdelphij		 * it as ARC_L2_WRITING in the previous step, but we must take
5044251478Sdelphij		 * care to only access its L2 cache parameters. In particular,
5045251478Sdelphij		 * ab->b_buf may be invalid by now due to ARC eviction.
5046251478Sdelphij		 */
5047251478Sdelphij		l2hdr = ab->b_l2hdr;
5048251478Sdelphij		l2hdr->b_daddr = dev->l2ad_hand;
5049251478Sdelphij
5050251478Sdelphij		if ((ab->b_flags & ARC_L2COMPRESS) &&
5051251478Sdelphij		    l2hdr->b_asize >= buf_compress_minsz) {
5052251478Sdelphij			if (l2arc_compress_buf(l2hdr)) {
5053251478Sdelphij				/*
5054251478Sdelphij				 * If compression succeeded, enable headroom
5055251478Sdelphij				 * boost on the next scan cycle.
5056251478Sdelphij				 */
5057251478Sdelphij				*headroom_boost = B_TRUE;
5058251478Sdelphij			}
5059251478Sdelphij		}
5060251478Sdelphij
5061251478Sdelphij		/*
5062251478Sdelphij		 * Pick up the buffer data we had previously stashed away
5063251478Sdelphij		 * (and now potentially also compressed).
5064251478Sdelphij		 */
5065251478Sdelphij		buf_data = l2hdr->b_tmp_cdata;
5066251478Sdelphij		buf_sz = l2hdr->b_asize;
5067251478Sdelphij
5068251478Sdelphij		/* Compression may have squashed the buffer to zero length. */
5069251478Sdelphij		if (buf_sz != 0) {
5070251478Sdelphij			uint64_t buf_p_sz;
5071251478Sdelphij
5072185029Spjd			wzio = zio_write_phys(pio, dev->l2ad_vdev,
5073185029Spjd			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5074185029Spjd			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5075185029Spjd			    ZIO_FLAG_CANFAIL, B_FALSE);
5076185029Spjd
5077185029Spjd			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5078185029Spjd			    zio_t *, wzio);
5079185029Spjd			(void) zio_nowait(wzio);
5080185029Spjd
5081251478Sdelphij			write_asize += buf_sz;
5082185029Spjd			/*
5083185029Spjd			 * Keep the clock hand suitably device-aligned.
5084185029Spjd			 */
5085251478Sdelphij			buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
5086251478Sdelphij			write_psize += buf_p_sz;
5087251478Sdelphij			dev->l2ad_hand += buf_p_sz;
5088185029Spjd		}
5089251478Sdelphij	}
5090185029Spjd
5091185029Spjd	mutex_exit(&l2arc_buflist_mtx);
5092185029Spjd
5093251478Sdelphij	ASSERT3U(write_asize, <=, target_sz);
5094185029Spjd	ARCSTAT_BUMP(arcstat_l2_writes_sent);
5095251478Sdelphij	ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5096185029Spjd	ARCSTAT_INCR(arcstat_l2_size, write_sz);
5097251478Sdelphij	ARCSTAT_INCR(arcstat_l2_asize, write_asize);
5098251478Sdelphij	vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
5099185029Spjd
5100185029Spjd	/*
5101185029Spjd	 * Bump device hand to the device start if it is approaching the end.
5102185029Spjd	 * l2arc_evict() will already have evicted ahead for this case.
5103185029Spjd	 */
5104185029Spjd	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
5105219089Spjd		vdev_space_update(dev->l2ad_vdev,
5106219089Spjd		    dev->l2ad_end - dev->l2ad_hand, 0, 0);
5107185029Spjd		dev->l2ad_hand = dev->l2ad_start;
5108185029Spjd		dev->l2ad_evict = dev->l2ad_start;
5109185029Spjd		dev->l2ad_first = B_FALSE;
5110185029Spjd	}
5111185029Spjd
5112208373Smm	dev->l2ad_writing = B_TRUE;
5113185029Spjd	(void) zio_wait(pio);
5114208373Smm	dev->l2ad_writing = B_FALSE;
5115208373Smm
5116251478Sdelphij	return (write_asize);
5117185029Spjd}
5118185029Spjd
5119185029Spjd/*
5120251478Sdelphij * Compresses an L2ARC buffer.
5121251478Sdelphij * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
5122251478Sdelphij * size in l2hdr->b_asize. This routine tries to compress the data and
5123251478Sdelphij * depending on the compression result there are three possible outcomes:
5124251478Sdelphij * *) The buffer was incompressible. The original l2hdr contents were left
5125251478Sdelphij *    untouched and are ready for writing to an L2 device.
5126251478Sdelphij * *) The buffer was all-zeros, so there is no need to write it to an L2
5127251478Sdelphij *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
5128251478Sdelphij *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
5129251478Sdelphij * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
5130251478Sdelphij *    data buffer which holds the compressed data to be written, and b_asize
5131251478Sdelphij *    tells us how much data there is. b_compress is set to the appropriate
5132251478Sdelphij *    compression algorithm. Once writing is done, invoke
5133251478Sdelphij *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
5134251478Sdelphij *
5135251478Sdelphij * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
5136251478Sdelphij * buffer was incompressible).
5137251478Sdelphij */
5138251478Sdelphijstatic boolean_t
5139251478Sdelphijl2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
5140251478Sdelphij{
5141251478Sdelphij	void *cdata;
5142251478Sdelphij	size_t csize, len;
5143251478Sdelphij
5144251478Sdelphij	ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
5145251478Sdelphij	ASSERT(l2hdr->b_tmp_cdata != NULL);
5146251478Sdelphij
5147251478Sdelphij	len = l2hdr->b_asize;
5148251478Sdelphij	cdata = zio_data_buf_alloc(len);
5149251478Sdelphij	csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
5150256889Ssmh	    cdata, l2hdr->b_asize, (size_t)(1ULL << l2hdr->b_dev->l2ad_vdev->vdev_ashift));
5151251478Sdelphij
5152251478Sdelphij	if (csize == 0) {
5153251478Sdelphij		/* zero block, indicate that there's nothing to write */
5154251478Sdelphij		zio_data_buf_free(cdata, len);
5155251478Sdelphij		l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
5156251478Sdelphij		l2hdr->b_asize = 0;
5157251478Sdelphij		l2hdr->b_tmp_cdata = NULL;
5158251478Sdelphij		ARCSTAT_BUMP(arcstat_l2_compress_zeros);
5159251478Sdelphij		return (B_TRUE);
5160251478Sdelphij	} else if (csize > 0 && csize < len) {
5161251478Sdelphij		/*
5162251478Sdelphij		 * Compression succeeded, we'll keep the cdata around for
5163251478Sdelphij		 * writing and release it afterwards.
5164251478Sdelphij		 */
5165251478Sdelphij		l2hdr->b_compress = ZIO_COMPRESS_LZ4;
5166251478Sdelphij		l2hdr->b_asize = csize;
5167251478Sdelphij		l2hdr->b_tmp_cdata = cdata;
5168251478Sdelphij		ARCSTAT_BUMP(arcstat_l2_compress_successes);
5169251478Sdelphij		return (B_TRUE);
5170251478Sdelphij	} else {
5171251478Sdelphij		/*
5172251478Sdelphij		 * Compression failed, release the compressed buffer.
5173251478Sdelphij		 * l2hdr will be left unmodified.
5174251478Sdelphij		 */
5175251478Sdelphij		zio_data_buf_free(cdata, len);
5176251478Sdelphij		ARCSTAT_BUMP(arcstat_l2_compress_failures);
5177251478Sdelphij		return (B_FALSE);
5178251478Sdelphij	}
5179251478Sdelphij}
5180251478Sdelphij
5181251478Sdelphij/*
5182251478Sdelphij * Decompresses a zio read back from an l2arc device. On success, the
5183251478Sdelphij * underlying zio's io_data buffer is overwritten by the uncompressed
5184251478Sdelphij * version. On decompression error (corrupt compressed stream), the
5185251478Sdelphij * zio->io_error value is set to signal an I/O error.
5186251478Sdelphij *
5187251478Sdelphij * Please note that the compressed data stream is not checksummed, so
5188251478Sdelphij * if the underlying device is experiencing data corruption, we may feed
5189251478Sdelphij * corrupt data to the decompressor, so the decompressor needs to be
5190251478Sdelphij * able to handle this situation (LZ4 does).
5191251478Sdelphij */
5192251478Sdelphijstatic void
5193251478Sdelphijl2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
5194251478Sdelphij{
5195251478Sdelphij	ASSERT(L2ARC_IS_VALID_COMPRESS(c));
5196251478Sdelphij
5197251478Sdelphij	if (zio->io_error != 0) {
5198251478Sdelphij		/*
5199251478Sdelphij		 * An io error has occured, just restore the original io
5200251478Sdelphij		 * size in preparation for a main pool read.
5201251478Sdelphij		 */
5202251478Sdelphij		zio->io_orig_size = zio->io_size = hdr->b_size;
5203251478Sdelphij		return;
5204251478Sdelphij	}
5205251478Sdelphij
5206251478Sdelphij	if (c == ZIO_COMPRESS_EMPTY) {
5207251478Sdelphij		/*
5208251478Sdelphij		 * An empty buffer results in a null zio, which means we
5209251478Sdelphij		 * need to fill its io_data after we're done restoring the
5210251478Sdelphij		 * buffer's contents.
5211251478Sdelphij		 */
5212251478Sdelphij		ASSERT(hdr->b_buf != NULL);
5213251478Sdelphij		bzero(hdr->b_buf->b_data, hdr->b_size);
5214251478Sdelphij		zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
5215251478Sdelphij	} else {
5216251478Sdelphij		ASSERT(zio->io_data != NULL);
5217251478Sdelphij		/*
5218251478Sdelphij		 * We copy the compressed data from the start of the arc buffer
5219251478Sdelphij		 * (the zio_read will have pulled in only what we need, the
5220251478Sdelphij		 * rest is garbage which we will overwrite at decompression)
5221251478Sdelphij		 * and then decompress back to the ARC data buffer. This way we
5222251478Sdelphij		 * can minimize copying by simply decompressing back over the
5223251478Sdelphij		 * original compressed data (rather than decompressing to an
5224251478Sdelphij		 * aux buffer and then copying back the uncompressed buffer,
5225251478Sdelphij		 * which is likely to be much larger).
5226251478Sdelphij		 */
5227251478Sdelphij		uint64_t csize;
5228251478Sdelphij		void *cdata;
5229251478Sdelphij
5230251478Sdelphij		csize = zio->io_size;
5231251478Sdelphij		cdata = zio_data_buf_alloc(csize);
5232251478Sdelphij		bcopy(zio->io_data, cdata, csize);
5233251478Sdelphij		if (zio_decompress_data(c, cdata, zio->io_data, csize,
5234251478Sdelphij		    hdr->b_size) != 0)
5235251478Sdelphij			zio->io_error = EIO;
5236251478Sdelphij		zio_data_buf_free(cdata, csize);
5237251478Sdelphij	}
5238251478Sdelphij
5239251478Sdelphij	/* Restore the expected uncompressed IO size. */
5240251478Sdelphij	zio->io_orig_size = zio->io_size = hdr->b_size;
5241251478Sdelphij}
5242251478Sdelphij
5243251478Sdelphij/*
5244251478Sdelphij * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
5245251478Sdelphij * This buffer serves as a temporary holder of compressed data while
5246251478Sdelphij * the buffer entry is being written to an l2arc device. Once that is
5247251478Sdelphij * done, we can dispose of it.
5248251478Sdelphij */
5249251478Sdelphijstatic void
5250251478Sdelphijl2arc_release_cdata_buf(arc_buf_hdr_t *ab)
5251251478Sdelphij{
5252251478Sdelphij	l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
5253251478Sdelphij
5254251478Sdelphij	if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
5255251478Sdelphij		/*
5256251478Sdelphij		 * If the data was compressed, then we've allocated a
5257251478Sdelphij		 * temporary buffer for it, so now we need to release it.
5258251478Sdelphij		 */
5259251478Sdelphij		ASSERT(l2hdr->b_tmp_cdata != NULL);
5260251478Sdelphij		zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
5261251478Sdelphij	}
5262251478Sdelphij	l2hdr->b_tmp_cdata = NULL;
5263251478Sdelphij}
5264251478Sdelphij
5265251478Sdelphij/*
5266185029Spjd * This thread feeds the L2ARC at regular intervals.  This is the beating
5267185029Spjd * heart of the L2ARC.
5268185029Spjd */
5269185029Spjdstatic void
5270185029Spjdl2arc_feed_thread(void *dummy __unused)
5271185029Spjd{
5272185029Spjd	callb_cpr_t cpr;
5273185029Spjd	l2arc_dev_t *dev;
5274185029Spjd	spa_t *spa;
5275208373Smm	uint64_t size, wrote;
5276219089Spjd	clock_t begin, next = ddi_get_lbolt();
5277251478Sdelphij	boolean_t headroom_boost = B_FALSE;
5278185029Spjd
5279185029Spjd	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
5280185029Spjd
5281185029Spjd	mutex_enter(&l2arc_feed_thr_lock);
5282185029Spjd
5283185029Spjd	while (l2arc_thread_exit == 0) {
5284185029Spjd		CALLB_CPR_SAFE_BEGIN(&cpr);
5285185029Spjd		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
5286219089Spjd		    next - ddi_get_lbolt());
5287185029Spjd		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
5288219089Spjd		next = ddi_get_lbolt() + hz;
5289185029Spjd
5290185029Spjd		/*
5291185029Spjd		 * Quick check for L2ARC devices.
5292185029Spjd		 */
5293185029Spjd		mutex_enter(&l2arc_dev_mtx);
5294185029Spjd		if (l2arc_ndev == 0) {
5295185029Spjd			mutex_exit(&l2arc_dev_mtx);
5296185029Spjd			continue;
5297185029Spjd		}
5298185029Spjd		mutex_exit(&l2arc_dev_mtx);
5299219089Spjd		begin = ddi_get_lbolt();
5300185029Spjd
5301185029Spjd		/*
5302185029Spjd		 * This selects the next l2arc device to write to, and in
5303185029Spjd		 * doing so the next spa to feed from: dev->l2ad_spa.   This
5304185029Spjd		 * will return NULL if there are now no l2arc devices or if
5305185029Spjd		 * they are all faulted.
5306185029Spjd		 *
5307185029Spjd		 * If a device is returned, its spa's config lock is also
5308185029Spjd		 * held to prevent device removal.  l2arc_dev_get_next()
5309185029Spjd		 * will grab and release l2arc_dev_mtx.
5310185029Spjd		 */
5311185029Spjd		if ((dev = l2arc_dev_get_next()) == NULL)
5312185029Spjd			continue;
5313185029Spjd
5314185029Spjd		spa = dev->l2ad_spa;
5315185029Spjd		ASSERT(spa != NULL);
5316185029Spjd
5317185029Spjd		/*
5318219089Spjd		 * If the pool is read-only then force the feed thread to
5319219089Spjd		 * sleep a little longer.
5320219089Spjd		 */
5321219089Spjd		if (!spa_writeable(spa)) {
5322219089Spjd			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
5323219089Spjd			spa_config_exit(spa, SCL_L2ARC, dev);
5324219089Spjd			continue;
5325219089Spjd		}
5326219089Spjd
5327219089Spjd		/*
5328185029Spjd		 * Avoid contributing to memory pressure.
5329185029Spjd		 */
5330185029Spjd		if (arc_reclaim_needed()) {
5331185029Spjd			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5332185029Spjd			spa_config_exit(spa, SCL_L2ARC, dev);
5333185029Spjd			continue;
5334185029Spjd		}
5335185029Spjd
5336185029Spjd		ARCSTAT_BUMP(arcstat_l2_feeds);
5337185029Spjd
5338251478Sdelphij		size = l2arc_write_size();
5339185029Spjd
5340185029Spjd		/*
5341185029Spjd		 * Evict L2ARC buffers that will be overwritten.
5342185029Spjd		 */
5343185029Spjd		l2arc_evict(dev, size, B_FALSE);
5344185029Spjd
5345185029Spjd		/*
5346185029Spjd		 * Write ARC buffers.
5347185029Spjd		 */
5348251478Sdelphij		wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5349208373Smm
5350208373Smm		/*
5351208373Smm		 * Calculate interval between writes.
5352208373Smm		 */
5353208373Smm		next = l2arc_write_interval(begin, size, wrote);
5354185029Spjd		spa_config_exit(spa, SCL_L2ARC, dev);
5355185029Spjd	}
5356185029Spjd
5357185029Spjd	l2arc_thread_exit = 0;
5358185029Spjd	cv_broadcast(&l2arc_feed_thr_cv);
5359185029Spjd	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
5360185029Spjd	thread_exit();
5361185029Spjd}
5362185029Spjd
5363185029Spjdboolean_t
5364185029Spjdl2arc_vdev_present(vdev_t *vd)
5365185029Spjd{
5366185029Spjd	l2arc_dev_t *dev;
5367185029Spjd
5368185029Spjd	mutex_enter(&l2arc_dev_mtx);
5369185029Spjd	for (dev = list_head(l2arc_dev_list); dev != NULL;
5370185029Spjd	    dev = list_next(l2arc_dev_list, dev)) {
5371185029Spjd		if (dev->l2ad_vdev == vd)
5372185029Spjd			break;
5373185029Spjd	}
5374185029Spjd	mutex_exit(&l2arc_dev_mtx);
5375185029Spjd
5376185029Spjd	return (dev != NULL);
5377185029Spjd}
5378185029Spjd
5379185029Spjd/*
5380185029Spjd * Add a vdev for use by the L2ARC.  By this point the spa has already
5381185029Spjd * validated the vdev and opened it.
5382185029Spjd */
5383185029Spjdvoid
5384219089Spjdl2arc_add_vdev(spa_t *spa, vdev_t *vd)
5385185029Spjd{
5386185029Spjd	l2arc_dev_t *adddev;
5387185029Spjd
5388185029Spjd	ASSERT(!l2arc_vdev_present(vd));
5389185029Spjd
5390255753Sgibbs	vdev_ashift_optimize(vd);
5391255753Sgibbs
5392185029Spjd	/*
5393185029Spjd	 * Create a new l2arc device entry.
5394185029Spjd	 */
5395185029Spjd	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5396185029Spjd	adddev->l2ad_spa = spa;
5397185029Spjd	adddev->l2ad_vdev = vd;
5398219089Spjd	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5399219089Spjd	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5400185029Spjd	adddev->l2ad_hand = adddev->l2ad_start;
5401185029Spjd	adddev->l2ad_evict = adddev->l2ad_start;
5402185029Spjd	adddev->l2ad_first = B_TRUE;
5403208373Smm	adddev->l2ad_writing = B_FALSE;
5404185029Spjd
5405185029Spjd	/*
5406185029Spjd	 * This is a list of all ARC buffers that are still valid on the
5407185029Spjd	 * device.
5408185029Spjd	 */
5409185029Spjd	adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
5410185029Spjd	list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5411185029Spjd	    offsetof(arc_buf_hdr_t, b_l2node));
5412185029Spjd
5413219089Spjd	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5414185029Spjd
5415185029Spjd	/*
5416185029Spjd	 * Add device to global list
5417185029Spjd	 */
5418185029Spjd	mutex_enter(&l2arc_dev_mtx);
5419185029Spjd	list_insert_head(l2arc_dev_list, adddev);
5420185029Spjd	atomic_inc_64(&l2arc_ndev);
5421185029Spjd	mutex_exit(&l2arc_dev_mtx);
5422185029Spjd}
5423185029Spjd
5424185029Spjd/*
5425185029Spjd * Remove a vdev from the L2ARC.
5426185029Spjd */
5427185029Spjdvoid
5428185029Spjdl2arc_remove_vdev(vdev_t *vd)
5429185029Spjd{
5430185029Spjd	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
5431185029Spjd
5432185029Spjd	/*
5433185029Spjd	 * Find the device by vdev
5434185029Spjd	 */
5435185029Spjd	mutex_enter(&l2arc_dev_mtx);
5436185029Spjd	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
5437185029Spjd		nextdev = list_next(l2arc_dev_list, dev);
5438185029Spjd		if (vd == dev->l2ad_vdev) {
5439185029Spjd			remdev = dev;
5440185029Spjd			break;
5441185029Spjd		}
5442185029Spjd	}
5443185029Spjd	ASSERT(remdev != NULL);
5444185029Spjd
5445185029Spjd	/*
5446185029Spjd	 * Remove device from global list
5447185029Spjd	 */
5448185029Spjd	list_remove(l2arc_dev_list, remdev);
5449185029Spjd	l2arc_dev_last = NULL;		/* may have been invalidated */
5450185029Spjd	atomic_dec_64(&l2arc_ndev);
5451185029Spjd	mutex_exit(&l2arc_dev_mtx);
5452185029Spjd
5453185029Spjd	/*
5454185029Spjd	 * Clear all buflists and ARC references.  L2ARC device flush.
5455185029Spjd	 */
5456185029Spjd	l2arc_evict(remdev, 0, B_TRUE);
5457185029Spjd	list_destroy(remdev->l2ad_buflist);
5458185029Spjd	kmem_free(remdev->l2ad_buflist, sizeof (list_t));
5459185029Spjd	kmem_free(remdev, sizeof (l2arc_dev_t));
5460185029Spjd}
5461185029Spjd
5462185029Spjdvoid
5463185029Spjdl2arc_init(void)
5464185029Spjd{
5465185029Spjd	l2arc_thread_exit = 0;
5466185029Spjd	l2arc_ndev = 0;
5467185029Spjd	l2arc_writes_sent = 0;
5468185029Spjd	l2arc_writes_done = 0;
5469185029Spjd
5470185029Spjd	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
5471185029Spjd	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
5472185029Spjd	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
5473185029Spjd	mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
5474185029Spjd	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
5475185029Spjd
5476185029Spjd	l2arc_dev_list = &L2ARC_dev_list;
5477185029Spjd	l2arc_free_on_write = &L2ARC_free_on_write;
5478185029Spjd	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
5479185029Spjd	    offsetof(l2arc_dev_t, l2ad_node));
5480185029Spjd	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
5481185029Spjd	    offsetof(l2arc_data_free_t, l2df_list_node));
5482185029Spjd}
5483185029Spjd
5484185029Spjdvoid
5485185029Spjdl2arc_fini(void)
5486185029Spjd{
5487185029Spjd	/*
5488185029Spjd	 * This is called from dmu_fini(), which is called from spa_fini();
5489185029Spjd	 * Because of this, we can assume that all l2arc devices have
5490185029Spjd	 * already been removed when the pools themselves were removed.
5491185029Spjd	 */
5492185029Spjd
5493185029Spjd	l2arc_do_free_on_write();
5494185029Spjd
5495185029Spjd	mutex_destroy(&l2arc_feed_thr_lock);
5496185029Spjd	cv_destroy(&l2arc_feed_thr_cv);
5497185029Spjd	mutex_destroy(&l2arc_dev_mtx);
5498185029Spjd	mutex_destroy(&l2arc_buflist_mtx);
5499185029Spjd	mutex_destroy(&l2arc_free_on_write_mtx);
5500185029Spjd
5501185029Spjd	list_destroy(l2arc_dev_list);
5502185029Spjd	list_destroy(l2arc_free_on_write);
5503185029Spjd}
5504185029Spjd
5505185029Spjdvoid
5506185029Spjdl2arc_start(void)
5507185029Spjd{
5508209962Smm	if (!(spa_mode_global & FWRITE))
5509185029Spjd		return;
5510185029Spjd
5511185029Spjd	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
5512185029Spjd	    TS_RUN, minclsyspri);
5513185029Spjd}
5514185029Spjd
5515185029Spjdvoid
5516185029Spjdl2arc_stop(void)
5517185029Spjd{
5518209962Smm	if (!(spa_mode_global & FWRITE))
5519185029Spjd		return;
5520185029Spjd
5521185029Spjd	mutex_enter(&l2arc_feed_thr_lock);
5522185029Spjd	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
5523185029Spjd	l2arc_thread_exit = 1;
5524185029Spjd	while (l2arc_thread_exit != 0)
5525185029Spjd		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
5526185029Spjd	mutex_exit(&l2arc_feed_thr_lock);
5527185029Spjd}
5528