arc.c revision 286570
1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23277826Sdelphij * Copyright (c) 2012, Joyent, Inc. All rights reserved.
24268123Sdelphij * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
25260835Sdelphij * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
26268085Sdelphij * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
27168404Spjd */
28168404Spjd
29168404Spjd/*
30168404Spjd * DVA-based Adjustable Replacement Cache
31168404Spjd *
32168404Spjd * While much of the theory of operation used here is
33168404Spjd * based on the self-tuning, low overhead replacement cache
34168404Spjd * presented by Megiddo and Modha at FAST 2003, there are some
35168404Spjd * significant differences:
36168404Spjd *
37168404Spjd * 1. The Megiddo and Modha model assumes any page is evictable.
38168404Spjd * Pages in its cache cannot be "locked" into memory.  This makes
39168404Spjd * the eviction algorithm simple: evict the last page in the list.
40168404Spjd * This also make the performance characteristics easy to reason
41168404Spjd * about.  Our cache is not so simple.  At any given moment, some
42168404Spjd * subset of the blocks in the cache are un-evictable because we
43168404Spjd * have handed out a reference to them.  Blocks are only evictable
44168404Spjd * when there are no external references active.  This makes
45168404Spjd * eviction far more problematic:  we choose to evict the evictable
46168404Spjd * blocks that are the "lowest" in the list.
47168404Spjd *
48168404Spjd * There are times when it is not possible to evict the requested
49168404Spjd * space.  In these circumstances we are unable to adjust the cache
50168404Spjd * size.  To prevent the cache growing unbounded at these times we
51185029Spjd * implement a "cache throttle" that slows the flow of new data
52185029Spjd * into the cache until we can make space available.
53168404Spjd *
54168404Spjd * 2. The Megiddo and Modha model assumes a fixed cache size.
55168404Spjd * Pages are evicted when the cache is full and there is a cache
56168404Spjd * miss.  Our model has a variable sized cache.  It grows with
57185029Spjd * high use, but also tries to react to memory pressure from the
58168404Spjd * operating system: decreasing its size when system memory is
59168404Spjd * tight.
60168404Spjd *
61168404Spjd * 3. The Megiddo and Modha model assumes a fixed page size. All
62251631Sdelphij * elements of the cache are therefore exactly the same size.  So
63168404Spjd * when adjusting the cache size following a cache miss, its simply
64168404Spjd * a matter of choosing a single page to evict.  In our model, we
65168404Spjd * have variable sized cache blocks (rangeing from 512 bytes to
66251631Sdelphij * 128K bytes).  We therefore choose a set of blocks to evict to make
67168404Spjd * space for a cache miss that approximates as closely as possible
68168404Spjd * the space used by the new block.
69168404Spjd *
70168404Spjd * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
71168404Spjd * by N. Megiddo & D. Modha, FAST 2003
72168404Spjd */
73168404Spjd
74168404Spjd/*
75168404Spjd * The locking model:
76168404Spjd *
77168404Spjd * A new reference to a cache buffer can be obtained in two
78168404Spjd * ways: 1) via a hash table lookup using the DVA as a key,
79185029Spjd * or 2) via one of the ARC lists.  The arc_read() interface
80168404Spjd * uses method 1, while the internal arc algorithms for
81251631Sdelphij * adjusting the cache use method 2.  We therefore provide two
82168404Spjd * types of locks: 1) the hash table lock array, and 2) the
83168404Spjd * arc list locks.
84168404Spjd *
85168404Spjd * Buffers do not have their own mutexs, rather they rely on the
86168404Spjd * hash table mutexs for the bulk of their protection (i.e. most
87168404Spjd * fields in the arc_buf_hdr_t are protected by these mutexs).
88168404Spjd *
89168404Spjd * buf_hash_find() returns the appropriate mutex (held) when it
90168404Spjd * locates the requested buffer in the hash table.  It returns
91168404Spjd * NULL for the mutex if the buffer was not in the table.
92168404Spjd *
93168404Spjd * buf_hash_remove() expects the appropriate hash mutex to be
94168404Spjd * already held before it is invoked.
95168404Spjd *
96168404Spjd * Each arc state also has a mutex which is used to protect the
97168404Spjd * buffer list associated with the state.  When attempting to
98168404Spjd * obtain a hash table lock while holding an arc list lock you
99168404Spjd * must use: mutex_tryenter() to avoid deadlock.  Also note that
100168404Spjd * the active state mutex must be held before the ghost state mutex.
101168404Spjd *
102168404Spjd * Arc buffers may have an associated eviction callback function.
103168404Spjd * This function will be invoked prior to removing the buffer (e.g.
104168404Spjd * in arc_do_user_evicts()).  Note however that the data associated
105168404Spjd * with the buffer may be evicted prior to the callback.  The callback
106168404Spjd * must be made with *no locks held* (to prevent deadlock).  Additionally,
107168404Spjd * the users of callbacks must ensure that their private data is
108268858Sdelphij * protected from simultaneous callbacks from arc_clear_callback()
109168404Spjd * and arc_do_user_evicts().
110168404Spjd *
111168404Spjd * Note that the majority of the performance stats are manipulated
112168404Spjd * with atomic operations.
113185029Spjd *
114286570Smav * The L2ARC uses the l2ad_mtx on each vdev for the following:
115185029Spjd *
116185029Spjd *	- L2ARC buflist creation
117185029Spjd *	- L2ARC buflist eviction
118185029Spjd *	- L2ARC write completion, which walks L2ARC buflists
119185029Spjd *	- ARC header destruction, as it removes from L2ARC buflists
120185029Spjd *	- ARC header release, as it removes from L2ARC buflists
121168404Spjd */
122168404Spjd
123168404Spjd#include <sys/spa.h>
124168404Spjd#include <sys/zio.h>
125251478Sdelphij#include <sys/zio_compress.h>
126168404Spjd#include <sys/zfs_context.h>
127168404Spjd#include <sys/arc.h>
128168404Spjd#include <sys/refcount.h>
129185029Spjd#include <sys/vdev.h>
130219089Spjd#include <sys/vdev_impl.h>
131258632Savg#include <sys/dsl_pool.h>
132168404Spjd#ifdef _KERNEL
133168404Spjd#include <sys/dnlc.h>
134168404Spjd#endif
135168404Spjd#include <sys/callb.h>
136168404Spjd#include <sys/kstat.h>
137248572Ssmh#include <sys/trim_map.h>
138219089Spjd#include <zfs_fletcher.h>
139168404Spjd#include <sys/sdt.h>
140168404Spjd
141191902Skmacy#include <vm/vm_pageout.h>
142272483Ssmh#include <machine/vmparam.h>
143191902Skmacy
144240133Smm#ifdef illumos
145240133Smm#ifndef _KERNEL
146240133Smm/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
147240133Smmboolean_t arc_watch = B_FALSE;
148240133Smmint arc_procfd;
149240133Smm#endif
150240133Smm#endif /* illumos */
151240133Smm
152168404Spjdstatic kmutex_t		arc_reclaim_thr_lock;
153168404Spjdstatic kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
154168404Spjdstatic uint8_t		arc_thread_exit;
155168404Spjd
156168404Spjd#define	ARC_REDUCE_DNLC_PERCENT	3
157168404Spjduint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
158168404Spjd
159168404Spjdtypedef enum arc_reclaim_strategy {
160168404Spjd	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
161168404Spjd	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
162168404Spjd} arc_reclaim_strategy_t;
163168404Spjd
164258632Savg/*
165258632Savg * The number of iterations through arc_evict_*() before we
166258632Savg * drop & reacquire the lock.
167258632Savg */
168258632Savgint arc_evict_iterations = 100;
169258632Savg
170168404Spjd/* number of seconds before growing cache again */
171168404Spjdstatic int		arc_grow_retry = 60;
172168404Spjd
173208373Smm/* shift of arc_c for calculating both min and max arc_p */
174208373Smmstatic int		arc_p_min_shift = 4;
175208373Smm
176208373Smm/* log2(fraction of arc to reclaim) */
177208373Smmstatic int		arc_shrink_shift = 5;
178208373Smm
179168404Spjd/*
180168404Spjd * minimum lifespan of a prefetch block in clock ticks
181168404Spjd * (initialized in arc_init())
182168404Spjd */
183168404Spjdstatic int		arc_min_prefetch_lifespan;
184168404Spjd
185258632Savg/*
186258632Savg * If this percent of memory is free, don't throttle.
187258632Savg */
188258632Savgint arc_lotsfree_percent = 10;
189258632Savg
190208373Smmstatic int arc_dead;
191194043Skmacyextern int zfs_prefetch_disable;
192168404Spjd
193168404Spjd/*
194185029Spjd * The arc has filled available memory and has now warmed up.
195185029Spjd */
196185029Spjdstatic boolean_t arc_warm;
197185029Spjd
198185029Spjduint64_t zfs_arc_max;
199185029Spjduint64_t zfs_arc_min;
200185029Spjduint64_t zfs_arc_meta_limit = 0;
201275780Sdelphijuint64_t zfs_arc_meta_min = 0;
202208373Smmint zfs_arc_grow_retry = 0;
203208373Smmint zfs_arc_shrink_shift = 0;
204208373Smmint zfs_arc_p_min_shift = 0;
205242845Sdelphijint zfs_disable_dup_eviction = 0;
206269230Sdelphijuint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
207272483Ssmhu_int zfs_arc_free_target = 0;
208185029Spjd
209270759Ssmhstatic int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
210275748Sdelphijstatic int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS);
211270759Ssmh
212270759Ssmh#ifdef _KERNEL
213270759Ssmhstatic void
214270759Ssmharc_free_target_init(void *unused __unused)
215270759Ssmh{
216270759Ssmh
217272483Ssmh	zfs_arc_free_target = vm_pageout_wakeup_thresh;
218270759Ssmh}
219270759SsmhSYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
220270759Ssmh    arc_free_target_init, NULL);
221270759Ssmh
222185029SpjdTUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
223275780SdelphijTUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min);
224273026SdelphijTUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift);
225168473SpjdSYSCTL_DECL(_vfs_zfs);
226217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
227168473Spjd    "Maximum ARC size");
228217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
229168473Spjd    "Minimum ARC size");
230269230SdelphijSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
231269230Sdelphij    &zfs_arc_average_blocksize, 0,
232269230Sdelphij    "ARC average blocksize");
233273026SdelphijSYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
234273026Sdelphij    &arc_shrink_shift, 0,
235273026Sdelphij    "log2(fraction of arc to reclaim)");
236273026Sdelphij
237270759Ssmh/*
238270759Ssmh * We don't have a tunable for arc_free_target due to the dependency on
239270759Ssmh * pagedaemon initialisation.
240270759Ssmh */
241270759SsmhSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
242270759Ssmh    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
243270759Ssmh    sysctl_vfs_zfs_arc_free_target, "IU",
244270759Ssmh    "Desired number of free pages below which ARC triggers reclaim");
245168404Spjd
246270759Ssmhstatic int
247270759Ssmhsysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
248270759Ssmh{
249270759Ssmh	u_int val;
250270759Ssmh	int err;
251270759Ssmh
252270759Ssmh	val = zfs_arc_free_target;
253270759Ssmh	err = sysctl_handle_int(oidp, &val, 0, req);
254270759Ssmh	if (err != 0 || req->newptr == NULL)
255270759Ssmh		return (err);
256270759Ssmh
257272483Ssmh	if (val < minfree)
258270759Ssmh		return (EINVAL);
259272483Ssmh	if (val > vm_cnt.v_page_count)
260270759Ssmh		return (EINVAL);
261270759Ssmh
262270759Ssmh	zfs_arc_free_target = val;
263270759Ssmh
264270759Ssmh	return (0);
265270759Ssmh}
266275748Sdelphij
267275748Sdelphij/*
268275748Sdelphij * Must be declared here, before the definition of corresponding kstat
269275748Sdelphij * macro which uses the same names will confuse the compiler.
270275748Sdelphij */
271275748SdelphijSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit,
272275748Sdelphij    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
273275748Sdelphij    sysctl_vfs_zfs_arc_meta_limit, "QU",
274275748Sdelphij    "ARC metadata limit");
275272483Ssmh#endif
276270759Ssmh
277168404Spjd/*
278185029Spjd * Note that buffers can be in one of 6 states:
279168404Spjd *	ARC_anon	- anonymous (discussed below)
280168404Spjd *	ARC_mru		- recently used, currently cached
281168404Spjd *	ARC_mru_ghost	- recentely used, no longer in cache
282168404Spjd *	ARC_mfu		- frequently used, currently cached
283168404Spjd *	ARC_mfu_ghost	- frequently used, no longer in cache
284185029Spjd *	ARC_l2c_only	- exists in L2ARC but not other states
285185029Spjd * When there are no active references to the buffer, they are
286185029Spjd * are linked onto a list in one of these arc states.  These are
287185029Spjd * the only buffers that can be evicted or deleted.  Within each
288185029Spjd * state there are multiple lists, one for meta-data and one for
289185029Spjd * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
290185029Spjd * etc.) is tracked separately so that it can be managed more
291185029Spjd * explicitly: favored over data, limited explicitly.
292168404Spjd *
293168404Spjd * Anonymous buffers are buffers that are not associated with
294168404Spjd * a DVA.  These are buffers that hold dirty block copies
295168404Spjd * before they are written to stable storage.  By definition,
296168404Spjd * they are "ref'd" and are considered part of arc_mru
297168404Spjd * that cannot be freed.  Generally, they will aquire a DVA
298168404Spjd * as they are written and migrate onto the arc_mru list.
299185029Spjd *
300185029Spjd * The ARC_l2c_only state is for buffers that are in the second
301185029Spjd * level ARC but no longer in any of the ARC_m* lists.  The second
302185029Spjd * level ARC itself may also contain buffers that are in any of
303185029Spjd * the ARC_m* states - meaning that a buffer can exist in two
304185029Spjd * places.  The reason for the ARC_l2c_only state is to keep the
305185029Spjd * buffer header in the hash table, so that reads that hit the
306185029Spjd * second level ARC benefit from these fast lookups.
307168404Spjd */
308168404Spjd
309205264Skmacy#define	ARCS_LOCK_PAD		CACHE_LINE_SIZE
310205231Skmacystruct arcs_lock {
311205231Skmacy	kmutex_t	arcs_lock;
312205231Skmacy#ifdef _KERNEL
313205231Skmacy	unsigned char	pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
314205231Skmacy#endif
315205231Skmacy};
316205231Skmacy
317205231Skmacy/*
318205231Skmacy * must be power of two for mask use to work
319205231Skmacy *
320205231Skmacy */
321205231Skmacy#define ARC_BUFC_NUMDATALISTS		16
322205231Skmacy#define ARC_BUFC_NUMMETADATALISTS	16
323206796Spjd#define ARC_BUFC_NUMLISTS	(ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
324205231Skmacy
325168404Spjdtypedef struct arc_state {
326185029Spjd	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
327185029Spjd	uint64_t arcs_size;	/* total amount of data in this state */
328205231Skmacy	list_t	arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
329205264Skmacy	struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
330168404Spjd} arc_state_t;
331168404Spjd
332206796Spjd#define ARCS_LOCK(s, i)	(&((s)->arcs_locks[(i)].arcs_lock))
333205231Skmacy
334185029Spjd/* The 6 states: */
335168404Spjdstatic arc_state_t ARC_anon;
336168404Spjdstatic arc_state_t ARC_mru;
337168404Spjdstatic arc_state_t ARC_mru_ghost;
338168404Spjdstatic arc_state_t ARC_mfu;
339168404Spjdstatic arc_state_t ARC_mfu_ghost;
340185029Spjdstatic arc_state_t ARC_l2c_only;
341168404Spjd
342168404Spjdtypedef struct arc_stats {
343168404Spjd	kstat_named_t arcstat_hits;
344168404Spjd	kstat_named_t arcstat_misses;
345168404Spjd	kstat_named_t arcstat_demand_data_hits;
346168404Spjd	kstat_named_t arcstat_demand_data_misses;
347168404Spjd	kstat_named_t arcstat_demand_metadata_hits;
348168404Spjd	kstat_named_t arcstat_demand_metadata_misses;
349168404Spjd	kstat_named_t arcstat_prefetch_data_hits;
350168404Spjd	kstat_named_t arcstat_prefetch_data_misses;
351168404Spjd	kstat_named_t arcstat_prefetch_metadata_hits;
352168404Spjd	kstat_named_t arcstat_prefetch_metadata_misses;
353168404Spjd	kstat_named_t arcstat_mru_hits;
354168404Spjd	kstat_named_t arcstat_mru_ghost_hits;
355168404Spjd	kstat_named_t arcstat_mfu_hits;
356168404Spjd	kstat_named_t arcstat_mfu_ghost_hits;
357205231Skmacy	kstat_named_t arcstat_allocated;
358168404Spjd	kstat_named_t arcstat_deleted;
359205231Skmacy	kstat_named_t arcstat_stolen;
360168404Spjd	kstat_named_t arcstat_recycle_miss;
361251629Sdelphij	/*
362251629Sdelphij	 * Number of buffers that could not be evicted because the hash lock
363251629Sdelphij	 * was held by another thread.  The lock may not necessarily be held
364251629Sdelphij	 * by something using the same buffer, since hash locks are shared
365251629Sdelphij	 * by multiple buffers.
366251629Sdelphij	 */
367168404Spjd	kstat_named_t arcstat_mutex_miss;
368251629Sdelphij	/*
369251629Sdelphij	 * Number of buffers skipped because they have I/O in progress, are
370251629Sdelphij	 * indrect prefetch buffers that have not lived long enough, or are
371251629Sdelphij	 * not from the spa we're trying to evict from.
372251629Sdelphij	 */
373168404Spjd	kstat_named_t arcstat_evict_skip;
374208373Smm	kstat_named_t arcstat_evict_l2_cached;
375208373Smm	kstat_named_t arcstat_evict_l2_eligible;
376208373Smm	kstat_named_t arcstat_evict_l2_ineligible;
377168404Spjd	kstat_named_t arcstat_hash_elements;
378168404Spjd	kstat_named_t arcstat_hash_elements_max;
379168404Spjd	kstat_named_t arcstat_hash_collisions;
380168404Spjd	kstat_named_t arcstat_hash_chains;
381168404Spjd	kstat_named_t arcstat_hash_chain_max;
382168404Spjd	kstat_named_t arcstat_p;
383168404Spjd	kstat_named_t arcstat_c;
384168404Spjd	kstat_named_t arcstat_c_min;
385168404Spjd	kstat_named_t arcstat_c_max;
386168404Spjd	kstat_named_t arcstat_size;
387185029Spjd	kstat_named_t arcstat_hdr_size;
388208373Smm	kstat_named_t arcstat_data_size;
389208373Smm	kstat_named_t arcstat_other_size;
390185029Spjd	kstat_named_t arcstat_l2_hits;
391185029Spjd	kstat_named_t arcstat_l2_misses;
392185029Spjd	kstat_named_t arcstat_l2_feeds;
393185029Spjd	kstat_named_t arcstat_l2_rw_clash;
394208373Smm	kstat_named_t arcstat_l2_read_bytes;
395208373Smm	kstat_named_t arcstat_l2_write_bytes;
396185029Spjd	kstat_named_t arcstat_l2_writes_sent;
397185029Spjd	kstat_named_t arcstat_l2_writes_done;
398185029Spjd	kstat_named_t arcstat_l2_writes_error;
399185029Spjd	kstat_named_t arcstat_l2_writes_hdr_miss;
400185029Spjd	kstat_named_t arcstat_l2_evict_lock_retry;
401185029Spjd	kstat_named_t arcstat_l2_evict_reading;
402286570Smav	kstat_named_t arcstat_l2_evict_l1cached;
403185029Spjd	kstat_named_t arcstat_l2_free_on_write;
404274172Savg	kstat_named_t arcstat_l2_cdata_free_on_write;
405185029Spjd	kstat_named_t arcstat_l2_abort_lowmem;
406185029Spjd	kstat_named_t arcstat_l2_cksum_bad;
407185029Spjd	kstat_named_t arcstat_l2_io_error;
408185029Spjd	kstat_named_t arcstat_l2_size;
409251478Sdelphij	kstat_named_t arcstat_l2_asize;
410185029Spjd	kstat_named_t arcstat_l2_hdr_size;
411251478Sdelphij	kstat_named_t arcstat_l2_compress_successes;
412251478Sdelphij	kstat_named_t arcstat_l2_compress_zeros;
413251478Sdelphij	kstat_named_t arcstat_l2_compress_failures;
414205231Skmacy	kstat_named_t arcstat_l2_write_trylock_fail;
415205231Skmacy	kstat_named_t arcstat_l2_write_passed_headroom;
416205231Skmacy	kstat_named_t arcstat_l2_write_spa_mismatch;
417206796Spjd	kstat_named_t arcstat_l2_write_in_l2;
418205231Skmacy	kstat_named_t arcstat_l2_write_hdr_io_in_progress;
419205231Skmacy	kstat_named_t arcstat_l2_write_not_cacheable;
420205231Skmacy	kstat_named_t arcstat_l2_write_full;
421205231Skmacy	kstat_named_t arcstat_l2_write_buffer_iter;
422205231Skmacy	kstat_named_t arcstat_l2_write_pios;
423205231Skmacy	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
424205231Skmacy	kstat_named_t arcstat_l2_write_buffer_list_iter;
425205231Skmacy	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
426242845Sdelphij	kstat_named_t arcstat_memory_throttle_count;
427242845Sdelphij	kstat_named_t arcstat_duplicate_buffers;
428242845Sdelphij	kstat_named_t arcstat_duplicate_buffers_size;
429242845Sdelphij	kstat_named_t arcstat_duplicate_reads;
430275748Sdelphij	kstat_named_t arcstat_meta_used;
431275748Sdelphij	kstat_named_t arcstat_meta_limit;
432275748Sdelphij	kstat_named_t arcstat_meta_max;
433275780Sdelphij	kstat_named_t arcstat_meta_min;
434168404Spjd} arc_stats_t;
435168404Spjd
436168404Spjdstatic arc_stats_t arc_stats = {
437168404Spjd	{ "hits",			KSTAT_DATA_UINT64 },
438168404Spjd	{ "misses",			KSTAT_DATA_UINT64 },
439168404Spjd	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
440168404Spjd	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
441168404Spjd	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
442168404Spjd	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
443168404Spjd	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
444168404Spjd	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
445168404Spjd	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
446168404Spjd	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
447168404Spjd	{ "mru_hits",			KSTAT_DATA_UINT64 },
448168404Spjd	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
449168404Spjd	{ "mfu_hits",			KSTAT_DATA_UINT64 },
450168404Spjd	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
451205231Skmacy	{ "allocated",			KSTAT_DATA_UINT64 },
452168404Spjd	{ "deleted",			KSTAT_DATA_UINT64 },
453205231Skmacy	{ "stolen",			KSTAT_DATA_UINT64 },
454168404Spjd	{ "recycle_miss",		KSTAT_DATA_UINT64 },
455168404Spjd	{ "mutex_miss",			KSTAT_DATA_UINT64 },
456168404Spjd	{ "evict_skip",			KSTAT_DATA_UINT64 },
457208373Smm	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
458208373Smm	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
459208373Smm	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
460168404Spjd	{ "hash_elements",		KSTAT_DATA_UINT64 },
461168404Spjd	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
462168404Spjd	{ "hash_collisions",		KSTAT_DATA_UINT64 },
463168404Spjd	{ "hash_chains",		KSTAT_DATA_UINT64 },
464168404Spjd	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
465168404Spjd	{ "p",				KSTAT_DATA_UINT64 },
466168404Spjd	{ "c",				KSTAT_DATA_UINT64 },
467168404Spjd	{ "c_min",			KSTAT_DATA_UINT64 },
468168404Spjd	{ "c_max",			KSTAT_DATA_UINT64 },
469185029Spjd	{ "size",			KSTAT_DATA_UINT64 },
470185029Spjd	{ "hdr_size",			KSTAT_DATA_UINT64 },
471208373Smm	{ "data_size",			KSTAT_DATA_UINT64 },
472208373Smm	{ "other_size",			KSTAT_DATA_UINT64 },
473185029Spjd	{ "l2_hits",			KSTAT_DATA_UINT64 },
474185029Spjd	{ "l2_misses",			KSTAT_DATA_UINT64 },
475185029Spjd	{ "l2_feeds",			KSTAT_DATA_UINT64 },
476185029Spjd	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
477208373Smm	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
478208373Smm	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
479185029Spjd	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
480185029Spjd	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
481185029Spjd	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
482185029Spjd	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
483185029Spjd	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
484185029Spjd	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
485286570Smav	{ "l2_evict_l1cached",		KSTAT_DATA_UINT64 },
486185029Spjd	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
487274172Savg	{ "l2_cdata_free_on_write",	KSTAT_DATA_UINT64 },
488185029Spjd	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
489185029Spjd	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
490185029Spjd	{ "l2_io_error",		KSTAT_DATA_UINT64 },
491185029Spjd	{ "l2_size",			KSTAT_DATA_UINT64 },
492251478Sdelphij	{ "l2_asize",			KSTAT_DATA_UINT64 },
493185029Spjd	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
494251478Sdelphij	{ "l2_compress_successes",	KSTAT_DATA_UINT64 },
495251478Sdelphij	{ "l2_compress_zeros",		KSTAT_DATA_UINT64 },
496251478Sdelphij	{ "l2_compress_failures",	KSTAT_DATA_UINT64 },
497206796Spjd	{ "l2_write_trylock_fail",	KSTAT_DATA_UINT64 },
498206796Spjd	{ "l2_write_passed_headroom",	KSTAT_DATA_UINT64 },
499206796Spjd	{ "l2_write_spa_mismatch",	KSTAT_DATA_UINT64 },
500206796Spjd	{ "l2_write_in_l2",		KSTAT_DATA_UINT64 },
501206796Spjd	{ "l2_write_io_in_progress",	KSTAT_DATA_UINT64 },
502206796Spjd	{ "l2_write_not_cacheable",	KSTAT_DATA_UINT64 },
503206796Spjd	{ "l2_write_full",		KSTAT_DATA_UINT64 },
504206796Spjd	{ "l2_write_buffer_iter",	KSTAT_DATA_UINT64 },
505206796Spjd	{ "l2_write_pios",		KSTAT_DATA_UINT64 },
506206796Spjd	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
507206796Spjd	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
508242845Sdelphij	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
509242845Sdelphij	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
510242845Sdelphij	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
511242845Sdelphij	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
512275748Sdelphij	{ "duplicate_reads",		KSTAT_DATA_UINT64 },
513275748Sdelphij	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
514275748Sdelphij	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
515275780Sdelphij	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
516275780Sdelphij	{ "arc_meta_min",		KSTAT_DATA_UINT64 }
517168404Spjd};
518168404Spjd
519168404Spjd#define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
520168404Spjd
521168404Spjd#define	ARCSTAT_INCR(stat, val) \
522251631Sdelphij	atomic_add_64(&arc_stats.stat.value.ui64, (val))
523168404Spjd
524206796Spjd#define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
525168404Spjd#define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
526168404Spjd
527168404Spjd#define	ARCSTAT_MAX(stat, val) {					\
528168404Spjd	uint64_t m;							\
529168404Spjd	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
530168404Spjd	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
531168404Spjd		continue;						\
532168404Spjd}
533168404Spjd
534168404Spjd#define	ARCSTAT_MAXSTAT(stat) \
535168404Spjd	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
536168404Spjd
537168404Spjd/*
538168404Spjd * We define a macro to allow ARC hits/misses to be easily broken down by
539168404Spjd * two separate conditions, giving a total of four different subtypes for
540168404Spjd * each of hits and misses (so eight statistics total).
541168404Spjd */
542168404Spjd#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
543168404Spjd	if (cond1) {							\
544168404Spjd		if (cond2) {						\
545168404Spjd			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
546168404Spjd		} else {						\
547168404Spjd			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
548168404Spjd		}							\
549168404Spjd	} else {							\
550168404Spjd		if (cond2) {						\
551168404Spjd			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
552168404Spjd		} else {						\
553168404Spjd			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
554168404Spjd		}							\
555168404Spjd	}
556168404Spjd
557168404Spjdkstat_t			*arc_ksp;
558206796Spjdstatic arc_state_t	*arc_anon;
559168404Spjdstatic arc_state_t	*arc_mru;
560168404Spjdstatic arc_state_t	*arc_mru_ghost;
561168404Spjdstatic arc_state_t	*arc_mfu;
562168404Spjdstatic arc_state_t	*arc_mfu_ghost;
563185029Spjdstatic arc_state_t	*arc_l2c_only;
564168404Spjd
565168404Spjd/*
566168404Spjd * There are several ARC variables that are critical to export as kstats --
567168404Spjd * but we don't want to have to grovel around in the kstat whenever we wish to
568168404Spjd * manipulate them.  For these variables, we therefore define them to be in
569168404Spjd * terms of the statistic variable.  This assures that we are not introducing
570168404Spjd * the possibility of inconsistency by having shadow copies of the variables,
571168404Spjd * while still allowing the code to be readable.
572168404Spjd */
573168404Spjd#define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
574168404Spjd#define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
575168404Spjd#define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
576168404Spjd#define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
577168404Spjd#define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
578275748Sdelphij#define	arc_meta_limit	ARCSTAT(arcstat_meta_limit) /* max size for metadata */
579275780Sdelphij#define	arc_meta_min	ARCSTAT(arcstat_meta_min) /* min size for metadata */
580275748Sdelphij#define	arc_meta_used	ARCSTAT(arcstat_meta_used) /* size of metadata */
581275748Sdelphij#define	arc_meta_max	ARCSTAT(arcstat_meta_max) /* max size of metadata */
582168404Spjd
583251478Sdelphij#define	L2ARC_IS_VALID_COMPRESS(_c_) \
584251478Sdelphij	((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
585251478Sdelphij
586168404Spjdstatic int		arc_no_grow;	/* Don't try to grow cache size */
587168404Spjdstatic uint64_t		arc_tempreserve;
588209962Smmstatic uint64_t		arc_loaned_bytes;
589168404Spjd
590168404Spjdtypedef struct arc_callback arc_callback_t;
591168404Spjd
592168404Spjdstruct arc_callback {
593168404Spjd	void			*acb_private;
594168404Spjd	arc_done_func_t		*acb_done;
595168404Spjd	arc_buf_t		*acb_buf;
596168404Spjd	zio_t			*acb_zio_dummy;
597168404Spjd	arc_callback_t		*acb_next;
598168404Spjd};
599168404Spjd
600168404Spjdtypedef struct arc_write_callback arc_write_callback_t;
601168404Spjd
602168404Spjdstruct arc_write_callback {
603168404Spjd	void		*awcb_private;
604168404Spjd	arc_done_func_t	*awcb_ready;
605258632Savg	arc_done_func_t	*awcb_physdone;
606168404Spjd	arc_done_func_t	*awcb_done;
607168404Spjd	arc_buf_t	*awcb_buf;
608168404Spjd};
609168404Spjd
610286570Smav/*
611286570Smav * ARC buffers are separated into multiple structs as a memory saving measure:
612286570Smav *   - Common fields struct, always defined, and embedded within it:
613286570Smav *       - L2-only fields, always allocated but undefined when not in L2ARC
614286570Smav *       - L1-only fields, only allocated when in L1ARC
615286570Smav *
616286570Smav *           Buffer in L1                     Buffer only in L2
617286570Smav *    +------------------------+          +------------------------+
618286570Smav *    | arc_buf_hdr_t          |          | arc_buf_hdr_t          |
619286570Smav *    |                        |          |                        |
620286570Smav *    |                        |          |                        |
621286570Smav *    |                        |          |                        |
622286570Smav *    +------------------------+          +------------------------+
623286570Smav *    | l2arc_buf_hdr_t        |          | l2arc_buf_hdr_t        |
624286570Smav *    | (undefined if L1-only) |          |                        |
625286570Smav *    +------------------------+          +------------------------+
626286570Smav *    | l1arc_buf_hdr_t        |
627286570Smav *    |                        |
628286570Smav *    |                        |
629286570Smav *    |                        |
630286570Smav *    |                        |
631286570Smav *    +------------------------+
632286570Smav *
633286570Smav * Because it's possible for the L2ARC to become extremely large, we can wind
634286570Smav * up eating a lot of memory in L2ARC buffer headers, so the size of a header
635286570Smav * is minimized by only allocating the fields necessary for an L1-cached buffer
636286570Smav * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
637286570Smav * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
638286570Smav * words in pointers. arc_hdr_realloc() is used to switch a header between
639286570Smav * these two allocation states.
640286570Smav */
641286570Smavtypedef struct l1arc_buf_hdr {
642168404Spjd	kmutex_t		b_freeze_lock;
643286570Smav#ifdef ZFS_DEBUG
644286570Smav	/*
645286570Smav	 * used for debugging wtih kmem_flags - by allocating and freeing
646286570Smav	 * b_thawed when the buffer is thawed, we get a record of the stack
647286570Smav	 * trace that thawed it.
648286570Smav	 */
649219089Spjd	void			*b_thawed;
650286570Smav#endif
651168404Spjd
652168404Spjd	arc_buf_t		*b_buf;
653168404Spjd	uint32_t		b_datacnt;
654286570Smav	/* for waiting on writes to complete */
655168404Spjd	kcondvar_t		b_cv;
656168404Spjd
657168404Spjd	/* protected by arc state mutex */
658168404Spjd	arc_state_t		*b_state;
659168404Spjd	list_node_t		b_arc_node;
660168404Spjd
661168404Spjd	/* updated atomically */
662168404Spjd	clock_t			b_arc_access;
663168404Spjd
664168404Spjd	/* self protecting */
665168404Spjd	refcount_t		b_refcnt;
666185029Spjd
667286570Smav	arc_callback_t		*b_acb;
668286570Smav	/* temporary buffer holder for in-flight compressed data */
669286570Smav	void			*b_tmp_cdata;
670286570Smav} l1arc_buf_hdr_t;
671286570Smav
672286570Smavtypedef struct l2arc_dev l2arc_dev_t;
673286570Smav
674286570Smavtypedef struct l2arc_buf_hdr {
675286570Smav	/* protected by arc_buf_hdr mutex */
676286570Smav	l2arc_dev_t		*b_dev;		/* L2ARC device */
677286570Smav	uint64_t		b_daddr;	/* disk address, offset byte */
678286570Smav	/* real alloc'd buffer size depending on b_compress applied */
679286570Smav	int32_t			b_asize;
680286570Smav
681185029Spjd	list_node_t		b_l2node;
682286570Smav} l2arc_buf_hdr_t;
683286570Smav
684286570Smavstruct arc_buf_hdr {
685286570Smav	/* protected by hash lock */
686286570Smav	dva_t			b_dva;
687286570Smav	uint64_t		b_birth;
688286570Smav	/*
689286570Smav	 * Even though this checksum is only set/verified when a buffer is in
690286570Smav	 * the L1 cache, it needs to be in the set of common fields because it
691286570Smav	 * must be preserved from the time before a buffer is written out to
692286570Smav	 * L2ARC until after it is read back in.
693286570Smav	 */
694286570Smav	zio_cksum_t		*b_freeze_cksum;
695286570Smav
696286570Smav	arc_buf_hdr_t		*b_hash_next;
697286570Smav	arc_flags_t		b_flags;
698286570Smav
699286570Smav	/* immutable */
700286570Smav	int32_t			b_size;
701286570Smav	uint64_t		b_spa;
702286570Smav
703286570Smav	/* L2ARC fields. Undefined when not in L2ARC. */
704286570Smav	l2arc_buf_hdr_t		b_l2hdr;
705286570Smav	/* L1ARC fields. Undefined when in l2arc_only state */
706286570Smav	l1arc_buf_hdr_t		b_l1hdr;
707168404Spjd};
708168404Spjd
709275748Sdelphij#ifdef _KERNEL
710275748Sdelphijstatic int
711275748Sdelphijsysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS)
712275748Sdelphij{
713275748Sdelphij	uint64_t val;
714275748Sdelphij	int err;
715275748Sdelphij
716275748Sdelphij	val = arc_meta_limit;
717275748Sdelphij	err = sysctl_handle_64(oidp, &val, 0, req);
718275748Sdelphij	if (err != 0 || req->newptr == NULL)
719275748Sdelphij		return (err);
720275748Sdelphij
721275748Sdelphij        if (val <= 0 || val > arc_c_max)
722275748Sdelphij		return (EINVAL);
723275748Sdelphij
724275748Sdelphij	arc_meta_limit = val;
725275748Sdelphij	return (0);
726275748Sdelphij}
727275748Sdelphij#endif
728275748Sdelphij
729168404Spjdstatic arc_buf_t *arc_eviction_list;
730168404Spjdstatic kmutex_t arc_eviction_mtx;
731168404Spjdstatic arc_buf_hdr_t arc_eviction_hdr;
732168404Spjd
733168404Spjd#define	GHOST_STATE(state)	\
734185029Spjd	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
735185029Spjd	(state) == arc_l2c_only)
736168404Spjd
737275811Sdelphij#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
738275811Sdelphij#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
739275811Sdelphij#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
740275811Sdelphij#define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
741275811Sdelphij#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
742275811Sdelphij#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
743286570Smav
744275811Sdelphij#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_FLAG_L2CACHE)
745286570Smav#define	HDR_L2COMPRESS(hdr)	((hdr)->b_flags & ARC_FLAG_L2COMPRESS)
746275811Sdelphij#define	HDR_L2_READING(hdr)	\
747286570Smav	    (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&	\
748286570Smav	    ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
749275811Sdelphij#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITING)
750275811Sdelphij#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
751275811Sdelphij#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
752168404Spjd
753286570Smav#define	HDR_ISTYPE_METADATA(hdr)	\
754286570Smav	    ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
755286570Smav#define	HDR_ISTYPE_DATA(hdr)	(!HDR_ISTYPE_METADATA(hdr))
756286570Smav
757286570Smav#define	HDR_HAS_L1HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
758286570Smav#define	HDR_HAS_L2HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
759286570Smav
760286570Smav/* For storing compression mode in b_flags */
761286570Smav#define	HDR_COMPRESS_OFFSET	24
762286570Smav#define	HDR_COMPRESS_NBITS	7
763286570Smav
764286570Smav#define	HDR_GET_COMPRESS(hdr)	((enum zio_compress)BF32_GET(hdr->b_flags, \
765286570Smav	    HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS))
766286570Smav#define	HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \
767286570Smav	    HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp))
768286570Smav
769168404Spjd/*
770185029Spjd * Other sizes
771185029Spjd */
772185029Spjd
773286570Smav#define	HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
774286570Smav#define	HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
775185029Spjd
776185029Spjd/*
777168404Spjd * Hash table routines
778168404Spjd */
779168404Spjd
780205253Skmacy#define	HT_LOCK_PAD	CACHE_LINE_SIZE
781168404Spjd
782168404Spjdstruct ht_lock {
783168404Spjd	kmutex_t	ht_lock;
784168404Spjd#ifdef _KERNEL
785168404Spjd	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
786168404Spjd#endif
787168404Spjd};
788168404Spjd
789168404Spjd#define	BUF_LOCKS 256
790168404Spjdtypedef struct buf_hash_table {
791168404Spjd	uint64_t ht_mask;
792168404Spjd	arc_buf_hdr_t **ht_table;
793205264Skmacy	struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
794168404Spjd} buf_hash_table_t;
795168404Spjd
796168404Spjdstatic buf_hash_table_t buf_hash_table;
797168404Spjd
798168404Spjd#define	BUF_HASH_INDEX(spa, dva, birth) \
799168404Spjd	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
800168404Spjd#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
801168404Spjd#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
802219089Spjd#define	HDR_LOCK(hdr) \
803219089Spjd	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
804168404Spjd
805168404Spjduint64_t zfs_crc64_table[256];
806168404Spjd
807185029Spjd/*
808185029Spjd * Level 2 ARC
809185029Spjd */
810185029Spjd
811272707Savg#define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
812251478Sdelphij#define	L2ARC_HEADROOM		2			/* num of writes */
813251478Sdelphij/*
814251478Sdelphij * If we discover during ARC scan any buffers to be compressed, we boost
815251478Sdelphij * our headroom for the next scanning cycle by this percentage multiple.
816251478Sdelphij */
817251478Sdelphij#define	L2ARC_HEADROOM_BOOST	200
818208373Smm#define	L2ARC_FEED_SECS		1		/* caching interval secs */
819208373Smm#define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
820185029Spjd
821185029Spjd#define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
822185029Spjd#define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
823185029Spjd
824251631Sdelphij/* L2ARC Performance Tunables */
825185029Spjduint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
826185029Spjduint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
827185029Spjduint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
828251478Sdelphijuint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
829185029Spjduint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
830208373Smmuint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
831219089Spjdboolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
832208373Smmboolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
833208373Smmboolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
834185029Spjd
835217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
836205231Skmacy    &l2arc_write_max, 0, "max write size");
837217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
838205231Skmacy    &l2arc_write_boost, 0, "extra write during warmup");
839217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
840205231Skmacy    &l2arc_headroom, 0, "number of dev writes");
841217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
842205231Skmacy    &l2arc_feed_secs, 0, "interval seconds");
843217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
844208373Smm    &l2arc_feed_min_ms, 0, "min interval milliseconds");
845205231Skmacy
846205231SkmacySYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
847205231Skmacy    &l2arc_noprefetch, 0, "don't cache prefetch bufs");
848208373SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
849208373Smm    &l2arc_feed_again, 0, "turbo warmup");
850208373SmmSYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
851208373Smm    &l2arc_norw, 0, "no reads during writes");
852205231Skmacy
853217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
854205231Skmacy    &ARC_anon.arcs_size, 0, "size of anonymous state");
855217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
856205231Skmacy    &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
857217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
858205231Skmacy    &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
859205231Skmacy
860217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
861205231Skmacy    &ARC_mru.arcs_size, 0, "size of mru state");
862217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
863205231Skmacy    &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
864217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
865205231Skmacy    &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
866205231Skmacy
867217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
868205231Skmacy    &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
869217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
870205231Skmacy    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
871205231Skmacy    "size of metadata in mru ghost state");
872217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
873205231Skmacy    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
874205231Skmacy    "size of data in mru ghost state");
875205231Skmacy
876217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
877205231Skmacy    &ARC_mfu.arcs_size, 0, "size of mfu state");
878217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
879205231Skmacy    &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
880217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
881205231Skmacy    &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
882205231Skmacy
883217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
884205231Skmacy    &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
885217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
886205231Skmacy    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
887205231Skmacy    "size of metadata in mfu ghost state");
888217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
889205231Skmacy    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
890205231Skmacy    "size of data in mfu ghost state");
891205231Skmacy
892217367SmdfSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
893205231Skmacy    &ARC_l2c_only.arcs_size, 0, "size of mru state");
894205231Skmacy
895185029Spjd/*
896185029Spjd * L2ARC Internals
897185029Spjd */
898286570Smavstruct l2arc_dev {
899185029Spjd	vdev_t			*l2ad_vdev;	/* vdev */
900185029Spjd	spa_t			*l2ad_spa;	/* spa */
901185029Spjd	uint64_t		l2ad_hand;	/* next write location */
902185029Spjd	uint64_t		l2ad_start;	/* first addr on device */
903185029Spjd	uint64_t		l2ad_end;	/* last addr on device */
904185029Spjd	uint64_t		l2ad_evict;	/* last addr eviction reached */
905185029Spjd	boolean_t		l2ad_first;	/* first sweep through */
906208373Smm	boolean_t		l2ad_writing;	/* currently writing */
907286570Smav	kmutex_t		l2ad_mtx;	/* lock for buffer list */
908286570Smav	list_t			l2ad_buflist;	/* buffer list */
909185029Spjd	list_node_t		l2ad_node;	/* device list node */
910286570Smav};
911185029Spjd
912185029Spjdstatic list_t L2ARC_dev_list;			/* device list */
913185029Spjdstatic list_t *l2arc_dev_list;			/* device list pointer */
914185029Spjdstatic kmutex_t l2arc_dev_mtx;			/* device list mutex */
915185029Spjdstatic l2arc_dev_t *l2arc_dev_last;		/* last device used */
916185029Spjdstatic list_t L2ARC_free_on_write;		/* free after write buf list */
917185029Spjdstatic list_t *l2arc_free_on_write;		/* free after write list ptr */
918185029Spjdstatic kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
919185029Spjdstatic uint64_t l2arc_ndev;			/* number of devices */
920185029Spjd
921185029Spjdtypedef struct l2arc_read_callback {
922251478Sdelphij	arc_buf_t		*l2rcb_buf;		/* read buffer */
923251478Sdelphij	spa_t			*l2rcb_spa;		/* spa */
924251478Sdelphij	blkptr_t		l2rcb_bp;		/* original blkptr */
925268123Sdelphij	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
926251478Sdelphij	int			l2rcb_flags;		/* original flags */
927251478Sdelphij	enum zio_compress	l2rcb_compress;		/* applied compress */
928185029Spjd} l2arc_read_callback_t;
929185029Spjd
930185029Spjdtypedef struct l2arc_write_callback {
931185029Spjd	l2arc_dev_t	*l2wcb_dev;		/* device info */
932185029Spjd	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
933185029Spjd} l2arc_write_callback_t;
934185029Spjd
935185029Spjdtypedef struct l2arc_data_free {
936185029Spjd	/* protected by l2arc_free_on_write_mtx */
937185029Spjd	void		*l2df_data;
938185029Spjd	size_t		l2df_size;
939185029Spjd	void		(*l2df_func)(void *, size_t);
940185029Spjd	list_node_t	l2df_list_node;
941185029Spjd} l2arc_data_free_t;
942185029Spjd
943185029Spjdstatic kmutex_t l2arc_feed_thr_lock;
944185029Spjdstatic kcondvar_t l2arc_feed_thr_cv;
945185029Spjdstatic uint8_t l2arc_thread_exit;
946185029Spjd
947275811Sdelphijstatic void arc_get_data_buf(arc_buf_t *);
948275811Sdelphijstatic void arc_access(arc_buf_hdr_t *, kmutex_t *);
949275811Sdelphijstatic int arc_evict_needed(arc_buf_contents_t);
950275811Sdelphijstatic void arc_evict_ghost(arc_state_t *, uint64_t, int64_t);
951275811Sdelphijstatic void arc_buf_watch(arc_buf_t *);
952275811Sdelphij
953286570Smavstatic arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
954286570Smavstatic uint32_t arc_bufc_to_flags(arc_buf_contents_t);
955286570Smav
956275811Sdelphijstatic boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
957275811Sdelphijstatic void l2arc_read_done(zio_t *);
958185029Spjd
959286570Smavstatic boolean_t l2arc_compress_buf(arc_buf_hdr_t *);
960275811Sdelphijstatic void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
961275811Sdelphijstatic void l2arc_release_cdata_buf(arc_buf_hdr_t *);
962251478Sdelphij
963168404Spjdstatic uint64_t
964209962Smmbuf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
965168404Spjd{
966168404Spjd	uint8_t *vdva = (uint8_t *)dva;
967168404Spjd	uint64_t crc = -1ULL;
968168404Spjd	int i;
969168404Spjd
970168404Spjd	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
971168404Spjd
972168404Spjd	for (i = 0; i < sizeof (dva_t); i++)
973168404Spjd		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
974168404Spjd
975209962Smm	crc ^= (spa>>8) ^ birth;
976168404Spjd
977168404Spjd	return (crc);
978168404Spjd}
979168404Spjd
980168404Spjd#define	BUF_EMPTY(buf)						\
981168404Spjd	((buf)->b_dva.dva_word[0] == 0 &&			\
982286570Smav	(buf)->b_dva.dva_word[1] == 0)
983168404Spjd
984168404Spjd#define	BUF_EQUAL(spa, dva, birth, buf)				\
985168404Spjd	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
986168404Spjd	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
987168404Spjd	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
988168404Spjd
989219089Spjdstatic void
990219089Spjdbuf_discard_identity(arc_buf_hdr_t *hdr)
991219089Spjd{
992219089Spjd	hdr->b_dva.dva_word[0] = 0;
993219089Spjd	hdr->b_dva.dva_word[1] = 0;
994219089Spjd	hdr->b_birth = 0;
995219089Spjd}
996219089Spjd
997168404Spjdstatic arc_buf_hdr_t *
998268075Sdelphijbuf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
999168404Spjd{
1000268075Sdelphij	const dva_t *dva = BP_IDENTITY(bp);
1001268075Sdelphij	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
1002168404Spjd	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
1003168404Spjd	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1004275811Sdelphij	arc_buf_hdr_t *hdr;
1005168404Spjd
1006168404Spjd	mutex_enter(hash_lock);
1007275811Sdelphij	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
1008275811Sdelphij	    hdr = hdr->b_hash_next) {
1009275811Sdelphij		if (BUF_EQUAL(spa, dva, birth, hdr)) {
1010168404Spjd			*lockp = hash_lock;
1011275811Sdelphij			return (hdr);
1012168404Spjd		}
1013168404Spjd	}
1014168404Spjd	mutex_exit(hash_lock);
1015168404Spjd	*lockp = NULL;
1016168404Spjd	return (NULL);
1017168404Spjd}
1018168404Spjd
1019168404Spjd/*
1020168404Spjd * Insert an entry into the hash table.  If there is already an element
1021168404Spjd * equal to elem in the hash table, then the already existing element
1022168404Spjd * will be returned and the new element will not be inserted.
1023168404Spjd * Otherwise returns NULL.
1024286570Smav * If lockp == NULL, the caller is assumed to already hold the hash lock.
1025168404Spjd */
1026168404Spjdstatic arc_buf_hdr_t *
1027275811Sdelphijbuf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
1028168404Spjd{
1029275811Sdelphij	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1030168404Spjd	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1031275811Sdelphij	arc_buf_hdr_t *fhdr;
1032168404Spjd	uint32_t i;
1033168404Spjd
1034275811Sdelphij	ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
1035275811Sdelphij	ASSERT(hdr->b_birth != 0);
1036275811Sdelphij	ASSERT(!HDR_IN_HASH_TABLE(hdr));
1037286570Smav
1038286570Smav	if (lockp != NULL) {
1039286570Smav		*lockp = hash_lock;
1040286570Smav		mutex_enter(hash_lock);
1041286570Smav	} else {
1042286570Smav		ASSERT(MUTEX_HELD(hash_lock));
1043286570Smav	}
1044286570Smav
1045275811Sdelphij	for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
1046275811Sdelphij	    fhdr = fhdr->b_hash_next, i++) {
1047275811Sdelphij		if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
1048275811Sdelphij			return (fhdr);
1049168404Spjd	}
1050168404Spjd
1051275811Sdelphij	hdr->b_hash_next = buf_hash_table.ht_table[idx];
1052275811Sdelphij	buf_hash_table.ht_table[idx] = hdr;
1053275811Sdelphij	hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
1054168404Spjd
1055168404Spjd	/* collect some hash table performance data */
1056168404Spjd	if (i > 0) {
1057168404Spjd		ARCSTAT_BUMP(arcstat_hash_collisions);
1058168404Spjd		if (i == 1)
1059168404Spjd			ARCSTAT_BUMP(arcstat_hash_chains);
1060168404Spjd
1061168404Spjd		ARCSTAT_MAX(arcstat_hash_chain_max, i);
1062168404Spjd	}
1063168404Spjd
1064168404Spjd	ARCSTAT_BUMP(arcstat_hash_elements);
1065168404Spjd	ARCSTAT_MAXSTAT(arcstat_hash_elements);
1066168404Spjd
1067168404Spjd	return (NULL);
1068168404Spjd}
1069168404Spjd
1070168404Spjdstatic void
1071275811Sdelphijbuf_hash_remove(arc_buf_hdr_t *hdr)
1072168404Spjd{
1073275811Sdelphij	arc_buf_hdr_t *fhdr, **hdrp;
1074275811Sdelphij	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1075168404Spjd
1076168404Spjd	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
1077275811Sdelphij	ASSERT(HDR_IN_HASH_TABLE(hdr));
1078168404Spjd
1079275811Sdelphij	hdrp = &buf_hash_table.ht_table[idx];
1080275811Sdelphij	while ((fhdr = *hdrp) != hdr) {
1081275811Sdelphij		ASSERT(fhdr != NULL);
1082275811Sdelphij		hdrp = &fhdr->b_hash_next;
1083168404Spjd	}
1084275811Sdelphij	*hdrp = hdr->b_hash_next;
1085275811Sdelphij	hdr->b_hash_next = NULL;
1086275811Sdelphij	hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE;
1087168404Spjd
1088168404Spjd	/* collect some hash table performance data */
1089168404Spjd	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1090168404Spjd
1091168404Spjd	if (buf_hash_table.ht_table[idx] &&
1092168404Spjd	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1093168404Spjd		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1094168404Spjd}
1095168404Spjd
1096168404Spjd/*
1097168404Spjd * Global data structures and functions for the buf kmem cache.
1098168404Spjd */
1099286570Smavstatic kmem_cache_t *hdr_full_cache;
1100286570Smavstatic kmem_cache_t *hdr_l2only_cache;
1101168404Spjdstatic kmem_cache_t *buf_cache;
1102168404Spjd
1103168404Spjdstatic void
1104168404Spjdbuf_fini(void)
1105168404Spjd{
1106168404Spjd	int i;
1107168404Spjd
1108168404Spjd	kmem_free(buf_hash_table.ht_table,
1109168404Spjd	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
1110168404Spjd	for (i = 0; i < BUF_LOCKS; i++)
1111168404Spjd		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
1112286570Smav	kmem_cache_destroy(hdr_full_cache);
1113286570Smav	kmem_cache_destroy(hdr_l2only_cache);
1114168404Spjd	kmem_cache_destroy(buf_cache);
1115168404Spjd}
1116168404Spjd
1117168404Spjd/*
1118168404Spjd * Constructor callback - called when the cache is empty
1119168404Spjd * and a new buf is requested.
1120168404Spjd */
1121168404Spjd/* ARGSUSED */
1122168404Spjdstatic int
1123286570Smavhdr_full_cons(void *vbuf, void *unused, int kmflag)
1124168404Spjd{
1125275811Sdelphij	arc_buf_hdr_t *hdr = vbuf;
1126168404Spjd
1127286570Smav	bzero(hdr, HDR_FULL_SIZE);
1128286570Smav	cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
1129286570Smav	refcount_create(&hdr->b_l1hdr.b_refcnt);
1130286570Smav	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1131286570Smav	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1132185029Spjd
1133168404Spjd	return (0);
1134168404Spjd}
1135168404Spjd
1136185029Spjd/* ARGSUSED */
1137185029Spjdstatic int
1138286570Smavhdr_l2only_cons(void *vbuf, void *unused, int kmflag)
1139286570Smav{
1140286570Smav	arc_buf_hdr_t *hdr = vbuf;
1141286570Smav
1142286570Smav	bzero(hdr, HDR_L2ONLY_SIZE);
1143286570Smav	arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1144286570Smav
1145286570Smav	return (0);
1146286570Smav}
1147286570Smav
1148286570Smav/* ARGSUSED */
1149286570Smavstatic int
1150185029Spjdbuf_cons(void *vbuf, void *unused, int kmflag)
1151185029Spjd{
1152185029Spjd	arc_buf_t *buf = vbuf;
1153185029Spjd
1154185029Spjd	bzero(buf, sizeof (arc_buf_t));
1155219089Spjd	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1156208373Smm	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1157208373Smm
1158185029Spjd	return (0);
1159185029Spjd}
1160185029Spjd
1161168404Spjd/*
1162168404Spjd * Destructor callback - called when a cached buf is
1163168404Spjd * no longer required.
1164168404Spjd */
1165168404Spjd/* ARGSUSED */
1166168404Spjdstatic void
1167286570Smavhdr_full_dest(void *vbuf, void *unused)
1168168404Spjd{
1169275811Sdelphij	arc_buf_hdr_t *hdr = vbuf;
1170168404Spjd
1171275811Sdelphij	ASSERT(BUF_EMPTY(hdr));
1172286570Smav	cv_destroy(&hdr->b_l1hdr.b_cv);
1173286570Smav	refcount_destroy(&hdr->b_l1hdr.b_refcnt);
1174286570Smav	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
1175286570Smav	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1176168404Spjd}
1177168404Spjd
1178185029Spjd/* ARGSUSED */
1179185029Spjdstatic void
1180286570Smavhdr_l2only_dest(void *vbuf, void *unused)
1181286570Smav{
1182286570Smav	arc_buf_hdr_t *hdr = vbuf;
1183286570Smav
1184286570Smav	ASSERT(BUF_EMPTY(hdr));
1185286570Smav	arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1186286570Smav}
1187286570Smav
1188286570Smav/* ARGSUSED */
1189286570Smavstatic void
1190185029Spjdbuf_dest(void *vbuf, void *unused)
1191185029Spjd{
1192185029Spjd	arc_buf_t *buf = vbuf;
1193185029Spjd
1194219089Spjd	mutex_destroy(&buf->b_evict_lock);
1195208373Smm	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1196185029Spjd}
1197185029Spjd
1198168404Spjd/*
1199168404Spjd * Reclaim callback -- invoked when memory is low.
1200168404Spjd */
1201168404Spjd/* ARGSUSED */
1202168404Spjdstatic void
1203168404Spjdhdr_recl(void *unused)
1204168404Spjd{
1205168404Spjd	dprintf("hdr_recl called\n");
1206168404Spjd	/*
1207168404Spjd	 * umem calls the reclaim func when we destroy the buf cache,
1208168404Spjd	 * which is after we do arc_fini().
1209168404Spjd	 */
1210168404Spjd	if (!arc_dead)
1211168404Spjd		cv_signal(&arc_reclaim_thr_cv);
1212168404Spjd}
1213168404Spjd
1214168404Spjdstatic void
1215168404Spjdbuf_init(void)
1216168404Spjd{
1217168404Spjd	uint64_t *ct;
1218168404Spjd	uint64_t hsize = 1ULL << 12;
1219168404Spjd	int i, j;
1220168404Spjd
1221168404Spjd	/*
1222168404Spjd	 * The hash table is big enough to fill all of physical memory
1223269230Sdelphij	 * with an average block size of zfs_arc_average_blocksize (default 8K).
1224269230Sdelphij	 * By default, the table will take up
1225269230Sdelphij	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1226168404Spjd	 */
1227269230Sdelphij	while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
1228168404Spjd		hsize <<= 1;
1229168404Spjdretry:
1230168404Spjd	buf_hash_table.ht_mask = hsize - 1;
1231168404Spjd	buf_hash_table.ht_table =
1232168404Spjd	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1233168404Spjd	if (buf_hash_table.ht_table == NULL) {
1234168404Spjd		ASSERT(hsize > (1ULL << 8));
1235168404Spjd		hsize >>= 1;
1236168404Spjd		goto retry;
1237168404Spjd	}
1238168404Spjd
1239286570Smav	hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
1240286570Smav	    0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
1241286570Smav	hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
1242286570Smav	    HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
1243286570Smav	    NULL, NULL, 0);
1244168404Spjd	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1245185029Spjd	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1246168404Spjd
1247168404Spjd	for (i = 0; i < 256; i++)
1248168404Spjd		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1249168404Spjd			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1250168404Spjd
1251168404Spjd	for (i = 0; i < BUF_LOCKS; i++) {
1252168404Spjd		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1253168404Spjd		    NULL, MUTEX_DEFAULT, NULL);
1254168404Spjd	}
1255168404Spjd}
1256168404Spjd
1257286570Smav/*
1258286570Smav * Transition between the two allocation states for the arc_buf_hdr struct.
1259286570Smav * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
1260286570Smav * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
1261286570Smav * version is used when a cache buffer is only in the L2ARC in order to reduce
1262286570Smav * memory usage.
1263286570Smav */
1264286570Smavstatic arc_buf_hdr_t *
1265286570Smavarc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
1266286570Smav{
1267286570Smav	ASSERT(HDR_HAS_L2HDR(hdr));
1268286570Smav
1269286570Smav	arc_buf_hdr_t *nhdr;
1270286570Smav	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
1271286570Smav
1272286570Smav	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
1273286570Smav	    (old == hdr_l2only_cache && new == hdr_full_cache));
1274286570Smav
1275286570Smav	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
1276286570Smav
1277286570Smav	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
1278286570Smav	buf_hash_remove(hdr);
1279286570Smav
1280286570Smav	bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
1281286570Smav	if (new == hdr_full_cache) {
1282286570Smav		nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
1283286570Smav		/*
1284286570Smav		 * arc_access and arc_change_state need to be aware that a
1285286570Smav		 * header has just come out of L2ARC, so we set its state to
1286286570Smav		 * l2c_only even though it's about to change.
1287286570Smav		 */
1288286570Smav		nhdr->b_l1hdr.b_state = arc_l2c_only;
1289286570Smav	} else {
1290286570Smav		ASSERT(hdr->b_l1hdr.b_buf == NULL);
1291286570Smav		ASSERT0(hdr->b_l1hdr.b_datacnt);
1292286570Smav		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
1293286570Smav		/*
1294286570Smav		 * We might be removing the L1hdr of a buffer which was just
1295286570Smav		 * written out to L2ARC. If such a buffer is compressed then we
1296286570Smav		 * need to free its b_tmp_cdata before destroying the header.
1297286570Smav		 */
1298286570Smav		if (hdr->b_l1hdr.b_tmp_cdata != NULL &&
1299286570Smav		    HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
1300286570Smav			l2arc_release_cdata_buf(hdr);
1301286570Smav		nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
1302286570Smav	}
1303286570Smav	/*
1304286570Smav	 * The header has been reallocated so we need to re-insert it into any
1305286570Smav	 * lists it was on.
1306286570Smav	 */
1307286570Smav	(void) buf_hash_insert(nhdr, NULL);
1308286570Smav
1309286570Smav	ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
1310286570Smav
1311286570Smav	mutex_enter(&dev->l2ad_mtx);
1312286570Smav
1313286570Smav	/*
1314286570Smav	 * We must place the realloc'ed header back into the list at
1315286570Smav	 * the same spot. Otherwise, if it's placed earlier in the list,
1316286570Smav	 * l2arc_write_buffers() could find it during the function's
1317286570Smav	 * write phase, and try to write it out to the l2arc.
1318286570Smav	 */
1319286570Smav	list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
1320286570Smav	list_remove(&dev->l2ad_buflist, hdr);
1321286570Smav
1322286570Smav	mutex_exit(&dev->l2ad_mtx);
1323286570Smav
1324286570Smav	buf_discard_identity(hdr);
1325286570Smav	hdr->b_freeze_cksum = NULL;
1326286570Smav	kmem_cache_free(old, hdr);
1327286570Smav
1328286570Smav	return (nhdr);
1329286570Smav}
1330286570Smav
1331286570Smav
1332168404Spjd#define	ARC_MINTIME	(hz>>4) /* 62 ms */
1333168404Spjd
1334168404Spjdstatic void
1335168404Spjdarc_cksum_verify(arc_buf_t *buf)
1336168404Spjd{
1337168404Spjd	zio_cksum_t zc;
1338168404Spjd
1339168404Spjd	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1340168404Spjd		return;
1341168404Spjd
1342286570Smav	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1343286570Smav	if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) {
1344286570Smav		mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1345168404Spjd		return;
1346168404Spjd	}
1347168404Spjd	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1348168404Spjd	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1349168404Spjd		panic("buffer modified while frozen!");
1350286570Smav	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1351168404Spjd}
1352168404Spjd
1353185029Spjdstatic int
1354185029Spjdarc_cksum_equal(arc_buf_t *buf)
1355185029Spjd{
1356185029Spjd	zio_cksum_t zc;
1357185029Spjd	int equal;
1358185029Spjd
1359286570Smav	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1360185029Spjd	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1361185029Spjd	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1362286570Smav	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1363185029Spjd
1364185029Spjd	return (equal);
1365185029Spjd}
1366185029Spjd
1367168404Spjdstatic void
1368185029Spjdarc_cksum_compute(arc_buf_t *buf, boolean_t force)
1369168404Spjd{
1370185029Spjd	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1371168404Spjd		return;
1372168404Spjd
1373286570Smav	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1374168404Spjd	if (buf->b_hdr->b_freeze_cksum != NULL) {
1375286570Smav		mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1376168404Spjd		return;
1377168404Spjd	}
1378168404Spjd	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1379168404Spjd	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1380168404Spjd	    buf->b_hdr->b_freeze_cksum);
1381286570Smav	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1382240133Smm#ifdef illumos
1383240133Smm	arc_buf_watch(buf);
1384277300Ssmh#endif
1385168404Spjd}
1386168404Spjd
1387240133Smm#ifdef illumos
1388240133Smm#ifndef _KERNEL
1389240133Smmtypedef struct procctl {
1390240133Smm	long cmd;
1391240133Smm	prwatch_t prwatch;
1392240133Smm} procctl_t;
1393240133Smm#endif
1394240133Smm
1395240133Smm/* ARGSUSED */
1396240133Smmstatic void
1397240133Smmarc_buf_unwatch(arc_buf_t *buf)
1398240133Smm{
1399240133Smm#ifndef _KERNEL
1400240133Smm	if (arc_watch) {
1401240133Smm		int result;
1402240133Smm		procctl_t ctl;
1403240133Smm		ctl.cmd = PCWATCH;
1404240133Smm		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1405240133Smm		ctl.prwatch.pr_size = 0;
1406240133Smm		ctl.prwatch.pr_wflags = 0;
1407240133Smm		result = write(arc_procfd, &ctl, sizeof (ctl));
1408240133Smm		ASSERT3U(result, ==, sizeof (ctl));
1409240133Smm	}
1410240133Smm#endif
1411240133Smm}
1412240133Smm
1413240133Smm/* ARGSUSED */
1414240133Smmstatic void
1415240133Smmarc_buf_watch(arc_buf_t *buf)
1416240133Smm{
1417240133Smm#ifndef _KERNEL
1418240133Smm	if (arc_watch) {
1419240133Smm		int result;
1420240133Smm		procctl_t ctl;
1421240133Smm		ctl.cmd = PCWATCH;
1422240133Smm		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1423240133Smm		ctl.prwatch.pr_size = buf->b_hdr->b_size;
1424240133Smm		ctl.prwatch.pr_wflags = WA_WRITE;
1425240133Smm		result = write(arc_procfd, &ctl, sizeof (ctl));
1426240133Smm		ASSERT3U(result, ==, sizeof (ctl));
1427240133Smm	}
1428240133Smm#endif
1429240133Smm}
1430240133Smm#endif /* illumos */
1431240133Smm
1432286570Smavstatic arc_buf_contents_t
1433286570Smavarc_buf_type(arc_buf_hdr_t *hdr)
1434286570Smav{
1435286570Smav	if (HDR_ISTYPE_METADATA(hdr)) {
1436286570Smav		return (ARC_BUFC_METADATA);
1437286570Smav	} else {
1438286570Smav		return (ARC_BUFC_DATA);
1439286570Smav	}
1440286570Smav}
1441286570Smav
1442286570Smavstatic uint32_t
1443286570Smavarc_bufc_to_flags(arc_buf_contents_t type)
1444286570Smav{
1445286570Smav	switch (type) {
1446286570Smav	case ARC_BUFC_DATA:
1447286570Smav		/* metadata field is 0 if buffer contains normal data */
1448286570Smav		return (0);
1449286570Smav	case ARC_BUFC_METADATA:
1450286570Smav		return (ARC_FLAG_BUFC_METADATA);
1451286570Smav	default:
1452286570Smav		break;
1453286570Smav	}
1454286570Smav	panic("undefined ARC buffer type!");
1455286570Smav	return ((uint32_t)-1);
1456286570Smav}
1457286570Smav
1458168404Spjdvoid
1459168404Spjdarc_buf_thaw(arc_buf_t *buf)
1460168404Spjd{
1461185029Spjd	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1462286570Smav		if (buf->b_hdr->b_l1hdr.b_state != arc_anon)
1463185029Spjd			panic("modifying non-anon buffer!");
1464286570Smav		if (HDR_IO_IN_PROGRESS(buf->b_hdr))
1465185029Spjd			panic("modifying buffer while i/o in progress!");
1466185029Spjd		arc_cksum_verify(buf);
1467185029Spjd	}
1468168404Spjd
1469286570Smav	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1470168404Spjd	if (buf->b_hdr->b_freeze_cksum != NULL) {
1471168404Spjd		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1472168404Spjd		buf->b_hdr->b_freeze_cksum = NULL;
1473168404Spjd	}
1474219089Spjd
1475286570Smav#ifdef ZFS_DEBUG
1476219089Spjd	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1477286570Smav		if (buf->b_hdr->b_l1hdr.b_thawed != NULL)
1478286570Smav			kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1);
1479286570Smav		buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
1480219089Spjd	}
1481286570Smav#endif
1482219089Spjd
1483286570Smav	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1484240133Smm
1485240133Smm#ifdef illumos
1486240133Smm	arc_buf_unwatch(buf);
1487277300Ssmh#endif
1488168404Spjd}
1489168404Spjd
1490168404Spjdvoid
1491168404Spjdarc_buf_freeze(arc_buf_t *buf)
1492168404Spjd{
1493219089Spjd	kmutex_t *hash_lock;
1494219089Spjd
1495168404Spjd	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1496168404Spjd		return;
1497168404Spjd
1498219089Spjd	hash_lock = HDR_LOCK(buf->b_hdr);
1499219089Spjd	mutex_enter(hash_lock);
1500219089Spjd
1501168404Spjd	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1502286570Smav	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
1503185029Spjd	arc_cksum_compute(buf, B_FALSE);
1504219089Spjd	mutex_exit(hash_lock);
1505240133Smm
1506168404Spjd}
1507168404Spjd
1508168404Spjdstatic void
1509275811Sdelphijget_buf_info(arc_buf_hdr_t *hdr, arc_state_t *state, list_t **list, kmutex_t **lock)
1510205231Skmacy{
1511275811Sdelphij	uint64_t buf_hashid = buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1512205231Skmacy
1513286570Smav	if (arc_buf_type(hdr) == ARC_BUFC_METADATA)
1514206796Spjd		buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
1515205231Skmacy	else {
1516206796Spjd		buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
1517205231Skmacy		buf_hashid += ARC_BUFC_NUMMETADATALISTS;
1518205231Skmacy	}
1519205231Skmacy
1520205231Skmacy	*list = &state->arcs_lists[buf_hashid];
1521205231Skmacy	*lock = ARCS_LOCK(state, buf_hashid);
1522205231Skmacy}
1523205231Skmacy
1524205231Skmacy
1525205231Skmacystatic void
1526275811Sdelphijadd_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1527168404Spjd{
1528286570Smav	ASSERT(HDR_HAS_L1HDR(hdr));
1529168404Spjd	ASSERT(MUTEX_HELD(hash_lock));
1530286570Smav	arc_state_t *state = hdr->b_l1hdr.b_state;
1531168404Spjd
1532286570Smav	if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
1533286570Smav	    (state != arc_anon)) {
1534286570Smav		/* We don't use the L2-only state list. */
1535286570Smav		if (state != arc_l2c_only) {
1536286570Smav			uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt;
1537286570Smav			uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
1538286570Smav			list_t *list;
1539286570Smav			kmutex_t *lock;
1540168404Spjd
1541286570Smav			get_buf_info(hdr, state, &list, &lock);
1542286570Smav			ASSERT(!MUTEX_HELD(lock));
1543286570Smav			mutex_enter(lock);
1544286570Smav			ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
1545286570Smav			list_remove(list, hdr);
1546286570Smav			if (GHOST_STATE(state)) {
1547286570Smav				ASSERT0(hdr->b_l1hdr.b_datacnt);
1548286570Smav				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
1549286570Smav				delta = hdr->b_size;
1550286570Smav			}
1551286570Smav			ASSERT(delta > 0);
1552286570Smav			ASSERT3U(*size, >=, delta);
1553286570Smav			atomic_add_64(size, -delta);
1554286570Smav			mutex_exit(lock);
1555168404Spjd		}
1556185029Spjd		/* remove the prefetch flag if we get a reference */
1557286570Smav		hdr->b_flags &= ~ARC_FLAG_PREFETCH;
1558168404Spjd	}
1559168404Spjd}
1560168404Spjd
1561168404Spjdstatic int
1562275811Sdelphijremove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1563168404Spjd{
1564168404Spjd	int cnt;
1565286570Smav	arc_state_t *state = hdr->b_l1hdr.b_state;
1566168404Spjd
1567286570Smav	ASSERT(HDR_HAS_L1HDR(hdr));
1568168404Spjd	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1569168404Spjd	ASSERT(!GHOST_STATE(state));
1570168404Spjd
1571286570Smav	/*
1572286570Smav	 * arc_l2c_only counts as a ghost state so we don't need to explicitly
1573286570Smav	 * check to prevent usage of the arc_l2c_only list.
1574286570Smav	 */
1575286570Smav	if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
1576168404Spjd	    (state != arc_anon)) {
1577286570Smav		uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
1578205231Skmacy		list_t *list;
1579205231Skmacy		kmutex_t *lock;
1580185029Spjd
1581275811Sdelphij		get_buf_info(hdr, state, &list, &lock);
1582205231Skmacy		ASSERT(!MUTEX_HELD(lock));
1583205231Skmacy		mutex_enter(lock);
1584286570Smav		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
1585275811Sdelphij		list_insert_head(list, hdr);
1586286570Smav		ASSERT(hdr->b_l1hdr.b_datacnt > 0);
1587286570Smav		atomic_add_64(size, hdr->b_size *
1588286570Smav		    hdr->b_l1hdr.b_datacnt);
1589206794Spjd		mutex_exit(lock);
1590168404Spjd	}
1591168404Spjd	return (cnt);
1592168404Spjd}
1593168404Spjd
1594168404Spjd/*
1595168404Spjd * Move the supplied buffer to the indicated state.  The mutex
1596168404Spjd * for the buffer must be held by the caller.
1597168404Spjd */
1598168404Spjdstatic void
1599275811Sdelphijarc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
1600275811Sdelphij    kmutex_t *hash_lock)
1601168404Spjd{
1602286570Smav	arc_state_t *old_state;
1603286570Smav	int64_t refcnt;
1604286570Smav	uint32_t datacnt;
1605168404Spjd	uint64_t from_delta, to_delta;
1606286570Smav	arc_buf_contents_t buftype = arc_buf_type(hdr);
1607205231Skmacy	list_t *list;
1608205231Skmacy	kmutex_t *lock;
1609168404Spjd
1610286570Smav	/*
1611286570Smav	 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
1612286570Smav	 * in arc_read() when bringing a buffer out of the L2ARC.  However, the
1613286570Smav	 * L1 hdr doesn't always exist when we change state to arc_anon before
1614286570Smav	 * destroying a header, in which case reallocating to add the L1 hdr is
1615286570Smav	 * pointless.
1616286570Smav	 */
1617286570Smav	if (HDR_HAS_L1HDR(hdr)) {
1618286570Smav		old_state = hdr->b_l1hdr.b_state;
1619286570Smav		refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
1620286570Smav		datacnt = hdr->b_l1hdr.b_datacnt;
1621286570Smav	} else {
1622286570Smav		old_state = arc_l2c_only;
1623286570Smav		refcnt = 0;
1624286570Smav		datacnt = 0;
1625286570Smav	}
1626286570Smav
1627168404Spjd	ASSERT(MUTEX_HELD(hash_lock));
1628258632Savg	ASSERT3P(new_state, !=, old_state);
1629286570Smav	ASSERT(refcnt == 0 || datacnt > 0);
1630286570Smav	ASSERT(!GHOST_STATE(new_state) || datacnt == 0);
1631286570Smav	ASSERT(old_state != arc_anon || datacnt <= 1);
1632168404Spjd
1633286570Smav	from_delta = to_delta = datacnt * hdr->b_size;
1634168404Spjd
1635168404Spjd	/*
1636168404Spjd	 * If this buffer is evictable, transfer it from the
1637168404Spjd	 * old state list to the new state list.
1638168404Spjd	 */
1639168404Spjd	if (refcnt == 0) {
1640286570Smav		if (old_state != arc_anon && old_state != arc_l2c_only) {
1641205231Skmacy			int use_mutex;
1642286570Smav			uint64_t *size = &old_state->arcs_lsize[buftype];
1643168404Spjd
1644275811Sdelphij			get_buf_info(hdr, old_state, &list, &lock);
1645205231Skmacy			use_mutex = !MUTEX_HELD(lock);
1646168404Spjd			if (use_mutex)
1647205231Skmacy				mutex_enter(lock);
1648168404Spjd
1649286570Smav			ASSERT(HDR_HAS_L1HDR(hdr));
1650286570Smav			ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
1651275811Sdelphij			list_remove(list, hdr);
1652168404Spjd
1653168404Spjd			/*
1654168404Spjd			 * If prefetching out of the ghost cache,
1655219089Spjd			 * we will have a non-zero datacnt.
1656168404Spjd			 */
1657286570Smav			if (GHOST_STATE(old_state) && datacnt == 0) {
1658168404Spjd				/* ghost elements have a ghost size */
1659286570Smav				ASSERT(hdr->b_l1hdr.b_buf == NULL);
1660275811Sdelphij				from_delta = hdr->b_size;
1661168404Spjd			}
1662185029Spjd			ASSERT3U(*size, >=, from_delta);
1663185029Spjd			atomic_add_64(size, -from_delta);
1664168404Spjd
1665168404Spjd			if (use_mutex)
1666205231Skmacy				mutex_exit(lock);
1667168404Spjd		}
1668286570Smav		if (new_state != arc_anon && new_state != arc_l2c_only) {
1669206796Spjd			int use_mutex;
1670286570Smav			uint64_t *size = &new_state->arcs_lsize[buftype];
1671168404Spjd
1672286570Smav			/*
1673286570Smav			 * An L1 header always exists here, since if we're
1674286570Smav			 * moving to some L1-cached state (i.e. not l2c_only or
1675286570Smav			 * anonymous), we realloc the header to add an L1hdr
1676286570Smav			 * beforehand.
1677286570Smav			 */
1678286570Smav			ASSERT(HDR_HAS_L1HDR(hdr));
1679275811Sdelphij			get_buf_info(hdr, new_state, &list, &lock);
1680205231Skmacy			use_mutex = !MUTEX_HELD(lock);
1681168404Spjd			if (use_mutex)
1682205231Skmacy				mutex_enter(lock);
1683168404Spjd
1684275811Sdelphij			list_insert_head(list, hdr);
1685168404Spjd
1686168404Spjd			/* ghost elements have a ghost size */
1687168404Spjd			if (GHOST_STATE(new_state)) {
1688286570Smav				ASSERT(datacnt == 0);
1689286570Smav				ASSERT(hdr->b_l1hdr.b_buf == NULL);
1690275811Sdelphij				to_delta = hdr->b_size;
1691168404Spjd			}
1692185029Spjd			atomic_add_64(size, to_delta);
1693168404Spjd
1694168404Spjd			if (use_mutex)
1695205231Skmacy				mutex_exit(lock);
1696168404Spjd		}
1697168404Spjd	}
1698168404Spjd
1699275811Sdelphij	ASSERT(!BUF_EMPTY(hdr));
1700275811Sdelphij	if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
1701275811Sdelphij		buf_hash_remove(hdr);
1702168404Spjd
1703286570Smav	/* adjust state sizes (ignore arc_l2c_only) */
1704286570Smav	if (to_delta && new_state != arc_l2c_only)
1705168404Spjd		atomic_add_64(&new_state->arcs_size, to_delta);
1706286570Smav	if (from_delta && old_state != arc_l2c_only) {
1707168404Spjd		ASSERT3U(old_state->arcs_size, >=, from_delta);
1708168404Spjd		atomic_add_64(&old_state->arcs_size, -from_delta);
1709168404Spjd	}
1710286570Smav	if (HDR_HAS_L1HDR(hdr))
1711286570Smav		hdr->b_l1hdr.b_state = new_state;
1712185029Spjd
1713286570Smav	/*
1714286570Smav	 * L2 headers should never be on the L2 state list since they don't
1715286570Smav	 * have L1 headers allocated.
1716286570Smav	 */
1717286570Smav	ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
1718286570Smav	    list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
1719168404Spjd}
1720168404Spjd
1721185029Spjdvoid
1722208373Smmarc_space_consume(uint64_t space, arc_space_type_t type)
1723185029Spjd{
1724208373Smm	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1725208373Smm
1726208373Smm	switch (type) {
1727208373Smm	case ARC_SPACE_DATA:
1728208373Smm		ARCSTAT_INCR(arcstat_data_size, space);
1729208373Smm		break;
1730208373Smm	case ARC_SPACE_OTHER:
1731208373Smm		ARCSTAT_INCR(arcstat_other_size, space);
1732208373Smm		break;
1733208373Smm	case ARC_SPACE_HDRS:
1734208373Smm		ARCSTAT_INCR(arcstat_hdr_size, space);
1735208373Smm		break;
1736208373Smm	case ARC_SPACE_L2HDRS:
1737208373Smm		ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1738208373Smm		break;
1739208373Smm	}
1740208373Smm
1741275748Sdelphij	ARCSTAT_INCR(arcstat_meta_used, space);
1742185029Spjd	atomic_add_64(&arc_size, space);
1743185029Spjd}
1744185029Spjd
1745185029Spjdvoid
1746208373Smmarc_space_return(uint64_t space, arc_space_type_t type)
1747185029Spjd{
1748208373Smm	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1749208373Smm
1750208373Smm	switch (type) {
1751208373Smm	case ARC_SPACE_DATA:
1752208373Smm		ARCSTAT_INCR(arcstat_data_size, -space);
1753208373Smm		break;
1754208373Smm	case ARC_SPACE_OTHER:
1755208373Smm		ARCSTAT_INCR(arcstat_other_size, -space);
1756208373Smm		break;
1757208373Smm	case ARC_SPACE_HDRS:
1758208373Smm		ARCSTAT_INCR(arcstat_hdr_size, -space);
1759208373Smm		break;
1760208373Smm	case ARC_SPACE_L2HDRS:
1761208373Smm		ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1762208373Smm		break;
1763208373Smm	}
1764208373Smm
1765185029Spjd	ASSERT(arc_meta_used >= space);
1766185029Spjd	if (arc_meta_max < arc_meta_used)
1767185029Spjd		arc_meta_max = arc_meta_used;
1768275748Sdelphij	ARCSTAT_INCR(arcstat_meta_used, -space);
1769185029Spjd	ASSERT(arc_size >= space);
1770185029Spjd	atomic_add_64(&arc_size, -space);
1771185029Spjd}
1772185029Spjd
1773168404Spjdarc_buf_t *
1774286570Smavarc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type)
1775168404Spjd{
1776168404Spjd	arc_buf_hdr_t *hdr;
1777168404Spjd	arc_buf_t *buf;
1778168404Spjd
1779168404Spjd	ASSERT3U(size, >, 0);
1780286570Smav	hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
1781168404Spjd	ASSERT(BUF_EMPTY(hdr));
1782286570Smav	ASSERT3P(hdr->b_freeze_cksum, ==, NULL);
1783168404Spjd	hdr->b_size = size;
1784228103Smm	hdr->b_spa = spa_load_guid(spa);
1785286570Smav
1786185029Spjd	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1787168404Spjd	buf->b_hdr = hdr;
1788168404Spjd	buf->b_data = NULL;
1789168404Spjd	buf->b_efunc = NULL;
1790168404Spjd	buf->b_private = NULL;
1791168404Spjd	buf->b_next = NULL;
1792286570Smav
1793286570Smav	hdr->b_flags = arc_bufc_to_flags(type);
1794286570Smav	hdr->b_flags |= ARC_FLAG_HAS_L1HDR;
1795286570Smav
1796286570Smav	hdr->b_l1hdr.b_buf = buf;
1797286570Smav	hdr->b_l1hdr.b_state = arc_anon;
1798286570Smav	hdr->b_l1hdr.b_arc_access = 0;
1799286570Smav	hdr->b_l1hdr.b_datacnt = 1;
1800286570Smav
1801168404Spjd	arc_get_data_buf(buf);
1802286570Smav	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
1803286570Smav	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
1804168404Spjd
1805168404Spjd	return (buf);
1806168404Spjd}
1807168404Spjd
1808209962Smmstatic char *arc_onloan_tag = "onloan";
1809209962Smm
1810209962Smm/*
1811209962Smm * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1812209962Smm * flight data by arc_tempreserve_space() until they are "returned". Loaned
1813209962Smm * buffers must be returned to the arc before they can be used by the DMU or
1814209962Smm * freed.
1815209962Smm */
1816209962Smmarc_buf_t *
1817209962Smmarc_loan_buf(spa_t *spa, int size)
1818209962Smm{
1819209962Smm	arc_buf_t *buf;
1820209962Smm
1821209962Smm	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1822209962Smm
1823209962Smm	atomic_add_64(&arc_loaned_bytes, size);
1824209962Smm	return (buf);
1825209962Smm}
1826209962Smm
1827209962Smm/*
1828209962Smm * Return a loaned arc buffer to the arc.
1829209962Smm */
1830209962Smmvoid
1831209962Smmarc_return_buf(arc_buf_t *buf, void *tag)
1832209962Smm{
1833209962Smm	arc_buf_hdr_t *hdr = buf->b_hdr;
1834209962Smm
1835209962Smm	ASSERT(buf->b_data != NULL);
1836286570Smav	ASSERT(HDR_HAS_L1HDR(hdr));
1837286570Smav	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
1838286570Smav	(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
1839209962Smm
1840209962Smm	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1841209962Smm}
1842209962Smm
1843219089Spjd/* Detach an arc_buf from a dbuf (tag) */
1844219089Spjdvoid
1845219089Spjdarc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1846219089Spjd{
1847286570Smav	arc_buf_hdr_t *hdr = buf->b_hdr;
1848219089Spjd
1849219089Spjd	ASSERT(buf->b_data != NULL);
1850286570Smav	ASSERT(HDR_HAS_L1HDR(hdr));
1851286570Smav	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
1852286570Smav	(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
1853219089Spjd	buf->b_efunc = NULL;
1854219089Spjd	buf->b_private = NULL;
1855219089Spjd
1856219089Spjd	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1857219089Spjd}
1858219089Spjd
1859168404Spjdstatic arc_buf_t *
1860168404Spjdarc_buf_clone(arc_buf_t *from)
1861168404Spjd{
1862168404Spjd	arc_buf_t *buf;
1863168404Spjd	arc_buf_hdr_t *hdr = from->b_hdr;
1864168404Spjd	uint64_t size = hdr->b_size;
1865168404Spjd
1866286570Smav	ASSERT(HDR_HAS_L1HDR(hdr));
1867286570Smav	ASSERT(hdr->b_l1hdr.b_state != arc_anon);
1868219089Spjd
1869185029Spjd	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1870168404Spjd	buf->b_hdr = hdr;
1871168404Spjd	buf->b_data = NULL;
1872168404Spjd	buf->b_efunc = NULL;
1873168404Spjd	buf->b_private = NULL;
1874286570Smav	buf->b_next = hdr->b_l1hdr.b_buf;
1875286570Smav	hdr->b_l1hdr.b_buf = buf;
1876168404Spjd	arc_get_data_buf(buf);
1877168404Spjd	bcopy(from->b_data, buf->b_data, size);
1878242845Sdelphij
1879242845Sdelphij	/*
1880242845Sdelphij	 * This buffer already exists in the arc so create a duplicate
1881242845Sdelphij	 * copy for the caller.  If the buffer is associated with user data
1882242845Sdelphij	 * then track the size and number of duplicates.  These stats will be
1883242845Sdelphij	 * updated as duplicate buffers are created and destroyed.
1884242845Sdelphij	 */
1885286570Smav	if (HDR_ISTYPE_DATA(hdr)) {
1886242845Sdelphij		ARCSTAT_BUMP(arcstat_duplicate_buffers);
1887242845Sdelphij		ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
1888242845Sdelphij	}
1889286570Smav	hdr->b_l1hdr.b_datacnt += 1;
1890168404Spjd	return (buf);
1891168404Spjd}
1892168404Spjd
1893168404Spjdvoid
1894168404Spjdarc_buf_add_ref(arc_buf_t *buf, void* tag)
1895168404Spjd{
1896168404Spjd	arc_buf_hdr_t *hdr;
1897168404Spjd	kmutex_t *hash_lock;
1898168404Spjd
1899168404Spjd	/*
1900185029Spjd	 * Check to see if this buffer is evicted.  Callers
1901185029Spjd	 * must verify b_data != NULL to know if the add_ref
1902185029Spjd	 * was successful.
1903168404Spjd	 */
1904219089Spjd	mutex_enter(&buf->b_evict_lock);
1905185029Spjd	if (buf->b_data == NULL) {
1906219089Spjd		mutex_exit(&buf->b_evict_lock);
1907168404Spjd		return;
1908168404Spjd	}
1909219089Spjd	hash_lock = HDR_LOCK(buf->b_hdr);
1910219089Spjd	mutex_enter(hash_lock);
1911185029Spjd	hdr = buf->b_hdr;
1912286570Smav	ASSERT(HDR_HAS_L1HDR(hdr));
1913219089Spjd	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
1914219089Spjd	mutex_exit(&buf->b_evict_lock);
1915168404Spjd
1916286570Smav	ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
1917286570Smav	    hdr->b_l1hdr.b_state == arc_mfu);
1918286570Smav
1919168404Spjd	add_reference(hdr, hash_lock, tag);
1920208373Smm	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1921168404Spjd	arc_access(hdr, hash_lock);
1922168404Spjd	mutex_exit(hash_lock);
1923168404Spjd	ARCSTAT_BUMP(arcstat_hits);
1924286570Smav	ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
1925286570Smav	    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
1926168404Spjd	    data, metadata, hits);
1927168404Spjd}
1928168404Spjd
1929274172Savgstatic void
1930274172Savgarc_buf_free_on_write(void *data, size_t size,
1931274172Savg    void (*free_func)(void *, size_t))
1932274172Savg{
1933274172Savg	l2arc_data_free_t *df;
1934274172Savg
1935274172Savg	df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
1936274172Savg	df->l2df_data = data;
1937274172Savg	df->l2df_size = size;
1938274172Savg	df->l2df_func = free_func;
1939274172Savg	mutex_enter(&l2arc_free_on_write_mtx);
1940274172Savg	list_insert_head(l2arc_free_on_write, df);
1941274172Savg	mutex_exit(&l2arc_free_on_write_mtx);
1942274172Savg}
1943274172Savg
1944185029Spjd/*
1945185029Spjd * Free the arc data buffer.  If it is an l2arc write in progress,
1946185029Spjd * the buffer is placed on l2arc_free_on_write to be freed later.
1947185029Spjd */
1948168404Spjdstatic void
1949240133Smmarc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
1950185029Spjd{
1951240133Smm	arc_buf_hdr_t *hdr = buf->b_hdr;
1952240133Smm
1953185029Spjd	if (HDR_L2_WRITING(hdr)) {
1954274172Savg		arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func);
1955185029Spjd		ARCSTAT_BUMP(arcstat_l2_free_on_write);
1956185029Spjd	} else {
1957240133Smm		free_func(buf->b_data, hdr->b_size);
1958185029Spjd	}
1959185029Spjd}
1960185029Spjd
1961268858Sdelphij/*
1962268858Sdelphij * Free up buf->b_data and if 'remove' is set, then pull the
1963268858Sdelphij * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
1964268858Sdelphij */
1965185029Spjdstatic void
1966274172Savgarc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
1967274172Savg{
1968286570Smav	ASSERT(HDR_HAS_L2HDR(hdr));
1969286570Smav	ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx));
1970274172Savg
1971286570Smav	/*
1972286570Smav	 * The b_tmp_cdata field is linked off of the b_l1hdr, so if
1973286570Smav	 * that doesn't exist, the header is in the arc_l2c_only state,
1974286570Smav	 * and there isn't anything to free (it's already been freed).
1975286570Smav	 */
1976286570Smav	if (!HDR_HAS_L1HDR(hdr))
1977286570Smav		return;
1978274172Savg
1979286570Smav	if (hdr->b_l1hdr.b_tmp_cdata == NULL)
1980274172Savg		return;
1981274172Savg
1982274172Savg	ASSERT(HDR_L2_WRITING(hdr));
1983286570Smav	arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size,
1984274172Savg	    zio_data_buf_free);
1985286570Smav
1986274172Savg	ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
1987286570Smav	hdr->b_l1hdr.b_tmp_cdata = NULL;
1988274172Savg}
1989274172Savg
1990274172Savgstatic void
1991268858Sdelphijarc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
1992168404Spjd{
1993168404Spjd	arc_buf_t **bufp;
1994168404Spjd
1995168404Spjd	/* free up data associated with the buf */
1996286570Smav	if (buf->b_data != NULL) {
1997286570Smav		arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;
1998168404Spjd		uint64_t size = buf->b_hdr->b_size;
1999286570Smav		arc_buf_contents_t type = arc_buf_type(buf->b_hdr);
2000168404Spjd
2001168404Spjd		arc_cksum_verify(buf);
2002240133Smm#ifdef illumos
2003240133Smm		arc_buf_unwatch(buf);
2004277300Ssmh#endif
2005219089Spjd
2006168404Spjd		if (!recycle) {
2007168404Spjd			if (type == ARC_BUFC_METADATA) {
2008240133Smm				arc_buf_data_free(buf, zio_buf_free);
2009208373Smm				arc_space_return(size, ARC_SPACE_DATA);
2010168404Spjd			} else {
2011168404Spjd				ASSERT(type == ARC_BUFC_DATA);
2012240133Smm				arc_buf_data_free(buf, zio_data_buf_free);
2013208373Smm				ARCSTAT_INCR(arcstat_data_size, -size);
2014185029Spjd				atomic_add_64(&arc_size, -size);
2015168404Spjd			}
2016168404Spjd		}
2017286570Smav		if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
2018185029Spjd			uint64_t *cnt = &state->arcs_lsize[type];
2019185029Spjd
2020286570Smav			ASSERT(refcount_is_zero(
2021286570Smav			    &buf->b_hdr->b_l1hdr.b_refcnt));
2022286570Smav			ASSERT(state != arc_anon && state != arc_l2c_only);
2023185029Spjd
2024185029Spjd			ASSERT3U(*cnt, >=, size);
2025185029Spjd			atomic_add_64(cnt, -size);
2026168404Spjd		}
2027168404Spjd		ASSERT3U(state->arcs_size, >=, size);
2028168404Spjd		atomic_add_64(&state->arcs_size, -size);
2029168404Spjd		buf->b_data = NULL;
2030242845Sdelphij
2031242845Sdelphij		/*
2032242845Sdelphij		 * If we're destroying a duplicate buffer make sure
2033242845Sdelphij		 * that the appropriate statistics are updated.
2034242845Sdelphij		 */
2035286570Smav		if (buf->b_hdr->b_l1hdr.b_datacnt > 1 &&
2036286570Smav		    HDR_ISTYPE_DATA(buf->b_hdr)) {
2037242845Sdelphij			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
2038242845Sdelphij			ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
2039242845Sdelphij		}
2040286570Smav		ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0);
2041286570Smav		buf->b_hdr->b_l1hdr.b_datacnt -= 1;
2042168404Spjd	}
2043168404Spjd
2044168404Spjd	/* only remove the buf if requested */
2045268858Sdelphij	if (!remove)
2046168404Spjd		return;
2047168404Spjd
2048168404Spjd	/* remove the buf from the hdr list */
2049286570Smav	for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf;
2050286570Smav	    bufp = &(*bufp)->b_next)
2051168404Spjd		continue;
2052168404Spjd	*bufp = buf->b_next;
2053219089Spjd	buf->b_next = NULL;
2054168404Spjd
2055168404Spjd	ASSERT(buf->b_efunc == NULL);
2056168404Spjd
2057168404Spjd	/* clean up the buf */
2058168404Spjd	buf->b_hdr = NULL;
2059168404Spjd	kmem_cache_free(buf_cache, buf);
2060168404Spjd}
2061168404Spjd
2062168404Spjdstatic void
2063168404Spjdarc_hdr_destroy(arc_buf_hdr_t *hdr)
2064168404Spjd{
2065286570Smav	if (HDR_HAS_L1HDR(hdr)) {
2066286570Smav		ASSERT(hdr->b_l1hdr.b_buf == NULL ||
2067286570Smav		    hdr->b_l1hdr.b_datacnt > 0);
2068286570Smav		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2069286570Smav		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
2070286570Smav	}
2071168404Spjd	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2072286570Smav	ASSERT(!HDR_IN_HASH_TABLE(hdr));
2073168404Spjd
2074286570Smav	if (HDR_HAS_L2HDR(hdr)) {
2075286570Smav		l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
2076286570Smav		boolean_t buflist_held = MUTEX_HELD(&l2hdr->b_dev->l2ad_mtx);
2077286570Smav
2078219089Spjd		if (!buflist_held) {
2079286570Smav			mutex_enter(&l2hdr->b_dev->l2ad_mtx);
2080286570Smav			l2hdr = &hdr->b_l2hdr;
2081219089Spjd		}
2082219089Spjd
2083286570Smav		trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
2084286570Smav		    l2hdr->b_asize, 0);
2085286570Smav		list_remove(&l2hdr->b_dev->l2ad_buflist, hdr);
2086219089Spjd
2087286570Smav		/*
2088286570Smav		 * We don't want to leak the b_tmp_cdata buffer that was
2089286570Smav		 * allocated in l2arc_write_buffers()
2090286570Smav		 */
2091286570Smav		arc_buf_l2_cdata_free(hdr);
2092286570Smav
2093286570Smav		ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
2094286570Smav		ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
2095286570Smav
2096219089Spjd		if (!buflist_held)
2097286570Smav			mutex_exit(&l2hdr->b_dev->l2ad_mtx);
2098286570Smav
2099286570Smav		hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
2100185029Spjd	}
2101185029Spjd
2102286570Smav	if (!BUF_EMPTY(hdr))
2103219089Spjd		buf_discard_identity(hdr);
2104168404Spjd	if (hdr->b_freeze_cksum != NULL) {
2105168404Spjd		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
2106168404Spjd		hdr->b_freeze_cksum = NULL;
2107168404Spjd	}
2108286570Smav
2109286570Smav	if (HDR_HAS_L1HDR(hdr)) {
2110286570Smav		while (hdr->b_l1hdr.b_buf) {
2111286570Smav			arc_buf_t *buf = hdr->b_l1hdr.b_buf;
2112286570Smav
2113286570Smav			if (buf->b_efunc != NULL) {
2114286570Smav				mutex_enter(&arc_eviction_mtx);
2115286570Smav				mutex_enter(&buf->b_evict_lock);
2116286570Smav				ASSERT(buf->b_hdr != NULL);
2117286570Smav				arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
2118286570Smav				    FALSE);
2119286570Smav				hdr->b_l1hdr.b_buf = buf->b_next;
2120286570Smav				buf->b_hdr = &arc_eviction_hdr;
2121286570Smav				buf->b_next = arc_eviction_list;
2122286570Smav				arc_eviction_list = buf;
2123286570Smav				mutex_exit(&buf->b_evict_lock);
2124286570Smav				mutex_exit(&arc_eviction_mtx);
2125286570Smav			} else {
2126286570Smav				arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
2127286570Smav				    TRUE);
2128286570Smav			}
2129286570Smav		}
2130286570Smav#ifdef ZFS_DEBUG
2131286570Smav		if (hdr->b_l1hdr.b_thawed != NULL) {
2132286570Smav			kmem_free(hdr->b_l1hdr.b_thawed, 1);
2133286570Smav			hdr->b_l1hdr.b_thawed = NULL;
2134286570Smav		}
2135286570Smav#endif
2136219089Spjd	}
2137168404Spjd
2138168404Spjd	ASSERT3P(hdr->b_hash_next, ==, NULL);
2139286570Smav	if (HDR_HAS_L1HDR(hdr)) {
2140286570Smav		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
2141286570Smav		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
2142286570Smav		kmem_cache_free(hdr_full_cache, hdr);
2143286570Smav	} else {
2144286570Smav		kmem_cache_free(hdr_l2only_cache, hdr);
2145286570Smav	}
2146168404Spjd}
2147168404Spjd
2148168404Spjdvoid
2149168404Spjdarc_buf_free(arc_buf_t *buf, void *tag)
2150168404Spjd{
2151168404Spjd	arc_buf_hdr_t *hdr = buf->b_hdr;
2152286570Smav	int hashed = hdr->b_l1hdr.b_state != arc_anon;
2153168404Spjd
2154168404Spjd	ASSERT(buf->b_efunc == NULL);
2155168404Spjd	ASSERT(buf->b_data != NULL);
2156168404Spjd
2157168404Spjd	if (hashed) {
2158168404Spjd		kmutex_t *hash_lock = HDR_LOCK(hdr);
2159168404Spjd
2160168404Spjd		mutex_enter(hash_lock);
2161219089Spjd		hdr = buf->b_hdr;
2162219089Spjd		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2163219089Spjd
2164168404Spjd		(void) remove_reference(hdr, hash_lock, tag);
2165286570Smav		if (hdr->b_l1hdr.b_datacnt > 1) {
2166168404Spjd			arc_buf_destroy(buf, FALSE, TRUE);
2167219089Spjd		} else {
2168286570Smav			ASSERT(buf == hdr->b_l1hdr.b_buf);
2169219089Spjd			ASSERT(buf->b_efunc == NULL);
2170275811Sdelphij			hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
2171219089Spjd		}
2172168404Spjd		mutex_exit(hash_lock);
2173168404Spjd	} else if (HDR_IO_IN_PROGRESS(hdr)) {
2174168404Spjd		int destroy_hdr;
2175168404Spjd		/*
2176168404Spjd		 * We are in the middle of an async write.  Don't destroy
2177168404Spjd		 * this buffer unless the write completes before we finish
2178168404Spjd		 * decrementing the reference count.
2179168404Spjd		 */
2180168404Spjd		mutex_enter(&arc_eviction_mtx);
2181168404Spjd		(void) remove_reference(hdr, NULL, tag);
2182286570Smav		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2183168404Spjd		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
2184168404Spjd		mutex_exit(&arc_eviction_mtx);
2185168404Spjd		if (destroy_hdr)
2186168404Spjd			arc_hdr_destroy(hdr);
2187168404Spjd	} else {
2188219089Spjd		if (remove_reference(hdr, NULL, tag) > 0)
2189168404Spjd			arc_buf_destroy(buf, FALSE, TRUE);
2190219089Spjd		else
2191168404Spjd			arc_hdr_destroy(hdr);
2192168404Spjd	}
2193168404Spjd}
2194168404Spjd
2195248571Smmboolean_t
2196168404Spjdarc_buf_remove_ref(arc_buf_t *buf, void* tag)
2197168404Spjd{
2198168404Spjd	arc_buf_hdr_t *hdr = buf->b_hdr;
2199168404Spjd	kmutex_t *hash_lock = HDR_LOCK(hdr);
2200248571Smm	boolean_t no_callback = (buf->b_efunc == NULL);
2201168404Spjd
2202286570Smav	if (hdr->b_l1hdr.b_state == arc_anon) {
2203286570Smav		ASSERT(hdr->b_l1hdr.b_datacnt == 1);
2204168404Spjd		arc_buf_free(buf, tag);
2205168404Spjd		return (no_callback);
2206168404Spjd	}
2207168404Spjd
2208168404Spjd	mutex_enter(hash_lock);
2209219089Spjd	hdr = buf->b_hdr;
2210286570Smav	ASSERT(hdr->b_l1hdr.b_datacnt > 0);
2211219089Spjd	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2212286570Smav	ASSERT(hdr->b_l1hdr.b_state != arc_anon);
2213168404Spjd	ASSERT(buf->b_data != NULL);
2214168404Spjd
2215168404Spjd	(void) remove_reference(hdr, hash_lock, tag);
2216286570Smav	if (hdr->b_l1hdr.b_datacnt > 1) {
2217168404Spjd		if (no_callback)
2218168404Spjd			arc_buf_destroy(buf, FALSE, TRUE);
2219168404Spjd	} else if (no_callback) {
2220286570Smav		ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL);
2221219089Spjd		ASSERT(buf->b_efunc == NULL);
2222275811Sdelphij		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
2223168404Spjd	}
2224286570Smav	ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 ||
2225286570Smav	    refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2226168404Spjd	mutex_exit(hash_lock);
2227168404Spjd	return (no_callback);
2228168404Spjd}
2229168404Spjd
2230286570Smavint32_t
2231168404Spjdarc_buf_size(arc_buf_t *buf)
2232168404Spjd{
2233168404Spjd	return (buf->b_hdr->b_size);
2234168404Spjd}
2235168404Spjd
2236168404Spjd/*
2237242845Sdelphij * Called from the DMU to determine if the current buffer should be
2238242845Sdelphij * evicted. In order to ensure proper locking, the eviction must be initiated
2239242845Sdelphij * from the DMU. Return true if the buffer is associated with user data and
2240242845Sdelphij * duplicate buffers still exist.
2241242845Sdelphij */
2242242845Sdelphijboolean_t
2243242845Sdelphijarc_buf_eviction_needed(arc_buf_t *buf)
2244242845Sdelphij{
2245242845Sdelphij	arc_buf_hdr_t *hdr;
2246242845Sdelphij	boolean_t evict_needed = B_FALSE;
2247242845Sdelphij
2248242845Sdelphij	if (zfs_disable_dup_eviction)
2249242845Sdelphij		return (B_FALSE);
2250242845Sdelphij
2251242845Sdelphij	mutex_enter(&buf->b_evict_lock);
2252242845Sdelphij	hdr = buf->b_hdr;
2253242845Sdelphij	if (hdr == NULL) {
2254242845Sdelphij		/*
2255242845Sdelphij		 * We are in arc_do_user_evicts(); let that function
2256242845Sdelphij		 * perform the eviction.
2257242845Sdelphij		 */
2258242845Sdelphij		ASSERT(buf->b_data == NULL);
2259242845Sdelphij		mutex_exit(&buf->b_evict_lock);
2260242845Sdelphij		return (B_FALSE);
2261242845Sdelphij	} else if (buf->b_data == NULL) {
2262242845Sdelphij		/*
2263242845Sdelphij		 * We have already been added to the arc eviction list;
2264242845Sdelphij		 * recommend eviction.
2265242845Sdelphij		 */
2266242845Sdelphij		ASSERT3P(hdr, ==, &arc_eviction_hdr);
2267242845Sdelphij		mutex_exit(&buf->b_evict_lock);
2268242845Sdelphij		return (B_TRUE);
2269242845Sdelphij	}
2270242845Sdelphij
2271286570Smav	if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr))
2272242845Sdelphij		evict_needed = B_TRUE;
2273242845Sdelphij
2274242845Sdelphij	mutex_exit(&buf->b_evict_lock);
2275242845Sdelphij	return (evict_needed);
2276242845Sdelphij}
2277242845Sdelphij
2278242845Sdelphij/*
2279168404Spjd * Evict buffers from list until we've removed the specified number of
2280168404Spjd * bytes.  Move the removed buffers to the appropriate evict state.
2281168404Spjd * If the recycle flag is set, then attempt to "recycle" a buffer:
2282168404Spjd * - look for a buffer to evict that is `bytes' long.
2283168404Spjd * - return the data block from this buffer rather than freeing it.
2284168404Spjd * This flag is used by callers that are trying to make space for a
2285168404Spjd * new buffer in a full arc cache.
2286185029Spjd *
2287185029Spjd * This function makes a "best effort".  It skips over any buffers
2288185029Spjd * it can't get a hash_lock on, and so may not catch all candidates.
2289185029Spjd * It may also return without evicting as much space as requested.
2290168404Spjd */
2291168404Spjdstatic void *
2292209962Smmarc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
2293168404Spjd    arc_buf_contents_t type)
2294168404Spjd{
2295168404Spjd	arc_state_t *evicted_state;
2296168404Spjd	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
2297205231Skmacy	int64_t bytes_remaining;
2298275811Sdelphij	arc_buf_hdr_t *hdr, *hdr_prev = NULL;
2299205231Skmacy	list_t *evicted_list, *list, *evicted_list_start, *list_start;
2300205231Skmacy	kmutex_t *lock, *evicted_lock;
2301168404Spjd	kmutex_t *hash_lock;
2302168404Spjd	boolean_t have_lock;
2303168404Spjd	void *stolen = NULL;
2304258632Savg	arc_buf_hdr_t marker = { 0 };
2305258632Savg	int count = 0;
2306205231Skmacy	static int evict_metadata_offset, evict_data_offset;
2307258632Savg	int i, idx, offset, list_count, lists;
2308168404Spjd
2309168404Spjd	ASSERT(state == arc_mru || state == arc_mfu);
2310168404Spjd
2311168404Spjd	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2312206796Spjd
2313275780Sdelphij	/*
2314275780Sdelphij	 * Decide which "type" (data vs metadata) to recycle from.
2315275780Sdelphij	 *
2316275780Sdelphij	 * If we are over the metadata limit, recycle from metadata.
2317275780Sdelphij	 * If we are under the metadata minimum, recycle from data.
2318275780Sdelphij	 * Otherwise, recycle from whichever type has the oldest (least
2319275780Sdelphij	 * recently accessed) header.  This is not yet implemented.
2320275780Sdelphij	 */
2321275780Sdelphij	if (recycle) {
2322275780Sdelphij		arc_buf_contents_t realtype;
2323275780Sdelphij		if (state->arcs_lsize[ARC_BUFC_DATA] == 0) {
2324275780Sdelphij			realtype = ARC_BUFC_METADATA;
2325275780Sdelphij		} else if (state->arcs_lsize[ARC_BUFC_METADATA] == 0) {
2326275780Sdelphij			realtype = ARC_BUFC_DATA;
2327275780Sdelphij		} else if (arc_meta_used >= arc_meta_limit) {
2328275780Sdelphij			realtype = ARC_BUFC_METADATA;
2329275780Sdelphij		} else if (arc_meta_used <= arc_meta_min) {
2330275780Sdelphij			realtype = ARC_BUFC_DATA;
2331286570Smav#ifdef illumos
2332286570Smav		} else if (HDR_HAS_L1HDR(data_hdr) &&
2333286570Smav		    HDR_HAS_L1HDR(metadata_hdr) &&
2334286570Smav		    data_hdr->b_l1hdr.b_arc_access <
2335286570Smav		    metadata_hdr->b_l1hdr.b_arc_access) {
2336286570Smav			realtype = ARC_BUFC_DATA;
2337275780Sdelphij		} else {
2338286570Smav			realtype = ARC_BUFC_METADATA;
2339275780Sdelphij#else
2340286570Smav		} else {
2341275780Sdelphij			/* TODO */
2342275780Sdelphij			realtype = type;
2343275780Sdelphij#endif
2344275780Sdelphij		}
2345275780Sdelphij		if (realtype != type) {
2346275780Sdelphij			/*
2347275780Sdelphij			 * If we want to evict from a different list,
2348275780Sdelphij			 * we can not recycle, because DATA vs METADATA
2349275780Sdelphij			 * buffers are segregated into different kmem
2350275780Sdelphij			 * caches (and vmem arenas).
2351275780Sdelphij			 */
2352275780Sdelphij			type = realtype;
2353275780Sdelphij			recycle = B_FALSE;
2354275780Sdelphij		}
2355275780Sdelphij	}
2356275780Sdelphij
2357205231Skmacy	if (type == ARC_BUFC_METADATA) {
2358205231Skmacy		offset = 0;
2359205231Skmacy		list_count = ARC_BUFC_NUMMETADATALISTS;
2360205231Skmacy		list_start = &state->arcs_lists[0];
2361205231Skmacy		evicted_list_start = &evicted_state->arcs_lists[0];
2362205231Skmacy		idx = evict_metadata_offset;
2363205231Skmacy	} else {
2364205231Skmacy		offset = ARC_BUFC_NUMMETADATALISTS;
2365205231Skmacy		list_start = &state->arcs_lists[offset];
2366205231Skmacy		evicted_list_start = &evicted_state->arcs_lists[offset];
2367205231Skmacy		list_count = ARC_BUFC_NUMDATALISTS;
2368205231Skmacy		idx = evict_data_offset;
2369205231Skmacy	}
2370205231Skmacy	bytes_remaining = evicted_state->arcs_lsize[type];
2371258632Savg	lists = 0;
2372206796Spjd
2373205231Skmacyevict_start:
2374205231Skmacy	list = &list_start[idx];
2375205231Skmacy	evicted_list = &evicted_list_start[idx];
2376205231Skmacy	lock = ARCS_LOCK(state, (offset + idx));
2377206796Spjd	evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
2378168404Spjd
2379286570Smav	/*
2380286570Smav	 * The ghost list lock must be acquired first in order to prevent
2381286570Smav	 * a 3 party deadlock:
2382286570Smav	 *
2383286570Smav	 *  - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by
2384286570Smav	 *    l2ad_mtx in arc_hdr_realloc
2385286570Smav	 *  - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx
2386286570Smav	 *  - arc_evict acquires arc_*_ghost->arcs_mtx, followed by
2387286570Smav	 *    arc_*_ghost->arcs_mtx and forms a deadlock cycle.
2388286570Smav	 *
2389286570Smav	 * This situation is avoided by acquiring the ghost list lock first.
2390286570Smav	 */
2391286570Smav	mutex_enter(evicted_lock);
2392205231Skmacy	mutex_enter(lock);
2393205231Skmacy
2394275811Sdelphij	for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
2395275811Sdelphij		hdr_prev = list_prev(list, hdr);
2396286570Smav		if (HDR_HAS_L1HDR(hdr)) {
2397286570Smav			bytes_remaining -=
2398286570Smav			    (hdr->b_size * hdr->b_l1hdr.b_datacnt);
2399286570Smav		}
2400168404Spjd		/* prefetch buffers have a minimum lifespan */
2401275811Sdelphij		if (HDR_IO_IN_PROGRESS(hdr) ||
2402275811Sdelphij		    (spa && hdr->b_spa != spa) ||
2403286570Smav		    ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
2404286570Smav		    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
2405219089Spjd		    arc_min_prefetch_lifespan)) {
2406168404Spjd			skipped++;
2407168404Spjd			continue;
2408168404Spjd		}
2409168404Spjd		/* "lookahead" for better eviction candidate */
2410275811Sdelphij		if (recycle && hdr->b_size != bytes &&
2411275811Sdelphij		    hdr_prev && hdr_prev->b_size == bytes)
2412168404Spjd			continue;
2413258632Savg
2414258632Savg		/* ignore markers */
2415275811Sdelphij		if (hdr->b_spa == 0)
2416258632Savg			continue;
2417258632Savg
2418258632Savg		/*
2419258632Savg		 * It may take a long time to evict all the bufs requested.
2420258632Savg		 * To avoid blocking all arc activity, periodically drop
2421258632Savg		 * the arcs_mtx and give other threads a chance to run
2422258632Savg		 * before reacquiring the lock.
2423258632Savg		 *
2424258632Savg		 * If we are looking for a buffer to recycle, we are in
2425258632Savg		 * the hot code path, so don't sleep.
2426258632Savg		 */
2427258632Savg		if (!recycle && count++ > arc_evict_iterations) {
2428275811Sdelphij			list_insert_after(list, hdr, &marker);
2429286570Smav			mutex_exit(lock);
2430258632Savg			mutex_exit(evicted_lock);
2431258632Savg			kpreempt(KPREEMPT_SYNC);
2432286570Smav			mutex_enter(evicted_lock);
2433258632Savg			mutex_enter(lock);
2434275811Sdelphij			hdr_prev = list_prev(list, &marker);
2435258632Savg			list_remove(list, &marker);
2436258632Savg			count = 0;
2437258632Savg			continue;
2438258632Savg		}
2439258632Savg
2440275811Sdelphij		hash_lock = HDR_LOCK(hdr);
2441168404Spjd		have_lock = MUTEX_HELD(hash_lock);
2442168404Spjd		if (have_lock || mutex_tryenter(hash_lock)) {
2443286570Smav			ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
2444286570Smav			ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
2445286570Smav			while (hdr->b_l1hdr.b_buf) {
2446286570Smav				arc_buf_t *buf = hdr->b_l1hdr.b_buf;
2447219089Spjd				if (!mutex_tryenter(&buf->b_evict_lock)) {
2448185029Spjd					missed += 1;
2449185029Spjd					break;
2450185029Spjd				}
2451286570Smav				if (buf->b_data != NULL) {
2452275811Sdelphij					bytes_evicted += hdr->b_size;
2453286570Smav					if (recycle &&
2454286570Smav					    arc_buf_type(hdr) == type &&
2455275811Sdelphij					    hdr->b_size == bytes &&
2456275811Sdelphij					    !HDR_L2_WRITING(hdr)) {
2457168404Spjd						stolen = buf->b_data;
2458168404Spjd						recycle = FALSE;
2459168404Spjd					}
2460168404Spjd				}
2461286570Smav				if (buf->b_efunc != NULL) {
2462168404Spjd					mutex_enter(&arc_eviction_mtx);
2463168404Spjd					arc_buf_destroy(buf,
2464168404Spjd					    buf->b_data == stolen, FALSE);
2465286570Smav					hdr->b_l1hdr.b_buf = buf->b_next;
2466168404Spjd					buf->b_hdr = &arc_eviction_hdr;
2467168404Spjd					buf->b_next = arc_eviction_list;
2468168404Spjd					arc_eviction_list = buf;
2469168404Spjd					mutex_exit(&arc_eviction_mtx);
2470219089Spjd					mutex_exit(&buf->b_evict_lock);
2471168404Spjd				} else {
2472219089Spjd					mutex_exit(&buf->b_evict_lock);
2473168404Spjd					arc_buf_destroy(buf,
2474168404Spjd					    buf->b_data == stolen, TRUE);
2475168404Spjd				}
2476168404Spjd			}
2477208373Smm
2478286570Smav			if (HDR_HAS_L2HDR(hdr)) {
2479208373Smm				ARCSTAT_INCR(arcstat_evict_l2_cached,
2480275811Sdelphij				    hdr->b_size);
2481208373Smm			} else {
2482275811Sdelphij				if (l2arc_write_eligible(hdr->b_spa, hdr)) {
2483208373Smm					ARCSTAT_INCR(arcstat_evict_l2_eligible,
2484275811Sdelphij					    hdr->b_size);
2485208373Smm				} else {
2486208373Smm					ARCSTAT_INCR(
2487208373Smm					    arcstat_evict_l2_ineligible,
2488275811Sdelphij					    hdr->b_size);
2489208373Smm				}
2490208373Smm			}
2491208373Smm
2492286570Smav			if (hdr->b_l1hdr.b_datacnt == 0) {
2493275811Sdelphij				arc_change_state(evicted_state, hdr, hash_lock);
2494275811Sdelphij				ASSERT(HDR_IN_HASH_TABLE(hdr));
2495275811Sdelphij				hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
2496275811Sdelphij				hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
2497275811Sdelphij				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
2498185029Spjd			}
2499168404Spjd			if (!have_lock)
2500168404Spjd				mutex_exit(hash_lock);
2501168404Spjd			if (bytes >= 0 && bytes_evicted >= bytes)
2502168404Spjd				break;
2503205231Skmacy			if (bytes_remaining > 0) {
2504205231Skmacy				mutex_exit(evicted_lock);
2505205231Skmacy				mutex_exit(lock);
2506206796Spjd				idx  = ((idx + 1) & (list_count - 1));
2507258632Savg				lists++;
2508205231Skmacy				goto evict_start;
2509205231Skmacy			}
2510168404Spjd		} else {
2511168404Spjd			missed += 1;
2512168404Spjd		}
2513168404Spjd	}
2514168404Spjd
2515286570Smav	mutex_exit(lock);
2516205231Skmacy	mutex_exit(evicted_lock);
2517206796Spjd
2518206796Spjd	idx  = ((idx + 1) & (list_count - 1));
2519258632Savg	lists++;
2520168404Spjd
2521205231Skmacy	if (bytes_evicted < bytes) {
2522258632Savg		if (lists < list_count)
2523205231Skmacy			goto evict_start;
2524205231Skmacy		else
2525205231Skmacy			dprintf("only evicted %lld bytes from %x",
2526205231Skmacy			    (longlong_t)bytes_evicted, state);
2527205231Skmacy	}
2528206796Spjd	if (type == ARC_BUFC_METADATA)
2529205231Skmacy		evict_metadata_offset = idx;
2530205231Skmacy	else
2531205231Skmacy		evict_data_offset = idx;
2532206796Spjd
2533168404Spjd	if (skipped)
2534168404Spjd		ARCSTAT_INCR(arcstat_evict_skip, skipped);
2535168404Spjd
2536168404Spjd	if (missed)
2537168404Spjd		ARCSTAT_INCR(arcstat_mutex_miss, missed);
2538168404Spjd
2539185029Spjd	/*
2540258632Savg	 * Note: we have just evicted some data into the ghost state,
2541258632Savg	 * potentially putting the ghost size over the desired size.  Rather
2542258632Savg	 * that evicting from the ghost list in this hot code path, leave
2543258632Savg	 * this chore to the arc_reclaim_thread().
2544185029Spjd	 */
2545185029Spjd
2546205231Skmacy	if (stolen)
2547205231Skmacy		ARCSTAT_BUMP(arcstat_stolen);
2548168404Spjd	return (stolen);
2549168404Spjd}
2550168404Spjd
2551168404Spjd/*
2552168404Spjd * Remove buffers from list until we've removed the specified number of
2553168404Spjd * bytes.  Destroy the buffers that are removed.
2554168404Spjd */
2555168404Spjdstatic void
2556209962Smmarc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
2557168404Spjd{
2558275811Sdelphij	arc_buf_hdr_t *hdr, *hdr_prev;
2559219089Spjd	arc_buf_hdr_t marker = { 0 };
2560205231Skmacy	list_t *list, *list_start;
2561205231Skmacy	kmutex_t *hash_lock, *lock;
2562168404Spjd	uint64_t bytes_deleted = 0;
2563168404Spjd	uint64_t bufs_skipped = 0;
2564258632Savg	int count = 0;
2565205231Skmacy	static int evict_offset;
2566205231Skmacy	int list_count, idx = evict_offset;
2567258632Savg	int offset, lists = 0;
2568168404Spjd
2569168404Spjd	ASSERT(GHOST_STATE(state));
2570205231Skmacy
2571205231Skmacy	/*
2572205231Skmacy	 * data lists come after metadata lists
2573205231Skmacy	 */
2574205231Skmacy	list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
2575205231Skmacy	list_count = ARC_BUFC_NUMDATALISTS;
2576205231Skmacy	offset = ARC_BUFC_NUMMETADATALISTS;
2577206796Spjd
2578205231Skmacyevict_start:
2579205231Skmacy	list = &list_start[idx];
2580205231Skmacy	lock = ARCS_LOCK(state, idx + offset);
2581205231Skmacy
2582205231Skmacy	mutex_enter(lock);
2583275811Sdelphij	for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
2584275811Sdelphij		hdr_prev = list_prev(list, hdr);
2585286570Smav		if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES)
2586275811Sdelphij			panic("invalid hdr=%p", (void *)hdr);
2587275811Sdelphij		if (spa && hdr->b_spa != spa)
2588185029Spjd			continue;
2589219089Spjd
2590219089Spjd		/* ignore markers */
2591275811Sdelphij		if (hdr->b_spa == 0)
2592219089Spjd			continue;
2593219089Spjd
2594275811Sdelphij		hash_lock = HDR_LOCK(hdr);
2595219089Spjd		/* caller may be trying to modify this buffer, skip it */
2596219089Spjd		if (MUTEX_HELD(hash_lock))
2597219089Spjd			continue;
2598258632Savg
2599258632Savg		/*
2600258632Savg		 * It may take a long time to evict all the bufs requested.
2601258632Savg		 * To avoid blocking all arc activity, periodically drop
2602258632Savg		 * the arcs_mtx and give other threads a chance to run
2603258632Savg		 * before reacquiring the lock.
2604258632Savg		 */
2605258632Savg		if (count++ > arc_evict_iterations) {
2606275811Sdelphij			list_insert_after(list, hdr, &marker);
2607258632Savg			mutex_exit(lock);
2608258632Savg			kpreempt(KPREEMPT_SYNC);
2609258632Savg			mutex_enter(lock);
2610275811Sdelphij			hdr_prev = list_prev(list, &marker);
2611258632Savg			list_remove(list, &marker);
2612258632Savg			count = 0;
2613258632Savg			continue;
2614258632Savg		}
2615168404Spjd		if (mutex_tryenter(hash_lock)) {
2616275811Sdelphij			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2617286570Smav			ASSERT(!HDR_HAS_L1HDR(hdr) ||
2618286570Smav			    hdr->b_l1hdr.b_buf == NULL);
2619168404Spjd			ARCSTAT_BUMP(arcstat_deleted);
2620275811Sdelphij			bytes_deleted += hdr->b_size;
2621185029Spjd
2622286570Smav			if (HDR_HAS_L2HDR(hdr)) {
2623185029Spjd				/*
2624185029Spjd				 * This buffer is cached on the 2nd Level ARC;
2625185029Spjd				 * don't destroy the header.
2626185029Spjd				 */
2627275811Sdelphij				arc_change_state(arc_l2c_only, hdr, hash_lock);
2628286570Smav				/*
2629286570Smav				 * dropping from L1+L2 cached to L2-only,
2630286570Smav				 * realloc to remove the L1 header.
2631286570Smav				 */
2632286570Smav				hdr = arc_hdr_realloc(hdr, hdr_full_cache,
2633286570Smav				    hdr_l2only_cache);
2634185029Spjd				mutex_exit(hash_lock);
2635185029Spjd			} else {
2636275811Sdelphij				arc_change_state(arc_anon, hdr, hash_lock);
2637185029Spjd				mutex_exit(hash_lock);
2638275811Sdelphij				arc_hdr_destroy(hdr);
2639185029Spjd			}
2640185029Spjd
2641275811Sdelphij			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
2642168404Spjd			if (bytes >= 0 && bytes_deleted >= bytes)
2643168404Spjd				break;
2644219089Spjd		} else if (bytes < 0) {
2645219089Spjd			/*
2646219089Spjd			 * Insert a list marker and then wait for the
2647219089Spjd			 * hash lock to become available. Once its
2648219089Spjd			 * available, restart from where we left off.
2649219089Spjd			 */
2650275811Sdelphij			list_insert_after(list, hdr, &marker);
2651219089Spjd			mutex_exit(lock);
2652219089Spjd			mutex_enter(hash_lock);
2653219089Spjd			mutex_exit(hash_lock);
2654219089Spjd			mutex_enter(lock);
2655275811Sdelphij			hdr_prev = list_prev(list, &marker);
2656219089Spjd			list_remove(list, &marker);
2657258632Savg		} else {
2658168404Spjd			bufs_skipped += 1;
2659258632Savg		}
2660258632Savg
2661168404Spjd	}
2662205231Skmacy	mutex_exit(lock);
2663206796Spjd	idx  = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
2664258632Savg	lists++;
2665206796Spjd
2666258632Savg	if (lists < list_count)
2667205231Skmacy		goto evict_start;
2668206796Spjd
2669205231Skmacy	evict_offset = idx;
2670205231Skmacy	if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
2671185029Spjd	    (bytes < 0 || bytes_deleted < bytes)) {
2672205231Skmacy		list_start = &state->arcs_lists[0];
2673205231Skmacy		list_count = ARC_BUFC_NUMMETADATALISTS;
2674258632Savg		offset = lists = 0;
2675205231Skmacy		goto evict_start;
2676185029Spjd	}
2677185029Spjd
2678168404Spjd	if (bufs_skipped) {
2679168404Spjd		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2680168404Spjd		ASSERT(bytes >= 0);
2681168404Spjd	}
2682168404Spjd
2683168404Spjd	if (bytes_deleted < bytes)
2684168404Spjd		dprintf("only deleted %lld bytes from %p",
2685168404Spjd		    (longlong_t)bytes_deleted, state);
2686168404Spjd}
2687168404Spjd
2688168404Spjdstatic void
2689168404Spjdarc_adjust(void)
2690168404Spjd{
2691208373Smm	int64_t adjustment, delta;
2692168404Spjd
2693208373Smm	/*
2694208373Smm	 * Adjust MRU size
2695208373Smm	 */
2696168404Spjd
2697209275Smm	adjustment = MIN((int64_t)(arc_size - arc_c),
2698209275Smm	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2699209275Smm	    arc_p));
2700208373Smm
2701208373Smm	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2702208373Smm		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2703209962Smm		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
2704208373Smm		adjustment -= delta;
2705168404Spjd	}
2706168404Spjd
2707208373Smm	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2708208373Smm		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2709209962Smm		(void) arc_evict(arc_mru, 0, delta, FALSE,
2710185029Spjd		    ARC_BUFC_METADATA);
2711185029Spjd	}
2712185029Spjd
2713208373Smm	/*
2714208373Smm	 * Adjust MFU size
2715208373Smm	 */
2716168404Spjd
2717208373Smm	adjustment = arc_size - arc_c;
2718208373Smm
2719208373Smm	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2720208373Smm		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2721209962Smm		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
2722208373Smm		adjustment -= delta;
2723168404Spjd	}
2724168404Spjd
2725208373Smm	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2726208373Smm		int64_t delta = MIN(adjustment,
2727208373Smm		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2728209962Smm		(void) arc_evict(arc_mfu, 0, delta, FALSE,
2729208373Smm		    ARC_BUFC_METADATA);
2730208373Smm	}
2731168404Spjd
2732208373Smm	/*
2733208373Smm	 * Adjust ghost lists
2734208373Smm	 */
2735168404Spjd
2736208373Smm	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2737168404Spjd
2738208373Smm	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2739208373Smm		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2740209962Smm		arc_evict_ghost(arc_mru_ghost, 0, delta);
2741208373Smm	}
2742185029Spjd
2743208373Smm	adjustment =
2744208373Smm	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2745208373Smm
2746208373Smm	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2747208373Smm		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2748209962Smm		arc_evict_ghost(arc_mfu_ghost, 0, delta);
2749168404Spjd	}
2750168404Spjd}
2751168404Spjd
2752168404Spjdstatic void
2753168404Spjdarc_do_user_evicts(void)
2754168404Spjd{
2755191903Skmacy	static arc_buf_t *tmp_arc_eviction_list;
2756191903Skmacy
2757191903Skmacy	/*
2758191903Skmacy	 * Move list over to avoid LOR
2759191903Skmacy	 */
2760206796Spjdrestart:
2761168404Spjd	mutex_enter(&arc_eviction_mtx);
2762191903Skmacy	tmp_arc_eviction_list = arc_eviction_list;
2763191903Skmacy	arc_eviction_list = NULL;
2764191903Skmacy	mutex_exit(&arc_eviction_mtx);
2765191903Skmacy
2766191903Skmacy	while (tmp_arc_eviction_list != NULL) {
2767191903Skmacy		arc_buf_t *buf = tmp_arc_eviction_list;
2768191903Skmacy		tmp_arc_eviction_list = buf->b_next;
2769219089Spjd		mutex_enter(&buf->b_evict_lock);
2770168404Spjd		buf->b_hdr = NULL;
2771219089Spjd		mutex_exit(&buf->b_evict_lock);
2772168404Spjd
2773168404Spjd		if (buf->b_efunc != NULL)
2774268858Sdelphij			VERIFY0(buf->b_efunc(buf->b_private));
2775168404Spjd
2776168404Spjd		buf->b_efunc = NULL;
2777168404Spjd		buf->b_private = NULL;
2778168404Spjd		kmem_cache_free(buf_cache, buf);
2779168404Spjd	}
2780191903Skmacy
2781191903Skmacy	if (arc_eviction_list != NULL)
2782191903Skmacy		goto restart;
2783168404Spjd}
2784168404Spjd
2785168404Spjd/*
2786185029Spjd * Flush all *evictable* data from the cache for the given spa.
2787168404Spjd * NOTE: this will not touch "active" (i.e. referenced) data.
2788168404Spjd */
2789168404Spjdvoid
2790185029Spjdarc_flush(spa_t *spa)
2791168404Spjd{
2792209962Smm	uint64_t guid = 0;
2793209962Smm
2794286570Smav	if (spa != NULL)
2795228103Smm		guid = spa_load_guid(spa);
2796209962Smm
2797205231Skmacy	while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
2798209962Smm		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2799286570Smav		if (spa != NULL)
2800185029Spjd			break;
2801185029Spjd	}
2802205231Skmacy	while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
2803209962Smm		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2804286570Smav		if (spa != NULL)
2805185029Spjd			break;
2806185029Spjd	}
2807205231Skmacy	while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
2808209962Smm		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2809286570Smav		if (spa != NULL)
2810185029Spjd			break;
2811185029Spjd	}
2812205231Skmacy	while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
2813209962Smm		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2814286570Smav		if (spa != NULL)
2815185029Spjd			break;
2816185029Spjd	}
2817168404Spjd
2818209962Smm	arc_evict_ghost(arc_mru_ghost, guid, -1);
2819209962Smm	arc_evict_ghost(arc_mfu_ghost, guid, -1);
2820168404Spjd
2821168404Spjd	mutex_enter(&arc_reclaim_thr_lock);
2822168404Spjd	arc_do_user_evicts();
2823168404Spjd	mutex_exit(&arc_reclaim_thr_lock);
2824185029Spjd	ASSERT(spa || arc_eviction_list == NULL);
2825168404Spjd}
2826168404Spjd
2827168404Spjdvoid
2828168404Spjdarc_shrink(void)
2829168404Spjd{
2830270759Ssmh
2831168404Spjd	if (arc_c > arc_c_min) {
2832168404Spjd		uint64_t to_free;
2833168404Spjd
2834277452Swill		to_free = arc_c >> arc_shrink_shift;
2835272483Ssmh		DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
2836272483Ssmh			arc_c_min, uint64_t, arc_p, uint64_t, to_free);
2837168404Spjd		if (arc_c > arc_c_min + to_free)
2838168404Spjd			atomic_add_64(&arc_c, -to_free);
2839168404Spjd		else
2840168404Spjd			arc_c = arc_c_min;
2841168404Spjd
2842168404Spjd		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2843168404Spjd		if (arc_c > arc_size)
2844168404Spjd			arc_c = MAX(arc_size, arc_c_min);
2845168404Spjd		if (arc_p > arc_c)
2846168404Spjd			arc_p = (arc_c >> 1);
2847272483Ssmh
2848272483Ssmh		DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
2849272483Ssmh			arc_p);
2850272483Ssmh
2851168404Spjd		ASSERT(arc_c >= arc_c_min);
2852168404Spjd		ASSERT((int64_t)arc_p >= 0);
2853168404Spjd	}
2854168404Spjd
2855270759Ssmh	if (arc_size > arc_c) {
2856270759Ssmh		DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,
2857270759Ssmh			uint64_t, arc_c);
2858168404Spjd		arc_adjust();
2859270759Ssmh	}
2860168404Spjd}
2861168404Spjd
2862185029Spjdstatic int needfree = 0;
2863168404Spjd
2864168404Spjdstatic int
2865168404Spjdarc_reclaim_needed(void)
2866168404Spjd{
2867168404Spjd
2868168404Spjd#ifdef _KERNEL
2869219089Spjd
2870270759Ssmh	if (needfree) {
2871270759Ssmh		DTRACE_PROBE(arc__reclaim_needfree);
2872197816Skmacy		return (1);
2873270759Ssmh	}
2874168404Spjd
2875191902Skmacy	/*
2876212780Savg	 * Cooperate with pagedaemon when it's time for it to scan
2877212780Savg	 * and reclaim some pages.
2878191902Skmacy	 */
2879272483Ssmh	if (freemem < zfs_arc_free_target) {
2880272483Ssmh		DTRACE_PROBE2(arc__reclaim_freemem, uint64_t,
2881272483Ssmh		    freemem, uint64_t, zfs_arc_free_target);
2882191902Skmacy		return (1);
2883270759Ssmh	}
2884191902Skmacy
2885277300Ssmh#ifdef illumos
2886168404Spjd	/*
2887185029Spjd	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
2888185029Spjd	 */
2889185029Spjd	extra = desfree;
2890185029Spjd
2891185029Spjd	/*
2892185029Spjd	 * check that we're out of range of the pageout scanner.  It starts to
2893185029Spjd	 * schedule paging if freemem is less than lotsfree and needfree.
2894185029Spjd	 * lotsfree is the high-water mark for pageout, and needfree is the
2895185029Spjd	 * number of needed free pages.  We add extra pages here to make sure
2896185029Spjd	 * the scanner doesn't start up while we're freeing memory.
2897185029Spjd	 */
2898185029Spjd	if (freemem < lotsfree + needfree + extra)
2899185029Spjd		return (1);
2900185029Spjd
2901185029Spjd	/*
2902168404Spjd	 * check to make sure that swapfs has enough space so that anon
2903185029Spjd	 * reservations can still succeed. anon_resvmem() checks that the
2904168404Spjd	 * availrmem is greater than swapfs_minfree, and the number of reserved
2905168404Spjd	 * swap pages.  We also add a bit of extra here just to prevent
2906168404Spjd	 * circumstances from getting really dire.
2907168404Spjd	 */
2908168404Spjd	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
2909168404Spjd		return (1);
2910168404Spjd
2911168404Spjd	/*
2912272483Ssmh	 * Check that we have enough availrmem that memory locking (e.g., via
2913272483Ssmh	 * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
2914272483Ssmh	 * stores the number of pages that cannot be locked; when availrmem
2915272483Ssmh	 * drops below pages_pp_maximum, page locking mechanisms such as
2916272483Ssmh	 * page_pp_lock() will fail.)
2917272483Ssmh	 */
2918272483Ssmh	if (availrmem <= pages_pp_maximum)
2919272483Ssmh		return (1);
2920272483Ssmh
2921277300Ssmh#endif	/* illumos */
2922272483Ssmh#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
2923272483Ssmh	/*
2924168404Spjd	 * If we're on an i386 platform, it's possible that we'll exhaust the
2925168404Spjd	 * kernel heap space before we ever run out of available physical
2926168404Spjd	 * memory.  Most checks of the size of the heap_area compare against
2927168404Spjd	 * tune.t_minarmem, which is the minimum available real memory that we
2928168404Spjd	 * can have in the system.  However, this is generally fixed at 25 pages
2929168404Spjd	 * which is so low that it's useless.  In this comparison, we seek to
2930168404Spjd	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
2931185029Spjd	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
2932168404Spjd	 * free)
2933168404Spjd	 */
2934272483Ssmh	if (vmem_size(heap_arena, VMEM_FREE) <
2935272483Ssmh	    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) {
2936270861Ssmh		DTRACE_PROBE2(arc__reclaim_used, uint64_t,
2937272483Ssmh		    vmem_size(heap_arena, VMEM_FREE), uint64_t,
2938272483Ssmh		    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2);
2939270861Ssmh		return (1);
2940270861Ssmh	}
2941281026Smav#define	zio_arena	NULL
2942281026Smav#else
2943281026Smav#define	zio_arena	heap_arena
2944270861Ssmh#endif
2945281026Smav
2946272483Ssmh	/*
2947272483Ssmh	 * If zio data pages are being allocated out of a separate heap segment,
2948272483Ssmh	 * then enforce that the size of available vmem for this arena remains
2949272483Ssmh	 * above about 1/16th free.
2950272483Ssmh	 *
2951272483Ssmh	 * Note: The 1/16th arena free requirement was put in place
2952272483Ssmh	 * to aggressively evict memory from the arc in order to avoid
2953272483Ssmh	 * memory fragmentation issues.
2954272483Ssmh	 */
2955272483Ssmh	if (zio_arena != NULL &&
2956272483Ssmh	    vmem_size(zio_arena, VMEM_FREE) <
2957272483Ssmh	    (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
2958272483Ssmh		return (1);
2959281026Smav
2960281026Smav	/*
2961281026Smav	 * Above limits know nothing about real level of KVA fragmentation.
2962281026Smav	 * Start aggressive reclamation if too little sequential KVA left.
2963281026Smav	 */
2964281109Smav	if (vmem_size(heap_arena, VMEM_MAXFREE) < zfs_max_recordsize) {
2965281109Smav		DTRACE_PROBE2(arc__reclaim_maxfree, uint64_t,
2966281109Smav		    vmem_size(heap_arena, VMEM_MAXFREE),
2967281109Smav		    uint64_t, zfs_max_recordsize);
2968281026Smav		return (1);
2969281109Smav	}
2970281026Smav
2971272483Ssmh#else	/* _KERNEL */
2972168404Spjd	if (spa_get_random(100) == 0)
2973168404Spjd		return (1);
2974272483Ssmh#endif	/* _KERNEL */
2975270759Ssmh	DTRACE_PROBE(arc__reclaim_no);
2976270759Ssmh
2977168404Spjd	return (0);
2978168404Spjd}
2979168404Spjd
2980208454Spjdextern kmem_cache_t	*zio_buf_cache[];
2981208454Spjdextern kmem_cache_t	*zio_data_buf_cache[];
2982272527Sdelphijextern kmem_cache_t	*range_seg_cache;
2983208454Spjd
2984278040Ssmhstatic __noinline void
2985168404Spjdarc_kmem_reap_now(arc_reclaim_strategy_t strat)
2986168404Spjd{
2987168404Spjd	size_t			i;
2988168404Spjd	kmem_cache_t		*prev_cache = NULL;
2989168404Spjd	kmem_cache_t		*prev_data_cache = NULL;
2990168404Spjd
2991272483Ssmh	DTRACE_PROBE(arc__kmem_reap_start);
2992168404Spjd#ifdef _KERNEL
2993185029Spjd	if (arc_meta_used >= arc_meta_limit) {
2994185029Spjd		/*
2995185029Spjd		 * We are exceeding our meta-data cache limit.
2996185029Spjd		 * Purge some DNLC entries to release holds on meta-data.
2997185029Spjd		 */
2998185029Spjd		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
2999185029Spjd	}
3000168404Spjd#if defined(__i386)
3001168404Spjd	/*
3002168404Spjd	 * Reclaim unused memory from all kmem caches.
3003168404Spjd	 */
3004168404Spjd	kmem_reap();
3005168404Spjd#endif
3006168404Spjd#endif
3007168404Spjd
3008168404Spjd	/*
3009185029Spjd	 * An aggressive reclamation will shrink the cache size as well as
3010168404Spjd	 * reap free buffers from the arc kmem caches.
3011168404Spjd	 */
3012168404Spjd	if (strat == ARC_RECLAIM_AGGR)
3013168404Spjd		arc_shrink();
3014168404Spjd
3015168404Spjd	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
3016168404Spjd		if (zio_buf_cache[i] != prev_cache) {
3017168404Spjd			prev_cache = zio_buf_cache[i];
3018168404Spjd			kmem_cache_reap_now(zio_buf_cache[i]);
3019168404Spjd		}
3020168404Spjd		if (zio_data_buf_cache[i] != prev_data_cache) {
3021168404Spjd			prev_data_cache = zio_data_buf_cache[i];
3022168404Spjd			kmem_cache_reap_now(zio_data_buf_cache[i]);
3023168404Spjd		}
3024168404Spjd	}
3025168404Spjd	kmem_cache_reap_now(buf_cache);
3026286570Smav	kmem_cache_reap_now(hdr_full_cache);
3027286570Smav	kmem_cache_reap_now(hdr_l2only_cache);
3028272506Sdelphij	kmem_cache_reap_now(range_seg_cache);
3029272483Ssmh
3030277300Ssmh#ifdef illumos
3031272483Ssmh	/*
3032272483Ssmh	 * Ask the vmem arena to reclaim unused memory from its
3033272483Ssmh	 * quantum caches.
3034272483Ssmh	 */
3035272483Ssmh	if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
3036272483Ssmh		vmem_qcache_reap(zio_arena);
3037272483Ssmh#endif
3038272483Ssmh	DTRACE_PROBE(arc__kmem_reap_end);
3039168404Spjd}
3040168404Spjd
3041168404Spjdstatic void
3042168404Spjdarc_reclaim_thread(void *dummy __unused)
3043168404Spjd{
3044168404Spjd	clock_t			growtime = 0;
3045168404Spjd	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
3046168404Spjd	callb_cpr_t		cpr;
3047168404Spjd
3048168404Spjd	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
3049168404Spjd
3050168404Spjd	mutex_enter(&arc_reclaim_thr_lock);
3051168404Spjd	while (arc_thread_exit == 0) {
3052168404Spjd		if (arc_reclaim_needed()) {
3053168404Spjd
3054168404Spjd			if (arc_no_grow) {
3055168404Spjd				if (last_reclaim == ARC_RECLAIM_CONS) {
3056272483Ssmh					DTRACE_PROBE(arc__reclaim_aggr_no_grow);
3057168404Spjd					last_reclaim = ARC_RECLAIM_AGGR;
3058168404Spjd				} else {
3059168404Spjd					last_reclaim = ARC_RECLAIM_CONS;
3060168404Spjd				}
3061168404Spjd			} else {
3062168404Spjd				arc_no_grow = TRUE;
3063168404Spjd				last_reclaim = ARC_RECLAIM_AGGR;
3064272483Ssmh				DTRACE_PROBE(arc__reclaim_aggr);
3065168404Spjd				membar_producer();
3066168404Spjd			}
3067168404Spjd
3068168404Spjd			/* reset the growth delay for every reclaim */
3069219089Spjd			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
3070168404Spjd
3071185029Spjd			if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
3072168404Spjd				/*
3073185029Spjd				 * If needfree is TRUE our vm_lowmem hook
3074168404Spjd				 * was called and in that case we must free some
3075168404Spjd				 * memory, so switch to aggressive mode.
3076168404Spjd				 */
3077168404Spjd				arc_no_grow = TRUE;
3078168404Spjd				last_reclaim = ARC_RECLAIM_AGGR;
3079168404Spjd			}
3080168404Spjd			arc_kmem_reap_now(last_reclaim);
3081185029Spjd			arc_warm = B_TRUE;
3082185029Spjd
3083219089Spjd		} else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
3084168404Spjd			arc_no_grow = FALSE;
3085168404Spjd		}
3086168404Spjd
3087209275Smm		arc_adjust();
3088168404Spjd
3089168404Spjd		if (arc_eviction_list != NULL)
3090168404Spjd			arc_do_user_evicts();
3091168404Spjd
3092211762Savg#ifdef _KERNEL
3093211762Savg		if (needfree) {
3094185029Spjd			needfree = 0;
3095185029Spjd			wakeup(&needfree);
3096211762Savg		}
3097168404Spjd#endif
3098168404Spjd
3099168404Spjd		/* block until needed, or one second, whichever is shorter */
3100168404Spjd		CALLB_CPR_SAFE_BEGIN(&cpr);
3101168404Spjd		(void) cv_timedwait(&arc_reclaim_thr_cv,
3102168404Spjd		    &arc_reclaim_thr_lock, hz);
3103168404Spjd		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
3104168404Spjd	}
3105168404Spjd
3106168404Spjd	arc_thread_exit = 0;
3107168404Spjd	cv_broadcast(&arc_reclaim_thr_cv);
3108168404Spjd	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
3109168404Spjd	thread_exit();
3110168404Spjd}
3111168404Spjd
3112168404Spjd/*
3113168404Spjd * Adapt arc info given the number of bytes we are trying to add and
3114168404Spjd * the state that we are comming from.  This function is only called
3115168404Spjd * when we are adding new content to the cache.
3116168404Spjd */
3117168404Spjdstatic void
3118168404Spjdarc_adapt(int bytes, arc_state_t *state)
3119168404Spjd{
3120168404Spjd	int mult;
3121208373Smm	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
3122168404Spjd
3123185029Spjd	if (state == arc_l2c_only)
3124185029Spjd		return;
3125185029Spjd
3126168404Spjd	ASSERT(bytes > 0);
3127168404Spjd	/*
3128168404Spjd	 * Adapt the target size of the MRU list:
3129168404Spjd	 *	- if we just hit in the MRU ghost list, then increase
3130168404Spjd	 *	  the target size of the MRU list.
3131168404Spjd	 *	- if we just hit in the MFU ghost list, then increase
3132168404Spjd	 *	  the target size of the MFU list by decreasing the
3133168404Spjd	 *	  target size of the MRU list.
3134168404Spjd	 */
3135168404Spjd	if (state == arc_mru_ghost) {
3136168404Spjd		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
3137168404Spjd		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
3138209275Smm		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
3139168404Spjd
3140208373Smm		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
3141168404Spjd	} else if (state == arc_mfu_ghost) {
3142208373Smm		uint64_t delta;
3143208373Smm
3144168404Spjd		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
3145168404Spjd		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
3146209275Smm		mult = MIN(mult, 10);
3147168404Spjd
3148208373Smm		delta = MIN(bytes * mult, arc_p);
3149208373Smm		arc_p = MAX(arc_p_min, arc_p - delta);
3150168404Spjd	}
3151168404Spjd	ASSERT((int64_t)arc_p >= 0);
3152168404Spjd
3153168404Spjd	if (arc_reclaim_needed()) {
3154168404Spjd		cv_signal(&arc_reclaim_thr_cv);
3155168404Spjd		return;
3156168404Spjd	}
3157168404Spjd
3158168404Spjd	if (arc_no_grow)
3159168404Spjd		return;
3160168404Spjd
3161168404Spjd	if (arc_c >= arc_c_max)
3162168404Spjd		return;
3163168404Spjd
3164168404Spjd	/*
3165168404Spjd	 * If we're within (2 * maxblocksize) bytes of the target
3166168404Spjd	 * cache size, increment the target cache size
3167168404Spjd	 */
3168168404Spjd	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
3169272483Ssmh		DTRACE_PROBE1(arc__inc_adapt, int, bytes);
3170168404Spjd		atomic_add_64(&arc_c, (int64_t)bytes);
3171168404Spjd		if (arc_c > arc_c_max)
3172168404Spjd			arc_c = arc_c_max;
3173168404Spjd		else if (state == arc_anon)
3174168404Spjd			atomic_add_64(&arc_p, (int64_t)bytes);
3175168404Spjd		if (arc_p > arc_c)
3176168404Spjd			arc_p = arc_c;
3177168404Spjd	}
3178168404Spjd	ASSERT((int64_t)arc_p >= 0);
3179168404Spjd}
3180168404Spjd
3181168404Spjd/*
3182168404Spjd * Check if the cache has reached its limits and eviction is required
3183168404Spjd * prior to insert.
3184168404Spjd */
3185168404Spjdstatic int
3186185029Spjdarc_evict_needed(arc_buf_contents_t type)
3187168404Spjd{
3188185029Spjd	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
3189185029Spjd		return (1);
3190185029Spjd
3191168404Spjd	if (arc_reclaim_needed())
3192168404Spjd		return (1);
3193168404Spjd
3194168404Spjd	return (arc_size > arc_c);
3195168404Spjd}
3196168404Spjd
3197168404Spjd/*
3198168404Spjd * The buffer, supplied as the first argument, needs a data block.
3199168404Spjd * So, if we are at cache max, determine which cache should be victimized.
3200168404Spjd * We have the following cases:
3201168404Spjd *
3202168404Spjd * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
3203168404Spjd * In this situation if we're out of space, but the resident size of the MFU is
3204168404Spjd * under the limit, victimize the MFU cache to satisfy this insertion request.
3205168404Spjd *
3206168404Spjd * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
3207168404Spjd * Here, we've used up all of the available space for the MRU, so we need to
3208168404Spjd * evict from our own cache instead.  Evict from the set of resident MRU
3209168404Spjd * entries.
3210168404Spjd *
3211168404Spjd * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
3212168404Spjd * c minus p represents the MFU space in the cache, since p is the size of the
3213168404Spjd * cache that is dedicated to the MRU.  In this situation there's still space on
3214168404Spjd * the MFU side, so the MRU side needs to be victimized.
3215168404Spjd *
3216168404Spjd * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
3217168404Spjd * MFU's resident set is consuming more space than it has been allotted.  In
3218168404Spjd * this situation, we must victimize our own cache, the MFU, for this insertion.
3219168404Spjd */
3220168404Spjdstatic void
3221168404Spjdarc_get_data_buf(arc_buf_t *buf)
3222168404Spjd{
3223286570Smav	arc_state_t		*state = buf->b_hdr->b_l1hdr.b_state;
3224168404Spjd	uint64_t		size = buf->b_hdr->b_size;
3225286570Smav	arc_buf_contents_t	type = arc_buf_type(buf->b_hdr);
3226168404Spjd
3227168404Spjd	arc_adapt(size, state);
3228168404Spjd
3229168404Spjd	/*
3230168404Spjd	 * We have not yet reached cache maximum size,
3231168404Spjd	 * just allocate a new buffer.
3232168404Spjd	 */
3233185029Spjd	if (!arc_evict_needed(type)) {
3234168404Spjd		if (type == ARC_BUFC_METADATA) {
3235168404Spjd			buf->b_data = zio_buf_alloc(size);
3236208373Smm			arc_space_consume(size, ARC_SPACE_DATA);
3237168404Spjd		} else {
3238168404Spjd			ASSERT(type == ARC_BUFC_DATA);
3239168404Spjd			buf->b_data = zio_data_buf_alloc(size);
3240208373Smm			ARCSTAT_INCR(arcstat_data_size, size);
3241185029Spjd			atomic_add_64(&arc_size, size);
3242168404Spjd		}
3243168404Spjd		goto out;
3244168404Spjd	}
3245168404Spjd
3246168404Spjd	/*
3247168404Spjd	 * If we are prefetching from the mfu ghost list, this buffer
3248168404Spjd	 * will end up on the mru list; so steal space from there.
3249168404Spjd	 */
3250168404Spjd	if (state == arc_mfu_ghost)
3251286570Smav		state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu;
3252168404Spjd	else if (state == arc_mru_ghost)
3253168404Spjd		state = arc_mru;
3254168404Spjd
3255168404Spjd	if (state == arc_mru || state == arc_anon) {
3256168404Spjd		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
3257208373Smm		state = (arc_mfu->arcs_lsize[type] >= size &&
3258185029Spjd		    arc_p > mru_used) ? arc_mfu : arc_mru;
3259168404Spjd	} else {
3260168404Spjd		/* MFU cases */
3261168404Spjd		uint64_t mfu_space = arc_c - arc_p;
3262208373Smm		state =  (arc_mru->arcs_lsize[type] >= size &&
3263185029Spjd		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
3264168404Spjd	}
3265209962Smm	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
3266168404Spjd		if (type == ARC_BUFC_METADATA) {
3267168404Spjd			buf->b_data = zio_buf_alloc(size);
3268208373Smm			arc_space_consume(size, ARC_SPACE_DATA);
3269168404Spjd		} else {
3270168404Spjd			ASSERT(type == ARC_BUFC_DATA);
3271168404Spjd			buf->b_data = zio_data_buf_alloc(size);
3272208373Smm			ARCSTAT_INCR(arcstat_data_size, size);
3273185029Spjd			atomic_add_64(&arc_size, size);
3274168404Spjd		}
3275168404Spjd		ARCSTAT_BUMP(arcstat_recycle_miss);
3276168404Spjd	}
3277168404Spjd	ASSERT(buf->b_data != NULL);
3278168404Spjdout:
3279168404Spjd	/*
3280168404Spjd	 * Update the state size.  Note that ghost states have a
3281168404Spjd	 * "ghost size" and so don't need to be updated.
3282168404Spjd	 */
3283286570Smav	if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) {
3284168404Spjd		arc_buf_hdr_t *hdr = buf->b_hdr;
3285168404Spjd
3286286570Smav		atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size);
3287286570Smav		if (list_link_active(&hdr->b_l1hdr.b_arc_node)) {
3288286570Smav			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3289286570Smav			atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type],
3290286570Smav			    size);
3291168404Spjd		}
3292168404Spjd		/*
3293168404Spjd		 * If we are growing the cache, and we are adding anonymous
3294168404Spjd		 * data, and we have outgrown arc_p, update arc_p
3295168404Spjd		 */
3296286570Smav		if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
3297168404Spjd		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
3298168404Spjd			arc_p = MIN(arc_c, arc_p + size);
3299168404Spjd	}
3300205231Skmacy	ARCSTAT_BUMP(arcstat_allocated);
3301168404Spjd}
3302168404Spjd
3303168404Spjd/*
3304168404Spjd * This routine is called whenever a buffer is accessed.
3305168404Spjd * NOTE: the hash lock is dropped in this function.
3306168404Spjd */
3307168404Spjdstatic void
3308275811Sdelphijarc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
3309168404Spjd{
3310219089Spjd	clock_t now;
3311219089Spjd
3312168404Spjd	ASSERT(MUTEX_HELD(hash_lock));
3313286570Smav	ASSERT(HDR_HAS_L1HDR(hdr));
3314168404Spjd
3315286570Smav	if (hdr->b_l1hdr.b_state == arc_anon) {
3316168404Spjd		/*
3317168404Spjd		 * This buffer is not in the cache, and does not
3318168404Spjd		 * appear in our "ghost" list.  Add the new buffer
3319168404Spjd		 * to the MRU state.
3320168404Spjd		 */
3321168404Spjd
3322286570Smav		ASSERT0(hdr->b_l1hdr.b_arc_access);
3323286570Smav		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3324275811Sdelphij		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
3325275811Sdelphij		arc_change_state(arc_mru, hdr, hash_lock);
3326168404Spjd
3327286570Smav	} else if (hdr->b_l1hdr.b_state == arc_mru) {
3328219089Spjd		now = ddi_get_lbolt();
3329219089Spjd
3330168404Spjd		/*
3331168404Spjd		 * If this buffer is here because of a prefetch, then either:
3332168404Spjd		 * - clear the flag if this is a "referencing" read
3333168404Spjd		 *   (any subsequent access will bump this into the MFU state).
3334168404Spjd		 * or
3335168404Spjd		 * - move the buffer to the head of the list if this is
3336168404Spjd		 *   another prefetch (to make it less likely to be evicted).
3337168404Spjd		 */
3338286570Smav		if (HDR_PREFETCH(hdr)) {
3339286570Smav			if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
3340286570Smav				ASSERT(list_link_active(
3341286570Smav				    &hdr->b_l1hdr.b_arc_node));
3342168404Spjd			} else {
3343275811Sdelphij				hdr->b_flags &= ~ARC_FLAG_PREFETCH;
3344168404Spjd				ARCSTAT_BUMP(arcstat_mru_hits);
3345168404Spjd			}
3346286570Smav			hdr->b_l1hdr.b_arc_access = now;
3347168404Spjd			return;
3348168404Spjd		}
3349168404Spjd
3350168404Spjd		/*
3351168404Spjd		 * This buffer has been "accessed" only once so far,
3352168404Spjd		 * but it is still in the cache. Move it to the MFU
3353168404Spjd		 * state.
3354168404Spjd		 */
3355286570Smav		if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) {
3356168404Spjd			/*
3357168404Spjd			 * More than 125ms have passed since we
3358168404Spjd			 * instantiated this buffer.  Move it to the
3359168404Spjd			 * most frequently used state.
3360168404Spjd			 */
3361286570Smav			hdr->b_l1hdr.b_arc_access = now;
3362275811Sdelphij			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3363275811Sdelphij			arc_change_state(arc_mfu, hdr, hash_lock);
3364168404Spjd		}
3365168404Spjd		ARCSTAT_BUMP(arcstat_mru_hits);
3366286570Smav	} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
3367168404Spjd		arc_state_t	*new_state;
3368168404Spjd		/*
3369168404Spjd		 * This buffer has been "accessed" recently, but
3370168404Spjd		 * was evicted from the cache.  Move it to the
3371168404Spjd		 * MFU state.
3372168404Spjd		 */
3373168404Spjd
3374286570Smav		if (HDR_PREFETCH(hdr)) {
3375168404Spjd			new_state = arc_mru;
3376286570Smav			if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
3377275811Sdelphij				hdr->b_flags &= ~ARC_FLAG_PREFETCH;
3378275811Sdelphij			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
3379168404Spjd		} else {
3380168404Spjd			new_state = arc_mfu;
3381275811Sdelphij			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3382168404Spjd		}
3383168404Spjd
3384286570Smav		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3385275811Sdelphij		arc_change_state(new_state, hdr, hash_lock);
3386168404Spjd
3387168404Spjd		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
3388286570Smav	} else if (hdr->b_l1hdr.b_state == arc_mfu) {
3389168404Spjd		/*
3390168404Spjd		 * This buffer has been accessed more than once and is
3391168404Spjd		 * still in the cache.  Keep it in the MFU state.
3392168404Spjd		 *
3393168404Spjd		 * NOTE: an add_reference() that occurred when we did
3394168404Spjd		 * the arc_read() will have kicked this off the list.
3395168404Spjd		 * If it was a prefetch, we will explicitly move it to
3396168404Spjd		 * the head of the list now.
3397168404Spjd		 */
3398286570Smav		if ((HDR_PREFETCH(hdr)) != 0) {
3399286570Smav			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3400286570Smav			ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
3401168404Spjd		}
3402168404Spjd		ARCSTAT_BUMP(arcstat_mfu_hits);
3403286570Smav		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3404286570Smav	} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
3405168404Spjd		arc_state_t	*new_state = arc_mfu;
3406168404Spjd		/*
3407168404Spjd		 * This buffer has been accessed more than once but has
3408168404Spjd		 * been evicted from the cache.  Move it back to the
3409168404Spjd		 * MFU state.
3410168404Spjd		 */
3411168404Spjd
3412286570Smav		if (HDR_PREFETCH(hdr)) {
3413168404Spjd			/*
3414168404Spjd			 * This is a prefetch access...
3415168404Spjd			 * move this block back to the MRU state.
3416168404Spjd			 */
3417286570Smav			ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
3418168404Spjd			new_state = arc_mru;
3419168404Spjd		}
3420168404Spjd
3421286570Smav		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3422275811Sdelphij		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3423275811Sdelphij		arc_change_state(new_state, hdr, hash_lock);
3424168404Spjd
3425168404Spjd		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
3426286570Smav	} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
3427185029Spjd		/*
3428185029Spjd		 * This buffer is on the 2nd Level ARC.
3429185029Spjd		 */
3430185029Spjd
3431286570Smav		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3432275811Sdelphij		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3433275811Sdelphij		arc_change_state(arc_mfu, hdr, hash_lock);
3434168404Spjd	} else {
3435168404Spjd		ASSERT(!"invalid arc state");
3436168404Spjd	}
3437168404Spjd}
3438168404Spjd
3439168404Spjd/* a generic arc_done_func_t which you can use */
3440168404Spjd/* ARGSUSED */
3441168404Spjdvoid
3442168404Spjdarc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
3443168404Spjd{
3444219089Spjd	if (zio == NULL || zio->io_error == 0)
3445219089Spjd		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
3446248571Smm	VERIFY(arc_buf_remove_ref(buf, arg));
3447168404Spjd}
3448168404Spjd
3449185029Spjd/* a generic arc_done_func_t */
3450168404Spjdvoid
3451168404Spjdarc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
3452168404Spjd{
3453168404Spjd	arc_buf_t **bufp = arg;
3454168404Spjd	if (zio && zio->io_error) {
3455248571Smm		VERIFY(arc_buf_remove_ref(buf, arg));
3456168404Spjd		*bufp = NULL;
3457168404Spjd	} else {
3458168404Spjd		*bufp = buf;
3459219089Spjd		ASSERT(buf->b_data);
3460168404Spjd	}
3461168404Spjd}
3462168404Spjd
3463168404Spjdstatic void
3464168404Spjdarc_read_done(zio_t *zio)
3465168404Spjd{
3466268075Sdelphij	arc_buf_hdr_t	*hdr;
3467168404Spjd	arc_buf_t	*buf;
3468168404Spjd	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
3469268075Sdelphij	kmutex_t	*hash_lock = NULL;
3470168404Spjd	arc_callback_t	*callback_list, *acb;
3471168404Spjd	int		freeable = FALSE;
3472168404Spjd
3473168404Spjd	buf = zio->io_private;
3474168404Spjd	hdr = buf->b_hdr;
3475168404Spjd
3476168404Spjd	/*
3477168404Spjd	 * The hdr was inserted into hash-table and removed from lists
3478168404Spjd	 * prior to starting I/O.  We should find this header, since
3479168404Spjd	 * it's in the hash table, and it should be legit since it's
3480168404Spjd	 * not possible to evict it during the I/O.  The only possible
3481168404Spjd	 * reason for it not to be found is if we were freed during the
3482168404Spjd	 * read.
3483168404Spjd	 */
3484268075Sdelphij	if (HDR_IN_HASH_TABLE(hdr)) {
3485268075Sdelphij		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
3486268075Sdelphij		ASSERT3U(hdr->b_dva.dva_word[0], ==,
3487268075Sdelphij		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
3488268075Sdelphij		ASSERT3U(hdr->b_dva.dva_word[1], ==,
3489268075Sdelphij		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
3490168404Spjd
3491268075Sdelphij		arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
3492268075Sdelphij		    &hash_lock);
3493168404Spjd
3494268075Sdelphij		ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
3495268075Sdelphij		    hash_lock == NULL) ||
3496268075Sdelphij		    (found == hdr &&
3497268075Sdelphij		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
3498268075Sdelphij		    (found == hdr && HDR_L2_READING(hdr)));
3499268075Sdelphij	}
3500268075Sdelphij
3501275811Sdelphij	hdr->b_flags &= ~ARC_FLAG_L2_EVICTED;
3502286570Smav	if (l2arc_noprefetch && HDR_PREFETCH(hdr))
3503275811Sdelphij		hdr->b_flags &= ~ARC_FLAG_L2CACHE;
3504206796Spjd
3505168404Spjd	/* byteswap if necessary */
3506286570Smav	callback_list = hdr->b_l1hdr.b_acb;
3507168404Spjd	ASSERT(callback_list != NULL);
3508209101Smm	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
3509236884Smm		dmu_object_byteswap_t bswap =
3510236884Smm		    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
3511185029Spjd		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
3512185029Spjd		    byteswap_uint64_array :
3513236884Smm		    dmu_ot_byteswap[bswap].ob_func;
3514185029Spjd		func(buf->b_data, hdr->b_size);
3515185029Spjd	}
3516168404Spjd
3517185029Spjd	arc_cksum_compute(buf, B_FALSE);
3518240133Smm#ifdef illumos
3519240133Smm	arc_buf_watch(buf);
3520277300Ssmh#endif
3521168404Spjd
3522286570Smav	if (hash_lock && zio->io_error == 0 &&
3523286570Smav	    hdr->b_l1hdr.b_state == arc_anon) {
3524219089Spjd		/*
3525219089Spjd		 * Only call arc_access on anonymous buffers.  This is because
3526219089Spjd		 * if we've issued an I/O for an evicted buffer, we've already
3527219089Spjd		 * called arc_access (to prevent any simultaneous readers from
3528219089Spjd		 * getting confused).
3529219089Spjd		 */
3530219089Spjd		arc_access(hdr, hash_lock);
3531219089Spjd	}
3532219089Spjd
3533168404Spjd	/* create copies of the data buffer for the callers */
3534168404Spjd	abuf = buf;
3535168404Spjd	for (acb = callback_list; acb; acb = acb->acb_next) {
3536168404Spjd		if (acb->acb_done) {
3537242845Sdelphij			if (abuf == NULL) {
3538242845Sdelphij				ARCSTAT_BUMP(arcstat_duplicate_reads);
3539168404Spjd				abuf = arc_buf_clone(buf);
3540242845Sdelphij			}
3541168404Spjd			acb->acb_buf = abuf;
3542168404Spjd			abuf = NULL;
3543168404Spjd		}
3544168404Spjd	}
3545286570Smav	hdr->b_l1hdr.b_acb = NULL;
3546275811Sdelphij	hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
3547168404Spjd	ASSERT(!HDR_BUF_AVAILABLE(hdr));
3548219089Spjd	if (abuf == buf) {
3549219089Spjd		ASSERT(buf->b_efunc == NULL);
3550286570Smav		ASSERT(hdr->b_l1hdr.b_datacnt == 1);
3551275811Sdelphij		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
3552219089Spjd	}
3553168404Spjd
3554286570Smav	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
3555286570Smav	    callback_list != NULL);
3556168404Spjd
3557168404Spjd	if (zio->io_error != 0) {
3558275811Sdelphij		hdr->b_flags |= ARC_FLAG_IO_ERROR;
3559286570Smav		if (hdr->b_l1hdr.b_state != arc_anon)
3560168404Spjd			arc_change_state(arc_anon, hdr, hash_lock);
3561168404Spjd		if (HDR_IN_HASH_TABLE(hdr))
3562168404Spjd			buf_hash_remove(hdr);
3563286570Smav		freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
3564168404Spjd	}
3565168404Spjd
3566168404Spjd	/*
3567168404Spjd	 * Broadcast before we drop the hash_lock to avoid the possibility
3568168404Spjd	 * that the hdr (and hence the cv) might be freed before we get to
3569168404Spjd	 * the cv_broadcast().
3570168404Spjd	 */
3571286570Smav	cv_broadcast(&hdr->b_l1hdr.b_cv);
3572168404Spjd
3573286570Smav	if (hash_lock != NULL) {
3574168404Spjd		mutex_exit(hash_lock);
3575168404Spjd	} else {
3576168404Spjd		/*
3577168404Spjd		 * This block was freed while we waited for the read to
3578168404Spjd		 * complete.  It has been removed from the hash table and
3579168404Spjd		 * moved to the anonymous state (so that it won't show up
3580168404Spjd		 * in the cache).
3581168404Spjd		 */
3582286570Smav		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3583286570Smav		freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
3584168404Spjd	}
3585168404Spjd
3586168404Spjd	/* execute each callback and free its structure */
3587168404Spjd	while ((acb = callback_list) != NULL) {
3588168404Spjd		if (acb->acb_done)
3589168404Spjd			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3590168404Spjd
3591168404Spjd		if (acb->acb_zio_dummy != NULL) {
3592168404Spjd			acb->acb_zio_dummy->io_error = zio->io_error;
3593168404Spjd			zio_nowait(acb->acb_zio_dummy);
3594168404Spjd		}
3595168404Spjd
3596168404Spjd		callback_list = acb->acb_next;
3597168404Spjd		kmem_free(acb, sizeof (arc_callback_t));
3598168404Spjd	}
3599168404Spjd
3600168404Spjd	if (freeable)
3601168404Spjd		arc_hdr_destroy(hdr);
3602168404Spjd}
3603168404Spjd
3604168404Spjd/*
3605168404Spjd * "Read" the block block at the specified DVA (in bp) via the
3606168404Spjd * cache.  If the block is found in the cache, invoke the provided
3607168404Spjd * callback immediately and return.  Note that the `zio' parameter
3608168404Spjd * in the callback will be NULL in this case, since no IO was
3609168404Spjd * required.  If the block is not in the cache pass the read request
3610168404Spjd * on to the spa with a substitute callback function, so that the
3611168404Spjd * requested block will be added to the cache.
3612168404Spjd *
3613168404Spjd * If a read request arrives for a block that has a read in-progress,
3614168404Spjd * either wait for the in-progress read to complete (and return the
3615168404Spjd * results); or, if this is a read with a "done" func, add a record
3616168404Spjd * to the read to invoke the "done" func when the read completes,
3617168404Spjd * and return; or just return.
3618168404Spjd *
3619168404Spjd * arc_read_done() will invoke all the requested "done" functions
3620168404Spjd * for readers of this block.
3621168404Spjd */
3622168404Spjdint
3623246666Smmarc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
3624275811Sdelphij    void *private, zio_priority_t priority, int zio_flags,
3625275811Sdelphij    arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
3626168404Spjd{
3627268075Sdelphij	arc_buf_hdr_t *hdr = NULL;
3628247187Smm	arc_buf_t *buf = NULL;
3629268075Sdelphij	kmutex_t *hash_lock = NULL;
3630185029Spjd	zio_t *rzio;
3631228103Smm	uint64_t guid = spa_load_guid(spa);
3632168404Spjd
3633268075Sdelphij	ASSERT(!BP_IS_EMBEDDED(bp) ||
3634268075Sdelphij	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
3635268075Sdelphij
3636168404Spjdtop:
3637268075Sdelphij	if (!BP_IS_EMBEDDED(bp)) {
3638268075Sdelphij		/*
3639268075Sdelphij		 * Embedded BP's have no DVA and require no I/O to "read".
3640268075Sdelphij		 * Create an anonymous arc buf to back it.
3641268075Sdelphij		 */
3642268075Sdelphij		hdr = buf_hash_find(guid, bp, &hash_lock);
3643268075Sdelphij	}
3644168404Spjd
3645286570Smav	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) {
3646268075Sdelphij
3647275811Sdelphij		*arc_flags |= ARC_FLAG_CACHED;
3648168404Spjd
3649168404Spjd		if (HDR_IO_IN_PROGRESS(hdr)) {
3650168404Spjd
3651275811Sdelphij			if (*arc_flags & ARC_FLAG_WAIT) {
3652286570Smav				cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
3653168404Spjd				mutex_exit(hash_lock);
3654168404Spjd				goto top;
3655168404Spjd			}
3656275811Sdelphij			ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
3657168404Spjd
3658168404Spjd			if (done) {
3659168404Spjd				arc_callback_t	*acb = NULL;
3660168404Spjd
3661168404Spjd				acb = kmem_zalloc(sizeof (arc_callback_t),
3662168404Spjd				    KM_SLEEP);
3663168404Spjd				acb->acb_done = done;
3664168404Spjd				acb->acb_private = private;
3665168404Spjd				if (pio != NULL)
3666168404Spjd					acb->acb_zio_dummy = zio_null(pio,
3667209962Smm					    spa, NULL, NULL, NULL, zio_flags);
3668168404Spjd
3669168404Spjd				ASSERT(acb->acb_done != NULL);
3670286570Smav				acb->acb_next = hdr->b_l1hdr.b_acb;
3671286570Smav				hdr->b_l1hdr.b_acb = acb;
3672168404Spjd				add_reference(hdr, hash_lock, private);
3673168404Spjd				mutex_exit(hash_lock);
3674168404Spjd				return (0);
3675168404Spjd			}
3676168404Spjd			mutex_exit(hash_lock);
3677168404Spjd			return (0);
3678168404Spjd		}
3679168404Spjd
3680286570Smav		ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
3681286570Smav		    hdr->b_l1hdr.b_state == arc_mfu);
3682168404Spjd
3683168404Spjd		if (done) {
3684168404Spjd			add_reference(hdr, hash_lock, private);
3685168404Spjd			/*
3686168404Spjd			 * If this block is already in use, create a new
3687168404Spjd			 * copy of the data so that we will be guaranteed
3688168404Spjd			 * that arc_release() will always succeed.
3689168404Spjd			 */
3690286570Smav			buf = hdr->b_l1hdr.b_buf;
3691168404Spjd			ASSERT(buf);
3692168404Spjd			ASSERT(buf->b_data);
3693168404Spjd			if (HDR_BUF_AVAILABLE(hdr)) {
3694168404Spjd				ASSERT(buf->b_efunc == NULL);
3695275811Sdelphij				hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
3696168404Spjd			} else {
3697168404Spjd				buf = arc_buf_clone(buf);
3698168404Spjd			}
3699219089Spjd
3700275811Sdelphij		} else if (*arc_flags & ARC_FLAG_PREFETCH &&
3701286570Smav		    refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
3702275811Sdelphij			hdr->b_flags |= ARC_FLAG_PREFETCH;
3703168404Spjd		}
3704168404Spjd		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3705168404Spjd		arc_access(hdr, hash_lock);
3706275811Sdelphij		if (*arc_flags & ARC_FLAG_L2CACHE)
3707275811Sdelphij			hdr->b_flags |= ARC_FLAG_L2CACHE;
3708275811Sdelphij		if (*arc_flags & ARC_FLAG_L2COMPRESS)
3709275811Sdelphij			hdr->b_flags |= ARC_FLAG_L2COMPRESS;
3710168404Spjd		mutex_exit(hash_lock);
3711168404Spjd		ARCSTAT_BUMP(arcstat_hits);
3712286570Smav		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
3713286570Smav		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
3714168404Spjd		    data, metadata, hits);
3715168404Spjd
3716168404Spjd		if (done)
3717168404Spjd			done(NULL, buf, private);
3718168404Spjd	} else {
3719168404Spjd		uint64_t size = BP_GET_LSIZE(bp);
3720268075Sdelphij		arc_callback_t *acb;
3721185029Spjd		vdev_t *vd = NULL;
3722247187Smm		uint64_t addr = 0;
3723208373Smm		boolean_t devw = B_FALSE;
3724258389Savg		enum zio_compress b_compress = ZIO_COMPRESS_OFF;
3725286570Smav		int32_t b_asize = 0;
3726168404Spjd
3727168404Spjd		if (hdr == NULL) {
3728168404Spjd			/* this block is not in the cache */
3729268075Sdelphij			arc_buf_hdr_t *exists = NULL;
3730168404Spjd			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3731168404Spjd			buf = arc_buf_alloc(spa, size, private, type);
3732168404Spjd			hdr = buf->b_hdr;
3733268075Sdelphij			if (!BP_IS_EMBEDDED(bp)) {
3734268075Sdelphij				hdr->b_dva = *BP_IDENTITY(bp);
3735268075Sdelphij				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3736268075Sdelphij				exists = buf_hash_insert(hdr, &hash_lock);
3737268075Sdelphij			}
3738268075Sdelphij			if (exists != NULL) {
3739168404Spjd				/* somebody beat us to the hash insert */
3740168404Spjd				mutex_exit(hash_lock);
3741219089Spjd				buf_discard_identity(hdr);
3742168404Spjd				(void) arc_buf_remove_ref(buf, private);
3743168404Spjd				goto top; /* restart the IO request */
3744168404Spjd			}
3745275811Sdelphij
3746168404Spjd			/* if this is a prefetch, we don't have a reference */
3747275811Sdelphij			if (*arc_flags & ARC_FLAG_PREFETCH) {
3748168404Spjd				(void) remove_reference(hdr, hash_lock,
3749168404Spjd				    private);
3750275811Sdelphij				hdr->b_flags |= ARC_FLAG_PREFETCH;
3751168404Spjd			}
3752275811Sdelphij			if (*arc_flags & ARC_FLAG_L2CACHE)
3753275811Sdelphij				hdr->b_flags |= ARC_FLAG_L2CACHE;
3754275811Sdelphij			if (*arc_flags & ARC_FLAG_L2COMPRESS)
3755275811Sdelphij				hdr->b_flags |= ARC_FLAG_L2COMPRESS;
3756168404Spjd			if (BP_GET_LEVEL(bp) > 0)
3757275811Sdelphij				hdr->b_flags |= ARC_FLAG_INDIRECT;
3758168404Spjd		} else {
3759286570Smav			/*
3760286570Smav			 * This block is in the ghost cache. If it was L2-only
3761286570Smav			 * (and thus didn't have an L1 hdr), we realloc the
3762286570Smav			 * header to add an L1 hdr.
3763286570Smav			 */
3764286570Smav			if (!HDR_HAS_L1HDR(hdr)) {
3765286570Smav				hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
3766286570Smav				    hdr_full_cache);
3767286570Smav			}
3768286570Smav
3769286570Smav			ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
3770168404Spjd			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3771286570Smav			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3772286570Smav			ASSERT(hdr->b_l1hdr.b_buf == NULL);
3773168404Spjd
3774168404Spjd			/* if this is a prefetch, we don't have a reference */
3775275811Sdelphij			if (*arc_flags & ARC_FLAG_PREFETCH)
3776275811Sdelphij				hdr->b_flags |= ARC_FLAG_PREFETCH;
3777168404Spjd			else
3778168404Spjd				add_reference(hdr, hash_lock, private);
3779275811Sdelphij			if (*arc_flags & ARC_FLAG_L2CACHE)
3780275811Sdelphij				hdr->b_flags |= ARC_FLAG_L2CACHE;
3781275811Sdelphij			if (*arc_flags & ARC_FLAG_L2COMPRESS)
3782275811Sdelphij				hdr->b_flags |= ARC_FLAG_L2COMPRESS;
3783185029Spjd			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3784168404Spjd			buf->b_hdr = hdr;
3785168404Spjd			buf->b_data = NULL;
3786168404Spjd			buf->b_efunc = NULL;
3787168404Spjd			buf->b_private = NULL;
3788168404Spjd			buf->b_next = NULL;
3789286570Smav			hdr->b_l1hdr.b_buf = buf;
3790286570Smav			ASSERT0(hdr->b_l1hdr.b_datacnt);
3791286570Smav			hdr->b_l1hdr.b_datacnt = 1;
3792219089Spjd			arc_get_data_buf(buf);
3793219089Spjd			arc_access(hdr, hash_lock);
3794168404Spjd		}
3795168404Spjd
3796286570Smav		ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
3797219089Spjd
3798168404Spjd		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3799168404Spjd		acb->acb_done = done;
3800168404Spjd		acb->acb_private = private;
3801168404Spjd
3802286570Smav		ASSERT(hdr->b_l1hdr.b_acb == NULL);
3803286570Smav		hdr->b_l1hdr.b_acb = acb;
3804275811Sdelphij		hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
3805168404Spjd
3806286570Smav		if (HDR_HAS_L2HDR(hdr) &&
3807286570Smav		    (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
3808286570Smav			devw = hdr->b_l2hdr.b_dev->l2ad_writing;
3809286570Smav			addr = hdr->b_l2hdr.b_daddr;
3810286570Smav			b_compress = HDR_GET_COMPRESS(hdr);
3811286570Smav			b_asize = hdr->b_l2hdr.b_asize;
3812185029Spjd			/*
3813185029Spjd			 * Lock out device removal.
3814185029Spjd			 */
3815185029Spjd			if (vdev_is_dead(vd) ||
3816185029Spjd			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3817185029Spjd				vd = NULL;
3818185029Spjd		}
3819185029Spjd
3820268075Sdelphij		if (hash_lock != NULL)
3821268075Sdelphij			mutex_exit(hash_lock);
3822168404Spjd
3823251629Sdelphij		/*
3824251629Sdelphij		 * At this point, we have a level 1 cache miss.  Try again in
3825251629Sdelphij		 * L2ARC if possible.
3826251629Sdelphij		 */
3827168404Spjd		ASSERT3U(hdr->b_size, ==, size);
3828219089Spjd		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
3829268123Sdelphij		    uint64_t, size, zbookmark_phys_t *, zb);
3830168404Spjd		ARCSTAT_BUMP(arcstat_misses);
3831286570Smav		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
3832286570Smav		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
3833168404Spjd		    data, metadata, misses);
3834228392Spjd#ifdef _KERNEL
3835228392Spjd		curthread->td_ru.ru_inblock++;
3836228392Spjd#endif
3837168404Spjd
3838208373Smm		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
3839185029Spjd			/*
3840185029Spjd			 * Read from the L2ARC if the following are true:
3841185029Spjd			 * 1. The L2ARC vdev was previously cached.
3842185029Spjd			 * 2. This buffer still has L2ARC metadata.
3843185029Spjd			 * 3. This buffer isn't currently writing to the L2ARC.
3844185029Spjd			 * 4. The L2ARC entry wasn't evicted, which may
3845185029Spjd			 *    also have invalidated the vdev.
3846208373Smm			 * 5. This isn't prefetch and l2arc_noprefetch is set.
3847185029Spjd			 */
3848286570Smav			if (HDR_HAS_L2HDR(hdr) &&
3849208373Smm			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
3850208373Smm			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
3851185029Spjd				l2arc_read_callback_t *cb;
3852185029Spjd
3853185029Spjd				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
3854185029Spjd				ARCSTAT_BUMP(arcstat_l2_hits);
3855185029Spjd
3856185029Spjd				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
3857185029Spjd				    KM_SLEEP);
3858185029Spjd				cb->l2rcb_buf = buf;
3859185029Spjd				cb->l2rcb_spa = spa;
3860185029Spjd				cb->l2rcb_bp = *bp;
3861185029Spjd				cb->l2rcb_zb = *zb;
3862185029Spjd				cb->l2rcb_flags = zio_flags;
3863258389Savg				cb->l2rcb_compress = b_compress;
3864185029Spjd
3865247187Smm				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
3866247187Smm				    addr + size < vd->vdev_psize -
3867247187Smm				    VDEV_LABEL_END_SIZE);
3868247187Smm
3869185029Spjd				/*
3870185029Spjd				 * l2arc read.  The SCL_L2ARC lock will be
3871185029Spjd				 * released by l2arc_read_done().
3872251478Sdelphij				 * Issue a null zio if the underlying buffer
3873251478Sdelphij				 * was squashed to zero size by compression.
3874185029Spjd				 */
3875258389Savg				if (b_compress == ZIO_COMPRESS_EMPTY) {
3876251478Sdelphij					rzio = zio_null(pio, spa, vd,
3877251478Sdelphij					    l2arc_read_done, cb,
3878251478Sdelphij					    zio_flags | ZIO_FLAG_DONT_CACHE |
3879251478Sdelphij					    ZIO_FLAG_CANFAIL |
3880251478Sdelphij					    ZIO_FLAG_DONT_PROPAGATE |
3881251478Sdelphij					    ZIO_FLAG_DONT_RETRY);
3882251478Sdelphij				} else {
3883251478Sdelphij					rzio = zio_read_phys(pio, vd, addr,
3884258389Savg					    b_asize, buf->b_data,
3885258389Savg					    ZIO_CHECKSUM_OFF,
3886251478Sdelphij					    l2arc_read_done, cb, priority,
3887251478Sdelphij					    zio_flags | ZIO_FLAG_DONT_CACHE |
3888251478Sdelphij					    ZIO_FLAG_CANFAIL |
3889251478Sdelphij					    ZIO_FLAG_DONT_PROPAGATE |
3890251478Sdelphij					    ZIO_FLAG_DONT_RETRY, B_FALSE);
3891251478Sdelphij				}
3892185029Spjd				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
3893185029Spjd				    zio_t *, rzio);
3894258389Savg				ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
3895185029Spjd
3896275811Sdelphij				if (*arc_flags & ARC_FLAG_NOWAIT) {
3897185029Spjd					zio_nowait(rzio);
3898185029Spjd					return (0);
3899185029Spjd				}
3900185029Spjd
3901275811Sdelphij				ASSERT(*arc_flags & ARC_FLAG_WAIT);
3902185029Spjd				if (zio_wait(rzio) == 0)
3903185029Spjd					return (0);
3904185029Spjd
3905185029Spjd				/* l2arc read error; goto zio_read() */
3906185029Spjd			} else {
3907185029Spjd				DTRACE_PROBE1(l2arc__miss,
3908185029Spjd				    arc_buf_hdr_t *, hdr);
3909185029Spjd				ARCSTAT_BUMP(arcstat_l2_misses);
3910185029Spjd				if (HDR_L2_WRITING(hdr))
3911185029Spjd					ARCSTAT_BUMP(arcstat_l2_rw_clash);
3912185029Spjd				spa_config_exit(spa, SCL_L2ARC, vd);
3913185029Spjd			}
3914208373Smm		} else {
3915208373Smm			if (vd != NULL)
3916208373Smm				spa_config_exit(spa, SCL_L2ARC, vd);
3917208373Smm			if (l2arc_ndev != 0) {
3918208373Smm				DTRACE_PROBE1(l2arc__miss,
3919208373Smm				    arc_buf_hdr_t *, hdr);
3920208373Smm				ARCSTAT_BUMP(arcstat_l2_misses);
3921208373Smm			}
3922185029Spjd		}
3923185029Spjd
3924168404Spjd		rzio = zio_read(pio, spa, bp, buf->b_data, size,
3925185029Spjd		    arc_read_done, buf, priority, zio_flags, zb);
3926168404Spjd
3927275811Sdelphij		if (*arc_flags & ARC_FLAG_WAIT)
3928168404Spjd			return (zio_wait(rzio));
3929168404Spjd
3930275811Sdelphij		ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
3931168404Spjd		zio_nowait(rzio);
3932168404Spjd	}
3933168404Spjd	return (0);
3934168404Spjd}
3935168404Spjd
3936168404Spjdvoid
3937168404Spjdarc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
3938168404Spjd{
3939168404Spjd	ASSERT(buf->b_hdr != NULL);
3940286570Smav	ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon);
3941286570Smav	ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) ||
3942286570Smav	    func == NULL);
3943219089Spjd	ASSERT(buf->b_efunc == NULL);
3944219089Spjd	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
3945219089Spjd
3946168404Spjd	buf->b_efunc = func;
3947168404Spjd	buf->b_private = private;
3948168404Spjd}
3949168404Spjd
3950168404Spjd/*
3951251520Sdelphij * Notify the arc that a block was freed, and thus will never be used again.
3952251520Sdelphij */
3953251520Sdelphijvoid
3954251520Sdelphijarc_freed(spa_t *spa, const blkptr_t *bp)
3955251520Sdelphij{
3956251520Sdelphij	arc_buf_hdr_t *hdr;
3957251520Sdelphij	kmutex_t *hash_lock;
3958251520Sdelphij	uint64_t guid = spa_load_guid(spa);
3959251520Sdelphij
3960268075Sdelphij	ASSERT(!BP_IS_EMBEDDED(bp));
3961268075Sdelphij
3962268075Sdelphij	hdr = buf_hash_find(guid, bp, &hash_lock);
3963251520Sdelphij	if (hdr == NULL)
3964251520Sdelphij		return;
3965251520Sdelphij	if (HDR_BUF_AVAILABLE(hdr)) {
3966286570Smav		arc_buf_t *buf = hdr->b_l1hdr.b_buf;
3967251520Sdelphij		add_reference(hdr, hash_lock, FTAG);
3968275811Sdelphij		hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
3969251520Sdelphij		mutex_exit(hash_lock);
3970251520Sdelphij
3971251520Sdelphij		arc_release(buf, FTAG);
3972251520Sdelphij		(void) arc_buf_remove_ref(buf, FTAG);
3973251520Sdelphij	} else {
3974251520Sdelphij		mutex_exit(hash_lock);
3975251520Sdelphij	}
3976251520Sdelphij
3977251520Sdelphij}
3978251520Sdelphij
3979251520Sdelphij/*
3980268858Sdelphij * Clear the user eviction callback set by arc_set_callback(), first calling
3981268858Sdelphij * it if it exists.  Because the presence of a callback keeps an arc_buf cached
3982268858Sdelphij * clearing the callback may result in the arc_buf being destroyed.  However,
3983268858Sdelphij * it will not result in the *last* arc_buf being destroyed, hence the data
3984268858Sdelphij * will remain cached in the ARC. We make a copy of the arc buffer here so
3985268858Sdelphij * that we can process the callback without holding any locks.
3986268858Sdelphij *
3987268858Sdelphij * It's possible that the callback is already in the process of being cleared
3988268858Sdelphij * by another thread.  In this case we can not clear the callback.
3989268858Sdelphij *
3990268858Sdelphij * Returns B_TRUE if the callback was successfully called and cleared.
3991168404Spjd */
3992268858Sdelphijboolean_t
3993268858Sdelphijarc_clear_callback(arc_buf_t *buf)
3994168404Spjd{
3995168404Spjd	arc_buf_hdr_t *hdr;
3996168404Spjd	kmutex_t *hash_lock;
3997268858Sdelphij	arc_evict_func_t *efunc = buf->b_efunc;
3998268858Sdelphij	void *private = buf->b_private;
3999205231Skmacy	list_t *list, *evicted_list;
4000205231Skmacy	kmutex_t *lock, *evicted_lock;
4001206796Spjd
4002219089Spjd	mutex_enter(&buf->b_evict_lock);
4003168404Spjd	hdr = buf->b_hdr;
4004168404Spjd	if (hdr == NULL) {
4005168404Spjd		/*
4006168404Spjd		 * We are in arc_do_user_evicts().
4007168404Spjd		 */
4008168404Spjd		ASSERT(buf->b_data == NULL);
4009219089Spjd		mutex_exit(&buf->b_evict_lock);
4010268858Sdelphij		return (B_FALSE);
4011185029Spjd	} else if (buf->b_data == NULL) {
4012185029Spjd		/*
4013185029Spjd		 * We are on the eviction list; process this buffer now
4014185029Spjd		 * but let arc_do_user_evicts() do the reaping.
4015185029Spjd		 */
4016185029Spjd		buf->b_efunc = NULL;
4017219089Spjd		mutex_exit(&buf->b_evict_lock);
4018268858Sdelphij		VERIFY0(efunc(private));
4019268858Sdelphij		return (B_TRUE);
4020168404Spjd	}
4021168404Spjd	hash_lock = HDR_LOCK(hdr);
4022168404Spjd	mutex_enter(hash_lock);
4023219089Spjd	hdr = buf->b_hdr;
4024219089Spjd	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4025168404Spjd
4026286570Smav	ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <,
4027286570Smav	    hdr->b_l1hdr.b_datacnt);
4028286570Smav	ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
4029286570Smav	    hdr->b_l1hdr.b_state == arc_mfu);
4030168404Spjd
4031268858Sdelphij	buf->b_efunc = NULL;
4032268858Sdelphij	buf->b_private = NULL;
4033168404Spjd
4034286570Smav	if (hdr->b_l1hdr.b_datacnt > 1) {
4035268858Sdelphij		mutex_exit(&buf->b_evict_lock);
4036268858Sdelphij		arc_buf_destroy(buf, FALSE, TRUE);
4037268858Sdelphij	} else {
4038286570Smav		ASSERT(buf == hdr->b_l1hdr.b_buf);
4039275811Sdelphij		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
4040268858Sdelphij		mutex_exit(&buf->b_evict_lock);
4041268858Sdelphij	}
4042168404Spjd
4043168404Spjd	mutex_exit(hash_lock);
4044268858Sdelphij	VERIFY0(efunc(private));
4045268858Sdelphij	return (B_TRUE);
4046168404Spjd}
4047168404Spjd
4048168404Spjd/*
4049251629Sdelphij * Release this buffer from the cache, making it an anonymous buffer.  This
4050251629Sdelphij * must be done after a read and prior to modifying the buffer contents.
4051168404Spjd * If the buffer has more than one reference, we must make
4052185029Spjd * a new hdr for the buffer.
4053168404Spjd */
4054168404Spjdvoid
4055168404Spjdarc_release(arc_buf_t *buf, void *tag)
4056168404Spjd{
4057286570Smav	arc_buf_hdr_t *hdr = buf->b_hdr;
4058168404Spjd
4059219089Spjd	/*
4060219089Spjd	 * It would be nice to assert that if it's DMU metadata (level >
4061219089Spjd	 * 0 || it's the dnode file), then it must be syncing context.
4062219089Spjd	 * But we don't know that information at this level.
4063219089Spjd	 */
4064219089Spjd
4065219089Spjd	mutex_enter(&buf->b_evict_lock);
4066286570Smav	/*
4067286570Smav	 * We don't grab the hash lock prior to this check, because if
4068286570Smav	 * the buffer's header is in the arc_anon state, it won't be
4069286570Smav	 * linked into the hash table.
4070286570Smav	 */
4071286570Smav	if (hdr->b_l1hdr.b_state == arc_anon) {
4072286570Smav		mutex_exit(&buf->b_evict_lock);
4073286570Smav		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4074286570Smav		ASSERT(!HDR_IN_HASH_TABLE(hdr));
4075286570Smav		ASSERT(!HDR_HAS_L2HDR(hdr));
4076286570Smav		ASSERT(BUF_EMPTY(hdr));
4077286570Smav		ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1);
4078286570Smav		ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
4079286570Smav		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
4080185029Spjd
4081286570Smav		ASSERT3P(buf->b_efunc, ==, NULL);
4082286570Smav		ASSERT3P(buf->b_private, ==, NULL);
4083168404Spjd
4084286570Smav		hdr->b_l1hdr.b_arc_access = 0;
4085286570Smav		arc_buf_thaw(buf);
4086286570Smav
4087286570Smav		return;
4088168404Spjd	}
4089168404Spjd
4090286570Smav	kmutex_t *hash_lock = HDR_LOCK(hdr);
4091286570Smav	mutex_enter(hash_lock);
4092286570Smav
4093286570Smav	/*
4094286570Smav	 * This assignment is only valid as long as the hash_lock is
4095286570Smav	 * held, we must be careful not to reference state or the
4096286570Smav	 * b_state field after dropping the lock.
4097286570Smav	 */
4098286570Smav	arc_state_t *state = hdr->b_l1hdr.b_state;
4099286570Smav	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4100286570Smav	ASSERT3P(state, !=, arc_anon);
4101286570Smav
4102286570Smav	/* this buffer is not on any list */
4103286570Smav	ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0);
4104286570Smav
4105286570Smav	if (HDR_HAS_L2HDR(hdr)) {
4106286570Smav		ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
4107286570Smav		ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
4108286570Smav
4109286570Smav		mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
4110286570Smav		trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev,
4111286570Smav		    hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0);
4112286570Smav		list_remove(&hdr->b_l2hdr.b_dev->l2ad_buflist, hdr);
4113286570Smav
4114286570Smav		/*
4115286570Smav		 * We don't want to leak the b_tmp_cdata buffer that was
4116286570Smav		 * allocated in l2arc_write_buffers()
4117286570Smav		 */
4118274172Savg		arc_buf_l2_cdata_free(hdr);
4119286570Smav
4120286570Smav		mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
4121286570Smav
4122286570Smav		hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
4123185029Spjd	}
4124185029Spjd
4125168404Spjd	/*
4126168404Spjd	 * Do we have more than one buf?
4127168404Spjd	 */
4128286570Smav	if (hdr->b_l1hdr.b_datacnt > 1) {
4129168404Spjd		arc_buf_hdr_t *nhdr;
4130168404Spjd		arc_buf_t **bufp;
4131168404Spjd		uint64_t blksz = hdr->b_size;
4132209962Smm		uint64_t spa = hdr->b_spa;
4133286570Smav		arc_buf_contents_t type = arc_buf_type(hdr);
4134185029Spjd		uint32_t flags = hdr->b_flags;
4135168404Spjd
4136286570Smav		ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
4137168404Spjd		/*
4138219089Spjd		 * Pull the data off of this hdr and attach it to
4139219089Spjd		 * a new anonymous hdr.
4140168404Spjd		 */
4141168404Spjd		(void) remove_reference(hdr, hash_lock, tag);
4142286570Smav		bufp = &hdr->b_l1hdr.b_buf;
4143168404Spjd		while (*bufp != buf)
4144168404Spjd			bufp = &(*bufp)->b_next;
4145219089Spjd		*bufp = buf->b_next;
4146168404Spjd		buf->b_next = NULL;
4147168404Spjd
4148286570Smav		ASSERT3P(state, !=, arc_l2c_only);
4149286570Smav		ASSERT3U(state->arcs_size, >=, hdr->b_size);
4150286570Smav		atomic_add_64(&state->arcs_size, -hdr->b_size);
4151286570Smav		if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
4152286570Smav			ASSERT3P(state, !=, arc_l2c_only);
4153286570Smav			uint64_t *size = &state->arcs_lsize[type];
4154185029Spjd			ASSERT3U(*size, >=, hdr->b_size);
4155185029Spjd			atomic_add_64(size, -hdr->b_size);
4156168404Spjd		}
4157242845Sdelphij
4158242845Sdelphij		/*
4159242845Sdelphij		 * We're releasing a duplicate user data buffer, update
4160242845Sdelphij		 * our statistics accordingly.
4161242845Sdelphij		 */
4162286570Smav		if (HDR_ISTYPE_DATA(hdr)) {
4163242845Sdelphij			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
4164242845Sdelphij			ARCSTAT_INCR(arcstat_duplicate_buffers_size,
4165242845Sdelphij			    -hdr->b_size);
4166242845Sdelphij		}
4167286570Smav		hdr->b_l1hdr.b_datacnt -= 1;
4168168404Spjd		arc_cksum_verify(buf);
4169240133Smm#ifdef illumos
4170240133Smm		arc_buf_unwatch(buf);
4171277300Ssmh#endif
4172168404Spjd
4173168404Spjd		mutex_exit(hash_lock);
4174168404Spjd
4175286570Smav		nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
4176168404Spjd		nhdr->b_size = blksz;
4177168404Spjd		nhdr->b_spa = spa;
4178286570Smav
4179275811Sdelphij		nhdr->b_flags = flags & ARC_FLAG_L2_WRITING;
4180286570Smav		nhdr->b_flags |= arc_bufc_to_flags(type);
4181286570Smav		nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
4182286570Smav
4183286570Smav		nhdr->b_l1hdr.b_buf = buf;
4184286570Smav		nhdr->b_l1hdr.b_datacnt = 1;
4185286570Smav		nhdr->b_l1hdr.b_state = arc_anon;
4186286570Smav		nhdr->b_l1hdr.b_arc_access = 0;
4187168404Spjd		nhdr->b_freeze_cksum = NULL;
4188286570Smav
4189286570Smav		(void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
4190168404Spjd		buf->b_hdr = nhdr;
4191219089Spjd		mutex_exit(&buf->b_evict_lock);
4192168404Spjd		atomic_add_64(&arc_anon->arcs_size, blksz);
4193168404Spjd	} else {
4194219089Spjd		mutex_exit(&buf->b_evict_lock);
4195286570Smav		ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
4196286570Smav		/* protected by hash lock */
4197286570Smav		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
4198168404Spjd		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4199286570Smav		arc_change_state(arc_anon, hdr, hash_lock);
4200286570Smav		hdr->b_l1hdr.b_arc_access = 0;
4201286570Smav		mutex_exit(hash_lock);
4202185029Spjd
4203219089Spjd		buf_discard_identity(hdr);
4204168404Spjd		arc_buf_thaw(buf);
4205168404Spjd	}
4206168404Spjd	buf->b_efunc = NULL;
4207168404Spjd	buf->b_private = NULL;
4208168404Spjd}
4209168404Spjd
4210168404Spjdint
4211168404Spjdarc_released(arc_buf_t *buf)
4212168404Spjd{
4213185029Spjd	int released;
4214185029Spjd
4215219089Spjd	mutex_enter(&buf->b_evict_lock);
4216286570Smav	released = (buf->b_data != NULL &&
4217286570Smav	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
4218219089Spjd	mutex_exit(&buf->b_evict_lock);
4219185029Spjd	return (released);
4220168404Spjd}
4221168404Spjd
4222168404Spjd#ifdef ZFS_DEBUG
4223168404Spjdint
4224168404Spjdarc_referenced(arc_buf_t *buf)
4225168404Spjd{
4226185029Spjd	int referenced;
4227185029Spjd
4228219089Spjd	mutex_enter(&buf->b_evict_lock);
4229286570Smav	referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
4230219089Spjd	mutex_exit(&buf->b_evict_lock);
4231185029Spjd	return (referenced);
4232168404Spjd}
4233168404Spjd#endif
4234168404Spjd
4235168404Spjdstatic void
4236168404Spjdarc_write_ready(zio_t *zio)
4237168404Spjd{
4238168404Spjd	arc_write_callback_t *callback = zio->io_private;
4239168404Spjd	arc_buf_t *buf = callback->awcb_buf;
4240185029Spjd	arc_buf_hdr_t *hdr = buf->b_hdr;
4241168404Spjd
4242286570Smav	ASSERT(HDR_HAS_L1HDR(hdr));
4243286570Smav	ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
4244286570Smav	ASSERT(hdr->b_l1hdr.b_datacnt > 0);
4245185029Spjd	callback->awcb_ready(zio, buf, callback->awcb_private);
4246185029Spjd
4247185029Spjd	/*
4248185029Spjd	 * If the IO is already in progress, then this is a re-write
4249185029Spjd	 * attempt, so we need to thaw and re-compute the cksum.
4250185029Spjd	 * It is the responsibility of the callback to handle the
4251185029Spjd	 * accounting for any re-write attempt.
4252185029Spjd	 */
4253185029Spjd	if (HDR_IO_IN_PROGRESS(hdr)) {
4254286570Smav		mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
4255185029Spjd		if (hdr->b_freeze_cksum != NULL) {
4256185029Spjd			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
4257185029Spjd			hdr->b_freeze_cksum = NULL;
4258185029Spjd		}
4259286570Smav		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
4260168404Spjd	}
4261185029Spjd	arc_cksum_compute(buf, B_FALSE);
4262275811Sdelphij	hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
4263168404Spjd}
4264168404Spjd
4265258632Savg/*
4266258632Savg * The SPA calls this callback for each physical write that happens on behalf
4267258632Savg * of a logical write.  See the comment in dbuf_write_physdone() for details.
4268258632Savg */
4269168404Spjdstatic void
4270258632Savgarc_write_physdone(zio_t *zio)
4271258632Savg{
4272258632Savg	arc_write_callback_t *cb = zio->io_private;
4273258632Savg	if (cb->awcb_physdone != NULL)
4274258632Savg		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
4275258632Savg}
4276258632Savg
4277258632Savgstatic void
4278168404Spjdarc_write_done(zio_t *zio)
4279168404Spjd{
4280168404Spjd	arc_write_callback_t *callback = zio->io_private;
4281168404Spjd	arc_buf_t *buf = callback->awcb_buf;
4282168404Spjd	arc_buf_hdr_t *hdr = buf->b_hdr;
4283168404Spjd
4284286570Smav	ASSERT(hdr->b_l1hdr.b_acb == NULL);
4285168404Spjd
4286219089Spjd	if (zio->io_error == 0) {
4287268075Sdelphij		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
4288260150Sdelphij			buf_discard_identity(hdr);
4289260150Sdelphij		} else {
4290260150Sdelphij			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
4291260150Sdelphij			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
4292260150Sdelphij		}
4293219089Spjd	} else {
4294219089Spjd		ASSERT(BUF_EMPTY(hdr));
4295219089Spjd	}
4296219089Spjd
4297168404Spjd	/*
4298268075Sdelphij	 * If the block to be written was all-zero or compressed enough to be
4299268075Sdelphij	 * embedded in the BP, no write was performed so there will be no
4300268075Sdelphij	 * dva/birth/checksum.  The buffer must therefore remain anonymous
4301268075Sdelphij	 * (and uncached).
4302168404Spjd	 */
4303168404Spjd	if (!BUF_EMPTY(hdr)) {
4304168404Spjd		arc_buf_hdr_t *exists;
4305168404Spjd		kmutex_t *hash_lock;
4306168404Spjd
4307219089Spjd		ASSERT(zio->io_error == 0);
4308219089Spjd
4309168404Spjd		arc_cksum_verify(buf);
4310168404Spjd
4311168404Spjd		exists = buf_hash_insert(hdr, &hash_lock);
4312286570Smav		if (exists != NULL) {
4313168404Spjd			/*
4314168404Spjd			 * This can only happen if we overwrite for
4315168404Spjd			 * sync-to-convergence, because we remove
4316168404Spjd			 * buffers from the hash table when we arc_free().
4317168404Spjd			 */
4318219089Spjd			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
4319219089Spjd				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
4320219089Spjd					panic("bad overwrite, hdr=%p exists=%p",
4321219089Spjd					    (void *)hdr, (void *)exists);
4322286570Smav				ASSERT(refcount_is_zero(
4323286570Smav				    &exists->b_l1hdr.b_refcnt));
4324219089Spjd				arc_change_state(arc_anon, exists, hash_lock);
4325219089Spjd				mutex_exit(hash_lock);
4326219089Spjd				arc_hdr_destroy(exists);
4327219089Spjd				exists = buf_hash_insert(hdr, &hash_lock);
4328219089Spjd				ASSERT3P(exists, ==, NULL);
4329243524Smm			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
4330243524Smm				/* nopwrite */
4331243524Smm				ASSERT(zio->io_prop.zp_nopwrite);
4332243524Smm				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
4333243524Smm					panic("bad nopwrite, hdr=%p exists=%p",
4334243524Smm					    (void *)hdr, (void *)exists);
4335219089Spjd			} else {
4336219089Spjd				/* Dedup */
4337286570Smav				ASSERT(hdr->b_l1hdr.b_datacnt == 1);
4338286570Smav				ASSERT(hdr->b_l1hdr.b_state == arc_anon);
4339219089Spjd				ASSERT(BP_GET_DEDUP(zio->io_bp));
4340219089Spjd				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
4341219089Spjd			}
4342168404Spjd		}
4343275811Sdelphij		hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
4344185029Spjd		/* if it's not anon, we are doing a scrub */
4345286570Smav		if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
4346185029Spjd			arc_access(hdr, hash_lock);
4347168404Spjd		mutex_exit(hash_lock);
4348168404Spjd	} else {
4349275811Sdelphij		hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
4350168404Spjd	}
4351168404Spjd
4352286570Smav	ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4353219089Spjd	callback->awcb_done(zio, buf, callback->awcb_private);
4354168404Spjd
4355168404Spjd	kmem_free(callback, sizeof (arc_write_callback_t));
4356168404Spjd}
4357168404Spjd
4358168404Spjdzio_t *
4359219089Spjdarc_write(zio_t *pio, spa_t *spa, uint64_t txg,
4360251478Sdelphij    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
4361258632Savg    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
4362258632Savg    arc_done_func_t *done, void *private, zio_priority_t priority,
4363268123Sdelphij    int zio_flags, const zbookmark_phys_t *zb)
4364168404Spjd{
4365168404Spjd	arc_buf_hdr_t *hdr = buf->b_hdr;
4366168404Spjd	arc_write_callback_t *callback;
4367185029Spjd	zio_t *zio;
4368168404Spjd
4369185029Spjd	ASSERT(ready != NULL);
4370219089Spjd	ASSERT(done != NULL);
4371168404Spjd	ASSERT(!HDR_IO_ERROR(hdr));
4372286570Smav	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4373286570Smav	ASSERT(hdr->b_l1hdr.b_acb == NULL);
4374286570Smav	ASSERT(hdr->b_l1hdr.b_datacnt > 0);
4375185029Spjd	if (l2arc)
4376275811Sdelphij		hdr->b_flags |= ARC_FLAG_L2CACHE;
4377251478Sdelphij	if (l2arc_compress)
4378275811Sdelphij		hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4379168404Spjd	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
4380168404Spjd	callback->awcb_ready = ready;
4381258632Savg	callback->awcb_physdone = physdone;
4382168404Spjd	callback->awcb_done = done;
4383168404Spjd	callback->awcb_private = private;
4384168404Spjd	callback->awcb_buf = buf;
4385168404Spjd
4386219089Spjd	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
4387258632Savg	    arc_write_ready, arc_write_physdone, arc_write_done, callback,
4388258632Savg	    priority, zio_flags, zb);
4389185029Spjd
4390168404Spjd	return (zio);
4391168404Spjd}
4392168404Spjd
4393185029Spjdstatic int
4394258632Savgarc_memory_throttle(uint64_t reserve, uint64_t txg)
4395185029Spjd{
4396185029Spjd#ifdef _KERNEL
4397272483Ssmh	uint64_t available_memory = ptob(freemem);
4398185029Spjd	static uint64_t page_load = 0;
4399185029Spjd	static uint64_t last_txg = 0;
4400185029Spjd
4401272483Ssmh#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
4402185029Spjd	available_memory =
4403272483Ssmh	    MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE)));
4404185029Spjd#endif
4405258632Savg
4406272483Ssmh	if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
4407185029Spjd		return (0);
4408185029Spjd
4409185029Spjd	if (txg > last_txg) {
4410185029Spjd		last_txg = txg;
4411185029Spjd		page_load = 0;
4412185029Spjd	}
4413185029Spjd	/*
4414185029Spjd	 * If we are in pageout, we know that memory is already tight,
4415185029Spjd	 * the arc is already going to be evicting, so we just want to
4416185029Spjd	 * continue to let page writes occur as quickly as possible.
4417185029Spjd	 */
4418185029Spjd	if (curproc == pageproc) {
4419272483Ssmh		if (page_load > MAX(ptob(minfree), available_memory) / 4)
4420249195Smm			return (SET_ERROR(ERESTART));
4421185029Spjd		/* Note: reserve is inflated, so we deflate */
4422185029Spjd		page_load += reserve / 8;
4423185029Spjd		return (0);
4424185029Spjd	} else if (page_load > 0 && arc_reclaim_needed()) {
4425185029Spjd		/* memory is low, delay before restarting */
4426185029Spjd		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
4427249195Smm		return (SET_ERROR(EAGAIN));
4428185029Spjd	}
4429185029Spjd	page_load = 0;
4430185029Spjd#endif
4431185029Spjd	return (0);
4432185029Spjd}
4433185029Spjd
4434168404Spjdvoid
4435185029Spjdarc_tempreserve_clear(uint64_t reserve)
4436168404Spjd{
4437185029Spjd	atomic_add_64(&arc_tempreserve, -reserve);
4438168404Spjd	ASSERT((int64_t)arc_tempreserve >= 0);
4439168404Spjd}
4440168404Spjd
4441168404Spjdint
4442185029Spjdarc_tempreserve_space(uint64_t reserve, uint64_t txg)
4443168404Spjd{
4444185029Spjd	int error;
4445209962Smm	uint64_t anon_size;
4446185029Spjd
4447272483Ssmh	if (reserve > arc_c/4 && !arc_no_grow) {
4448185029Spjd		arc_c = MIN(arc_c_max, reserve * 4);
4449272483Ssmh		DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
4450272483Ssmh	}
4451185029Spjd	if (reserve > arc_c)
4452249195Smm		return (SET_ERROR(ENOMEM));
4453168404Spjd
4454168404Spjd	/*
4455209962Smm	 * Don't count loaned bufs as in flight dirty data to prevent long
4456209962Smm	 * network delays from blocking transactions that are ready to be
4457209962Smm	 * assigned to a txg.
4458209962Smm	 */
4459209962Smm	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
4460209962Smm
4461209962Smm	/*
4462185029Spjd	 * Writes will, almost always, require additional memory allocations
4463251631Sdelphij	 * in order to compress/encrypt/etc the data.  We therefore need to
4464185029Spjd	 * make sure that there is sufficient available memory for this.
4465185029Spjd	 */
4466258632Savg	error = arc_memory_throttle(reserve, txg);
4467258632Savg	if (error != 0)
4468185029Spjd		return (error);
4469185029Spjd
4470185029Spjd	/*
4471168404Spjd	 * Throttle writes when the amount of dirty data in the cache
4472168404Spjd	 * gets too large.  We try to keep the cache less than half full
4473168404Spjd	 * of dirty blocks so that our sync times don't grow too large.
4474168404Spjd	 * Note: if two requests come in concurrently, we might let them
4475168404Spjd	 * both succeed, when one of them should fail.  Not a huge deal.
4476168404Spjd	 */
4477209962Smm
4478209962Smm	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
4479209962Smm	    anon_size > arc_c / 4) {
4480185029Spjd		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
4481185029Spjd		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
4482185029Spjd		    arc_tempreserve>>10,
4483185029Spjd		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
4484185029Spjd		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
4485185029Spjd		    reserve>>10, arc_c>>10);
4486249195Smm		return (SET_ERROR(ERESTART));
4487168404Spjd	}
4488185029Spjd	atomic_add_64(&arc_tempreserve, reserve);
4489168404Spjd	return (0);
4490168404Spjd}
4491168404Spjd
4492168582Spjdstatic kmutex_t arc_lowmem_lock;
4493168404Spjd#ifdef _KERNEL
4494168566Spjdstatic eventhandler_tag arc_event_lowmem = NULL;
4495168404Spjd
4496168404Spjdstatic void
4497168566Spjdarc_lowmem(void *arg __unused, int howto __unused)
4498168404Spjd{
4499168404Spjd
4500168566Spjd	/* Serialize access via arc_lowmem_lock. */
4501168566Spjd	mutex_enter(&arc_lowmem_lock);
4502219089Spjd	mutex_enter(&arc_reclaim_thr_lock);
4503185029Spjd	needfree = 1;
4504272483Ssmh	DTRACE_PROBE(arc__needfree);
4505168404Spjd	cv_signal(&arc_reclaim_thr_cv);
4506241773Savg
4507241773Savg	/*
4508241773Savg	 * It is unsafe to block here in arbitrary threads, because we can come
4509241773Savg	 * here from ARC itself and may hold ARC locks and thus risk a deadlock
4510241773Savg	 * with ARC reclaim thread.
4511241773Savg	 */
4512241773Savg	if (curproc == pageproc) {
4513241773Savg		while (needfree)
4514241773Savg			msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
4515241773Savg	}
4516219089Spjd	mutex_exit(&arc_reclaim_thr_lock);
4517168566Spjd	mutex_exit(&arc_lowmem_lock);
4518168404Spjd}
4519168404Spjd#endif
4520168404Spjd
4521168404Spjdvoid
4522168404Spjdarc_init(void)
4523168404Spjd{
4524219089Spjd	int i, prefetch_tunable_set = 0;
4525205231Skmacy
4526168404Spjd	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4527168404Spjd	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
4528168566Spjd	mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
4529168404Spjd
4530168404Spjd	/* Convert seconds to clock ticks */
4531168404Spjd	arc_min_prefetch_lifespan = 1 * hz;
4532168404Spjd
4533168404Spjd	/* Start out with 1/8 of all memory */
4534168566Spjd	arc_c = kmem_size() / 8;
4535219089Spjd
4536277300Ssmh#ifdef illumos
4537192360Skmacy#ifdef _KERNEL
4538192360Skmacy	/*
4539192360Skmacy	 * On architectures where the physical memory can be larger
4540192360Skmacy	 * than the addressable space (intel in 32-bit mode), we may
4541192360Skmacy	 * need to limit the cache to 1/8 of VM size.
4542192360Skmacy	 */
4543192360Skmacy	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
4544192360Skmacy#endif
4545277300Ssmh#endif	/* illumos */
4546168566Spjd	/* set min cache to 1/32 of all memory, or 16MB, whichever is more */
4547280822Smav	arc_c_min = MAX(arc_c / 4, 16 << 20);
4548168566Spjd	/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
4549280822Smav	if (arc_c * 8 >= 1 << 30)
4550280822Smav		arc_c_max = (arc_c * 8) - (1 << 30);
4551168404Spjd	else
4552168404Spjd		arc_c_max = arc_c_min;
4553175633Spjd	arc_c_max = MAX(arc_c * 5, arc_c_max);
4554219089Spjd
4555168481Spjd#ifdef _KERNEL
4556168404Spjd	/*
4557168404Spjd	 * Allow the tunables to override our calculations if they are
4558168566Spjd	 * reasonable (ie. over 16MB)
4559168404Spjd	 */
4560280822Smav	if (zfs_arc_max > 16 << 20 && zfs_arc_max < kmem_size())
4561168404Spjd		arc_c_max = zfs_arc_max;
4562280822Smav	if (zfs_arc_min > 16 << 20 && zfs_arc_min <= arc_c_max)
4563168404Spjd		arc_c_min = zfs_arc_min;
4564168481Spjd#endif
4565219089Spjd
4566168404Spjd	arc_c = arc_c_max;
4567168404Spjd	arc_p = (arc_c >> 1);
4568168404Spjd
4569185029Spjd	/* limit meta-data to 1/4 of the arc capacity */
4570185029Spjd	arc_meta_limit = arc_c_max / 4;
4571185029Spjd
4572185029Spjd	/* Allow the tunable to override if it is reasonable */
4573185029Spjd	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
4574185029Spjd		arc_meta_limit = zfs_arc_meta_limit;
4575185029Spjd
4576185029Spjd	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
4577185029Spjd		arc_c_min = arc_meta_limit / 2;
4578185029Spjd
4579275780Sdelphij	if (zfs_arc_meta_min > 0) {
4580275780Sdelphij		arc_meta_min = zfs_arc_meta_min;
4581275780Sdelphij	} else {
4582275780Sdelphij		arc_meta_min = arc_c_min / 2;
4583275780Sdelphij	}
4584275780Sdelphij
4585208373Smm	if (zfs_arc_grow_retry > 0)
4586208373Smm		arc_grow_retry = zfs_arc_grow_retry;
4587208373Smm
4588208373Smm	if (zfs_arc_shrink_shift > 0)
4589208373Smm		arc_shrink_shift = zfs_arc_shrink_shift;
4590208373Smm
4591208373Smm	if (zfs_arc_p_min_shift > 0)
4592208373Smm		arc_p_min_shift = zfs_arc_p_min_shift;
4593208373Smm
4594168404Spjd	/* if kmem_flags are set, lets try to use less memory */
4595168404Spjd	if (kmem_debugging())
4596168404Spjd		arc_c = arc_c / 2;
4597168404Spjd	if (arc_c < arc_c_min)
4598168404Spjd		arc_c = arc_c_min;
4599168404Spjd
4600168473Spjd	zfs_arc_min = arc_c_min;
4601168473Spjd	zfs_arc_max = arc_c_max;
4602168473Spjd
4603168404Spjd	arc_anon = &ARC_anon;
4604168404Spjd	arc_mru = &ARC_mru;
4605168404Spjd	arc_mru_ghost = &ARC_mru_ghost;
4606168404Spjd	arc_mfu = &ARC_mfu;
4607168404Spjd	arc_mfu_ghost = &ARC_mfu_ghost;
4608185029Spjd	arc_l2c_only = &ARC_l2c_only;
4609168404Spjd	arc_size = 0;
4610168404Spjd
4611205231Skmacy	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4612205231Skmacy		mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
4613205231Skmacy		    NULL, MUTEX_DEFAULT, NULL);
4614205231Skmacy		mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
4615205231Skmacy		    NULL, MUTEX_DEFAULT, NULL);
4616205231Skmacy		mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
4617205231Skmacy		    NULL, MUTEX_DEFAULT, NULL);
4618205231Skmacy		mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
4619205231Skmacy		    NULL, MUTEX_DEFAULT, NULL);
4620205231Skmacy		mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
4621205231Skmacy		    NULL, MUTEX_DEFAULT, NULL);
4622205231Skmacy		mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
4623205231Skmacy		    NULL, MUTEX_DEFAULT, NULL);
4624206796Spjd
4625205231Skmacy		list_create(&arc_mru->arcs_lists[i],
4626286570Smav		    sizeof (arc_buf_hdr_t),
4627286570Smav		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4628205231Skmacy		list_create(&arc_mru_ghost->arcs_lists[i],
4629286570Smav		    sizeof (arc_buf_hdr_t),
4630286570Smav		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4631205231Skmacy		list_create(&arc_mfu->arcs_lists[i],
4632286570Smav		    sizeof (arc_buf_hdr_t),
4633286570Smav		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4634205231Skmacy		list_create(&arc_mfu_ghost->arcs_lists[i],
4635286570Smav		    sizeof (arc_buf_hdr_t),
4636286570Smav		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4637205231Skmacy		list_create(&arc_mfu_ghost->arcs_lists[i],
4638286570Smav		    sizeof (arc_buf_hdr_t),
4639286570Smav		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4640205231Skmacy		list_create(&arc_l2c_only->arcs_lists[i],
4641286570Smav		    sizeof (arc_buf_hdr_t),
4642286570Smav		    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4643205231Skmacy	}
4644168404Spjd
4645168404Spjd	buf_init();
4646168404Spjd
4647168404Spjd	arc_thread_exit = 0;
4648168404Spjd	arc_eviction_list = NULL;
4649168404Spjd	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
4650168404Spjd	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4651168404Spjd
4652168404Spjd	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4653168404Spjd	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4654168404Spjd
4655168404Spjd	if (arc_ksp != NULL) {
4656168404Spjd		arc_ksp->ks_data = &arc_stats;
4657168404Spjd		kstat_install(arc_ksp);
4658168404Spjd	}
4659168404Spjd
4660168404Spjd	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
4661168404Spjd	    TS_RUN, minclsyspri);
4662168404Spjd
4663168404Spjd#ifdef _KERNEL
4664168566Spjd	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
4665168404Spjd	    EVENTHANDLER_PRI_FIRST);
4666168404Spjd#endif
4667168404Spjd
4668168404Spjd	arc_dead = FALSE;
4669185029Spjd	arc_warm = B_FALSE;
4670168566Spjd
4671258632Savg	/*
4672258632Savg	 * Calculate maximum amount of dirty data per pool.
4673258632Savg	 *
4674258632Savg	 * If it has been set by /etc/system, take that.
4675258632Savg	 * Otherwise, use a percentage of physical memory defined by
4676258632Savg	 * zfs_dirty_data_max_percent (default 10%) with a cap at
4677258632Savg	 * zfs_dirty_data_max_max (default 4GB).
4678258632Savg	 */
4679258632Savg	if (zfs_dirty_data_max == 0) {
4680258632Savg		zfs_dirty_data_max = ptob(physmem) *
4681258632Savg		    zfs_dirty_data_max_percent / 100;
4682258632Savg		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
4683258632Savg		    zfs_dirty_data_max_max);
4684258632Savg	}
4685185029Spjd
4686168566Spjd#ifdef _KERNEL
4687194043Skmacy	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
4688193953Skmacy		prefetch_tunable_set = 1;
4689206796Spjd
4690193878Skmacy#ifdef __i386__
4691193953Skmacy	if (prefetch_tunable_set == 0) {
4692196863Strasz		printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
4693196863Strasz		    "-- to enable,\n");
4694196863Strasz		printf("            add \"vfs.zfs.prefetch_disable=0\" "
4695196863Strasz		    "to /boot/loader.conf.\n");
4696219089Spjd		zfs_prefetch_disable = 1;
4697193878Skmacy	}
4698206796Spjd#else
4699193878Skmacy	if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
4700193953Skmacy	    prefetch_tunable_set == 0) {
4701196863Strasz		printf("ZFS NOTICE: Prefetch is disabled by default if less "
4702196941Strasz		    "than 4GB of RAM is present;\n"
4703196863Strasz		    "            to enable, add \"vfs.zfs.prefetch_disable=0\" "
4704196863Strasz		    "to /boot/loader.conf.\n");
4705219089Spjd		zfs_prefetch_disable = 1;
4706193878Skmacy	}
4707206796Spjd#endif
4708175633Spjd	/* Warn about ZFS memory and address space requirements. */
4709168696Spjd	if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
4710168987Sbmah		printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
4711168987Sbmah		    "expect unstable behavior.\n");
4712175633Spjd	}
4713175633Spjd	if (kmem_size() < 512 * (1 << 20)) {
4714173419Spjd		printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
4715168987Sbmah		    "expect unstable behavior.\n");
4716185029Spjd		printf("             Consider tuning vm.kmem_size and "
4717173419Spjd		    "vm.kmem_size_max\n");
4718185029Spjd		printf("             in /boot/loader.conf.\n");
4719168566Spjd	}
4720168566Spjd#endif
4721168404Spjd}
4722168404Spjd
4723168404Spjdvoid
4724168404Spjdarc_fini(void)
4725168404Spjd{
4726205231Skmacy	int i;
4727206796Spjd
4728168404Spjd	mutex_enter(&arc_reclaim_thr_lock);
4729168404Spjd	arc_thread_exit = 1;
4730168404Spjd	cv_signal(&arc_reclaim_thr_cv);
4731168404Spjd	while (arc_thread_exit != 0)
4732168404Spjd		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
4733168404Spjd	mutex_exit(&arc_reclaim_thr_lock);
4734168404Spjd
4735185029Spjd	arc_flush(NULL);
4736168404Spjd
4737168404Spjd	arc_dead = TRUE;
4738168404Spjd
4739168404Spjd	if (arc_ksp != NULL) {
4740168404Spjd		kstat_delete(arc_ksp);
4741168404Spjd		arc_ksp = NULL;
4742168404Spjd	}
4743168404Spjd
4744168404Spjd	mutex_destroy(&arc_eviction_mtx);
4745168404Spjd	mutex_destroy(&arc_reclaim_thr_lock);
4746168404Spjd	cv_destroy(&arc_reclaim_thr_cv);
4747168404Spjd
4748205231Skmacy	for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
4749205231Skmacy		list_destroy(&arc_mru->arcs_lists[i]);
4750205231Skmacy		list_destroy(&arc_mru_ghost->arcs_lists[i]);
4751205231Skmacy		list_destroy(&arc_mfu->arcs_lists[i]);
4752205231Skmacy		list_destroy(&arc_mfu_ghost->arcs_lists[i]);
4753206795Spjd		list_destroy(&arc_l2c_only->arcs_lists[i]);
4754168404Spjd
4755205231Skmacy		mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
4756205231Skmacy		mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
4757205231Skmacy		mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
4758205231Skmacy		mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
4759205231Skmacy		mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
4760206795Spjd		mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
4761205231Skmacy	}
4762206796Spjd
4763168404Spjd	buf_fini();
4764168404Spjd
4765286570Smav	ASSERT0(arc_loaned_bytes);
4766209962Smm
4767168582Spjd	mutex_destroy(&arc_lowmem_lock);
4768168404Spjd#ifdef _KERNEL
4769168566Spjd	if (arc_event_lowmem != NULL)
4770168566Spjd		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
4771168404Spjd#endif
4772168404Spjd}
4773185029Spjd
4774185029Spjd/*
4775185029Spjd * Level 2 ARC
4776185029Spjd *
4777185029Spjd * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
4778185029Spjd * It uses dedicated storage devices to hold cached data, which are populated
4779185029Spjd * using large infrequent writes.  The main role of this cache is to boost
4780185029Spjd * the performance of random read workloads.  The intended L2ARC devices
4781185029Spjd * include short-stroked disks, solid state disks, and other media with
4782185029Spjd * substantially faster read latency than disk.
4783185029Spjd *
4784185029Spjd *                 +-----------------------+
4785185029Spjd *                 |         ARC           |
4786185029Spjd *                 +-----------------------+
4787185029Spjd *                    |         ^     ^
4788185029Spjd *                    |         |     |
4789185029Spjd *      l2arc_feed_thread()    arc_read()
4790185029Spjd *                    |         |     |
4791185029Spjd *                    |  l2arc read   |
4792185029Spjd *                    V         |     |
4793185029Spjd *               +---------------+    |
4794185029Spjd *               |     L2ARC     |    |
4795185029Spjd *               +---------------+    |
4796185029Spjd *                   |    ^           |
4797185029Spjd *          l2arc_write() |           |
4798185029Spjd *                   |    |           |
4799185029Spjd *                   V    |           |
4800185029Spjd *                 +-------+      +-------+
4801185029Spjd *                 | vdev  |      | vdev  |
4802185029Spjd *                 | cache |      | cache |
4803185029Spjd *                 +-------+      +-------+
4804185029Spjd *                 +=========+     .-----.
4805185029Spjd *                 :  L2ARC  :    |-_____-|
4806185029Spjd *                 : devices :    | Disks |
4807185029Spjd *                 +=========+    `-_____-'
4808185029Spjd *
4809185029Spjd * Read requests are satisfied from the following sources, in order:
4810185029Spjd *
4811185029Spjd *	1) ARC
4812185029Spjd *	2) vdev cache of L2ARC devices
4813185029Spjd *	3) L2ARC devices
4814185029Spjd *	4) vdev cache of disks
4815185029Spjd *	5) disks
4816185029Spjd *
4817185029Spjd * Some L2ARC device types exhibit extremely slow write performance.
4818185029Spjd * To accommodate for this there are some significant differences between
4819185029Spjd * the L2ARC and traditional cache design:
4820185029Spjd *
4821185029Spjd * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
4822185029Spjd * the ARC behave as usual, freeing buffers and placing headers on ghost
4823185029Spjd * lists.  The ARC does not send buffers to the L2ARC during eviction as
4824185029Spjd * this would add inflated write latencies for all ARC memory pressure.
4825185029Spjd *
4826185029Spjd * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
4827185029Spjd * It does this by periodically scanning buffers from the eviction-end of
4828185029Spjd * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
4829251478Sdelphij * not already there. It scans until a headroom of buffers is satisfied,
4830251478Sdelphij * which itself is a buffer for ARC eviction. If a compressible buffer is
4831251478Sdelphij * found during scanning and selected for writing to an L2ARC device, we
4832251478Sdelphij * temporarily boost scanning headroom during the next scan cycle to make
4833251478Sdelphij * sure we adapt to compression effects (which might significantly reduce
4834251478Sdelphij * the data volume we write to L2ARC). The thread that does this is
4835185029Spjd * l2arc_feed_thread(), illustrated below; example sizes are included to
4836185029Spjd * provide a better sense of ratio than this diagram:
4837185029Spjd *
4838185029Spjd *	       head -->                        tail
4839185029Spjd *	        +---------------------+----------+
4840185029Spjd *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
4841185029Spjd *	        +---------------------+----------+   |   o L2ARC eligible
4842185029Spjd *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
4843185029Spjd *	        +---------------------+----------+   |
4844185029Spjd *	             15.9 Gbytes      ^ 32 Mbytes    |
4845185029Spjd *	                           headroom          |
4846185029Spjd *	                                      l2arc_feed_thread()
4847185029Spjd *	                                             |
4848185029Spjd *	                 l2arc write hand <--[oooo]--'
4849185029Spjd *	                         |           8 Mbyte
4850185029Spjd *	                         |          write max
4851185029Spjd *	                         V
4852185029Spjd *		  +==============================+
4853185029Spjd *	L2ARC dev |####|#|###|###|    |####| ... |
4854185029Spjd *	          +==============================+
4855185029Spjd *	                     32 Gbytes
4856185029Spjd *
4857185029Spjd * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
4858185029Spjd * evicted, then the L2ARC has cached a buffer much sooner than it probably
4859185029Spjd * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
4860185029Spjd * safe to say that this is an uncommon case, since buffers at the end of
4861185029Spjd * the ARC lists have moved there due to inactivity.
4862185029Spjd *
4863185029Spjd * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
4864185029Spjd * then the L2ARC simply misses copying some buffers.  This serves as a
4865185029Spjd * pressure valve to prevent heavy read workloads from both stalling the ARC
4866185029Spjd * with waits and clogging the L2ARC with writes.  This also helps prevent
4867185029Spjd * the potential for the L2ARC to churn if it attempts to cache content too
4868185029Spjd * quickly, such as during backups of the entire pool.
4869185029Spjd *
4870185029Spjd * 5. After system boot and before the ARC has filled main memory, there are
4871185029Spjd * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
4872185029Spjd * lists can remain mostly static.  Instead of searching from tail of these
4873185029Spjd * lists as pictured, the l2arc_feed_thread() will search from the list heads
4874185029Spjd * for eligible buffers, greatly increasing its chance of finding them.
4875185029Spjd *
4876185029Spjd * The L2ARC device write speed is also boosted during this time so that
4877185029Spjd * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
4878185029Spjd * there are no L2ARC reads, and no fear of degrading read performance
4879185029Spjd * through increased writes.
4880185029Spjd *
4881185029Spjd * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
4882185029Spjd * the vdev queue can aggregate them into larger and fewer writes.  Each
4883185029Spjd * device is written to in a rotor fashion, sweeping writes through
4884185029Spjd * available space then repeating.
4885185029Spjd *
4886185029Spjd * 7. The L2ARC does not store dirty content.  It never needs to flush
4887185029Spjd * write buffers back to disk based storage.
4888185029Spjd *
4889185029Spjd * 8. If an ARC buffer is written (and dirtied) which also exists in the
4890185029Spjd * L2ARC, the now stale L2ARC buffer is immediately dropped.
4891185029Spjd *
4892185029Spjd * The performance of the L2ARC can be tweaked by a number of tunables, which
4893185029Spjd * may be necessary for different workloads:
4894185029Spjd *
4895185029Spjd *	l2arc_write_max		max write bytes per interval
4896185029Spjd *	l2arc_write_boost	extra write bytes during device warmup
4897185029Spjd *	l2arc_noprefetch	skip caching prefetched buffers
4898185029Spjd *	l2arc_headroom		number of max device writes to precache
4899251478Sdelphij *	l2arc_headroom_boost	when we find compressed buffers during ARC
4900251478Sdelphij *				scanning, we multiply headroom by this
4901251478Sdelphij *				percentage factor for the next scan cycle,
4902251478Sdelphij *				since more compressed buffers are likely to
4903251478Sdelphij *				be present
4904185029Spjd *	l2arc_feed_secs		seconds between L2ARC writing
4905185029Spjd *
4906185029Spjd * Tunables may be removed or added as future performance improvements are
4907185029Spjd * integrated, and also may become zpool properties.
4908208373Smm *
4909208373Smm * There are three key functions that control how the L2ARC warms up:
4910208373Smm *
4911208373Smm *	l2arc_write_eligible()	check if a buffer is eligible to cache
4912208373Smm *	l2arc_write_size()	calculate how much to write
4913208373Smm *	l2arc_write_interval()	calculate sleep delay between writes
4914208373Smm *
4915208373Smm * These three functions determine what to write, how much, and how quickly
4916208373Smm * to send writes.
4917185029Spjd */
4918185029Spjd
4919208373Smmstatic boolean_t
4920275811Sdelphijl2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
4921208373Smm{
4922208373Smm	/*
4923208373Smm	 * A buffer is *not* eligible for the L2ARC if it:
4924208373Smm	 * 1. belongs to a different spa.
4925208373Smm	 * 2. is already cached on the L2ARC.
4926208373Smm	 * 3. has an I/O in progress (it may be an incomplete read).
4927208373Smm	 * 4. is flagged not eligible (zfs property).
4928208373Smm	 */
4929275811Sdelphij	if (hdr->b_spa != spa_guid) {
4930208373Smm		ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
4931208373Smm		return (B_FALSE);
4932208373Smm	}
4933286570Smav	if (HDR_HAS_L2HDR(hdr)) {
4934208373Smm		ARCSTAT_BUMP(arcstat_l2_write_in_l2);
4935208373Smm		return (B_FALSE);
4936208373Smm	}
4937275811Sdelphij	if (HDR_IO_IN_PROGRESS(hdr)) {
4938208373Smm		ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
4939208373Smm		return (B_FALSE);
4940208373Smm	}
4941275811Sdelphij	if (!HDR_L2CACHE(hdr)) {
4942208373Smm		ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
4943208373Smm		return (B_FALSE);
4944208373Smm	}
4945208373Smm
4946208373Smm	return (B_TRUE);
4947208373Smm}
4948208373Smm
4949208373Smmstatic uint64_t
4950251478Sdelphijl2arc_write_size(void)
4951208373Smm{
4952208373Smm	uint64_t size;
4953208373Smm
4954251478Sdelphij	/*
4955251478Sdelphij	 * Make sure our globals have meaningful values in case the user
4956251478Sdelphij	 * altered them.
4957251478Sdelphij	 */
4958251478Sdelphij	size = l2arc_write_max;
4959251478Sdelphij	if (size == 0) {
4960251478Sdelphij		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
4961251478Sdelphij		    "be greater than zero, resetting it to the default (%d)",
4962251478Sdelphij		    L2ARC_WRITE_SIZE);
4963251478Sdelphij		size = l2arc_write_max = L2ARC_WRITE_SIZE;
4964251478Sdelphij	}
4965208373Smm
4966208373Smm	if (arc_warm == B_FALSE)
4967251478Sdelphij		size += l2arc_write_boost;
4968208373Smm
4969208373Smm	return (size);
4970208373Smm
4971208373Smm}
4972208373Smm
4973208373Smmstatic clock_t
4974208373Smml2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
4975208373Smm{
4976219089Spjd	clock_t interval, next, now;
4977208373Smm
4978208373Smm	/*
4979208373Smm	 * If the ARC lists are busy, increase our write rate; if the
4980208373Smm	 * lists are stale, idle back.  This is achieved by checking
4981208373Smm	 * how much we previously wrote - if it was more than half of
4982208373Smm	 * what we wanted, schedule the next write much sooner.
4983208373Smm	 */
4984208373Smm	if (l2arc_feed_again && wrote > (wanted / 2))
4985208373Smm		interval = (hz * l2arc_feed_min_ms) / 1000;
4986208373Smm	else
4987208373Smm		interval = hz * l2arc_feed_secs;
4988208373Smm
4989219089Spjd	now = ddi_get_lbolt();
4990219089Spjd	next = MAX(now, MIN(now + interval, began + interval));
4991208373Smm
4992208373Smm	return (next);
4993208373Smm}
4994208373Smm
4995185029Spjd/*
4996185029Spjd * Cycle through L2ARC devices.  This is how L2ARC load balances.
4997185029Spjd * If a device is returned, this also returns holding the spa config lock.
4998185029Spjd */
4999185029Spjdstatic l2arc_dev_t *
5000185029Spjdl2arc_dev_get_next(void)
5001185029Spjd{
5002185029Spjd	l2arc_dev_t *first, *next = NULL;
5003185029Spjd
5004185029Spjd	/*
5005185029Spjd	 * Lock out the removal of spas (spa_namespace_lock), then removal
5006185029Spjd	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
5007185029Spjd	 * both locks will be dropped and a spa config lock held instead.
5008185029Spjd	 */
5009185029Spjd	mutex_enter(&spa_namespace_lock);
5010185029Spjd	mutex_enter(&l2arc_dev_mtx);
5011185029Spjd
5012185029Spjd	/* if there are no vdevs, there is nothing to do */
5013185029Spjd	if (l2arc_ndev == 0)
5014185029Spjd		goto out;
5015185029Spjd
5016185029Spjd	first = NULL;
5017185029Spjd	next = l2arc_dev_last;
5018185029Spjd	do {
5019185029Spjd		/* loop around the list looking for a non-faulted vdev */
5020185029Spjd		if (next == NULL) {
5021185029Spjd			next = list_head(l2arc_dev_list);
5022185029Spjd		} else {
5023185029Spjd			next = list_next(l2arc_dev_list, next);
5024185029Spjd			if (next == NULL)
5025185029Spjd				next = list_head(l2arc_dev_list);
5026185029Spjd		}
5027185029Spjd
5028185029Spjd		/* if we have come back to the start, bail out */
5029185029Spjd		if (first == NULL)
5030185029Spjd			first = next;
5031185029Spjd		else if (next == first)
5032185029Spjd			break;
5033185029Spjd
5034185029Spjd	} while (vdev_is_dead(next->l2ad_vdev));
5035185029Spjd
5036185029Spjd	/* if we were unable to find any usable vdevs, return NULL */
5037185029Spjd	if (vdev_is_dead(next->l2ad_vdev))
5038185029Spjd		next = NULL;
5039185029Spjd
5040185029Spjd	l2arc_dev_last = next;
5041185029Spjd
5042185029Spjdout:
5043185029Spjd	mutex_exit(&l2arc_dev_mtx);
5044185029Spjd
5045185029Spjd	/*
5046185029Spjd	 * Grab the config lock to prevent the 'next' device from being
5047185029Spjd	 * removed while we are writing to it.
5048185029Spjd	 */
5049185029Spjd	if (next != NULL)
5050185029Spjd		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
5051185029Spjd	mutex_exit(&spa_namespace_lock);
5052185029Spjd
5053185029Spjd	return (next);
5054185029Spjd}
5055185029Spjd
5056185029Spjd/*
5057185029Spjd * Free buffers that were tagged for destruction.
5058185029Spjd */
5059185029Spjdstatic void
5060185029Spjdl2arc_do_free_on_write()
5061185029Spjd{
5062185029Spjd	list_t *buflist;
5063185029Spjd	l2arc_data_free_t *df, *df_prev;
5064185029Spjd
5065185029Spjd	mutex_enter(&l2arc_free_on_write_mtx);
5066185029Spjd	buflist = l2arc_free_on_write;
5067185029Spjd
5068185029Spjd	for (df = list_tail(buflist); df; df = df_prev) {
5069185029Spjd		df_prev = list_prev(buflist, df);
5070185029Spjd		ASSERT(df->l2df_data != NULL);
5071185029Spjd		ASSERT(df->l2df_func != NULL);
5072185029Spjd		df->l2df_func(df->l2df_data, df->l2df_size);
5073185029Spjd		list_remove(buflist, df);
5074185029Spjd		kmem_free(df, sizeof (l2arc_data_free_t));
5075185029Spjd	}
5076185029Spjd
5077185029Spjd	mutex_exit(&l2arc_free_on_write_mtx);
5078185029Spjd}
5079185029Spjd
5080185029Spjd/*
5081185029Spjd * A write to a cache device has completed.  Update all headers to allow
5082185029Spjd * reads from these buffers to begin.
5083185029Spjd */
5084185029Spjdstatic void
5085185029Spjdl2arc_write_done(zio_t *zio)
5086185029Spjd{
5087185029Spjd	l2arc_write_callback_t *cb;
5088185029Spjd	l2arc_dev_t *dev;
5089185029Spjd	list_t *buflist;
5090275811Sdelphij	arc_buf_hdr_t *head, *hdr, *hdr_prev;
5091185029Spjd	kmutex_t *hash_lock;
5092268085Sdelphij	int64_t bytes_dropped = 0;
5093185029Spjd
5094185029Spjd	cb = zio->io_private;
5095185029Spjd	ASSERT(cb != NULL);
5096185029Spjd	dev = cb->l2wcb_dev;
5097185029Spjd	ASSERT(dev != NULL);
5098185029Spjd	head = cb->l2wcb_head;
5099185029Spjd	ASSERT(head != NULL);
5100286570Smav	buflist = &dev->l2ad_buflist;
5101185029Spjd	ASSERT(buflist != NULL);
5102185029Spjd	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
5103185029Spjd	    l2arc_write_callback_t *, cb);
5104185029Spjd
5105185029Spjd	if (zio->io_error != 0)
5106185029Spjd		ARCSTAT_BUMP(arcstat_l2_writes_error);
5107185029Spjd
5108286570Smav	mutex_enter(&dev->l2ad_mtx);
5109185029Spjd
5110185029Spjd	/*
5111185029Spjd	 * All writes completed, or an error was hit.
5112185029Spjd	 */
5113275811Sdelphij	for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
5114275811Sdelphij		hdr_prev = list_prev(buflist, hdr);
5115185029Spjd
5116275811Sdelphij		hash_lock = HDR_LOCK(hdr);
5117185029Spjd		if (!mutex_tryenter(hash_lock)) {
5118185029Spjd			/*
5119185029Spjd			 * This buffer misses out.  It may be in a stage
5120286570Smav			 * of eviction.  Its ARC_FLAG_L2_WRITING flag will be
5121185029Spjd			 * left set, denying reads to this buffer.
5122185029Spjd			 */
5123185029Spjd			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
5124185029Spjd			continue;
5125185029Spjd		}
5126185029Spjd
5127286570Smav		/*
5128286570Smav		 * It's possible that this buffer got evicted from the L1 cache
5129286570Smav		 * before we grabbed the vdev + hash locks, in which case
5130286570Smav		 * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated.
5131286570Smav		 * Only free the buffer if we still have an L1 hdr.
5132286570Smav		 */
5133286570Smav		if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL &&
5134286570Smav		    HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
5135286570Smav			l2arc_release_cdata_buf(hdr);
5136286570Smav
5137185029Spjd		if (zio->io_error != 0) {
5138185029Spjd			/*
5139185029Spjd			 * Error - drop L2ARC entry.
5140185029Spjd			 */
5141286570Smav			trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev,
5142286570Smav			    hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0);
5143286570Smav			hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
5144286570Smav
5145286570Smav			ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
5146275811Sdelphij			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
5147185029Spjd		}
5148185029Spjd
5149185029Spjd		/*
5150185029Spjd		 * Allow ARC to begin reads to this L2ARC entry.
5151185029Spjd		 */
5152275811Sdelphij		hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
5153185029Spjd
5154185029Spjd		mutex_exit(hash_lock);
5155185029Spjd	}
5156185029Spjd
5157185029Spjd	atomic_inc_64(&l2arc_writes_done);
5158185029Spjd	list_remove(buflist, head);
5159286570Smav	ASSERT(!HDR_HAS_L1HDR(head));
5160286570Smav	kmem_cache_free(hdr_l2only_cache, head);
5161286570Smav	mutex_exit(&dev->l2ad_mtx);
5162185029Spjd
5163268085Sdelphij	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
5164268085Sdelphij
5165185029Spjd	l2arc_do_free_on_write();
5166185029Spjd
5167185029Spjd	kmem_free(cb, sizeof (l2arc_write_callback_t));
5168185029Spjd}
5169185029Spjd
5170185029Spjd/*
5171185029Spjd * A read to a cache device completed.  Validate buffer contents before
5172185029Spjd * handing over to the regular ARC routines.
5173185029Spjd */
5174185029Spjdstatic void
5175185029Spjdl2arc_read_done(zio_t *zio)
5176185029Spjd{
5177185029Spjd	l2arc_read_callback_t *cb;
5178185029Spjd	arc_buf_hdr_t *hdr;
5179185029Spjd	arc_buf_t *buf;
5180185029Spjd	kmutex_t *hash_lock;
5181185029Spjd	int equal;
5182185029Spjd
5183185029Spjd	ASSERT(zio->io_vd != NULL);
5184185029Spjd	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
5185185029Spjd
5186185029Spjd	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
5187185029Spjd
5188185029Spjd	cb = zio->io_private;
5189185029Spjd	ASSERT(cb != NULL);
5190185029Spjd	buf = cb->l2rcb_buf;
5191185029Spjd	ASSERT(buf != NULL);
5192185029Spjd
5193219089Spjd	hash_lock = HDR_LOCK(buf->b_hdr);
5194185029Spjd	mutex_enter(hash_lock);
5195219089Spjd	hdr = buf->b_hdr;
5196219089Spjd	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
5197185029Spjd
5198185029Spjd	/*
5199251478Sdelphij	 * If the buffer was compressed, decompress it first.
5200251478Sdelphij	 */
5201251478Sdelphij	if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
5202251478Sdelphij		l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
5203251478Sdelphij	ASSERT(zio->io_data != NULL);
5204251478Sdelphij
5205251478Sdelphij	/*
5206185029Spjd	 * Check this survived the L2ARC journey.
5207185029Spjd	 */
5208185029Spjd	equal = arc_cksum_equal(buf);
5209185029Spjd	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
5210185029Spjd		mutex_exit(hash_lock);
5211185029Spjd		zio->io_private = buf;
5212185029Spjd		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
5213185029Spjd		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
5214185029Spjd		arc_read_done(zio);
5215185029Spjd	} else {
5216185029Spjd		mutex_exit(hash_lock);
5217185029Spjd		/*
5218185029Spjd		 * Buffer didn't survive caching.  Increment stats and
5219185029Spjd		 * reissue to the original storage device.
5220185029Spjd		 */
5221185029Spjd		if (zio->io_error != 0) {
5222185029Spjd			ARCSTAT_BUMP(arcstat_l2_io_error);
5223185029Spjd		} else {
5224249195Smm			zio->io_error = SET_ERROR(EIO);
5225185029Spjd		}
5226185029Spjd		if (!equal)
5227185029Spjd			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
5228185029Spjd
5229185029Spjd		/*
5230185029Spjd		 * If there's no waiter, issue an async i/o to the primary
5231185029Spjd		 * storage now.  If there *is* a waiter, the caller must
5232185029Spjd		 * issue the i/o in a context where it's OK to block.
5233185029Spjd		 */
5234209962Smm		if (zio->io_waiter == NULL) {
5235209962Smm			zio_t *pio = zio_unique_parent(zio);
5236209962Smm
5237209962Smm			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
5238209962Smm
5239209962Smm			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
5240185029Spjd			    buf->b_data, zio->io_size, arc_read_done, buf,
5241185029Spjd			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
5242209962Smm		}
5243185029Spjd	}
5244185029Spjd
5245185029Spjd	kmem_free(cb, sizeof (l2arc_read_callback_t));
5246185029Spjd}
5247185029Spjd
5248185029Spjd/*
5249185029Spjd * This is the list priority from which the L2ARC will search for pages to
5250185029Spjd * cache.  This is used within loops (0..3) to cycle through lists in the
5251185029Spjd * desired order.  This order can have a significant effect on cache
5252185029Spjd * performance.
5253185029Spjd *
5254185029Spjd * Currently the metadata lists are hit first, MFU then MRU, followed by
5255185029Spjd * the data lists.  This function returns a locked list, and also returns
5256185029Spjd * the lock pointer.
5257185029Spjd */
5258185029Spjdstatic list_t *
5259185029Spjdl2arc_list_locked(int list_num, kmutex_t **lock)
5260185029Spjd{
5261247187Smm	list_t *list = NULL;
5262205231Skmacy	int idx;
5263185029Spjd
5264206796Spjd	ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);
5265206796Spjd
5266205231Skmacy	if (list_num < ARC_BUFC_NUMMETADATALISTS) {
5267205231Skmacy		idx = list_num;
5268205231Skmacy		list = &arc_mfu->arcs_lists[idx];
5269205231Skmacy		*lock = ARCS_LOCK(arc_mfu, idx);
5270206796Spjd	} else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) {
5271205231Skmacy		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
5272205231Skmacy		list = &arc_mru->arcs_lists[idx];
5273205231Skmacy		*lock = ARCS_LOCK(arc_mru, idx);
5274206796Spjd	} else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 +
5275205231Skmacy		ARC_BUFC_NUMDATALISTS)) {
5276205231Skmacy		idx = list_num - ARC_BUFC_NUMMETADATALISTS;
5277205231Skmacy		list = &arc_mfu->arcs_lists[idx];
5278205231Skmacy		*lock = ARCS_LOCK(arc_mfu, idx);
5279205231Skmacy	} else {
5280205231Skmacy		idx = list_num - ARC_BUFC_NUMLISTS;
5281205231Skmacy		list = &arc_mru->arcs_lists[idx];
5282205231Skmacy		*lock = ARCS_LOCK(arc_mru, idx);
5283185029Spjd	}
5284185029Spjd
5285185029Spjd	ASSERT(!(MUTEX_HELD(*lock)));
5286185029Spjd	mutex_enter(*lock);
5287185029Spjd	return (list);
5288185029Spjd}
5289185029Spjd
5290185029Spjd/*
5291185029Spjd * Evict buffers from the device write hand to the distance specified in
5292185029Spjd * bytes.  This distance may span populated buffers, it may span nothing.
5293185029Spjd * This is clearing a region on the L2ARC device ready for writing.
5294185029Spjd * If the 'all' boolean is set, every buffer is evicted.
5295185029Spjd */
5296185029Spjdstatic void
5297185029Spjdl2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
5298185029Spjd{
5299185029Spjd	list_t *buflist;
5300275811Sdelphij	arc_buf_hdr_t *hdr, *hdr_prev;
5301185029Spjd	kmutex_t *hash_lock;
5302185029Spjd	uint64_t taddr;
5303268085Sdelphij	int64_t bytes_evicted = 0;
5304185029Spjd
5305286570Smav	buflist = &dev->l2ad_buflist;
5306185029Spjd
5307185029Spjd	if (!all && dev->l2ad_first) {
5308185029Spjd		/*
5309185029Spjd		 * This is the first sweep through the device.  There is
5310185029Spjd		 * nothing to evict.
5311185029Spjd		 */
5312185029Spjd		return;
5313185029Spjd	}
5314185029Spjd
5315185029Spjd	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
5316185029Spjd		/*
5317185029Spjd		 * When nearing the end of the device, evict to the end
5318185029Spjd		 * before the device write hand jumps to the start.
5319185029Spjd		 */
5320185029Spjd		taddr = dev->l2ad_end;
5321185029Spjd	} else {
5322185029Spjd		taddr = dev->l2ad_hand + distance;
5323185029Spjd	}
5324185029Spjd	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
5325185029Spjd	    uint64_t, taddr, boolean_t, all);
5326185029Spjd
5327185029Spjdtop:
5328286570Smav	mutex_enter(&dev->l2ad_mtx);
5329275811Sdelphij	for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
5330275811Sdelphij		hdr_prev = list_prev(buflist, hdr);
5331185029Spjd
5332275811Sdelphij		hash_lock = HDR_LOCK(hdr);
5333185029Spjd		if (!mutex_tryenter(hash_lock)) {
5334185029Spjd			/*
5335185029Spjd			 * Missed the hash lock.  Retry.
5336185029Spjd			 */
5337185029Spjd			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
5338286570Smav			mutex_exit(&dev->l2ad_mtx);
5339185029Spjd			mutex_enter(hash_lock);
5340185029Spjd			mutex_exit(hash_lock);
5341185029Spjd			goto top;
5342185029Spjd		}
5343185029Spjd
5344275811Sdelphij		if (HDR_L2_WRITE_HEAD(hdr)) {
5345185029Spjd			/*
5346185029Spjd			 * We hit a write head node.  Leave it for
5347185029Spjd			 * l2arc_write_done().
5348185029Spjd			 */
5349275811Sdelphij			list_remove(buflist, hdr);
5350185029Spjd			mutex_exit(hash_lock);
5351185029Spjd			continue;
5352185029Spjd		}
5353185029Spjd
5354286570Smav		if (!all && HDR_HAS_L2HDR(hdr) &&
5355286570Smav		    (hdr->b_l2hdr.b_daddr > taddr ||
5356286570Smav		    hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
5357185029Spjd			/*
5358185029Spjd			 * We've evicted to the target address,
5359185029Spjd			 * or the end of the device.
5360185029Spjd			 */
5361185029Spjd			mutex_exit(hash_lock);
5362185029Spjd			break;
5363185029Spjd		}
5364185029Spjd
5365286570Smav		ASSERT(HDR_HAS_L2HDR(hdr));
5366286570Smav		if (!HDR_HAS_L1HDR(hdr)) {
5367275811Sdelphij			ASSERT(!HDR_L2_READING(hdr));
5368185029Spjd			/*
5369185029Spjd			 * This doesn't exist in the ARC.  Destroy.
5370185029Spjd			 * arc_hdr_destroy() will call list_remove()
5371185029Spjd			 * and decrement arcstat_l2_size.
5372185029Spjd			 */
5373275811Sdelphij			arc_change_state(arc_anon, hdr, hash_lock);
5374275811Sdelphij			arc_hdr_destroy(hdr);
5375185029Spjd		} else {
5376286570Smav			ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
5377286570Smav			ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
5378185029Spjd			/*
5379185029Spjd			 * Invalidate issued or about to be issued
5380185029Spjd			 * reads, since we may be about to write
5381185029Spjd			 * over this location.
5382185029Spjd			 */
5383275811Sdelphij			if (HDR_L2_READING(hdr)) {
5384185029Spjd				ARCSTAT_BUMP(arcstat_l2_evict_reading);
5385275811Sdelphij				hdr->b_flags |= ARC_FLAG_L2_EVICTED;
5386185029Spjd			}
5387185029Spjd
5388286570Smav			/* Tell ARC this no longer exists in L2ARC. */
5389286570Smav			ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
5390286570Smav			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
5391286570Smav			hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
5392275811Sdelphij			list_remove(buflist, hdr);
5393185029Spjd
5394286570Smav			/* This may have been leftover after a failed write. */
5395275811Sdelphij			hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
5396185029Spjd		}
5397185029Spjd		mutex_exit(hash_lock);
5398185029Spjd	}
5399286570Smav	mutex_exit(&dev->l2ad_mtx);
5400185029Spjd
5401268085Sdelphij	vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0);
5402185029Spjd	dev->l2ad_evict = taddr;
5403185029Spjd}
5404185029Spjd
5405185029Spjd/*
5406185029Spjd * Find and write ARC buffers to the L2ARC device.
5407185029Spjd *
5408275811Sdelphij * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
5409185029Spjd * for reading until they have completed writing.
5410251478Sdelphij * The headroom_boost is an in-out parameter used to maintain headroom boost
5411251478Sdelphij * state between calls to this function.
5412251478Sdelphij *
5413251478Sdelphij * Returns the number of bytes actually written (which may be smaller than
5414251478Sdelphij * the delta by which the device hand has changed due to alignment).
5415185029Spjd */
5416208373Smmstatic uint64_t
5417251478Sdelphijl2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
5418251478Sdelphij    boolean_t *headroom_boost)
5419185029Spjd{
5420275811Sdelphij	arc_buf_hdr_t *hdr, *hdr_prev, *head;
5421185029Spjd	list_t *list;
5422251478Sdelphij	uint64_t write_asize, write_psize, write_sz, headroom,
5423251478Sdelphij	    buf_compress_minsz;
5424185029Spjd	void *buf_data;
5425251478Sdelphij	kmutex_t *list_lock;
5426251478Sdelphij	boolean_t full;
5427185029Spjd	l2arc_write_callback_t *cb;
5428185029Spjd	zio_t *pio, *wzio;
5429228103Smm	uint64_t guid = spa_load_guid(spa);
5430251478Sdelphij	const boolean_t do_headroom_boost = *headroom_boost;
5431185029Spjd	int try;
5432185029Spjd
5433185029Spjd	ASSERT(dev->l2ad_vdev != NULL);
5434185029Spjd
5435251478Sdelphij	/* Lower the flag now, we might want to raise it again later. */
5436251478Sdelphij	*headroom_boost = B_FALSE;
5437251478Sdelphij
5438185029Spjd	pio = NULL;
5439251478Sdelphij	write_sz = write_asize = write_psize = 0;
5440185029Spjd	full = B_FALSE;
5441286570Smav	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
5442275811Sdelphij	head->b_flags |= ARC_FLAG_L2_WRITE_HEAD;
5443286570Smav	head->b_flags |= ARC_FLAG_HAS_L2HDR;
5444185029Spjd
5445205231Skmacy	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
5446185029Spjd	/*
5447251478Sdelphij	 * We will want to try to compress buffers that are at least 2x the
5448251478Sdelphij	 * device sector size.
5449251478Sdelphij	 */
5450251478Sdelphij	buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
5451251478Sdelphij
5452251478Sdelphij	/*
5453185029Spjd	 * Copy buffers for L2ARC writing.
5454185029Spjd	 */
5455286570Smav	mutex_enter(&dev->l2ad_mtx);
5456206796Spjd	for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
5457251478Sdelphij		uint64_t passed_sz = 0;
5458251478Sdelphij
5459185029Spjd		list = l2arc_list_locked(try, &list_lock);
5460205231Skmacy		ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
5461185029Spjd
5462185029Spjd		/*
5463185029Spjd		 * L2ARC fast warmup.
5464185029Spjd		 *
5465185029Spjd		 * Until the ARC is warm and starts to evict, read from the
5466185029Spjd		 * head of the ARC lists rather than the tail.
5467185029Spjd		 */
5468185029Spjd		if (arc_warm == B_FALSE)
5469275811Sdelphij			hdr = list_head(list);
5470185029Spjd		else
5471275811Sdelphij			hdr = list_tail(list);
5472275811Sdelphij		if (hdr == NULL)
5473205231Skmacy			ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
5474185029Spjd
5475272708Savg		headroom = target_sz * l2arc_headroom * 2 / ARC_BUFC_NUMLISTS;
5476251478Sdelphij		if (do_headroom_boost)
5477251478Sdelphij			headroom = (headroom * l2arc_headroom_boost) / 100;
5478251478Sdelphij
5479275811Sdelphij		for (; hdr; hdr = hdr_prev) {
5480251478Sdelphij			kmutex_t *hash_lock;
5481251478Sdelphij			uint64_t buf_sz;
5482251478Sdelphij
5483185029Spjd			if (arc_warm == B_FALSE)
5484275811Sdelphij				hdr_prev = list_next(list, hdr);
5485185029Spjd			else
5486275811Sdelphij				hdr_prev = list_prev(list, hdr);
5487275811Sdelphij			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size);
5488206796Spjd
5489275811Sdelphij			hash_lock = HDR_LOCK(hdr);
5490251478Sdelphij			if (!mutex_tryenter(hash_lock)) {
5491205231Skmacy				ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
5492185029Spjd				/*
5493185029Spjd				 * Skip this buffer rather than waiting.
5494185029Spjd				 */
5495185029Spjd				continue;
5496185029Spjd			}
5497185029Spjd
5498275811Sdelphij			passed_sz += hdr->b_size;
5499185029Spjd			if (passed_sz > headroom) {
5500185029Spjd				/*
5501185029Spjd				 * Searched too far.
5502185029Spjd				 */
5503185029Spjd				mutex_exit(hash_lock);
5504205231Skmacy				ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
5505185029Spjd				break;
5506185029Spjd			}
5507185029Spjd
5508275811Sdelphij			if (!l2arc_write_eligible(guid, hdr)) {
5509185029Spjd				mutex_exit(hash_lock);
5510185029Spjd				continue;
5511185029Spjd			}
5512185029Spjd
5513275811Sdelphij			if ((write_sz + hdr->b_size) > target_sz) {
5514185029Spjd				full = B_TRUE;
5515185029Spjd				mutex_exit(hash_lock);
5516205231Skmacy				ARCSTAT_BUMP(arcstat_l2_write_full);
5517185029Spjd				break;
5518185029Spjd			}
5519185029Spjd
5520185029Spjd			if (pio == NULL) {
5521185029Spjd				/*
5522185029Spjd				 * Insert a dummy header on the buflist so
5523185029Spjd				 * l2arc_write_done() can find where the
5524185029Spjd				 * write buffers begin without searching.
5525185029Spjd				 */
5526286570Smav				list_insert_head(&dev->l2ad_buflist, head);
5527185029Spjd
5528185029Spjd				cb = kmem_alloc(
5529185029Spjd				    sizeof (l2arc_write_callback_t), KM_SLEEP);
5530185029Spjd				cb->l2wcb_dev = dev;
5531185029Spjd				cb->l2wcb_head = head;
5532185029Spjd				pio = zio_root(spa, l2arc_write_done, cb,
5533185029Spjd				    ZIO_FLAG_CANFAIL);
5534205231Skmacy				ARCSTAT_BUMP(arcstat_l2_write_pios);
5535185029Spjd			}
5536185029Spjd
5537185029Spjd			/*
5538185029Spjd			 * Create and add a new L2ARC header.
5539185029Spjd			 */
5540286570Smav			hdr->b_l2hdr.b_dev = dev;
5541275811Sdelphij			hdr->b_flags |= ARC_FLAG_L2_WRITING;
5542251478Sdelphij			/*
5543251478Sdelphij			 * Temporarily stash the data buffer in b_tmp_cdata.
5544251478Sdelphij			 * The subsequent write step will pick it up from
5545286570Smav			 * there. This is because can't access b_l1hdr.b_buf
5546251478Sdelphij			 * without holding the hash_lock, which we in turn
5547251478Sdelphij			 * can't access without holding the ARC list locks
5548251478Sdelphij			 * (which we want to avoid during compression/writing).
5549251478Sdelphij			 */
5550286570Smav			HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
5551286570Smav			hdr->b_l2hdr.b_asize = hdr->b_size;
5552286570Smav			hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data;
5553251478Sdelphij
5554275811Sdelphij			buf_sz = hdr->b_size;
5555286570Smav			hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
5556185029Spjd
5557286570Smav			list_insert_head(&dev->l2ad_buflist, hdr);
5558251478Sdelphij
5559185029Spjd			/*
5560185029Spjd			 * Compute and store the buffer cksum before
5561185029Spjd			 * writing.  On debug the cksum is verified first.
5562185029Spjd			 */
5563286570Smav			arc_cksum_verify(hdr->b_l1hdr.b_buf);
5564286570Smav			arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE);
5565185029Spjd
5566185029Spjd			mutex_exit(hash_lock);
5567185029Spjd
5568251478Sdelphij			write_sz += buf_sz;
5569251478Sdelphij		}
5570251478Sdelphij
5571251478Sdelphij		mutex_exit(list_lock);
5572251478Sdelphij
5573251478Sdelphij		if (full == B_TRUE)
5574251478Sdelphij			break;
5575251478Sdelphij	}
5576251478Sdelphij
5577251478Sdelphij	/* No buffers selected for writing? */
5578251478Sdelphij	if (pio == NULL) {
5579251478Sdelphij		ASSERT0(write_sz);
5580286570Smav		mutex_exit(&dev->l2ad_mtx);
5581286570Smav		ASSERT(!HDR_HAS_L1HDR(head));
5582286570Smav		kmem_cache_free(hdr_l2only_cache, head);
5583251478Sdelphij		return (0);
5584251478Sdelphij	}
5585251478Sdelphij
5586251478Sdelphij	/*
5587251478Sdelphij	 * Now start writing the buffers. We're starting at the write head
5588251478Sdelphij	 * and work backwards, retracing the course of the buffer selector
5589251478Sdelphij	 * loop above.
5590251478Sdelphij	 */
5591286570Smav	for (hdr = list_prev(&dev->l2ad_buflist, head); hdr;
5592286570Smav	    hdr = list_prev(&dev->l2ad_buflist, hdr)) {
5593251478Sdelphij		uint64_t buf_sz;
5594251478Sdelphij
5595251478Sdelphij		/*
5596251478Sdelphij		 * We shouldn't need to lock the buffer here, since we flagged
5597275811Sdelphij		 * it as ARC_FLAG_L2_WRITING in the previous step, but we must
5598275811Sdelphij		 * take care to only access its L2 cache parameters. In
5599286570Smav		 * particular, hdr->l1hdr.b_buf may be invalid by now due to
5600275811Sdelphij		 * ARC eviction.
5601251478Sdelphij		 */
5602286570Smav		hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
5603251478Sdelphij
5604286570Smav		if ((HDR_L2COMPRESS(hdr)) &&
5605286570Smav		    hdr->b_l2hdr.b_asize >= buf_compress_minsz) {
5606286570Smav			if (l2arc_compress_buf(hdr)) {
5607251478Sdelphij				/*
5608251478Sdelphij				 * If compression succeeded, enable headroom
5609251478Sdelphij				 * boost on the next scan cycle.
5610251478Sdelphij				 */
5611251478Sdelphij				*headroom_boost = B_TRUE;
5612251478Sdelphij			}
5613251478Sdelphij		}
5614251478Sdelphij
5615251478Sdelphij		/*
5616251478Sdelphij		 * Pick up the buffer data we had previously stashed away
5617251478Sdelphij		 * (and now potentially also compressed).
5618251478Sdelphij		 */
5619286570Smav		buf_data = hdr->b_l1hdr.b_tmp_cdata;
5620286570Smav		buf_sz = hdr->b_l2hdr.b_asize;
5621251478Sdelphij
5622274172Savg		/*
5623274172Savg		 * If the data has not been compressed, then clear b_tmp_cdata
5624274172Savg		 * to make sure that it points only to a temporary compression
5625274172Savg		 * buffer.
5626274172Savg		 */
5627286570Smav		if (!L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr)))
5628286570Smav			hdr->b_l1hdr.b_tmp_cdata = NULL;
5629274172Savg
5630251478Sdelphij		/* Compression may have squashed the buffer to zero length. */
5631251478Sdelphij		if (buf_sz != 0) {
5632251478Sdelphij			uint64_t buf_p_sz;
5633251478Sdelphij
5634185029Spjd			wzio = zio_write_phys(pio, dev->l2ad_vdev,
5635185029Spjd			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5636185029Spjd			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5637185029Spjd			    ZIO_FLAG_CANFAIL, B_FALSE);
5638185029Spjd
5639185029Spjd			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5640185029Spjd			    zio_t *, wzio);
5641185029Spjd			(void) zio_nowait(wzio);
5642185029Spjd
5643251478Sdelphij			write_asize += buf_sz;
5644185029Spjd			/*
5645185029Spjd			 * Keep the clock hand suitably device-aligned.
5646185029Spjd			 */
5647251478Sdelphij			buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
5648251478Sdelphij			write_psize += buf_p_sz;
5649251478Sdelphij			dev->l2ad_hand += buf_p_sz;
5650185029Spjd		}
5651251478Sdelphij	}
5652185029Spjd
5653286570Smav	mutex_exit(&dev->l2ad_mtx);
5654185029Spjd
5655251478Sdelphij	ASSERT3U(write_asize, <=, target_sz);
5656185029Spjd	ARCSTAT_BUMP(arcstat_l2_writes_sent);
5657251478Sdelphij	ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5658185029Spjd	ARCSTAT_INCR(arcstat_l2_size, write_sz);
5659251478Sdelphij	ARCSTAT_INCR(arcstat_l2_asize, write_asize);
5660275096Sdelphij	vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
5661185029Spjd
5662185029Spjd	/*
5663185029Spjd	 * Bump device hand to the device start if it is approaching the end.
5664185029Spjd	 * l2arc_evict() will already have evicted ahead for this case.
5665185029Spjd	 */
5666185029Spjd	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
5667185029Spjd		dev->l2ad_hand = dev->l2ad_start;
5668185029Spjd		dev->l2ad_evict = dev->l2ad_start;
5669185029Spjd		dev->l2ad_first = B_FALSE;
5670185029Spjd	}
5671185029Spjd
5672208373Smm	dev->l2ad_writing = B_TRUE;
5673185029Spjd	(void) zio_wait(pio);
5674208373Smm	dev->l2ad_writing = B_FALSE;
5675208373Smm
5676251478Sdelphij	return (write_asize);
5677185029Spjd}
5678185029Spjd
5679185029Spjd/*
5680251478Sdelphij * Compresses an L2ARC buffer.
5681286570Smav * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its
5682251478Sdelphij * size in l2hdr->b_asize. This routine tries to compress the data and
5683251478Sdelphij * depending on the compression result there are three possible outcomes:
5684251478Sdelphij * *) The buffer was incompressible. The original l2hdr contents were left
5685251478Sdelphij *    untouched and are ready for writing to an L2 device.
5686251478Sdelphij * *) The buffer was all-zeros, so there is no need to write it to an L2
5687251478Sdelphij *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
5688251478Sdelphij *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
5689251478Sdelphij * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
5690251478Sdelphij *    data buffer which holds the compressed data to be written, and b_asize
5691251478Sdelphij *    tells us how much data there is. b_compress is set to the appropriate
5692251478Sdelphij *    compression algorithm. Once writing is done, invoke
5693251478Sdelphij *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
5694251478Sdelphij *
5695251478Sdelphij * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
5696251478Sdelphij * buffer was incompressible).
5697251478Sdelphij */
5698251478Sdelphijstatic boolean_t
5699286570Smavl2arc_compress_buf(arc_buf_hdr_t *hdr)
5700251478Sdelphij{
5701251478Sdelphij	void *cdata;
5702268075Sdelphij	size_t csize, len, rounded;
5703286570Smav	ASSERT(HDR_HAS_L2HDR(hdr));
5704286570Smav	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
5705251478Sdelphij
5706286570Smav	ASSERT(HDR_HAS_L1HDR(hdr));
5707286570Smav	ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF);
5708286570Smav	ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
5709251478Sdelphij
5710251478Sdelphij	len = l2hdr->b_asize;
5711251478Sdelphij	cdata = zio_data_buf_alloc(len);
5712286570Smav	ASSERT3P(cdata, !=, NULL);
5713286570Smav	csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata,
5714269086Sdelphij	    cdata, l2hdr->b_asize);
5715251478Sdelphij
5716251478Sdelphij	if (csize == 0) {
5717251478Sdelphij		/* zero block, indicate that there's nothing to write */
5718251478Sdelphij		zio_data_buf_free(cdata, len);
5719286570Smav		HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY);
5720251478Sdelphij		l2hdr->b_asize = 0;
5721286570Smav		hdr->b_l1hdr.b_tmp_cdata = NULL;
5722251478Sdelphij		ARCSTAT_BUMP(arcstat_l2_compress_zeros);
5723251478Sdelphij		return (B_TRUE);
5724274628Savg	}
5725274628Savg
5726274628Savg	rounded = P2ROUNDUP(csize,
5727274628Savg	    (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift);
5728274628Savg	if (rounded < len) {
5729251478Sdelphij		/*
5730251478Sdelphij		 * Compression succeeded, we'll keep the cdata around for
5731251478Sdelphij		 * writing and release it afterwards.
5732251478Sdelphij		 */
5733274628Savg		if (rounded > csize) {
5734274628Savg			bzero((char *)cdata + csize, rounded - csize);
5735274628Savg			csize = rounded;
5736274628Savg		}
5737286570Smav		HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4);
5738251478Sdelphij		l2hdr->b_asize = csize;
5739286570Smav		hdr->b_l1hdr.b_tmp_cdata = cdata;
5740251478Sdelphij		ARCSTAT_BUMP(arcstat_l2_compress_successes);
5741251478Sdelphij		return (B_TRUE);
5742251478Sdelphij	} else {
5743251478Sdelphij		/*
5744251478Sdelphij		 * Compression failed, release the compressed buffer.
5745251478Sdelphij		 * l2hdr will be left unmodified.
5746251478Sdelphij		 */
5747251478Sdelphij		zio_data_buf_free(cdata, len);
5748251478Sdelphij		ARCSTAT_BUMP(arcstat_l2_compress_failures);
5749251478Sdelphij		return (B_FALSE);
5750251478Sdelphij	}
5751251478Sdelphij}
5752251478Sdelphij
5753251478Sdelphij/*
5754251478Sdelphij * Decompresses a zio read back from an l2arc device. On success, the
5755251478Sdelphij * underlying zio's io_data buffer is overwritten by the uncompressed
5756251478Sdelphij * version. On decompression error (corrupt compressed stream), the
5757251478Sdelphij * zio->io_error value is set to signal an I/O error.
5758251478Sdelphij *
5759251478Sdelphij * Please note that the compressed data stream is not checksummed, so
5760251478Sdelphij * if the underlying device is experiencing data corruption, we may feed
5761251478Sdelphij * corrupt data to the decompressor, so the decompressor needs to be
5762251478Sdelphij * able to handle this situation (LZ4 does).
5763251478Sdelphij */
5764251478Sdelphijstatic void
5765251478Sdelphijl2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
5766251478Sdelphij{
5767251478Sdelphij	ASSERT(L2ARC_IS_VALID_COMPRESS(c));
5768251478Sdelphij
5769251478Sdelphij	if (zio->io_error != 0) {
5770251478Sdelphij		/*
5771251478Sdelphij		 * An io error has occured, just restore the original io
5772251478Sdelphij		 * size in preparation for a main pool read.
5773251478Sdelphij		 */
5774251478Sdelphij		zio->io_orig_size = zio->io_size = hdr->b_size;
5775251478Sdelphij		return;
5776251478Sdelphij	}
5777251478Sdelphij
5778251478Sdelphij	if (c == ZIO_COMPRESS_EMPTY) {
5779251478Sdelphij		/*
5780251478Sdelphij		 * An empty buffer results in a null zio, which means we
5781251478Sdelphij		 * need to fill its io_data after we're done restoring the
5782251478Sdelphij		 * buffer's contents.
5783251478Sdelphij		 */
5784286570Smav		ASSERT(hdr->b_l1hdr.b_buf != NULL);
5785286570Smav		bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size);
5786286570Smav		zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data;
5787251478Sdelphij	} else {
5788251478Sdelphij		ASSERT(zio->io_data != NULL);
5789251478Sdelphij		/*
5790251478Sdelphij		 * We copy the compressed data from the start of the arc buffer
5791251478Sdelphij		 * (the zio_read will have pulled in only what we need, the
5792251478Sdelphij		 * rest is garbage which we will overwrite at decompression)
5793251478Sdelphij		 * and then decompress back to the ARC data buffer. This way we
5794251478Sdelphij		 * can minimize copying by simply decompressing back over the
5795251478Sdelphij		 * original compressed data (rather than decompressing to an
5796251478Sdelphij		 * aux buffer and then copying back the uncompressed buffer,
5797251478Sdelphij		 * which is likely to be much larger).
5798251478Sdelphij		 */
5799251478Sdelphij		uint64_t csize;
5800251478Sdelphij		void *cdata;
5801251478Sdelphij
5802251478Sdelphij		csize = zio->io_size;
5803251478Sdelphij		cdata = zio_data_buf_alloc(csize);
5804251478Sdelphij		bcopy(zio->io_data, cdata, csize);
5805251478Sdelphij		if (zio_decompress_data(c, cdata, zio->io_data, csize,
5806251478Sdelphij		    hdr->b_size) != 0)
5807251478Sdelphij			zio->io_error = EIO;
5808251478Sdelphij		zio_data_buf_free(cdata, csize);
5809251478Sdelphij	}
5810251478Sdelphij
5811251478Sdelphij	/* Restore the expected uncompressed IO size. */
5812251478Sdelphij	zio->io_orig_size = zio->io_size = hdr->b_size;
5813251478Sdelphij}
5814251478Sdelphij
5815251478Sdelphij/*
5816251478Sdelphij * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
5817251478Sdelphij * This buffer serves as a temporary holder of compressed data while
5818251478Sdelphij * the buffer entry is being written to an l2arc device. Once that is
5819251478Sdelphij * done, we can dispose of it.
5820251478Sdelphij */
5821251478Sdelphijstatic void
5822275811Sdelphijl2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
5823251478Sdelphij{
5824286570Smav	ASSERT(HDR_HAS_L1HDR(hdr));
5825286570Smav	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) {
5826251478Sdelphij		/*
5827251478Sdelphij		 * If the data was compressed, then we've allocated a
5828251478Sdelphij		 * temporary buffer for it, so now we need to release it.
5829251478Sdelphij		 */
5830286570Smav		ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
5831286570Smav		zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata,
5832286570Smav		    hdr->b_size);
5833286570Smav		hdr->b_l1hdr.b_tmp_cdata = NULL;
5834274172Savg	} else {
5835286570Smav		ASSERT(hdr->b_l1hdr.b_tmp_cdata == NULL);
5836251478Sdelphij	}
5837251478Sdelphij}
5838251478Sdelphij
5839251478Sdelphij/*
5840185029Spjd * This thread feeds the L2ARC at regular intervals.  This is the beating
5841185029Spjd * heart of the L2ARC.
5842185029Spjd */
5843185029Spjdstatic void
5844185029Spjdl2arc_feed_thread(void *dummy __unused)
5845185029Spjd{
5846185029Spjd	callb_cpr_t cpr;
5847185029Spjd	l2arc_dev_t *dev;
5848185029Spjd	spa_t *spa;
5849208373Smm	uint64_t size, wrote;
5850219089Spjd	clock_t begin, next = ddi_get_lbolt();
5851251478Sdelphij	boolean_t headroom_boost = B_FALSE;
5852185029Spjd
5853185029Spjd	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
5854185029Spjd
5855185029Spjd	mutex_enter(&l2arc_feed_thr_lock);
5856185029Spjd
5857185029Spjd	while (l2arc_thread_exit == 0) {
5858185029Spjd		CALLB_CPR_SAFE_BEGIN(&cpr);
5859185029Spjd		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
5860219089Spjd		    next - ddi_get_lbolt());
5861185029Spjd		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
5862219089Spjd		next = ddi_get_lbolt() + hz;
5863185029Spjd
5864185029Spjd		/*
5865185029Spjd		 * Quick check for L2ARC devices.
5866185029Spjd		 */
5867185029Spjd		mutex_enter(&l2arc_dev_mtx);
5868185029Spjd		if (l2arc_ndev == 0) {
5869185029Spjd			mutex_exit(&l2arc_dev_mtx);
5870185029Spjd			continue;
5871185029Spjd		}
5872185029Spjd		mutex_exit(&l2arc_dev_mtx);
5873219089Spjd		begin = ddi_get_lbolt();
5874185029Spjd
5875185029Spjd		/*
5876185029Spjd		 * This selects the next l2arc device to write to, and in
5877185029Spjd		 * doing so the next spa to feed from: dev->l2ad_spa.   This
5878185029Spjd		 * will return NULL if there are now no l2arc devices or if
5879185029Spjd		 * they are all faulted.
5880185029Spjd		 *
5881185029Spjd		 * If a device is returned, its spa's config lock is also
5882185029Spjd		 * held to prevent device removal.  l2arc_dev_get_next()
5883185029Spjd		 * will grab and release l2arc_dev_mtx.
5884185029Spjd		 */
5885185029Spjd		if ((dev = l2arc_dev_get_next()) == NULL)
5886185029Spjd			continue;
5887185029Spjd
5888185029Spjd		spa = dev->l2ad_spa;
5889185029Spjd		ASSERT(spa != NULL);
5890185029Spjd
5891185029Spjd		/*
5892219089Spjd		 * If the pool is read-only then force the feed thread to
5893219089Spjd		 * sleep a little longer.
5894219089Spjd		 */
5895219089Spjd		if (!spa_writeable(spa)) {
5896219089Spjd			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
5897219089Spjd			spa_config_exit(spa, SCL_L2ARC, dev);
5898219089Spjd			continue;
5899219089Spjd		}
5900219089Spjd
5901219089Spjd		/*
5902185029Spjd		 * Avoid contributing to memory pressure.
5903185029Spjd		 */
5904185029Spjd		if (arc_reclaim_needed()) {
5905185029Spjd			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
5906185029Spjd			spa_config_exit(spa, SCL_L2ARC, dev);
5907185029Spjd			continue;
5908185029Spjd		}
5909185029Spjd
5910185029Spjd		ARCSTAT_BUMP(arcstat_l2_feeds);
5911185029Spjd
5912251478Sdelphij		size = l2arc_write_size();
5913185029Spjd
5914185029Spjd		/*
5915185029Spjd		 * Evict L2ARC buffers that will be overwritten.
5916185029Spjd		 */
5917185029Spjd		l2arc_evict(dev, size, B_FALSE);
5918185029Spjd
5919185029Spjd		/*
5920185029Spjd		 * Write ARC buffers.
5921185029Spjd		 */
5922251478Sdelphij		wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
5923208373Smm
5924208373Smm		/*
5925208373Smm		 * Calculate interval between writes.
5926208373Smm		 */
5927208373Smm		next = l2arc_write_interval(begin, size, wrote);
5928185029Spjd		spa_config_exit(spa, SCL_L2ARC, dev);
5929185029Spjd	}
5930185029Spjd
5931185029Spjd	l2arc_thread_exit = 0;
5932185029Spjd	cv_broadcast(&l2arc_feed_thr_cv);
5933185029Spjd	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
5934185029Spjd	thread_exit();
5935185029Spjd}
5936185029Spjd
5937185029Spjdboolean_t
5938185029Spjdl2arc_vdev_present(vdev_t *vd)
5939185029Spjd{
5940185029Spjd	l2arc_dev_t *dev;
5941185029Spjd
5942185029Spjd	mutex_enter(&l2arc_dev_mtx);
5943185029Spjd	for (dev = list_head(l2arc_dev_list); dev != NULL;
5944185029Spjd	    dev = list_next(l2arc_dev_list, dev)) {
5945185029Spjd		if (dev->l2ad_vdev == vd)
5946185029Spjd			break;
5947185029Spjd	}
5948185029Spjd	mutex_exit(&l2arc_dev_mtx);
5949185029Spjd
5950185029Spjd	return (dev != NULL);
5951185029Spjd}
5952185029Spjd
5953185029Spjd/*
5954185029Spjd * Add a vdev for use by the L2ARC.  By this point the spa has already
5955185029Spjd * validated the vdev and opened it.
5956185029Spjd */
5957185029Spjdvoid
5958219089Spjdl2arc_add_vdev(spa_t *spa, vdev_t *vd)
5959185029Spjd{
5960185029Spjd	l2arc_dev_t *adddev;
5961185029Spjd
5962185029Spjd	ASSERT(!l2arc_vdev_present(vd));
5963185029Spjd
5964255753Sgibbs	vdev_ashift_optimize(vd);
5965255753Sgibbs
5966185029Spjd	/*
5967185029Spjd	 * Create a new l2arc device entry.
5968185029Spjd	 */
5969185029Spjd	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
5970185029Spjd	adddev->l2ad_spa = spa;
5971185029Spjd	adddev->l2ad_vdev = vd;
5972219089Spjd	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
5973219089Spjd	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
5974185029Spjd	adddev->l2ad_hand = adddev->l2ad_start;
5975185029Spjd	adddev->l2ad_evict = adddev->l2ad_start;
5976185029Spjd	adddev->l2ad_first = B_TRUE;
5977208373Smm	adddev->l2ad_writing = B_FALSE;
5978185029Spjd
5979286570Smav	mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
5980185029Spjd	/*
5981185029Spjd	 * This is a list of all ARC buffers that are still valid on the
5982185029Spjd	 * device.
5983185029Spjd	 */
5984286570Smav	list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
5985286570Smav	    offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
5986185029Spjd
5987219089Spjd	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
5988185029Spjd
5989185029Spjd	/*
5990185029Spjd	 * Add device to global list
5991185029Spjd	 */
5992185029Spjd	mutex_enter(&l2arc_dev_mtx);
5993185029Spjd	list_insert_head(l2arc_dev_list, adddev);
5994185029Spjd	atomic_inc_64(&l2arc_ndev);
5995185029Spjd	mutex_exit(&l2arc_dev_mtx);
5996185029Spjd}
5997185029Spjd
5998185029Spjd/*
5999185029Spjd * Remove a vdev from the L2ARC.
6000185029Spjd */
6001185029Spjdvoid
6002185029Spjdl2arc_remove_vdev(vdev_t *vd)
6003185029Spjd{
6004185029Spjd	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
6005185029Spjd
6006185029Spjd	/*
6007185029Spjd	 * Find the device by vdev
6008185029Spjd	 */
6009185029Spjd	mutex_enter(&l2arc_dev_mtx);
6010185029Spjd	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
6011185029Spjd		nextdev = list_next(l2arc_dev_list, dev);
6012185029Spjd		if (vd == dev->l2ad_vdev) {
6013185029Spjd			remdev = dev;
6014185029Spjd			break;
6015185029Spjd		}
6016185029Spjd	}
6017185029Spjd	ASSERT(remdev != NULL);
6018185029Spjd
6019185029Spjd	/*
6020185029Spjd	 * Remove device from global list
6021185029Spjd	 */
6022185029Spjd	list_remove(l2arc_dev_list, remdev);
6023185029Spjd	l2arc_dev_last = NULL;		/* may have been invalidated */
6024185029Spjd	atomic_dec_64(&l2arc_ndev);
6025185029Spjd	mutex_exit(&l2arc_dev_mtx);
6026185029Spjd
6027185029Spjd	/*
6028185029Spjd	 * Clear all buflists and ARC references.  L2ARC device flush.
6029185029Spjd	 */
6030185029Spjd	l2arc_evict(remdev, 0, B_TRUE);
6031286570Smav	list_destroy(&remdev->l2ad_buflist);
6032286570Smav	mutex_destroy(&remdev->l2ad_mtx);
6033185029Spjd	kmem_free(remdev, sizeof (l2arc_dev_t));
6034185029Spjd}
6035185029Spjd
6036185029Spjdvoid
6037185029Spjdl2arc_init(void)
6038185029Spjd{
6039185029Spjd	l2arc_thread_exit = 0;
6040185029Spjd	l2arc_ndev = 0;
6041185029Spjd	l2arc_writes_sent = 0;
6042185029Spjd	l2arc_writes_done = 0;
6043185029Spjd
6044185029Spjd	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
6045185029Spjd	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
6046185029Spjd	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
6047185029Spjd	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
6048185029Spjd
6049185029Spjd	l2arc_dev_list = &L2ARC_dev_list;
6050185029Spjd	l2arc_free_on_write = &L2ARC_free_on_write;
6051185029Spjd	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
6052185029Spjd	    offsetof(l2arc_dev_t, l2ad_node));
6053185029Spjd	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
6054185029Spjd	    offsetof(l2arc_data_free_t, l2df_list_node));
6055185029Spjd}
6056185029Spjd
6057185029Spjdvoid
6058185029Spjdl2arc_fini(void)
6059185029Spjd{
6060185029Spjd	/*
6061185029Spjd	 * This is called from dmu_fini(), which is called from spa_fini();
6062185029Spjd	 * Because of this, we can assume that all l2arc devices have
6063185029Spjd	 * already been removed when the pools themselves were removed.
6064185029Spjd	 */
6065185029Spjd
6066185029Spjd	l2arc_do_free_on_write();
6067185029Spjd
6068185029Spjd	mutex_destroy(&l2arc_feed_thr_lock);
6069185029Spjd	cv_destroy(&l2arc_feed_thr_cv);
6070185029Spjd	mutex_destroy(&l2arc_dev_mtx);
6071185029Spjd	mutex_destroy(&l2arc_free_on_write_mtx);
6072185029Spjd
6073185029Spjd	list_destroy(l2arc_dev_list);
6074185029Spjd	list_destroy(l2arc_free_on_write);
6075185029Spjd}
6076185029Spjd
6077185029Spjdvoid
6078185029Spjdl2arc_start(void)
6079185029Spjd{
6080209962Smm	if (!(spa_mode_global & FWRITE))
6081185029Spjd		return;
6082185029Spjd
6083185029Spjd	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
6084185029Spjd	    TS_RUN, minclsyspri);
6085185029Spjd}
6086185029Spjd
6087185029Spjdvoid
6088185029Spjdl2arc_stop(void)
6089185029Spjd{
6090209962Smm	if (!(spa_mode_global & FWRITE))
6091185029Spjd		return;
6092185029Spjd
6093185029Spjd	mutex_enter(&l2arc_feed_thr_lock);
6094185029Spjd	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
6095185029Spjd	l2arc_thread_exit = 1;
6096185029Spjd	while (l2arc_thread_exit != 0)
6097185029Spjd		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
6098185029Spjd	mutex_exit(&l2arc_feed_thr_lock);
6099185029Spjd}
6100