arc.c revision 286762
156746Sroberto/*
256746Sroberto * CDDL HEADER START
356746Sroberto *
456746Sroberto * The contents of this file are subject to the terms of the
556746Sroberto * Common Development and Distribution License (the "License").
656746Sroberto * You may not use this file except in compliance with the License.
756746Sroberto *
856746Sroberto * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
956746Sroberto * or http://www.opensolaris.org/os/licensing.
1056746Sroberto * See the License for the specific language governing permissions
1156746Sroberto * and limitations under the License.
1256746Sroberto *
1356746Sroberto * When distributing Covered Code, include this CDDL HEADER in each
1456746Sroberto * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
1556746Sroberto * If applicable, add the following below this CDDL HEADER, with the
1656746Sroberto * fields enclosed by brackets "[]" replaced with your own identifying
1756746Sroberto * information: Portions Copyright [yyyy] [name of copyright owner]
1856746Sroberto *
1956746Sroberto * CDDL HEADER END
2056746Sroberto */
2156746Sroberto/*
2256746Sroberto * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
2356746Sroberto * Copyright (c) 2012, Joyent, Inc. All rights reserved.
2456746Sroberto * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
2556746Sroberto * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
2656746Sroberto * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
2756746Sroberto */
2856746Sroberto
2956746Sroberto/*
3056746Sroberto * DVA-based Adjustable Replacement Cache
3156746Sroberto *
3256746Sroberto * While much of the theory of operation used here is
3356746Sroberto * based on the self-tuning, low overhead replacement cache
3456746Sroberto * presented by Megiddo and Modha at FAST 2003, there are some
3556746Sroberto * significant differences:
36285612Sdelphij *
37285612Sdelphij * 1. The Megiddo and Modha model assumes any page is evictable.
38285612Sdelphij * Pages in its cache cannot be "locked" into memory.  This makes
39285612Sdelphij * the eviction algorithm simple: evict the last page in the list.
40285612Sdelphij * This also make the performance characteristics easy to reason
4156746Sroberto * about.  Our cache is not so simple.  At any given moment, some
4256746Sroberto * subset of the blocks in the cache are un-evictable because we
4356746Sroberto * have handed out a reference to them.  Blocks are only evictable
4456746Sroberto * when there are no external references active.  This makes
4556746Sroberto * eviction far more problematic:  we choose to evict the evictable
4656746Sroberto * blocks that are the "lowest" in the list.
47285612Sdelphij *
48285612Sdelphij * There are times when it is not possible to evict the requested
49285612Sdelphij * space.  In these circumstances we are unable to adjust the cache
5056746Sroberto * size.  To prevent the cache growing unbounded at these times we
5156746Sroberto * implement a "cache throttle" that slows the flow of new data
5256746Sroberto * into the cache until we can make space available.
5356746Sroberto *
5456746Sroberto * 2. The Megiddo and Modha model assumes a fixed cache size.
5556746Sroberto * Pages are evicted when the cache is full and there is a cache
5656746Sroberto * miss.  Our model has a variable sized cache.  It grows with
5756746Sroberto * high use, but also tries to react to memory pressure from the
5856746Sroberto * operating system: decreasing its size when system memory is
5956746Sroberto * tight.
6056746Sroberto *
6156746Sroberto * 3. The Megiddo and Modha model assumes a fixed page size. All
6256746Sroberto * elements of the cache are therefore exactly the same size.  So
6356746Sroberto * when adjusting the cache size following a cache miss, its simply
64285612Sdelphij * a matter of choosing a single page to evict.  In our model, we
65285612Sdelphij * have variable sized cache blocks (rangeing from 512 bytes to
66285612Sdelphij * 128K bytes).  We therefore choose a set of blocks to evict to make
67285612Sdelphij * space for a cache miss that approximates as closely as possible
68285612Sdelphij * the space used by the new block.
69285612Sdelphij *
70285612Sdelphij * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
7156746Sroberto * by N. Megiddo & D. Modha, FAST 2003
7256746Sroberto */
7356746Sroberto
7456746Sroberto/*
7556746Sroberto * The locking model:
7656746Sroberto *
7756746Sroberto * A new reference to a cache buffer can be obtained in two
7856746Sroberto * ways: 1) via a hash table lookup using the DVA as a key,
79285612Sdelphij * or 2) via one of the ARC lists.  The arc_read() interface
80285612Sdelphij * uses method 1, while the internal arc algorithms for
8156746Sroberto * adjusting the cache use method 2.  We therefore provide two
8256746Sroberto * types of locks: 1) the hash table lock array, and 2) the
83285612Sdelphij * arc list locks.
8456746Sroberto *
85285612Sdelphij * Buffers do not have their own mutexs, rather they rely on the
8656746Sroberto * hash table mutexs for the bulk of their protection (i.e. most
8756746Sroberto * fields in the arc_buf_hdr_t are protected by these mutexs).
8856746Sroberto *
8956746Sroberto * buf_hash_find() returns the appropriate mutex (held) when it
9056746Sroberto * locates the requested buffer in the hash table.  It returns
9156746Sroberto * NULL for the mutex if the buffer was not in the table.
9256746Sroberto *
93285612Sdelphij * buf_hash_remove() expects the appropriate hash mutex to be
9456746Sroberto * already held before it is invoked.
9556746Sroberto *
9656746Sroberto * Each arc state also has a mutex which is used to protect the
9756746Sroberto * buffer list associated with the state.  When attempting to
9856746Sroberto * obtain a hash table lock while holding an arc list lock you
9956746Sroberto * must use: mutex_tryenter() to avoid deadlock.  Also note that
10056746Sroberto * the active state mutex must be held before the ghost state mutex.
10156746Sroberto *
10256746Sroberto * Arc buffers may have an associated eviction callback function.
10356746Sroberto * This function will be invoked prior to removing the buffer (e.g.
10456746Sroberto * in arc_do_user_evicts()).  Note however that the data associated
10556746Sroberto * with the buffer may be evicted prior to the callback.  The callback
106285612Sdelphij * must be made with *no locks held* (to prevent deadlock).  Additionally,
10756746Sroberto * the users of callbacks must ensure that their private data is
108285612Sdelphij * protected from simultaneous callbacks from arc_clear_callback()
109285612Sdelphij * and arc_do_user_evicts().
110285612Sdelphij *
111285612Sdelphij * Note that the majority of the performance stats are manipulated
112285612Sdelphij * with atomic operations.
11356746Sroberto *
114285612Sdelphij * The L2ARC uses the l2ad_mtx on each vdev for the following:
115285612Sdelphij *
116285612Sdelphij *	- L2ARC buflist creation
11756746Sroberto *	- L2ARC buflist eviction
118285612Sdelphij *	- L2ARC write completion, which walks L2ARC buflists
119285612Sdelphij *	- ARC header destruction, as it removes from L2ARC buflists
12056746Sroberto *	- ARC header release, as it removes from L2ARC buflists
121285612Sdelphij */
12256746Sroberto
123285612Sdelphij#include <sys/spa.h>
12456746Sroberto#include <sys/zio.h>
12556746Sroberto#include <sys/zio_compress.h>
12656746Sroberto#include <sys/zfs_context.h>
127285612Sdelphij#include <sys/arc.h>
128285612Sdelphij#include <sys/refcount.h>
129285612Sdelphij#include <sys/vdev.h>
130285612Sdelphij#include <sys/vdev_impl.h>
13156746Sroberto#include <sys/dsl_pool.h>
13256746Sroberto#ifdef _KERNEL
13356746Sroberto#include <sys/dnlc.h>
13456746Sroberto#endif
13556746Sroberto#include <sys/callb.h>
13656746Sroberto#include <sys/kstat.h>
13756746Sroberto#include <sys/trim_map.h>
138285612Sdelphij#include <zfs_fletcher.h>
13956746Sroberto#include <sys/sdt.h>
14056746Sroberto
14156746Sroberto#include <vm/vm_pageout.h>
14256746Sroberto#include <machine/vmparam.h>
14356746Sroberto
144285612Sdelphij#ifdef illumos
14556746Sroberto#ifndef _KERNEL
14656746Sroberto/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
14756746Srobertoboolean_t arc_watch = B_FALSE;
14856746Srobertoint arc_procfd;
14956746Sroberto#endif
15056746Sroberto#endif /* illumos */
15156746Sroberto
15256746Srobertostatic kmutex_t		arc_reclaim_thr_lock;
15356746Srobertostatic kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
15456746Srobertostatic uint8_t		arc_thread_exit;
15556746Sroberto
15656746Srobertouint_t arc_reduce_dnlc_percent = 3;
15756746Sroberto
15856746Sroberto/*
15956746Sroberto * The number of iterations through arc_evict_*() before we
16056746Sroberto * drop & reacquire the lock.
16156746Sroberto */
16256746Srobertoint arc_evict_iterations = 100;
16356746Sroberto
16456746Sroberto/* number of seconds before growing cache again */
165285612Sdelphijstatic int		arc_grow_retry = 60;
166285612Sdelphij
167285612Sdelphij/* shift of arc_c for calculating both min and max arc_p */
168285612Sdelphijstatic int		arc_p_min_shift = 4;
169285612Sdelphij
17056746Sroberto/* log2(fraction of arc to reclaim) */
17156746Srobertostatic int		arc_shrink_shift = 7;
17256746Sroberto
17356746Sroberto/*
17456746Sroberto * log2(fraction of ARC which must be free to allow growing).
17556746Sroberto * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
17656746Sroberto * when reading a new block into the ARC, we will evict an equal-sized block
17756746Sroberto * from the ARC.
17856746Sroberto *
17956746Sroberto * This must be less than arc_shrink_shift, so that when we shrink the ARC,
18056746Sroberto * we will still not allow it to grow.
18156746Sroberto */
18256746Srobertoint			arc_no_grow_shift = 5;
18356746Sroberto
18456746Sroberto
18556746Sroberto/*
186285612Sdelphij * minimum lifespan of a prefetch block in clock ticks
187285612Sdelphij * (initialized in arc_init())
188285612Sdelphij */
189285612Sdelphijstatic int		arc_min_prefetch_lifespan;
190285612Sdelphij
191285612Sdelphij/*
19256746Sroberto * If this percent of memory is free, don't throttle.
19356746Sroberto */
194285612Sdelphijint arc_lotsfree_percent = 10;
195285612Sdelphij
196285612Sdelphijstatic int arc_dead;
19756746Srobertoextern int zfs_prefetch_disable;
19856746Sroberto
199285612Sdelphij/*
200285612Sdelphij * The arc has filled available memory and has now warmed up.
201285612Sdelphij */
202285612Sdelphijstatic boolean_t arc_warm;
20356746Sroberto
20456746Sroberto/*
205285612Sdelphij * These tunables are for performance analysis.
20656746Sroberto */
20756746Srobertouint64_t zfs_arc_max;
20856746Srobertouint64_t zfs_arc_min;
20956746Srobertouint64_t zfs_arc_meta_limit = 0;
21056746Srobertouint64_t zfs_arc_meta_min = 0;
21156746Srobertoint zfs_arc_grow_retry = 0;
21256746Srobertoint zfs_arc_shrink_shift = 0;
21356746Srobertoint zfs_arc_p_min_shift = 0;
21456746Srobertoint zfs_disable_dup_eviction = 0;
21556746Srobertouint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
216285612Sdelphiju_int zfs_arc_free_target = 0;
217285612Sdelphij
21856746Srobertostatic int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
219285612Sdelphijstatic int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS);
22056746Sroberto
221285612Sdelphij#ifdef _KERNEL
22256746Srobertostatic void
22356746Srobertoarc_free_target_init(void *unused __unused)
224285612Sdelphij{
225285612Sdelphij
22656746Sroberto	zfs_arc_free_target = vm_pageout_wakeup_thresh;
22756746Sroberto}
228285612SdelphijSYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
229285612Sdelphij    arc_free_target_init, NULL);
230285612Sdelphij
23156746SrobertoTUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
23256746SrobertoTUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min);
233285612SdelphijTUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift);
234285612SdelphijSYSCTL_DECL(_vfs_zfs);
235285612SdelphijSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
23656746Sroberto    "Maximum ARC size");
23756746SrobertoSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
23856746Sroberto    "Minimum ARC size");
23956746SrobertoSYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
24056746Sroberto    &zfs_arc_average_blocksize, 0,
241285612Sdelphij    "ARC average blocksize");
24256746SrobertoSYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
243285612Sdelphij    &arc_shrink_shift, 0,
24456746Sroberto    "log2(fraction of arc to reclaim)");
24556746Sroberto
24656746Sroberto/*
24756746Sroberto * We don't have a tunable for arc_free_target due to the dependency on
24856746Sroberto * pagedaemon initialisation.
24956746Sroberto */
25056746SrobertoSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
25156746Sroberto    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
252285612Sdelphij    sysctl_vfs_zfs_arc_free_target, "IU",
25356746Sroberto    "Desired number of free pages below which ARC triggers reclaim");
25456746Sroberto
25556746Srobertostatic int
25656746Srobertosysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
25756746Sroberto{
258285612Sdelphij	u_int val;
25956746Sroberto	int err;
260285612Sdelphij
26156746Sroberto	val = zfs_arc_free_target;
262285612Sdelphij	err = sysctl_handle_int(oidp, &val, 0, req);
26356746Sroberto	if (err != 0 || req->newptr == NULL)
264285612Sdelphij		return (err);
26556746Sroberto
26656746Sroberto	if (val < minfree)
26756746Sroberto		return (EINVAL);
26856746Sroberto	if (val > vm_cnt.v_page_count)
26956746Sroberto		return (EINVAL);
27056746Sroberto
271285612Sdelphij	zfs_arc_free_target = val;
27256746Sroberto
273285612Sdelphij	return (0);
274285612Sdelphij}
27556746Sroberto
27656746Sroberto/*
27756746Sroberto * Must be declared here, before the definition of corresponding kstat
27856746Sroberto * macro which uses the same names will confuse the compiler.
27956746Sroberto */
28056746SrobertoSYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit,
28156746Sroberto    CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
28256746Sroberto    sysctl_vfs_zfs_arc_meta_limit, "QU",
283285612Sdelphij    "ARC metadata limit");
284285612Sdelphij#endif
28556746Sroberto
286285612Sdelphij/*
28756746Sroberto * Note that buffers can be in one of 6 states:
288285612Sdelphij *	ARC_anon	- anonymous (discussed below)
289285612Sdelphij *	ARC_mru		- recently used, currently cached
290285612Sdelphij *	ARC_mru_ghost	- recentely used, no longer in cache
291285612Sdelphij *	ARC_mfu		- frequently used, currently cached
292285612Sdelphij *	ARC_mfu_ghost	- frequently used, no longer in cache
29356746Sroberto *	ARC_l2c_only	- exists in L2ARC but not other states
294285612Sdelphij * When there are no active references to the buffer, they are
295285612Sdelphij * are linked onto a list in one of these arc states.  These are
296285612Sdelphij * the only buffers that can be evicted or deleted.  Within each
297285612Sdelphij * state there are multiple lists, one for meta-data and one for
298285612Sdelphij * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
29956746Sroberto * etc.) is tracked separately so that it can be managed more
300285612Sdelphij * explicitly: favored over data, limited explicitly.
301285612Sdelphij *
30256746Sroberto * Anonymous buffers are buffers that are not associated with
30356746Sroberto * a DVA.  These are buffers that hold dirty block copies
30456746Sroberto * before they are written to stable storage.  By definition,
305285612Sdelphij * they are "ref'd" and are considered part of arc_mru
306285612Sdelphij * that cannot be freed.  Generally, they will aquire a DVA
307285612Sdelphij * as they are written and migrate onto the arc_mru list.
30856746Sroberto *
309285612Sdelphij * The ARC_l2c_only state is for buffers that are in the second
31056746Sroberto * level ARC but no longer in any of the ARC_m* lists.  The second
31156746Sroberto * level ARC itself may also contain buffers that are in any of
312285612Sdelphij * the ARC_m* states - meaning that a buffer can exist in two
313285612Sdelphij * places.  The reason for the ARC_l2c_only state is to keep the
314285612Sdelphij * buffer header in the hash table, so that reads that hit the
31556746Sroberto * second level ARC benefit from these fast lookups.
316285612Sdelphij */
317132451Sroberto
31856746Srobertotypedef struct arc_state {
31956746Sroberto	list_t	arcs_list[ARC_BUFC_NUMTYPES];	/* list of evictable buffers */
320285612Sdelphij	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];	/* amount of evictable data */
321285612Sdelphij	uint64_t arcs_size;	/* total amount of data in this state */
322285612Sdelphij	kmutex_t arcs_mtx;
323285612Sdelphij} arc_state_t;
32456746Sroberto
325285612Sdelphij/* The 6 states: */
326285612Sdelphijstatic arc_state_t ARC_anon;
327285612Sdelphijstatic arc_state_t ARC_mru;
32856746Srobertostatic arc_state_t ARC_mru_ghost;
32956746Srobertostatic arc_state_t ARC_mfu;
33056746Srobertostatic arc_state_t ARC_mfu_ghost;
33156746Srobertostatic arc_state_t ARC_l2c_only;
33256746Sroberto
33356746Srobertotypedef struct arc_stats {
33456746Sroberto	kstat_named_t arcstat_hits;
33556746Sroberto	kstat_named_t arcstat_misses;
336	kstat_named_t arcstat_demand_data_hits;
337	kstat_named_t arcstat_demand_data_misses;
338	kstat_named_t arcstat_demand_metadata_hits;
339	kstat_named_t arcstat_demand_metadata_misses;
340	kstat_named_t arcstat_prefetch_data_hits;
341	kstat_named_t arcstat_prefetch_data_misses;
342	kstat_named_t arcstat_prefetch_metadata_hits;
343	kstat_named_t arcstat_prefetch_metadata_misses;
344	kstat_named_t arcstat_mru_hits;
345	kstat_named_t arcstat_mru_ghost_hits;
346	kstat_named_t arcstat_mfu_hits;
347	kstat_named_t arcstat_mfu_ghost_hits;
348	kstat_named_t arcstat_allocated;
349	kstat_named_t arcstat_deleted;
350	kstat_named_t arcstat_recycle_miss;
351	/*
352	 * Number of buffers that could not be evicted because the hash lock
353	 * was held by another thread.  The lock may not necessarily be held
354	 * by something using the same buffer, since hash locks are shared
355	 * by multiple buffers.
356	 */
357	kstat_named_t arcstat_mutex_miss;
358	/*
359	 * Number of buffers skipped because they have I/O in progress, are
360	 * indrect prefetch buffers that have not lived long enough, or are
361	 * not from the spa we're trying to evict from.
362	 */
363	kstat_named_t arcstat_evict_skip;
364	kstat_named_t arcstat_evict_l2_cached;
365	kstat_named_t arcstat_evict_l2_eligible;
366	kstat_named_t arcstat_evict_l2_ineligible;
367	kstat_named_t arcstat_hash_elements;
368	kstat_named_t arcstat_hash_elements_max;
369	kstat_named_t arcstat_hash_collisions;
370	kstat_named_t arcstat_hash_chains;
371	kstat_named_t arcstat_hash_chain_max;
372	kstat_named_t arcstat_p;
373	kstat_named_t arcstat_c;
374	kstat_named_t arcstat_c_min;
375	kstat_named_t arcstat_c_max;
376	kstat_named_t arcstat_size;
377	/*
378	 * Number of bytes consumed by internal ARC structures necessary
379	 * for tracking purposes; these structures are not actually
380	 * backed by ARC buffers. This includes arc_buf_hdr_t structures
381	 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
382	 * caches), and arc_buf_t structures (allocated via arc_buf_t
383	 * cache).
384	 */
385	kstat_named_t arcstat_hdr_size;
386	/*
387	 * Number of bytes consumed by ARC buffers of type equal to
388	 * ARC_BUFC_DATA. This is generally consumed by buffers backing
389	 * on disk user data (e.g. plain file contents).
390	 */
391	kstat_named_t arcstat_data_size;
392	/*
393	 * Number of bytes consumed by ARC buffers of type equal to
394	 * ARC_BUFC_METADATA. This is generally consumed by buffers
395	 * backing on disk data that is used for internal ZFS
396	 * structures (e.g. ZAP, dnode, indirect blocks, etc).
397	 */
398	kstat_named_t arcstat_metadata_size;
399	/*
400	 * Number of bytes consumed by various buffers and structures
401	 * not actually backed with ARC buffers. This includes bonus
402	 * buffers (allocated directly via zio_buf_* functions),
403	 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
404	 * cache), and dnode_t structures (allocated via dnode_t cache).
405	 */
406	kstat_named_t arcstat_other_size;
407	/*
408	 * Total number of bytes consumed by ARC buffers residing in the
409	 * arc_anon state. This includes *all* buffers in the arc_anon
410	 * state; e.g. data, metadata, evictable, and unevictable buffers
411	 * are all included in this value.
412	 */
413	kstat_named_t arcstat_anon_size;
414	/*
415	 * Number of bytes consumed by ARC buffers that meet the
416	 * following criteria: backing buffers of type ARC_BUFC_DATA,
417	 * residing in the arc_anon state, and are eligible for eviction
418	 * (e.g. have no outstanding holds on the buffer).
419	 */
420	kstat_named_t arcstat_anon_evictable_data;
421	/*
422	 * Number of bytes consumed by ARC buffers that meet the
423	 * following criteria: backing buffers of type ARC_BUFC_METADATA,
424	 * residing in the arc_anon state, and are eligible for eviction
425	 * (e.g. have no outstanding holds on the buffer).
426	 */
427	kstat_named_t arcstat_anon_evictable_metadata;
428	/*
429	 * Total number of bytes consumed by ARC buffers residing in the
430	 * arc_mru state. This includes *all* buffers in the arc_mru
431	 * state; e.g. data, metadata, evictable, and unevictable buffers
432	 * are all included in this value.
433	 */
434	kstat_named_t arcstat_mru_size;
435	/*
436	 * Number of bytes consumed by ARC buffers that meet the
437	 * following criteria: backing buffers of type ARC_BUFC_DATA,
438	 * residing in the arc_mru state, and are eligible for eviction
439	 * (e.g. have no outstanding holds on the buffer).
440	 */
441	kstat_named_t arcstat_mru_evictable_data;
442	/*
443	 * Number of bytes consumed by ARC buffers that meet the
444	 * following criteria: backing buffers of type ARC_BUFC_METADATA,
445	 * residing in the arc_mru state, and are eligible for eviction
446	 * (e.g. have no outstanding holds on the buffer).
447	 */
448	kstat_named_t arcstat_mru_evictable_metadata;
449	/*
450	 * Total number of bytes that *would have been* consumed by ARC
451	 * buffers in the arc_mru_ghost state. The key thing to note
452	 * here, is the fact that this size doesn't actually indicate
453	 * RAM consumption. The ghost lists only consist of headers and
454	 * don't actually have ARC buffers linked off of these headers.
455	 * Thus, *if* the headers had associated ARC buffers, these
456	 * buffers *would have* consumed this number of bytes.
457	 */
458	kstat_named_t arcstat_mru_ghost_size;
459	/*
460	 * Number of bytes that *would have been* consumed by ARC
461	 * buffers that are eligible for eviction, of type
462	 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
463	 */
464	kstat_named_t arcstat_mru_ghost_evictable_data;
465	/*
466	 * Number of bytes that *would have been* consumed by ARC
467	 * buffers that are eligible for eviction, of type
468	 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
469	 */
470	kstat_named_t arcstat_mru_ghost_evictable_metadata;
471	/*
472	 * Total number of bytes consumed by ARC buffers residing in the
473	 * arc_mfu state. This includes *all* buffers in the arc_mfu
474	 * state; e.g. data, metadata, evictable, and unevictable buffers
475	 * are all included in this value.
476	 */
477	kstat_named_t arcstat_mfu_size;
478	/*
479	 * Number of bytes consumed by ARC buffers that are eligible for
480	 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
481	 * state.
482	 */
483	kstat_named_t arcstat_mfu_evictable_data;
484	/*
485	 * Number of bytes consumed by ARC buffers that are eligible for
486	 * eviction, of type ARC_BUFC_METADATA, and reside in the
487	 * arc_mfu state.
488	 */
489	kstat_named_t arcstat_mfu_evictable_metadata;
490	/*
491	 * Total number of bytes that *would have been* consumed by ARC
492	 * buffers in the arc_mfu_ghost state. See the comment above
493	 * arcstat_mru_ghost_size for more details.
494	 */
495	kstat_named_t arcstat_mfu_ghost_size;
496	/*
497	 * Number of bytes that *would have been* consumed by ARC
498	 * buffers that are eligible for eviction, of type
499	 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
500	 */
501	kstat_named_t arcstat_mfu_ghost_evictable_data;
502	/*
503	 * Number of bytes that *would have been* consumed by ARC
504	 * buffers that are eligible for eviction, of type
505	 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
506	 */
507	kstat_named_t arcstat_mfu_ghost_evictable_metadata;
508	kstat_named_t arcstat_l2_hits;
509	kstat_named_t arcstat_l2_misses;
510	kstat_named_t arcstat_l2_feeds;
511	kstat_named_t arcstat_l2_rw_clash;
512	kstat_named_t arcstat_l2_read_bytes;
513	kstat_named_t arcstat_l2_write_bytes;
514	kstat_named_t arcstat_l2_writes_sent;
515	kstat_named_t arcstat_l2_writes_done;
516	kstat_named_t arcstat_l2_writes_error;
517	kstat_named_t arcstat_l2_writes_hdr_miss;
518	kstat_named_t arcstat_l2_evict_lock_retry;
519	kstat_named_t arcstat_l2_evict_reading;
520	kstat_named_t arcstat_l2_evict_l1cached;
521	kstat_named_t arcstat_l2_free_on_write;
522	kstat_named_t arcstat_l2_cdata_free_on_write;
523	kstat_named_t arcstat_l2_abort_lowmem;
524	kstat_named_t arcstat_l2_cksum_bad;
525	kstat_named_t arcstat_l2_io_error;
526	kstat_named_t arcstat_l2_size;
527	kstat_named_t arcstat_l2_asize;
528	kstat_named_t arcstat_l2_hdr_size;
529	kstat_named_t arcstat_l2_compress_successes;
530	kstat_named_t arcstat_l2_compress_zeros;
531	kstat_named_t arcstat_l2_compress_failures;
532	kstat_named_t arcstat_l2_write_trylock_fail;
533	kstat_named_t arcstat_l2_write_passed_headroom;
534	kstat_named_t arcstat_l2_write_spa_mismatch;
535	kstat_named_t arcstat_l2_write_in_l2;
536	kstat_named_t arcstat_l2_write_hdr_io_in_progress;
537	kstat_named_t arcstat_l2_write_not_cacheable;
538	kstat_named_t arcstat_l2_write_full;
539	kstat_named_t arcstat_l2_write_buffer_iter;
540	kstat_named_t arcstat_l2_write_pios;
541	kstat_named_t arcstat_l2_write_buffer_bytes_scanned;
542	kstat_named_t arcstat_l2_write_buffer_list_iter;
543	kstat_named_t arcstat_l2_write_buffer_list_null_iter;
544	kstat_named_t arcstat_memory_throttle_count;
545	kstat_named_t arcstat_duplicate_buffers;
546	kstat_named_t arcstat_duplicate_buffers_size;
547	kstat_named_t arcstat_duplicate_reads;
548	kstat_named_t arcstat_meta_used;
549	kstat_named_t arcstat_meta_limit;
550	kstat_named_t arcstat_meta_max;
551	kstat_named_t arcstat_meta_min;
552} arc_stats_t;
553
554static arc_stats_t arc_stats = {
555	{ "hits",			KSTAT_DATA_UINT64 },
556	{ "misses",			KSTAT_DATA_UINT64 },
557	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
558	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
559	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
560	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
561	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
562	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
563	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
564	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
565	{ "mru_hits",			KSTAT_DATA_UINT64 },
566	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
567	{ "mfu_hits",			KSTAT_DATA_UINT64 },
568	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
569	{ "allocated",			KSTAT_DATA_UINT64 },
570	{ "deleted",			KSTAT_DATA_UINT64 },
571	{ "recycle_miss",		KSTAT_DATA_UINT64 },
572	{ "mutex_miss",			KSTAT_DATA_UINT64 },
573	{ "evict_skip",			KSTAT_DATA_UINT64 },
574	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
575	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
576	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
577	{ "hash_elements",		KSTAT_DATA_UINT64 },
578	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
579	{ "hash_collisions",		KSTAT_DATA_UINT64 },
580	{ "hash_chains",		KSTAT_DATA_UINT64 },
581	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
582	{ "p",				KSTAT_DATA_UINT64 },
583	{ "c",				KSTAT_DATA_UINT64 },
584	{ "c_min",			KSTAT_DATA_UINT64 },
585	{ "c_max",			KSTAT_DATA_UINT64 },
586	{ "size",			KSTAT_DATA_UINT64 },
587	{ "hdr_size",			KSTAT_DATA_UINT64 },
588	{ "data_size",			KSTAT_DATA_UINT64 },
589	{ "metadata_size",		KSTAT_DATA_UINT64 },
590	{ "other_size",			KSTAT_DATA_UINT64 },
591	{ "anon_size",			KSTAT_DATA_UINT64 },
592	{ "anon_evictable_data",	KSTAT_DATA_UINT64 },
593	{ "anon_evictable_metadata",	KSTAT_DATA_UINT64 },
594	{ "mru_size",			KSTAT_DATA_UINT64 },
595	{ "mru_evictable_data",		KSTAT_DATA_UINT64 },
596	{ "mru_evictable_metadata",	KSTAT_DATA_UINT64 },
597	{ "mru_ghost_size",		KSTAT_DATA_UINT64 },
598	{ "mru_ghost_evictable_data",	KSTAT_DATA_UINT64 },
599	{ "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
600	{ "mfu_size",			KSTAT_DATA_UINT64 },
601	{ "mfu_evictable_data",		KSTAT_DATA_UINT64 },
602	{ "mfu_evictable_metadata",	KSTAT_DATA_UINT64 },
603	{ "mfu_ghost_size",		KSTAT_DATA_UINT64 },
604	{ "mfu_ghost_evictable_data",	KSTAT_DATA_UINT64 },
605	{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
606	{ "l2_hits",			KSTAT_DATA_UINT64 },
607	{ "l2_misses",			KSTAT_DATA_UINT64 },
608	{ "l2_feeds",			KSTAT_DATA_UINT64 },
609	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
610	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
611	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
612	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
613	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
614	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
615	{ "l2_writes_hdr_miss",		KSTAT_DATA_UINT64 },
616	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
617	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
618	{ "l2_evict_l1cached",		KSTAT_DATA_UINT64 },
619	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
620	{ "l2_cdata_free_on_write",	KSTAT_DATA_UINT64 },
621	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
622	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
623	{ "l2_io_error",		KSTAT_DATA_UINT64 },
624	{ "l2_size",			KSTAT_DATA_UINT64 },
625	{ "l2_asize",			KSTAT_DATA_UINT64 },
626	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
627	{ "l2_compress_successes",	KSTAT_DATA_UINT64 },
628	{ "l2_compress_zeros",		KSTAT_DATA_UINT64 },
629	{ "l2_compress_failures",	KSTAT_DATA_UINT64 },
630	{ "l2_write_trylock_fail",	KSTAT_DATA_UINT64 },
631	{ "l2_write_passed_headroom",	KSTAT_DATA_UINT64 },
632	{ "l2_write_spa_mismatch",	KSTAT_DATA_UINT64 },
633	{ "l2_write_in_l2",		KSTAT_DATA_UINT64 },
634	{ "l2_write_io_in_progress",	KSTAT_DATA_UINT64 },
635	{ "l2_write_not_cacheable",	KSTAT_DATA_UINT64 },
636	{ "l2_write_full",		KSTAT_DATA_UINT64 },
637	{ "l2_write_buffer_iter",	KSTAT_DATA_UINT64 },
638	{ "l2_write_pios",		KSTAT_DATA_UINT64 },
639	{ "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 },
640	{ "l2_write_buffer_list_iter",	KSTAT_DATA_UINT64 },
641	{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
642	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
643	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
644	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
645	{ "duplicate_reads",		KSTAT_DATA_UINT64 },
646	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
647	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
648	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
649	{ "arc_meta_min",		KSTAT_DATA_UINT64 }
650};
651
652#define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
653
654#define	ARCSTAT_INCR(stat, val) \
655	atomic_add_64(&arc_stats.stat.value.ui64, (val))
656
657#define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
658#define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
659
660#define	ARCSTAT_MAX(stat, val) {					\
661	uint64_t m;							\
662	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
663	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
664		continue;						\
665}
666
667#define	ARCSTAT_MAXSTAT(stat) \
668	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
669
670/*
671 * We define a macro to allow ARC hits/misses to be easily broken down by
672 * two separate conditions, giving a total of four different subtypes for
673 * each of hits and misses (so eight statistics total).
674 */
675#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
676	if (cond1) {							\
677		if (cond2) {						\
678			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
679		} else {						\
680			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
681		}							\
682	} else {							\
683		if (cond2) {						\
684			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
685		} else {						\
686			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
687		}							\
688	}
689
690kstat_t			*arc_ksp;
691static arc_state_t	*arc_anon;
692static arc_state_t	*arc_mru;
693static arc_state_t	*arc_mru_ghost;
694static arc_state_t	*arc_mfu;
695static arc_state_t	*arc_mfu_ghost;
696static arc_state_t	*arc_l2c_only;
697
698/*
699 * There are several ARC variables that are critical to export as kstats --
700 * but we don't want to have to grovel around in the kstat whenever we wish to
701 * manipulate them.  For these variables, we therefore define them to be in
702 * terms of the statistic variable.  This assures that we are not introducing
703 * the possibility of inconsistency by having shadow copies of the variables,
704 * while still allowing the code to be readable.
705 */
706#define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
707#define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
708#define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
709#define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
710#define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
711#define	arc_meta_limit	ARCSTAT(arcstat_meta_limit) /* max size for metadata */
712#define	arc_meta_min	ARCSTAT(arcstat_meta_min) /* min size for metadata */
713#define	arc_meta_used	ARCSTAT(arcstat_meta_used) /* size of metadata */
714#define	arc_meta_max	ARCSTAT(arcstat_meta_max) /* max size of metadata */
715
716#define	L2ARC_IS_VALID_COMPRESS(_c_) \
717	((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
718
719static int		arc_no_grow;	/* Don't try to grow cache size */
720static uint64_t		arc_tempreserve;
721static uint64_t		arc_loaned_bytes;
722
723typedef struct arc_callback arc_callback_t;
724
725struct arc_callback {
726	void			*acb_private;
727	arc_done_func_t		*acb_done;
728	arc_buf_t		*acb_buf;
729	zio_t			*acb_zio_dummy;
730	arc_callback_t		*acb_next;
731};
732
733typedef struct arc_write_callback arc_write_callback_t;
734
735struct arc_write_callback {
736	void		*awcb_private;
737	arc_done_func_t	*awcb_ready;
738	arc_done_func_t	*awcb_physdone;
739	arc_done_func_t	*awcb_done;
740	arc_buf_t	*awcb_buf;
741};
742
743/*
744 * ARC buffers are separated into multiple structs as a memory saving measure:
745 *   - Common fields struct, always defined, and embedded within it:
746 *       - L2-only fields, always allocated but undefined when not in L2ARC
747 *       - L1-only fields, only allocated when in L1ARC
748 *
749 *           Buffer in L1                     Buffer only in L2
750 *    +------------------------+          +------------------------+
751 *    | arc_buf_hdr_t          |          | arc_buf_hdr_t          |
752 *    |                        |          |                        |
753 *    |                        |          |                        |
754 *    |                        |          |                        |
755 *    +------------------------+          +------------------------+
756 *    | l2arc_buf_hdr_t        |          | l2arc_buf_hdr_t        |
757 *    | (undefined if L1-only) |          |                        |
758 *    +------------------------+          +------------------------+
759 *    | l1arc_buf_hdr_t        |
760 *    |                        |
761 *    |                        |
762 *    |                        |
763 *    |                        |
764 *    +------------------------+
765 *
766 * Because it's possible for the L2ARC to become extremely large, we can wind
767 * up eating a lot of memory in L2ARC buffer headers, so the size of a header
768 * is minimized by only allocating the fields necessary for an L1-cached buffer
769 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
770 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
771 * words in pointers. arc_hdr_realloc() is used to switch a header between
772 * these two allocation states.
773 */
774typedef struct l1arc_buf_hdr {
775	kmutex_t		b_freeze_lock;
776#ifdef ZFS_DEBUG
777	/*
778	 * used for debugging wtih kmem_flags - by allocating and freeing
779	 * b_thawed when the buffer is thawed, we get a record of the stack
780	 * trace that thawed it.
781	 */
782	void			*b_thawed;
783#endif
784
785	arc_buf_t		*b_buf;
786	uint32_t		b_datacnt;
787	/* for waiting on writes to complete */
788	kcondvar_t		b_cv;
789
790	/* protected by arc state mutex */
791	arc_state_t		*b_state;
792	list_node_t		b_arc_node;
793
794	/* updated atomically */
795	clock_t			b_arc_access;
796
797	/* self protecting */
798	refcount_t		b_refcnt;
799
800	arc_callback_t		*b_acb;
801	/* temporary buffer holder for in-flight compressed data */
802	void			*b_tmp_cdata;
803} l1arc_buf_hdr_t;
804
805typedef struct l2arc_dev l2arc_dev_t;
806
807typedef struct l2arc_buf_hdr {
808	/* protected by arc_buf_hdr mutex */
809	l2arc_dev_t		*b_dev;		/* L2ARC device */
810	uint64_t		b_daddr;	/* disk address, offset byte */
811	/* real alloc'd buffer size depending on b_compress applied */
812	int32_t			b_asize;
813
814	list_node_t		b_l2node;
815} l2arc_buf_hdr_t;
816
817struct arc_buf_hdr {
818	/* protected by hash lock */
819	dva_t			b_dva;
820	uint64_t		b_birth;
821	/*
822	 * Even though this checksum is only set/verified when a buffer is in
823	 * the L1 cache, it needs to be in the set of common fields because it
824	 * must be preserved from the time before a buffer is written out to
825	 * L2ARC until after it is read back in.
826	 */
827	zio_cksum_t		*b_freeze_cksum;
828
829	arc_buf_hdr_t		*b_hash_next;
830	arc_flags_t		b_flags;
831
832	/* immutable */
833	int32_t			b_size;
834	uint64_t		b_spa;
835
836	/* L2ARC fields. Undefined when not in L2ARC. */
837	l2arc_buf_hdr_t		b_l2hdr;
838	/* L1ARC fields. Undefined when in l2arc_only state */
839	l1arc_buf_hdr_t		b_l1hdr;
840};
841
842#ifdef _KERNEL
843static int
844sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS)
845{
846	uint64_t val;
847	int err;
848
849	val = arc_meta_limit;
850	err = sysctl_handle_64(oidp, &val, 0, req);
851	if (err != 0 || req->newptr == NULL)
852		return (err);
853
854        if (val <= 0 || val > arc_c_max)
855		return (EINVAL);
856
857	arc_meta_limit = val;
858	return (0);
859}
860#endif
861
862static arc_buf_t *arc_eviction_list;
863static kmutex_t arc_eviction_mtx;
864static arc_buf_hdr_t arc_eviction_hdr;
865
866#define	GHOST_STATE(state)	\
867	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
868	(state) == arc_l2c_only)
869
870#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
871#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
872#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
873#define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
874#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
875#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
876
877#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_FLAG_L2CACHE)
878#define	HDR_L2COMPRESS(hdr)	((hdr)->b_flags & ARC_FLAG_L2COMPRESS)
879#define	HDR_L2_READING(hdr)	\
880	    (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&	\
881	    ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
882#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITING)
883#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
884#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
885
886#define	HDR_ISTYPE_METADATA(hdr)	\
887	    ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
888#define	HDR_ISTYPE_DATA(hdr)	(!HDR_ISTYPE_METADATA(hdr))
889
890#define	HDR_HAS_L1HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
891#define	HDR_HAS_L2HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
892
893/* For storing compression mode in b_flags */
894#define	HDR_COMPRESS_OFFSET	24
895#define	HDR_COMPRESS_NBITS	7
896
897#define	HDR_GET_COMPRESS(hdr)	((enum zio_compress)BF32_GET(hdr->b_flags, \
898	    HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS))
899#define	HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \
900	    HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp))
901
902/*
903 * Other sizes
904 */
905
906#define	HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
907#define	HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
908
909/*
910 * Hash table routines
911 */
912
913#define	HT_LOCK_PAD	CACHE_LINE_SIZE
914
915struct ht_lock {
916	kmutex_t	ht_lock;
917#ifdef _KERNEL
918	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
919#endif
920};
921
922#define	BUF_LOCKS 256
923typedef struct buf_hash_table {
924	uint64_t ht_mask;
925	arc_buf_hdr_t **ht_table;
926	struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE);
927} buf_hash_table_t;
928
929static buf_hash_table_t buf_hash_table;
930
931#define	BUF_HASH_INDEX(spa, dva, birth) \
932	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
933#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
934#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
935#define	HDR_LOCK(hdr) \
936	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
937
938uint64_t zfs_crc64_table[256];
939
940/*
941 * Level 2 ARC
942 */
943
944#define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
945#define	L2ARC_HEADROOM		2			/* num of writes */
946/*
947 * If we discover during ARC scan any buffers to be compressed, we boost
948 * our headroom for the next scanning cycle by this percentage multiple.
949 */
950#define	L2ARC_HEADROOM_BOOST	200
951#define	L2ARC_FEED_SECS		1		/* caching interval secs */
952#define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
953
954/*
955 * Used to distinguish headers that are being process by
956 * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk
957 * address. This can happen when the header is added to the l2arc's list
958 * of buffers to write in the first stage of l2arc_write_buffers(), but
959 * has not yet been written out which happens in the second stage of
960 * l2arc_write_buffers().
961 */
962#define	L2ARC_ADDR_UNSET	((uint64_t)(-1))
963
964#define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
965#define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
966
967/* L2ARC Performance Tunables */
968uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
969uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
970uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
971uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
972uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
973uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
974boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
975boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
976boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
977
978SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
979    &l2arc_write_max, 0, "max write size");
980SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
981    &l2arc_write_boost, 0, "extra write during warmup");
982SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
983    &l2arc_headroom, 0, "number of dev writes");
984SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
985    &l2arc_feed_secs, 0, "interval seconds");
986SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
987    &l2arc_feed_min_ms, 0, "min interval milliseconds");
988
989SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
990    &l2arc_noprefetch, 0, "don't cache prefetch bufs");
991SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
992    &l2arc_feed_again, 0, "turbo warmup");
993SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
994    &l2arc_norw, 0, "no reads during writes");
995
996SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
997    &ARC_anon.arcs_size, 0, "size of anonymous state");
998SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
999    &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
1000SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
1001    &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
1002
1003SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
1004    &ARC_mru.arcs_size, 0, "size of mru state");
1005SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
1006    &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
1007SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
1008    &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
1009
1010SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
1011    &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
1012SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
1013    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
1014    "size of metadata in mru ghost state");
1015SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
1016    &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
1017    "size of data in mru ghost state");
1018
1019SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
1020    &ARC_mfu.arcs_size, 0, "size of mfu state");
1021SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
1022    &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
1023SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
1024    &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
1025
1026SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
1027    &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
1028SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
1029    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
1030    "size of metadata in mfu ghost state");
1031SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
1032    &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
1033    "size of data in mfu ghost state");
1034
1035SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
1036    &ARC_l2c_only.arcs_size, 0, "size of mru state");
1037
1038/*
1039 * L2ARC Internals
1040 */
1041struct l2arc_dev {
1042	vdev_t			*l2ad_vdev;	/* vdev */
1043	spa_t			*l2ad_spa;	/* spa */
1044	uint64_t		l2ad_hand;	/* next write location */
1045	uint64_t		l2ad_start;	/* first addr on device */
1046	uint64_t		l2ad_end;	/* last addr on device */
1047	boolean_t		l2ad_first;	/* first sweep through */
1048	boolean_t		l2ad_writing;	/* currently writing */
1049	kmutex_t		l2ad_mtx;	/* lock for buffer list */
1050	list_t			l2ad_buflist;	/* buffer list */
1051	list_node_t		l2ad_node;	/* device list node */
1052	refcount_t		l2ad_alloc;	/* allocated bytes */
1053};
1054
1055static list_t L2ARC_dev_list;			/* device list */
1056static list_t *l2arc_dev_list;			/* device list pointer */
1057static kmutex_t l2arc_dev_mtx;			/* device list mutex */
1058static l2arc_dev_t *l2arc_dev_last;		/* last device used */
1059static list_t L2ARC_free_on_write;		/* free after write buf list */
1060static list_t *l2arc_free_on_write;		/* free after write list ptr */
1061static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
1062static uint64_t l2arc_ndev;			/* number of devices */
1063
1064typedef struct l2arc_read_callback {
1065	arc_buf_t		*l2rcb_buf;		/* read buffer */
1066	spa_t			*l2rcb_spa;		/* spa */
1067	blkptr_t		l2rcb_bp;		/* original blkptr */
1068	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
1069	int			l2rcb_flags;		/* original flags */
1070	enum zio_compress	l2rcb_compress;		/* applied compress */
1071} l2arc_read_callback_t;
1072
1073typedef struct l2arc_write_callback {
1074	l2arc_dev_t	*l2wcb_dev;		/* device info */
1075	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
1076} l2arc_write_callback_t;
1077
1078typedef struct l2arc_data_free {
1079	/* protected by l2arc_free_on_write_mtx */
1080	void		*l2df_data;
1081	size_t		l2df_size;
1082	void		(*l2df_func)(void *, size_t);
1083	list_node_t	l2df_list_node;
1084} l2arc_data_free_t;
1085
1086static kmutex_t l2arc_feed_thr_lock;
1087static kcondvar_t l2arc_feed_thr_cv;
1088static uint8_t l2arc_thread_exit;
1089
1090static void arc_get_data_buf(arc_buf_t *);
1091static void arc_access(arc_buf_hdr_t *, kmutex_t *);
1092static int arc_evict_needed(arc_buf_contents_t);
1093static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t);
1094static void arc_buf_watch(arc_buf_t *);
1095
1096static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
1097static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
1098
1099static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
1100static void l2arc_read_done(zio_t *);
1101
1102static boolean_t l2arc_compress_buf(arc_buf_hdr_t *);
1103static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
1104static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
1105
1106static uint64_t
1107buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
1108{
1109	uint8_t *vdva = (uint8_t *)dva;
1110	uint64_t crc = -1ULL;
1111	int i;
1112
1113	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
1114
1115	for (i = 0; i < sizeof (dva_t); i++)
1116		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
1117
1118	crc ^= (spa>>8) ^ birth;
1119
1120	return (crc);
1121}
1122
1123#define	BUF_EMPTY(buf)						\
1124	((buf)->b_dva.dva_word[0] == 0 &&			\
1125	(buf)->b_dva.dva_word[1] == 0)
1126
1127#define	BUF_EQUAL(spa, dva, birth, buf)				\
1128	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
1129	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
1130	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
1131
1132static void
1133buf_discard_identity(arc_buf_hdr_t *hdr)
1134{
1135	hdr->b_dva.dva_word[0] = 0;
1136	hdr->b_dva.dva_word[1] = 0;
1137	hdr->b_birth = 0;
1138}
1139
1140static arc_buf_hdr_t *
1141buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
1142{
1143	const dva_t *dva = BP_IDENTITY(bp);
1144	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
1145	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
1146	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1147	arc_buf_hdr_t *hdr;
1148
1149	mutex_enter(hash_lock);
1150	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
1151	    hdr = hdr->b_hash_next) {
1152		if (BUF_EQUAL(spa, dva, birth, hdr)) {
1153			*lockp = hash_lock;
1154			return (hdr);
1155		}
1156	}
1157	mutex_exit(hash_lock);
1158	*lockp = NULL;
1159	return (NULL);
1160}
1161
1162/*
1163 * Insert an entry into the hash table.  If there is already an element
1164 * equal to elem in the hash table, then the already existing element
1165 * will be returned and the new element will not be inserted.
1166 * Otherwise returns NULL.
1167 * If lockp == NULL, the caller is assumed to already hold the hash lock.
1168 */
1169static arc_buf_hdr_t *
1170buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
1171{
1172	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1173	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1174	arc_buf_hdr_t *fhdr;
1175	uint32_t i;
1176
1177	ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
1178	ASSERT(hdr->b_birth != 0);
1179	ASSERT(!HDR_IN_HASH_TABLE(hdr));
1180
1181	if (lockp != NULL) {
1182		*lockp = hash_lock;
1183		mutex_enter(hash_lock);
1184	} else {
1185		ASSERT(MUTEX_HELD(hash_lock));
1186	}
1187
1188	for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
1189	    fhdr = fhdr->b_hash_next, i++) {
1190		if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
1191			return (fhdr);
1192	}
1193
1194	hdr->b_hash_next = buf_hash_table.ht_table[idx];
1195	buf_hash_table.ht_table[idx] = hdr;
1196	hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
1197
1198	/* collect some hash table performance data */
1199	if (i > 0) {
1200		ARCSTAT_BUMP(arcstat_hash_collisions);
1201		if (i == 1)
1202			ARCSTAT_BUMP(arcstat_hash_chains);
1203
1204		ARCSTAT_MAX(arcstat_hash_chain_max, i);
1205	}
1206
1207	ARCSTAT_BUMP(arcstat_hash_elements);
1208	ARCSTAT_MAXSTAT(arcstat_hash_elements);
1209
1210	return (NULL);
1211}
1212
1213static void
1214buf_hash_remove(arc_buf_hdr_t *hdr)
1215{
1216	arc_buf_hdr_t *fhdr, **hdrp;
1217	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1218
1219	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
1220	ASSERT(HDR_IN_HASH_TABLE(hdr));
1221
1222	hdrp = &buf_hash_table.ht_table[idx];
1223	while ((fhdr = *hdrp) != hdr) {
1224		ASSERT(fhdr != NULL);
1225		hdrp = &fhdr->b_hash_next;
1226	}
1227	*hdrp = hdr->b_hash_next;
1228	hdr->b_hash_next = NULL;
1229	hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE;
1230
1231	/* collect some hash table performance data */
1232	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1233
1234	if (buf_hash_table.ht_table[idx] &&
1235	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1236		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1237}
1238
1239/*
1240 * Global data structures and functions for the buf kmem cache.
1241 */
1242static kmem_cache_t *hdr_full_cache;
1243static kmem_cache_t *hdr_l2only_cache;
1244static kmem_cache_t *buf_cache;
1245
1246static void
1247buf_fini(void)
1248{
1249	int i;
1250
1251	kmem_free(buf_hash_table.ht_table,
1252	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
1253	for (i = 0; i < BUF_LOCKS; i++)
1254		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
1255	kmem_cache_destroy(hdr_full_cache);
1256	kmem_cache_destroy(hdr_l2only_cache);
1257	kmem_cache_destroy(buf_cache);
1258}
1259
1260/*
1261 * Constructor callback - called when the cache is empty
1262 * and a new buf is requested.
1263 */
1264/* ARGSUSED */
1265static int
1266hdr_full_cons(void *vbuf, void *unused, int kmflag)
1267{
1268	arc_buf_hdr_t *hdr = vbuf;
1269
1270	bzero(hdr, HDR_FULL_SIZE);
1271	cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
1272	refcount_create(&hdr->b_l1hdr.b_refcnt);
1273	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1274	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1275
1276	return (0);
1277}
1278
1279/* ARGSUSED */
1280static int
1281hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
1282{
1283	arc_buf_hdr_t *hdr = vbuf;
1284
1285	bzero(hdr, HDR_L2ONLY_SIZE);
1286	arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1287
1288	return (0);
1289}
1290
1291/* ARGSUSED */
1292static int
1293buf_cons(void *vbuf, void *unused, int kmflag)
1294{
1295	arc_buf_t *buf = vbuf;
1296
1297	bzero(buf, sizeof (arc_buf_t));
1298	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1299	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1300
1301	return (0);
1302}
1303
1304/*
1305 * Destructor callback - called when a cached buf is
1306 * no longer required.
1307 */
1308/* ARGSUSED */
1309static void
1310hdr_full_dest(void *vbuf, void *unused)
1311{
1312	arc_buf_hdr_t *hdr = vbuf;
1313
1314	ASSERT(BUF_EMPTY(hdr));
1315	cv_destroy(&hdr->b_l1hdr.b_cv);
1316	refcount_destroy(&hdr->b_l1hdr.b_refcnt);
1317	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
1318	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1319}
1320
1321/* ARGSUSED */
1322static void
1323hdr_l2only_dest(void *vbuf, void *unused)
1324{
1325	arc_buf_hdr_t *hdr = vbuf;
1326
1327	ASSERT(BUF_EMPTY(hdr));
1328	arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1329}
1330
1331/* ARGSUSED */
1332static void
1333buf_dest(void *vbuf, void *unused)
1334{
1335	arc_buf_t *buf = vbuf;
1336
1337	mutex_destroy(&buf->b_evict_lock);
1338	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1339}
1340
1341/*
1342 * Reclaim callback -- invoked when memory is low.
1343 */
1344/* ARGSUSED */
1345static void
1346hdr_recl(void *unused)
1347{
1348	dprintf("hdr_recl called\n");
1349	/*
1350	 * umem calls the reclaim func when we destroy the buf cache,
1351	 * which is after we do arc_fini().
1352	 */
1353	if (!arc_dead)
1354		cv_signal(&arc_reclaim_thr_cv);
1355}
1356
1357static void
1358buf_init(void)
1359{
1360	uint64_t *ct;
1361	uint64_t hsize = 1ULL << 12;
1362	int i, j;
1363
1364	/*
1365	 * The hash table is big enough to fill all of physical memory
1366	 * with an average block size of zfs_arc_average_blocksize (default 8K).
1367	 * By default, the table will take up
1368	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1369	 */
1370	while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
1371		hsize <<= 1;
1372retry:
1373	buf_hash_table.ht_mask = hsize - 1;
1374	buf_hash_table.ht_table =
1375	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1376	if (buf_hash_table.ht_table == NULL) {
1377		ASSERT(hsize > (1ULL << 8));
1378		hsize >>= 1;
1379		goto retry;
1380	}
1381
1382	hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
1383	    0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
1384	hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
1385	    HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
1386	    NULL, NULL, 0);
1387	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1388	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1389
1390	for (i = 0; i < 256; i++)
1391		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1392			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1393
1394	for (i = 0; i < BUF_LOCKS; i++) {
1395		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1396		    NULL, MUTEX_DEFAULT, NULL);
1397	}
1398}
1399
1400/*
1401 * Transition between the two allocation states for the arc_buf_hdr struct.
1402 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
1403 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
1404 * version is used when a cache buffer is only in the L2ARC in order to reduce
1405 * memory usage.
1406 */
1407static arc_buf_hdr_t *
1408arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
1409{
1410	ASSERT(HDR_HAS_L2HDR(hdr));
1411
1412	arc_buf_hdr_t *nhdr;
1413	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
1414
1415	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
1416	    (old == hdr_l2only_cache && new == hdr_full_cache));
1417
1418	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
1419
1420	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
1421	buf_hash_remove(hdr);
1422
1423	bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
1424
1425	if (new == hdr_full_cache) {
1426		nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
1427		/*
1428		 * arc_access and arc_change_state need to be aware that a
1429		 * header has just come out of L2ARC, so we set its state to
1430		 * l2c_only even though it's about to change.
1431		 */
1432		nhdr->b_l1hdr.b_state = arc_l2c_only;
1433	} else {
1434		ASSERT(hdr->b_l1hdr.b_buf == NULL);
1435		ASSERT0(hdr->b_l1hdr.b_datacnt);
1436		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
1437		/*
1438		 * We might be removing the L1hdr of a buffer which was just
1439		 * written out to L2ARC. If such a buffer is compressed then we
1440		 * need to free its b_tmp_cdata before destroying the header.
1441		 */
1442		if (hdr->b_l1hdr.b_tmp_cdata != NULL &&
1443		    HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
1444			l2arc_release_cdata_buf(hdr);
1445		nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
1446	}
1447	/*
1448	 * The header has been reallocated so we need to re-insert it into any
1449	 * lists it was on.
1450	 */
1451	(void) buf_hash_insert(nhdr, NULL);
1452
1453	ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
1454
1455	mutex_enter(&dev->l2ad_mtx);
1456
1457	/*
1458	 * We must place the realloc'ed header back into the list at
1459	 * the same spot. Otherwise, if it's placed earlier in the list,
1460	 * l2arc_write_buffers() could find it during the function's
1461	 * write phase, and try to write it out to the l2arc.
1462	 */
1463	list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
1464	list_remove(&dev->l2ad_buflist, hdr);
1465
1466	mutex_exit(&dev->l2ad_mtx);
1467
1468	/*
1469	 * Since we're using the pointer address as the tag when
1470	 * incrementing and decrementing the l2ad_alloc refcount, we
1471	 * must remove the old pointer (that we're about to destroy) and
1472	 * add the new pointer to the refcount. Otherwise we'd remove
1473	 * the wrong pointer address when calling arc_hdr_destroy() later.
1474	 */
1475
1476	(void) refcount_remove_many(&dev->l2ad_alloc,
1477	    hdr->b_l2hdr.b_asize, hdr);
1478
1479	(void) refcount_add_many(&dev->l2ad_alloc,
1480	    nhdr->b_l2hdr.b_asize, nhdr);
1481
1482	buf_discard_identity(hdr);
1483	hdr->b_freeze_cksum = NULL;
1484	kmem_cache_free(old, hdr);
1485
1486	return (nhdr);
1487}
1488
1489
1490#define	ARC_MINTIME	(hz>>4) /* 62 ms */
1491
1492static void
1493arc_cksum_verify(arc_buf_t *buf)
1494{
1495	zio_cksum_t zc;
1496
1497	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1498		return;
1499
1500	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1501	if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) {
1502		mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1503		return;
1504	}
1505	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1506	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1507		panic("buffer modified while frozen!");
1508	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1509}
1510
1511static int
1512arc_cksum_equal(arc_buf_t *buf)
1513{
1514	zio_cksum_t zc;
1515	int equal;
1516
1517	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1518	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
1519	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1520	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1521
1522	return (equal);
1523}
1524
1525static void
1526arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1527{
1528	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1529		return;
1530
1531	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1532	if (buf->b_hdr->b_freeze_cksum != NULL) {
1533		mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1534		return;
1535	}
1536	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1537	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1538	    buf->b_hdr->b_freeze_cksum);
1539	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1540#ifdef illumos
1541	arc_buf_watch(buf);
1542#endif
1543}
1544
1545#ifdef illumos
1546#ifndef _KERNEL
1547typedef struct procctl {
1548	long cmd;
1549	prwatch_t prwatch;
1550} procctl_t;
1551#endif
1552
1553/* ARGSUSED */
1554static void
1555arc_buf_unwatch(arc_buf_t *buf)
1556{
1557#ifndef _KERNEL
1558	if (arc_watch) {
1559		int result;
1560		procctl_t ctl;
1561		ctl.cmd = PCWATCH;
1562		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1563		ctl.prwatch.pr_size = 0;
1564		ctl.prwatch.pr_wflags = 0;
1565		result = write(arc_procfd, &ctl, sizeof (ctl));
1566		ASSERT3U(result, ==, sizeof (ctl));
1567	}
1568#endif
1569}
1570
1571/* ARGSUSED */
1572static void
1573arc_buf_watch(arc_buf_t *buf)
1574{
1575#ifndef _KERNEL
1576	if (arc_watch) {
1577		int result;
1578		procctl_t ctl;
1579		ctl.cmd = PCWATCH;
1580		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1581		ctl.prwatch.pr_size = buf->b_hdr->b_size;
1582		ctl.prwatch.pr_wflags = WA_WRITE;
1583		result = write(arc_procfd, &ctl, sizeof (ctl));
1584		ASSERT3U(result, ==, sizeof (ctl));
1585	}
1586#endif
1587}
1588#endif /* illumos */
1589
1590static arc_buf_contents_t
1591arc_buf_type(arc_buf_hdr_t *hdr)
1592{
1593	if (HDR_ISTYPE_METADATA(hdr)) {
1594		return (ARC_BUFC_METADATA);
1595	} else {
1596		return (ARC_BUFC_DATA);
1597	}
1598}
1599
1600static uint32_t
1601arc_bufc_to_flags(arc_buf_contents_t type)
1602{
1603	switch (type) {
1604	case ARC_BUFC_DATA:
1605		/* metadata field is 0 if buffer contains normal data */
1606		return (0);
1607	case ARC_BUFC_METADATA:
1608		return (ARC_FLAG_BUFC_METADATA);
1609	default:
1610		break;
1611	}
1612	panic("undefined ARC buffer type!");
1613	return ((uint32_t)-1);
1614}
1615
1616void
1617arc_buf_thaw(arc_buf_t *buf)
1618{
1619	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1620		if (buf->b_hdr->b_l1hdr.b_state != arc_anon)
1621			panic("modifying non-anon buffer!");
1622		if (HDR_IO_IN_PROGRESS(buf->b_hdr))
1623			panic("modifying buffer while i/o in progress!");
1624		arc_cksum_verify(buf);
1625	}
1626
1627	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1628	if (buf->b_hdr->b_freeze_cksum != NULL) {
1629		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1630		buf->b_hdr->b_freeze_cksum = NULL;
1631	}
1632
1633#ifdef ZFS_DEBUG
1634	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1635		if (buf->b_hdr->b_l1hdr.b_thawed != NULL)
1636			kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1);
1637		buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
1638	}
1639#endif
1640
1641	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1642
1643#ifdef illumos
1644	arc_buf_unwatch(buf);
1645#endif
1646}
1647
1648void
1649arc_buf_freeze(arc_buf_t *buf)
1650{
1651	kmutex_t *hash_lock;
1652
1653	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1654		return;
1655
1656	hash_lock = HDR_LOCK(buf->b_hdr);
1657	mutex_enter(hash_lock);
1658
1659	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1660	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
1661	arc_cksum_compute(buf, B_FALSE);
1662	mutex_exit(hash_lock);
1663
1664}
1665
1666static void
1667add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1668{
1669	ASSERT(HDR_HAS_L1HDR(hdr));
1670	ASSERT(MUTEX_HELD(hash_lock));
1671	arc_state_t *state = hdr->b_l1hdr.b_state;
1672
1673	if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
1674	    (state != arc_anon)) {
1675		/* We don't use the L2-only state list. */
1676		if (state != arc_l2c_only) {
1677			uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt;
1678			list_t *list = &state->arcs_list[arc_buf_type(hdr)];
1679			uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
1680
1681			ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1682			mutex_enter(&state->arcs_mtx);
1683			ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
1684			list_remove(list, hdr);
1685			if (GHOST_STATE(state)) {
1686				ASSERT0(hdr->b_l1hdr.b_datacnt);
1687				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
1688				delta = hdr->b_size;
1689			}
1690			ASSERT(delta > 0);
1691			ASSERT3U(*size, >=, delta);
1692			atomic_add_64(size, -delta);
1693			mutex_exit(&state->arcs_mtx);
1694		}
1695		/* remove the prefetch flag if we get a reference */
1696		hdr->b_flags &= ~ARC_FLAG_PREFETCH;
1697	}
1698}
1699
1700static int
1701remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1702{
1703	int cnt;
1704	arc_state_t *state = hdr->b_l1hdr.b_state;
1705
1706	ASSERT(HDR_HAS_L1HDR(hdr));
1707	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1708	ASSERT(!GHOST_STATE(state));
1709
1710	/*
1711	 * arc_l2c_only counts as a ghost state so we don't need to explicitly
1712	 * check to prevent usage of the arc_l2c_only list.
1713	 */
1714	if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
1715	    (state != arc_anon)) {
1716		uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)];
1717
1718		ASSERT(!MUTEX_HELD(&state->arcs_mtx));
1719		mutex_enter(&state->arcs_mtx);
1720		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
1721		list_insert_head(&state->arcs_list[arc_buf_type(hdr)], hdr);
1722		ASSERT(hdr->b_l1hdr.b_datacnt > 0);
1723		atomic_add_64(size, hdr->b_size *
1724		    hdr->b_l1hdr.b_datacnt);
1725		mutex_exit(&state->arcs_mtx);
1726	}
1727	return (cnt);
1728}
1729
1730/*
1731 * Move the supplied buffer to the indicated state.  The mutex
1732 * for the buffer must be held by the caller.
1733 */
1734static void
1735arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
1736    kmutex_t *hash_lock)
1737{
1738	arc_state_t *old_state;
1739	int64_t refcnt;
1740	uint32_t datacnt;
1741	uint64_t from_delta, to_delta;
1742	arc_buf_contents_t buftype = arc_buf_type(hdr);
1743
1744	/*
1745	 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
1746	 * in arc_read() when bringing a buffer out of the L2ARC.  However, the
1747	 * L1 hdr doesn't always exist when we change state to arc_anon before
1748	 * destroying a header, in which case reallocating to add the L1 hdr is
1749	 * pointless.
1750	 */
1751	if (HDR_HAS_L1HDR(hdr)) {
1752		old_state = hdr->b_l1hdr.b_state;
1753		refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
1754		datacnt = hdr->b_l1hdr.b_datacnt;
1755	} else {
1756		old_state = arc_l2c_only;
1757		refcnt = 0;
1758		datacnt = 0;
1759	}
1760
1761	ASSERT(MUTEX_HELD(hash_lock));
1762	ASSERT3P(new_state, !=, old_state);
1763	ASSERT(refcnt == 0 || datacnt > 0);
1764	ASSERT(!GHOST_STATE(new_state) || datacnt == 0);
1765	ASSERT(old_state != arc_anon || datacnt <= 1);
1766
1767	from_delta = to_delta = datacnt * hdr->b_size;
1768
1769	/*
1770	 * If this buffer is evictable, transfer it from the
1771	 * old state list to the new state list.
1772	 */
1773	if (refcnt == 0) {
1774		if (old_state != arc_anon && old_state != arc_l2c_only) {
1775			int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx);
1776			uint64_t *size = &old_state->arcs_lsize[buftype];
1777
1778			if (use_mutex)
1779				mutex_enter(&old_state->arcs_mtx);
1780
1781			ASSERT(HDR_HAS_L1HDR(hdr));
1782			ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
1783			list_remove(&old_state->arcs_list[buftype], hdr);
1784
1785			/*
1786			 * If prefetching out of the ghost cache,
1787			 * we will have a non-zero datacnt.
1788			 */
1789			if (GHOST_STATE(old_state) && datacnt == 0) {
1790				/* ghost elements have a ghost size */
1791				ASSERT(hdr->b_l1hdr.b_buf == NULL);
1792				from_delta = hdr->b_size;
1793			}
1794			ASSERT3U(*size, >=, from_delta);
1795			atomic_add_64(size, -from_delta);
1796
1797			if (use_mutex)
1798				mutex_exit(&old_state->arcs_mtx);
1799		}
1800		if (new_state != arc_anon && new_state != arc_l2c_only) {
1801			int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx);
1802			uint64_t *size = &new_state->arcs_lsize[buftype];
1803
1804			/*
1805			 * An L1 header always exists here, since if we're
1806			 * moving to some L1-cached state (i.e. not l2c_only or
1807			 * anonymous), we realloc the header to add an L1hdr
1808			 * beforehand.
1809			 */
1810			ASSERT(HDR_HAS_L1HDR(hdr));
1811			if (use_mutex)
1812				mutex_enter(&new_state->arcs_mtx);
1813
1814			list_insert_head(&new_state->arcs_list[buftype], hdr);
1815
1816			/* ghost elements have a ghost size */
1817			if (GHOST_STATE(new_state)) {
1818				ASSERT0(datacnt);
1819				ASSERT(hdr->b_l1hdr.b_buf == NULL);
1820				to_delta = hdr->b_size;
1821			}
1822			atomic_add_64(size, to_delta);
1823
1824			if (use_mutex)
1825				mutex_exit(&new_state->arcs_mtx);
1826		}
1827	}
1828
1829	ASSERT(!BUF_EMPTY(hdr));
1830	if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
1831		buf_hash_remove(hdr);
1832
1833	/* adjust state sizes (ignore arc_l2c_only) */
1834	if (to_delta && new_state != arc_l2c_only)
1835		atomic_add_64(&new_state->arcs_size, to_delta);
1836	if (from_delta && old_state != arc_l2c_only) {
1837		ASSERT3U(old_state->arcs_size, >=, from_delta);
1838		atomic_add_64(&old_state->arcs_size, -from_delta);
1839	}
1840	if (HDR_HAS_L1HDR(hdr))
1841		hdr->b_l1hdr.b_state = new_state;
1842
1843	/*
1844	 * L2 headers should never be on the L2 state list since they don't
1845	 * have L1 headers allocated.
1846	 */
1847	ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
1848	    list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
1849}
1850
1851void
1852arc_space_consume(uint64_t space, arc_space_type_t type)
1853{
1854	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1855
1856	switch (type) {
1857	case ARC_SPACE_DATA:
1858		ARCSTAT_INCR(arcstat_data_size, space);
1859		break;
1860	case ARC_SPACE_META:
1861		ARCSTAT_INCR(arcstat_metadata_size, space);
1862		break;
1863	case ARC_SPACE_OTHER:
1864		ARCSTAT_INCR(arcstat_other_size, space);
1865		break;
1866	case ARC_SPACE_HDRS:
1867		ARCSTAT_INCR(arcstat_hdr_size, space);
1868		break;
1869	case ARC_SPACE_L2HDRS:
1870		ARCSTAT_INCR(arcstat_l2_hdr_size, space);
1871		break;
1872	}
1873
1874	if (type != ARC_SPACE_DATA)
1875		ARCSTAT_INCR(arcstat_meta_used, space);
1876
1877	atomic_add_64(&arc_size, space);
1878}
1879
1880void
1881arc_space_return(uint64_t space, arc_space_type_t type)
1882{
1883	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
1884
1885	switch (type) {
1886	case ARC_SPACE_DATA:
1887		ARCSTAT_INCR(arcstat_data_size, -space);
1888		break;
1889	case ARC_SPACE_META:
1890		ARCSTAT_INCR(arcstat_metadata_size, -space);
1891		break;
1892	case ARC_SPACE_OTHER:
1893		ARCSTAT_INCR(arcstat_other_size, -space);
1894		break;
1895	case ARC_SPACE_HDRS:
1896		ARCSTAT_INCR(arcstat_hdr_size, -space);
1897		break;
1898	case ARC_SPACE_L2HDRS:
1899		ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
1900		break;
1901	}
1902
1903	if (type != ARC_SPACE_DATA) {
1904		ASSERT(arc_meta_used >= space);
1905		if (arc_meta_max < arc_meta_used)
1906			arc_meta_max = arc_meta_used;
1907		ARCSTAT_INCR(arcstat_meta_used, -space);
1908	}
1909
1910	ASSERT(arc_size >= space);
1911	atomic_add_64(&arc_size, -space);
1912}
1913
1914arc_buf_t *
1915arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type)
1916{
1917	arc_buf_hdr_t *hdr;
1918	arc_buf_t *buf;
1919
1920	ASSERT3U(size, >, 0);
1921	hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
1922	ASSERT(BUF_EMPTY(hdr));
1923	ASSERT3P(hdr->b_freeze_cksum, ==, NULL);
1924	hdr->b_size = size;
1925	hdr->b_spa = spa_load_guid(spa);
1926
1927	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
1928	buf->b_hdr = hdr;
1929	buf->b_data = NULL;
1930	buf->b_efunc = NULL;
1931	buf->b_private = NULL;
1932	buf->b_next = NULL;
1933
1934	hdr->b_flags = arc_bufc_to_flags(type);
1935	hdr->b_flags |= ARC_FLAG_HAS_L1HDR;
1936
1937	hdr->b_l1hdr.b_buf = buf;
1938	hdr->b_l1hdr.b_state = arc_anon;
1939	hdr->b_l1hdr.b_arc_access = 0;
1940	hdr->b_l1hdr.b_datacnt = 1;
1941
1942	arc_get_data_buf(buf);
1943	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
1944	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
1945
1946	return (buf);
1947}
1948
1949static char *arc_onloan_tag = "onloan";
1950
1951/*
1952 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
1953 * flight data by arc_tempreserve_space() until they are "returned". Loaned
1954 * buffers must be returned to the arc before they can be used by the DMU or
1955 * freed.
1956 */
1957arc_buf_t *
1958arc_loan_buf(spa_t *spa, int size)
1959{
1960	arc_buf_t *buf;
1961
1962	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
1963
1964	atomic_add_64(&arc_loaned_bytes, size);
1965	return (buf);
1966}
1967
1968/*
1969 * Return a loaned arc buffer to the arc.
1970 */
1971void
1972arc_return_buf(arc_buf_t *buf, void *tag)
1973{
1974	arc_buf_hdr_t *hdr = buf->b_hdr;
1975
1976	ASSERT(buf->b_data != NULL);
1977	ASSERT(HDR_HAS_L1HDR(hdr));
1978	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
1979	(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
1980
1981	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
1982}
1983
1984/* Detach an arc_buf from a dbuf (tag) */
1985void
1986arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
1987{
1988	arc_buf_hdr_t *hdr = buf->b_hdr;
1989
1990	ASSERT(buf->b_data != NULL);
1991	ASSERT(HDR_HAS_L1HDR(hdr));
1992	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
1993	(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
1994	buf->b_efunc = NULL;
1995	buf->b_private = NULL;
1996
1997	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
1998}
1999
2000static arc_buf_t *
2001arc_buf_clone(arc_buf_t *from)
2002{
2003	arc_buf_t *buf;
2004	arc_buf_hdr_t *hdr = from->b_hdr;
2005	uint64_t size = hdr->b_size;
2006
2007	ASSERT(HDR_HAS_L1HDR(hdr));
2008	ASSERT(hdr->b_l1hdr.b_state != arc_anon);
2009
2010	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2011	buf->b_hdr = hdr;
2012	buf->b_data = NULL;
2013	buf->b_efunc = NULL;
2014	buf->b_private = NULL;
2015	buf->b_next = hdr->b_l1hdr.b_buf;
2016	hdr->b_l1hdr.b_buf = buf;
2017	arc_get_data_buf(buf);
2018	bcopy(from->b_data, buf->b_data, size);
2019
2020	/*
2021	 * This buffer already exists in the arc so create a duplicate
2022	 * copy for the caller.  If the buffer is associated with user data
2023	 * then track the size and number of duplicates.  These stats will be
2024	 * updated as duplicate buffers are created and destroyed.
2025	 */
2026	if (HDR_ISTYPE_DATA(hdr)) {
2027		ARCSTAT_BUMP(arcstat_duplicate_buffers);
2028		ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
2029	}
2030	hdr->b_l1hdr.b_datacnt += 1;
2031	return (buf);
2032}
2033
2034void
2035arc_buf_add_ref(arc_buf_t *buf, void* tag)
2036{
2037	arc_buf_hdr_t *hdr;
2038	kmutex_t *hash_lock;
2039
2040	/*
2041	 * Check to see if this buffer is evicted.  Callers
2042	 * must verify b_data != NULL to know if the add_ref
2043	 * was successful.
2044	 */
2045	mutex_enter(&buf->b_evict_lock);
2046	if (buf->b_data == NULL) {
2047		mutex_exit(&buf->b_evict_lock);
2048		return;
2049	}
2050	hash_lock = HDR_LOCK(buf->b_hdr);
2051	mutex_enter(hash_lock);
2052	hdr = buf->b_hdr;
2053	ASSERT(HDR_HAS_L1HDR(hdr));
2054	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2055	mutex_exit(&buf->b_evict_lock);
2056
2057	ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
2058	    hdr->b_l1hdr.b_state == arc_mfu);
2059
2060	add_reference(hdr, hash_lock, tag);
2061	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2062	arc_access(hdr, hash_lock);
2063	mutex_exit(hash_lock);
2064	ARCSTAT_BUMP(arcstat_hits);
2065	ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
2066	    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
2067	    data, metadata, hits);
2068}
2069
2070static void
2071arc_buf_free_on_write(void *data, size_t size,
2072    void (*free_func)(void *, size_t))
2073{
2074	l2arc_data_free_t *df;
2075
2076	df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
2077	df->l2df_data = data;
2078	df->l2df_size = size;
2079	df->l2df_func = free_func;
2080	mutex_enter(&l2arc_free_on_write_mtx);
2081	list_insert_head(l2arc_free_on_write, df);
2082	mutex_exit(&l2arc_free_on_write_mtx);
2083}
2084
2085/*
2086 * Free the arc data buffer.  If it is an l2arc write in progress,
2087 * the buffer is placed on l2arc_free_on_write to be freed later.
2088 */
2089static void
2090arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
2091{
2092	arc_buf_hdr_t *hdr = buf->b_hdr;
2093
2094	if (HDR_L2_WRITING(hdr)) {
2095		arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func);
2096		ARCSTAT_BUMP(arcstat_l2_free_on_write);
2097	} else {
2098		free_func(buf->b_data, hdr->b_size);
2099	}
2100}
2101
2102/*
2103 * Free up buf->b_data and if 'remove' is set, then pull the
2104 * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
2105 */
2106static void
2107arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
2108{
2109	ASSERT(HDR_HAS_L2HDR(hdr));
2110	ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx));
2111
2112	/*
2113	 * The b_tmp_cdata field is linked off of the b_l1hdr, so if
2114	 * that doesn't exist, the header is in the arc_l2c_only state,
2115	 * and there isn't anything to free (it's already been freed).
2116	 */
2117	if (!HDR_HAS_L1HDR(hdr))
2118		return;
2119
2120	if (hdr->b_l1hdr.b_tmp_cdata == NULL)
2121		return;
2122
2123	ASSERT(HDR_L2_WRITING(hdr));
2124	arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, hdr->b_size,
2125	    zio_data_buf_free);
2126
2127	ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
2128	hdr->b_l1hdr.b_tmp_cdata = NULL;
2129}
2130
2131static void
2132arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove)
2133{
2134	arc_buf_t **bufp;
2135
2136	/* free up data associated with the buf */
2137	if (buf->b_data != NULL) {
2138		arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;
2139		uint64_t size = buf->b_hdr->b_size;
2140		arc_buf_contents_t type = arc_buf_type(buf->b_hdr);
2141
2142		arc_cksum_verify(buf);
2143#ifdef illumos
2144		arc_buf_unwatch(buf);
2145#endif
2146
2147		if (!recycle) {
2148			if (type == ARC_BUFC_METADATA) {
2149				arc_buf_data_free(buf, zio_buf_free);
2150				arc_space_return(size, ARC_SPACE_META);
2151			} else {
2152				ASSERT(type == ARC_BUFC_DATA);
2153				arc_buf_data_free(buf, zio_data_buf_free);
2154				arc_space_return(size, ARC_SPACE_DATA);
2155			}
2156		}
2157		if (list_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
2158			uint64_t *cnt = &state->arcs_lsize[type];
2159
2160			ASSERT(refcount_is_zero(
2161			    &buf->b_hdr->b_l1hdr.b_refcnt));
2162			ASSERT(state != arc_anon && state != arc_l2c_only);
2163
2164			ASSERT3U(*cnt, >=, size);
2165			atomic_add_64(cnt, -size);
2166		}
2167		ASSERT3U(state->arcs_size, >=, size);
2168		atomic_add_64(&state->arcs_size, -size);
2169		buf->b_data = NULL;
2170
2171		/*
2172		 * If we're destroying a duplicate buffer make sure
2173		 * that the appropriate statistics are updated.
2174		 */
2175		if (buf->b_hdr->b_l1hdr.b_datacnt > 1 &&
2176		    HDR_ISTYPE_DATA(buf->b_hdr)) {
2177			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
2178			ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
2179		}
2180		ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0);
2181		buf->b_hdr->b_l1hdr.b_datacnt -= 1;
2182	}
2183
2184	/* only remove the buf if requested */
2185	if (!remove)
2186		return;
2187
2188	/* remove the buf from the hdr list */
2189	for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf;
2190	    bufp = &(*bufp)->b_next)
2191		continue;
2192	*bufp = buf->b_next;
2193	buf->b_next = NULL;
2194
2195	ASSERT(buf->b_efunc == NULL);
2196
2197	/* clean up the buf */
2198	buf->b_hdr = NULL;
2199	kmem_cache_free(buf_cache, buf);
2200}
2201
2202static void
2203arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
2204{
2205	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
2206	l2arc_dev_t *dev = l2hdr->b_dev;
2207
2208	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
2209	ASSERT(HDR_HAS_L2HDR(hdr));
2210
2211	list_remove(&dev->l2ad_buflist, hdr);
2212
2213	/*
2214	 * We don't want to leak the b_tmp_cdata buffer that was
2215	 * allocated in l2arc_write_buffers()
2216	 */
2217	arc_buf_l2_cdata_free(hdr);
2218
2219	/*
2220	 * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then
2221	 * this header is being processed by l2arc_write_buffers() (i.e.
2222	 * it's in the first stage of l2arc_write_buffers()).
2223	 * Re-affirming that truth here, just to serve as a reminder. If
2224	 * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or
2225	 * may not have its HDR_L2_WRITING flag set. (the write may have
2226	 * completed, in which case HDR_L2_WRITING will be false and the
2227	 * b_daddr field will point to the address of the buffer on disk).
2228	 */
2229	IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr));
2230
2231	/*
2232	 * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with
2233	 * l2arc_write_buffers(). Since we've just removed this header
2234	 * from the l2arc buffer list, this header will never reach the
2235	 * second stage of l2arc_write_buffers(), which increments the
2236	 * accounting stats for this header. Thus, we must be careful
2237	 * not to decrement them for this header either.
2238	 */
2239	if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) {
2240		ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
2241		ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
2242
2243		vdev_space_update(dev->l2ad_vdev,
2244		    -l2hdr->b_asize, 0, 0);
2245
2246		(void) refcount_remove_many(&dev->l2ad_alloc,
2247		    l2hdr->b_asize, hdr);
2248	}
2249
2250	hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
2251}
2252
2253static void
2254arc_hdr_destroy(arc_buf_hdr_t *hdr)
2255{
2256	if (HDR_HAS_L1HDR(hdr)) {
2257		ASSERT(hdr->b_l1hdr.b_buf == NULL ||
2258		    hdr->b_l1hdr.b_datacnt > 0);
2259		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2260		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
2261	}
2262	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2263	ASSERT(!HDR_IN_HASH_TABLE(hdr));
2264
2265	if (HDR_HAS_L2HDR(hdr)) {
2266		l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
2267		boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
2268
2269		if (!buflist_held)
2270			mutex_enter(&dev->l2ad_mtx);
2271
2272		/*
2273		 * Even though we checked this conditional above, we
2274		 * need to check this again now that we have the
2275		 * l2ad_mtx. This is because we could be racing with
2276		 * another thread calling l2arc_evict() which might have
2277		 * destroyed this header's L2 portion as we were waiting
2278		 * to acquire the l2ad_mtx. If that happens, we don't
2279		 * want to re-destroy the header's L2 portion.
2280		 */
2281		if (HDR_HAS_L2HDR(hdr)) {
2282			if (hdr->b_l2hdr.b_daddr != L2ARC_ADDR_UNSET)
2283				trim_map_free(dev->l2ad_vdev,
2284				    hdr->b_l2hdr.b_daddr,
2285				    hdr->b_l2hdr.b_asize, 0);
2286			arc_hdr_l2hdr_destroy(hdr);
2287		}
2288
2289		if (!buflist_held)
2290			mutex_exit(&dev->l2ad_mtx);
2291	}
2292
2293	if (!BUF_EMPTY(hdr))
2294		buf_discard_identity(hdr);
2295	if (hdr->b_freeze_cksum != NULL) {
2296		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
2297		hdr->b_freeze_cksum = NULL;
2298	}
2299
2300	if (HDR_HAS_L1HDR(hdr)) {
2301		while (hdr->b_l1hdr.b_buf) {
2302			arc_buf_t *buf = hdr->b_l1hdr.b_buf;
2303
2304			if (buf->b_efunc != NULL) {
2305				mutex_enter(&arc_eviction_mtx);
2306				mutex_enter(&buf->b_evict_lock);
2307				ASSERT(buf->b_hdr != NULL);
2308				arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
2309				    FALSE);
2310				hdr->b_l1hdr.b_buf = buf->b_next;
2311				buf->b_hdr = &arc_eviction_hdr;
2312				buf->b_next = arc_eviction_list;
2313				arc_eviction_list = buf;
2314				mutex_exit(&buf->b_evict_lock);
2315				mutex_exit(&arc_eviction_mtx);
2316			} else {
2317				arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE,
2318				    TRUE);
2319			}
2320		}
2321#ifdef ZFS_DEBUG
2322		if (hdr->b_l1hdr.b_thawed != NULL) {
2323			kmem_free(hdr->b_l1hdr.b_thawed, 1);
2324			hdr->b_l1hdr.b_thawed = NULL;
2325		}
2326#endif
2327	}
2328
2329	ASSERT3P(hdr->b_hash_next, ==, NULL);
2330	if (HDR_HAS_L1HDR(hdr)) {
2331		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
2332		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
2333		kmem_cache_free(hdr_full_cache, hdr);
2334	} else {
2335		kmem_cache_free(hdr_l2only_cache, hdr);
2336	}
2337}
2338
2339void
2340arc_buf_free(arc_buf_t *buf, void *tag)
2341{
2342	arc_buf_hdr_t *hdr = buf->b_hdr;
2343	int hashed = hdr->b_l1hdr.b_state != arc_anon;
2344
2345	ASSERT(buf->b_efunc == NULL);
2346	ASSERT(buf->b_data != NULL);
2347
2348	if (hashed) {
2349		kmutex_t *hash_lock = HDR_LOCK(hdr);
2350
2351		mutex_enter(hash_lock);
2352		hdr = buf->b_hdr;
2353		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2354
2355		(void) remove_reference(hdr, hash_lock, tag);
2356		if (hdr->b_l1hdr.b_datacnt > 1) {
2357			arc_buf_destroy(buf, FALSE, TRUE);
2358		} else {
2359			ASSERT(buf == hdr->b_l1hdr.b_buf);
2360			ASSERT(buf->b_efunc == NULL);
2361			hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
2362		}
2363		mutex_exit(hash_lock);
2364	} else if (HDR_IO_IN_PROGRESS(hdr)) {
2365		int destroy_hdr;
2366		/*
2367		 * We are in the middle of an async write.  Don't destroy
2368		 * this buffer unless the write completes before we finish
2369		 * decrementing the reference count.
2370		 */
2371		mutex_enter(&arc_eviction_mtx);
2372		(void) remove_reference(hdr, NULL, tag);
2373		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2374		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
2375		mutex_exit(&arc_eviction_mtx);
2376		if (destroy_hdr)
2377			arc_hdr_destroy(hdr);
2378	} else {
2379		if (remove_reference(hdr, NULL, tag) > 0)
2380			arc_buf_destroy(buf, FALSE, TRUE);
2381		else
2382			arc_hdr_destroy(hdr);
2383	}
2384}
2385
2386boolean_t
2387arc_buf_remove_ref(arc_buf_t *buf, void* tag)
2388{
2389	arc_buf_hdr_t *hdr = buf->b_hdr;
2390	kmutex_t *hash_lock = HDR_LOCK(hdr);
2391	boolean_t no_callback = (buf->b_efunc == NULL);
2392
2393	if (hdr->b_l1hdr.b_state == arc_anon) {
2394		ASSERT(hdr->b_l1hdr.b_datacnt == 1);
2395		arc_buf_free(buf, tag);
2396		return (no_callback);
2397	}
2398
2399	mutex_enter(hash_lock);
2400	hdr = buf->b_hdr;
2401	ASSERT(hdr->b_l1hdr.b_datacnt > 0);
2402	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2403	ASSERT(hdr->b_l1hdr.b_state != arc_anon);
2404	ASSERT(buf->b_data != NULL);
2405
2406	(void) remove_reference(hdr, hash_lock, tag);
2407	if (hdr->b_l1hdr.b_datacnt > 1) {
2408		if (no_callback)
2409			arc_buf_destroy(buf, FALSE, TRUE);
2410	} else if (no_callback) {
2411		ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL);
2412		ASSERT(buf->b_efunc == NULL);
2413		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
2414	}
2415	ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 ||
2416	    refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2417	mutex_exit(hash_lock);
2418	return (no_callback);
2419}
2420
2421int32_t
2422arc_buf_size(arc_buf_t *buf)
2423{
2424	return (buf->b_hdr->b_size);
2425}
2426
2427/*
2428 * Called from the DMU to determine if the current buffer should be
2429 * evicted. In order to ensure proper locking, the eviction must be initiated
2430 * from the DMU. Return true if the buffer is associated with user data and
2431 * duplicate buffers still exist.
2432 */
2433boolean_t
2434arc_buf_eviction_needed(arc_buf_t *buf)
2435{
2436	arc_buf_hdr_t *hdr;
2437	boolean_t evict_needed = B_FALSE;
2438
2439	if (zfs_disable_dup_eviction)
2440		return (B_FALSE);
2441
2442	mutex_enter(&buf->b_evict_lock);
2443	hdr = buf->b_hdr;
2444	if (hdr == NULL) {
2445		/*
2446		 * We are in arc_do_user_evicts(); let that function
2447		 * perform the eviction.
2448		 */
2449		ASSERT(buf->b_data == NULL);
2450		mutex_exit(&buf->b_evict_lock);
2451		return (B_FALSE);
2452	} else if (buf->b_data == NULL) {
2453		/*
2454		 * We have already been added to the arc eviction list;
2455		 * recommend eviction.
2456		 */
2457		ASSERT3P(hdr, ==, &arc_eviction_hdr);
2458		mutex_exit(&buf->b_evict_lock);
2459		return (B_TRUE);
2460	}
2461
2462	if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr))
2463		evict_needed = B_TRUE;
2464
2465	mutex_exit(&buf->b_evict_lock);
2466	return (evict_needed);
2467}
2468
2469/*
2470 * Evict buffers from list until we've removed the specified number of
2471 * bytes.  Move the removed buffers to the appropriate evict state.
2472 * If the recycle flag is set, then attempt to "recycle" a buffer:
2473 * - look for a buffer to evict that is `bytes' long.
2474 * - return the data block from this buffer rather than freeing it.
2475 * This flag is used by callers that are trying to make space for a
2476 * new buffer in a full arc cache.
2477 *
2478 * This function makes a "best effort".  It skips over any buffers
2479 * it can't get a hash_lock on, and so may not catch all candidates.
2480 * It may also return without evicting as much space as requested.
2481 */
2482static void *
2483arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
2484    arc_buf_contents_t type)
2485{
2486	arc_state_t *evicted_state;
2487	uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
2488	arc_buf_hdr_t *hdr, *hdr_prev = NULL;
2489	kmutex_t *hash_lock;
2490	boolean_t have_lock;
2491	void *stolen = NULL;
2492	arc_buf_hdr_t marker = { 0 };
2493	int count = 0;
2494
2495	ASSERT(state == arc_mru || state == arc_mfu);
2496
2497	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2498
2499	/*
2500	 * The ghost list lock must be acquired first in order to prevent
2501	 * a 3 party deadlock:
2502	 *
2503	 *  - arc_evict_ghost acquires arc_*_ghost->arcs_mtx, followed by
2504	 *    l2ad_mtx in arc_hdr_realloc
2505	 *  - l2arc_write_buffers acquires l2ad_mtx, followed by arc_*->arcs_mtx
2506	 *  - arc_evict acquires arc_*_ghost->arcs_mtx, followed by
2507	 *    arc_*_ghost->arcs_mtx and forms a deadlock cycle.
2508	 *
2509	 * This situation is avoided by acquiring the ghost list lock first.
2510	 */
2511	mutex_enter(&evicted_state->arcs_mtx);
2512	mutex_enter(&state->arcs_mtx);
2513
2514	/*
2515	 * Decide which "type" (data vs metadata) to recycle from.
2516	 *
2517	 * If we are over the metadata limit, recycle from metadata.
2518	 * If we are under the metadata minimum, recycle from data.
2519	 * Otherwise, recycle from whichever type has the oldest (least
2520	 * recently accessed) header.
2521	 */
2522	if (recycle) {
2523		arc_buf_hdr_t *data_hdr =
2524		    list_tail(&state->arcs_list[ARC_BUFC_DATA]);
2525		arc_buf_hdr_t *metadata_hdr =
2526		    list_tail(&state->arcs_list[ARC_BUFC_METADATA]);
2527		arc_buf_contents_t realtype;
2528
2529		if (data_hdr == NULL) {
2530			realtype = ARC_BUFC_METADATA;
2531		} else if (metadata_hdr == NULL) {
2532			realtype = ARC_BUFC_DATA;
2533		} else if (arc_meta_used >= arc_meta_limit) {
2534			realtype = ARC_BUFC_METADATA;
2535		} else if (arc_meta_used <= arc_meta_min) {
2536			realtype = ARC_BUFC_DATA;
2537		} else if (HDR_HAS_L1HDR(data_hdr) &&
2538		    HDR_HAS_L1HDR(metadata_hdr) &&
2539		    data_hdr->b_l1hdr.b_arc_access <
2540		    metadata_hdr->b_l1hdr.b_arc_access) {
2541			realtype = ARC_BUFC_DATA;
2542		} else {
2543			realtype = ARC_BUFC_METADATA;
2544		}
2545		if (realtype != type) {
2546			/*
2547			 * If we want to evict from a different list,
2548			 * we can not recycle, because DATA vs METADATA
2549			 * buffers are segregated into different kmem
2550			 * caches (and vmem arenas).
2551			 */
2552			type = realtype;
2553			recycle = B_FALSE;
2554		}
2555	}
2556
2557	list_t *list = &state->arcs_list[type];
2558
2559	for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
2560		hdr_prev = list_prev(list, hdr);
2561		/* prefetch buffers have a minimum lifespan */
2562		if (HDR_IO_IN_PROGRESS(hdr) ||
2563		    (spa && hdr->b_spa != spa) ||
2564		    ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
2565		    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
2566		    arc_min_prefetch_lifespan)) {
2567			skipped++;
2568			continue;
2569		}
2570		/* "lookahead" for better eviction candidate */
2571		if (recycle && hdr->b_size != bytes &&
2572		    hdr_prev && hdr_prev->b_size == bytes)
2573			continue;
2574
2575		/* ignore markers */
2576		if (hdr->b_spa == 0)
2577			continue;
2578
2579		/*
2580		 * It may take a long time to evict all the bufs requested.
2581		 * To avoid blocking all arc activity, periodically drop
2582		 * the arcs_mtx and give other threads a chance to run
2583		 * before reacquiring the lock.
2584		 *
2585		 * If we are looking for a buffer to recycle, we are in
2586		 * the hot code path, so don't sleep.
2587		 */
2588		if (!recycle && count++ > arc_evict_iterations) {
2589			list_insert_after(list, hdr, &marker);
2590			mutex_exit(&state->arcs_mtx);
2591			mutex_exit(&evicted_state->arcs_mtx);
2592			kpreempt(KPREEMPT_SYNC);
2593			mutex_enter(&evicted_state->arcs_mtx);
2594			mutex_enter(&state->arcs_mtx);
2595			hdr_prev = list_prev(list, &marker);
2596			list_remove(list, &marker);
2597			count = 0;
2598			continue;
2599		}
2600
2601		hash_lock = HDR_LOCK(hdr);
2602		have_lock = MUTEX_HELD(hash_lock);
2603		if (have_lock || mutex_tryenter(hash_lock)) {
2604			ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
2605			ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
2606			while (hdr->b_l1hdr.b_buf) {
2607				arc_buf_t *buf = hdr->b_l1hdr.b_buf;
2608				if (!mutex_tryenter(&buf->b_evict_lock)) {
2609					missed += 1;
2610					break;
2611				}
2612				if (buf->b_data != NULL) {
2613					bytes_evicted += hdr->b_size;
2614					if (recycle &&
2615					    arc_buf_type(hdr) == type &&
2616					    hdr->b_size == bytes &&
2617					    !HDR_L2_WRITING(hdr)) {
2618						stolen = buf->b_data;
2619						recycle = FALSE;
2620					}
2621				}
2622				if (buf->b_efunc != NULL) {
2623					mutex_enter(&arc_eviction_mtx);
2624					arc_buf_destroy(buf,
2625					    buf->b_data == stolen, FALSE);
2626					hdr->b_l1hdr.b_buf = buf->b_next;
2627					buf->b_hdr = &arc_eviction_hdr;
2628					buf->b_next = arc_eviction_list;
2629					arc_eviction_list = buf;
2630					mutex_exit(&arc_eviction_mtx);
2631					mutex_exit(&buf->b_evict_lock);
2632				} else {
2633					mutex_exit(&buf->b_evict_lock);
2634					arc_buf_destroy(buf,
2635					    buf->b_data == stolen, TRUE);
2636				}
2637			}
2638
2639			if (HDR_HAS_L2HDR(hdr)) {
2640				ARCSTAT_INCR(arcstat_evict_l2_cached,
2641				    hdr->b_size);
2642			} else {
2643				if (l2arc_write_eligible(hdr->b_spa, hdr)) {
2644					ARCSTAT_INCR(arcstat_evict_l2_eligible,
2645					    hdr->b_size);
2646				} else {
2647					ARCSTAT_INCR(
2648					    arcstat_evict_l2_ineligible,
2649					    hdr->b_size);
2650				}
2651			}
2652
2653			if (hdr->b_l1hdr.b_datacnt == 0) {
2654				arc_change_state(evicted_state, hdr, hash_lock);
2655				ASSERT(HDR_IN_HASH_TABLE(hdr));
2656				hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
2657				hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
2658				DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
2659			}
2660			if (!have_lock)
2661				mutex_exit(hash_lock);
2662			if (bytes >= 0 && bytes_evicted >= bytes)
2663				break;
2664		} else {
2665			missed += 1;
2666		}
2667	}
2668
2669	mutex_exit(&state->arcs_mtx);
2670	mutex_exit(&evicted_state->arcs_mtx);
2671
2672	if (bytes_evicted < bytes)
2673		dprintf("only evicted %lld bytes from %x",
2674		    (longlong_t)bytes_evicted, state);
2675
2676	if (skipped)
2677		ARCSTAT_INCR(arcstat_evict_skip, skipped);
2678
2679	if (missed)
2680		ARCSTAT_INCR(arcstat_mutex_miss, missed);
2681
2682	/*
2683	 * Note: we have just evicted some data into the ghost state,
2684	 * potentially putting the ghost size over the desired size.  Rather
2685	 * that evicting from the ghost list in this hot code path, leave
2686	 * this chore to the arc_reclaim_thread().
2687	 */
2688
2689	return (stolen);
2690}
2691
2692/*
2693 * Remove buffers from list until we've removed the specified number of
2694 * bytes.  Destroy the buffers that are removed.
2695 */
2696static void
2697arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
2698{
2699	arc_buf_hdr_t *hdr, *hdr_prev;
2700	arc_buf_hdr_t marker = { 0 };
2701	list_t *list = &state->arcs_list[ARC_BUFC_DATA];
2702	kmutex_t *hash_lock;
2703	uint64_t bytes_deleted = 0;
2704	uint64_t bufs_skipped = 0;
2705	int count = 0;
2706
2707	ASSERT(GHOST_STATE(state));
2708top:
2709	mutex_enter(&state->arcs_mtx);
2710	for (hdr = list_tail(list); hdr; hdr = hdr_prev) {
2711		hdr_prev = list_prev(list, hdr);
2712		if (arc_buf_type(hdr) >= ARC_BUFC_NUMTYPES)
2713			panic("invalid hdr=%p", (void *)hdr);
2714		if (spa && hdr->b_spa != spa)
2715			continue;
2716
2717		/* ignore markers */
2718		if (hdr->b_spa == 0)
2719			continue;
2720
2721		hash_lock = HDR_LOCK(hdr);
2722		/* caller may be trying to modify this buffer, skip it */
2723		if (MUTEX_HELD(hash_lock))
2724			continue;
2725
2726		/*
2727		 * It may take a long time to evict all the bufs requested.
2728		 * To avoid blocking all arc activity, periodically drop
2729		 * the arcs_mtx and give other threads a chance to run
2730		 * before reacquiring the lock.
2731		 */
2732		if (count++ > arc_evict_iterations) {
2733			list_insert_after(list, hdr, &marker);
2734			mutex_exit(&state->arcs_mtx);
2735			kpreempt(KPREEMPT_SYNC);
2736			mutex_enter(&state->arcs_mtx);
2737			hdr_prev = list_prev(list, &marker);
2738			list_remove(list, &marker);
2739			count = 0;
2740			continue;
2741		}
2742		if (mutex_tryenter(hash_lock)) {
2743			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2744			ASSERT(!HDR_HAS_L1HDR(hdr) ||
2745			    hdr->b_l1hdr.b_buf == NULL);
2746			ARCSTAT_BUMP(arcstat_deleted);
2747			bytes_deleted += hdr->b_size;
2748
2749			if (HDR_HAS_L2HDR(hdr)) {
2750				/*
2751				 * This buffer is cached on the 2nd Level ARC;
2752				 * don't destroy the header.
2753				 */
2754				arc_change_state(arc_l2c_only, hdr, hash_lock);
2755				/*
2756				 * dropping from L1+L2 cached to L2-only,
2757				 * realloc to remove the L1 header.
2758				 */
2759				hdr = arc_hdr_realloc(hdr, hdr_full_cache,
2760				    hdr_l2only_cache);
2761				mutex_exit(hash_lock);
2762			} else {
2763				arc_change_state(arc_anon, hdr, hash_lock);
2764				mutex_exit(hash_lock);
2765				arc_hdr_destroy(hdr);
2766			}
2767
2768			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
2769			if (bytes >= 0 && bytes_deleted >= bytes)
2770				break;
2771		} else if (bytes < 0) {
2772			/*
2773			 * Insert a list marker and then wait for the
2774			 * hash lock to become available. Once its
2775			 * available, restart from where we left off.
2776			 */
2777			list_insert_after(list, hdr, &marker);
2778			mutex_exit(&state->arcs_mtx);
2779			mutex_enter(hash_lock);
2780			mutex_exit(hash_lock);
2781			mutex_enter(&state->arcs_mtx);
2782			hdr_prev = list_prev(list, &marker);
2783			list_remove(list, &marker);
2784		} else {
2785			bufs_skipped += 1;
2786		}
2787
2788	}
2789	mutex_exit(&state->arcs_mtx);
2790
2791	if (list == &state->arcs_list[ARC_BUFC_DATA] &&
2792	    (bytes < 0 || bytes_deleted < bytes)) {
2793		list = &state->arcs_list[ARC_BUFC_METADATA];
2794		goto top;
2795	}
2796
2797	if (bufs_skipped) {
2798		ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
2799		ASSERT(bytes >= 0);
2800	}
2801
2802	if (bytes_deleted < bytes)
2803		dprintf("only deleted %lld bytes from %p",
2804		    (longlong_t)bytes_deleted, state);
2805}
2806
2807static void
2808arc_adjust(void)
2809{
2810	int64_t adjustment, delta;
2811
2812	/*
2813	 * Adjust MRU size
2814	 */
2815
2816	adjustment = MIN((int64_t)(arc_size - arc_c),
2817	    (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
2818	    arc_p));
2819
2820	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
2821		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
2822		(void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
2823		adjustment -= delta;
2824	}
2825
2826	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2827		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
2828		(void) arc_evict(arc_mru, 0, delta, FALSE,
2829		    ARC_BUFC_METADATA);
2830	}
2831
2832	/*
2833	 * Adjust MFU size
2834	 */
2835
2836	adjustment = arc_size - arc_c;
2837
2838	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
2839		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
2840		(void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
2841		adjustment -= delta;
2842	}
2843
2844	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
2845		int64_t delta = MIN(adjustment,
2846		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
2847		(void) arc_evict(arc_mfu, 0, delta, FALSE,
2848		    ARC_BUFC_METADATA);
2849	}
2850
2851	/*
2852	 * Adjust ghost lists
2853	 */
2854
2855	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
2856
2857	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
2858		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
2859		arc_evict_ghost(arc_mru_ghost, 0, delta);
2860	}
2861
2862	adjustment =
2863	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
2864
2865	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
2866		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
2867		arc_evict_ghost(arc_mfu_ghost, 0, delta);
2868	}
2869}
2870
2871static void
2872arc_do_user_evicts(void)
2873{
2874	mutex_enter(&arc_eviction_mtx);
2875	while (arc_eviction_list != NULL) {
2876		arc_buf_t *buf = arc_eviction_list;
2877		arc_eviction_list = buf->b_next;
2878		mutex_enter(&buf->b_evict_lock);
2879		buf->b_hdr = NULL;
2880		mutex_exit(&buf->b_evict_lock);
2881		mutex_exit(&arc_eviction_mtx);
2882
2883		if (buf->b_efunc != NULL)
2884			VERIFY0(buf->b_efunc(buf->b_private));
2885
2886		buf->b_efunc = NULL;
2887		buf->b_private = NULL;
2888		kmem_cache_free(buf_cache, buf);
2889		mutex_enter(&arc_eviction_mtx);
2890	}
2891	mutex_exit(&arc_eviction_mtx);
2892}
2893
2894/*
2895 * Flush all *evictable* data from the cache for the given spa.
2896 * NOTE: this will not touch "active" (i.e. referenced) data.
2897 */
2898void
2899arc_flush(spa_t *spa)
2900{
2901	uint64_t guid = 0;
2902
2903	if (spa != NULL)
2904		guid = spa_load_guid(spa);
2905
2906	while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
2907		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
2908		if (spa != NULL)
2909			break;
2910	}
2911	while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
2912		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
2913		if (spa != NULL)
2914			break;
2915	}
2916	while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
2917		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
2918		if (spa != NULL)
2919			break;
2920	}
2921	while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
2922		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
2923		if (spa != NULL)
2924			break;
2925	}
2926
2927	arc_evict_ghost(arc_mru_ghost, guid, -1);
2928	arc_evict_ghost(arc_mfu_ghost, guid, -1);
2929
2930	mutex_enter(&arc_reclaim_thr_lock);
2931	arc_do_user_evicts();
2932	mutex_exit(&arc_reclaim_thr_lock);
2933	ASSERT(spa || arc_eviction_list == NULL);
2934}
2935
2936void
2937arc_shrink(int64_t to_free)
2938{
2939	if (arc_c > arc_c_min) {
2940		DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
2941			arc_c_min, uint64_t, arc_p, uint64_t, to_free);
2942		if (arc_c > arc_c_min + to_free)
2943			atomic_add_64(&arc_c, -to_free);
2944		else
2945			arc_c = arc_c_min;
2946
2947		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
2948		if (arc_c > arc_size)
2949			arc_c = MAX(arc_size, arc_c_min);
2950		if (arc_p > arc_c)
2951			arc_p = (arc_c >> 1);
2952
2953		DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
2954			arc_p);
2955
2956		ASSERT(arc_c >= arc_c_min);
2957		ASSERT((int64_t)arc_p >= 0);
2958	}
2959
2960	if (arc_size > arc_c) {
2961		DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,
2962			uint64_t, arc_c);
2963		arc_adjust();
2964	}
2965}
2966
2967static long needfree = 0;
2968
2969typedef enum free_memory_reason_t {
2970	FMR_UNKNOWN,
2971	FMR_NEEDFREE,
2972	FMR_LOTSFREE,
2973	FMR_SWAPFS_MINFREE,
2974	FMR_PAGES_PP_MAXIMUM,
2975	FMR_HEAP_ARENA,
2976	FMR_ZIO_ARENA,
2977	FMR_ZIO_FRAG,
2978} free_memory_reason_t;
2979
2980int64_t last_free_memory;
2981free_memory_reason_t last_free_reason;
2982
2983/*
2984 * Additional reserve of pages for pp_reserve.
2985 */
2986int64_t arc_pages_pp_reserve = 64;
2987
2988/*
2989 * Additional reserve of pages for swapfs.
2990 */
2991int64_t arc_swapfs_reserve = 64;
2992
2993/*
2994 * Return the amount of memory that can be consumed before reclaim will be
2995 * needed.  Positive if there is sufficient free memory, negative indicates
2996 * the amount of memory that needs to be freed up.
2997 */
2998static int64_t
2999arc_available_memory(void)
3000{
3001	int64_t lowest = INT64_MAX;
3002	int64_t n;
3003	free_memory_reason_t r = FMR_UNKNOWN;
3004
3005#ifdef _KERNEL
3006	if (needfree > 0) {
3007		n = PAGESIZE * (-needfree);
3008		if (n < lowest) {
3009			lowest = n;
3010			r = FMR_NEEDFREE;
3011		}
3012	}
3013
3014	/*
3015	 * Cooperate with pagedaemon when it's time for it to scan
3016	 * and reclaim some pages.
3017	 */
3018	n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target);
3019	if (n < lowest) {
3020		lowest = n;
3021		r = FMR_LOTSFREE;
3022	}
3023
3024#ifdef illumos
3025	/*
3026	 * check that we're out of range of the pageout scanner.  It starts to
3027	 * schedule paging if freemem is less than lotsfree and needfree.
3028	 * lotsfree is the high-water mark for pageout, and needfree is the
3029	 * number of needed free pages.  We add extra pages here to make sure
3030	 * the scanner doesn't start up while we're freeing memory.
3031	 */
3032	n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
3033	if (n < lowest) {
3034		lowest = n;
3035		r = FMR_LOTSFREE;
3036	}
3037
3038	/*
3039	 * check to make sure that swapfs has enough space so that anon
3040	 * reservations can still succeed. anon_resvmem() checks that the
3041	 * availrmem is greater than swapfs_minfree, and the number of reserved
3042	 * swap pages.  We also add a bit of extra here just to prevent
3043	 * circumstances from getting really dire.
3044	 */
3045	n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
3046	    desfree - arc_swapfs_reserve);
3047	if (n < lowest) {
3048		lowest = n;
3049		r = FMR_SWAPFS_MINFREE;
3050	}
3051
3052
3053	/*
3054	 * Check that we have enough availrmem that memory locking (e.g., via
3055	 * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
3056	 * stores the number of pages that cannot be locked; when availrmem
3057	 * drops below pages_pp_maximum, page locking mechanisms such as
3058	 * page_pp_lock() will fail.)
3059	 */
3060	n = PAGESIZE * (availrmem - pages_pp_maximum -
3061	    arc_pages_pp_reserve);
3062	if (n < lowest) {
3063		lowest = n;
3064		r = FMR_PAGES_PP_MAXIMUM;
3065	}
3066
3067#endif	/* illumos */
3068#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
3069	/*
3070	 * If we're on an i386 platform, it's possible that we'll exhaust the
3071	 * kernel heap space before we ever run out of available physical
3072	 * memory.  Most checks of the size of the heap_area compare against
3073	 * tune.t_minarmem, which is the minimum available real memory that we
3074	 * can have in the system.  However, this is generally fixed at 25 pages
3075	 * which is so low that it's useless.  In this comparison, we seek to
3076	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
3077	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
3078	 * free)
3079	 */
3080	n = (int64_t)vmem_size(heap_arena, VMEM_FREE) -
3081	    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2);
3082	if (n < lowest) {
3083		lowest = n;
3084		r = FMR_HEAP_ARENA;
3085	}
3086#define	zio_arena	NULL
3087#else
3088#define	zio_arena	heap_arena
3089#endif
3090
3091	/*
3092	 * If zio data pages are being allocated out of a separate heap segment,
3093	 * then enforce that the size of available vmem for this arena remains
3094	 * above about 1/16th free.
3095	 *
3096	 * Note: The 1/16th arena free requirement was put in place
3097	 * to aggressively evict memory from the arc in order to avoid
3098	 * memory fragmentation issues.
3099	 */
3100	if (zio_arena != NULL) {
3101		n = (int64_t)vmem_size(zio_arena, VMEM_FREE) -
3102		    (vmem_size(zio_arena, VMEM_ALLOC) >> 4);
3103		if (n < lowest) {
3104			lowest = n;
3105			r = FMR_ZIO_ARENA;
3106		}
3107	}
3108
3109	/*
3110	 * Above limits know nothing about real level of KVA fragmentation.
3111	 * Start aggressive reclamation if too little sequential KVA left.
3112	 */
3113	if (lowest > 0) {
3114		n = (vmem_size(heap_arena, VMEM_MAXFREE) < zfs_max_recordsize) ?
3115		    -((int64_t)vmem_size(heap_arena, VMEM_ALLOC) >> 4) :
3116		    INT64_MAX;
3117		if (n < lowest) {
3118			lowest = n;
3119			r = FMR_ZIO_FRAG;
3120		}
3121	}
3122
3123#else	/* _KERNEL */
3124	/* Every 100 calls, free a small amount */
3125	if (spa_get_random(100) == 0)
3126		lowest = -1024;
3127#endif	/* _KERNEL */
3128
3129	last_free_memory = lowest;
3130	last_free_reason = r;
3131	DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r);
3132	return (lowest);
3133}
3134
3135
3136/*
3137 * Determine if the system is under memory pressure and is asking
3138 * to reclaim memory. A return value of TRUE indicates that the system
3139 * is under memory pressure and that the arc should adjust accordingly.
3140 */
3141static boolean_t
3142arc_reclaim_needed(void)
3143{
3144	return (arc_available_memory() < 0);
3145}
3146
3147extern kmem_cache_t	*zio_buf_cache[];
3148extern kmem_cache_t	*zio_data_buf_cache[];
3149extern kmem_cache_t	*range_seg_cache;
3150
3151static __noinline void
3152arc_kmem_reap_now(void)
3153{
3154	size_t			i;
3155	kmem_cache_t		*prev_cache = NULL;
3156	kmem_cache_t		*prev_data_cache = NULL;
3157
3158	DTRACE_PROBE(arc__kmem_reap_start);
3159#ifdef _KERNEL
3160	if (arc_meta_used >= arc_meta_limit) {
3161		/*
3162		 * We are exceeding our meta-data cache limit.
3163		 * Purge some DNLC entries to release holds on meta-data.
3164		 */
3165		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
3166	}
3167#if defined(__i386)
3168	/*
3169	 * Reclaim unused memory from all kmem caches.
3170	 */
3171	kmem_reap();
3172#endif
3173#endif
3174
3175	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
3176		if (zio_buf_cache[i] != prev_cache) {
3177			prev_cache = zio_buf_cache[i];
3178			kmem_cache_reap_now(zio_buf_cache[i]);
3179		}
3180		if (zio_data_buf_cache[i] != prev_data_cache) {
3181			prev_data_cache = zio_data_buf_cache[i];
3182			kmem_cache_reap_now(zio_data_buf_cache[i]);
3183		}
3184	}
3185	kmem_cache_reap_now(buf_cache);
3186	kmem_cache_reap_now(hdr_full_cache);
3187	kmem_cache_reap_now(hdr_l2only_cache);
3188	kmem_cache_reap_now(range_seg_cache);
3189
3190#ifdef illumos
3191	if (zio_arena != NULL) {
3192		/*
3193		 * Ask the vmem arena to reclaim unused memory from its
3194		 * quantum caches.
3195		 */
3196		vmem_qcache_reap(zio_arena);
3197	}
3198#endif
3199	DTRACE_PROBE(arc__kmem_reap_end);
3200}
3201
3202static void
3203arc_reclaim_thread(void *dummy __unused)
3204{
3205	clock_t			growtime = 0;
3206	callb_cpr_t		cpr;
3207
3208	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
3209
3210	mutex_enter(&arc_reclaim_thr_lock);
3211	while (arc_thread_exit == 0) {
3212		int64_t free_memory = arc_available_memory();
3213		if (free_memory < 0) {
3214
3215			arc_no_grow = B_TRUE;
3216			arc_warm = B_TRUE;
3217
3218			/*
3219			 * Wait at least zfs_grow_retry (default 60) seconds
3220			 * before considering growing.
3221			 */
3222			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
3223
3224			arc_kmem_reap_now();
3225
3226			/*
3227			 * If we are still low on memory, shrink the ARC
3228			 * so that we have arc_shrink_min free space.
3229			 */
3230			free_memory = arc_available_memory();
3231
3232			int64_t to_free =
3233			    (arc_c >> arc_shrink_shift) - free_memory;
3234			if (to_free > 0) {
3235#ifdef _KERNEL
3236				to_free = MAX(to_free, ptob(needfree));
3237#endif
3238				arc_shrink(to_free);
3239			}
3240		} else if (free_memory < arc_c >> arc_no_grow_shift) {
3241			arc_no_grow = B_TRUE;
3242		} else if (ddi_get_lbolt() >= growtime) {
3243			arc_no_grow = B_FALSE;
3244		}
3245
3246		arc_adjust();
3247
3248		if (arc_eviction_list != NULL)
3249			arc_do_user_evicts();
3250
3251#ifdef _KERNEL
3252		if (needfree) {
3253			needfree = 0;
3254			wakeup(&needfree);
3255		}
3256#endif
3257
3258		/*
3259		 * This is necessary in order for the mdb ::arc dcmd to
3260		 * show up to date information. Since the ::arc command
3261		 * does not call the kstat's update function, without
3262		 * this call, the command may show stale stats for the
3263		 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
3264		 * with this change, the data might be up to 1 second
3265		 * out of date; but that should suffice. The arc_state_t
3266		 * structures can be queried directly if more accurate
3267		 * information is needed.
3268		 */
3269		if (arc_ksp != NULL)
3270			arc_ksp->ks_update(arc_ksp, KSTAT_READ);
3271
3272		/* block until needed, or one second, whichever is shorter */
3273		CALLB_CPR_SAFE_BEGIN(&cpr);
3274		(void) cv_timedwait(&arc_reclaim_thr_cv,
3275		    &arc_reclaim_thr_lock, hz);
3276		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
3277	}
3278
3279	arc_thread_exit = 0;
3280	cv_broadcast(&arc_reclaim_thr_cv);
3281	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
3282	thread_exit();
3283}
3284
3285/*
3286 * Adapt arc info given the number of bytes we are trying to add and
3287 * the state that we are comming from.  This function is only called
3288 * when we are adding new content to the cache.
3289 */
3290static void
3291arc_adapt(int bytes, arc_state_t *state)
3292{
3293	int mult;
3294	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
3295
3296	if (state == arc_l2c_only)
3297		return;
3298
3299	ASSERT(bytes > 0);
3300	/*
3301	 * Adapt the target size of the MRU list:
3302	 *	- if we just hit in the MRU ghost list, then increase
3303	 *	  the target size of the MRU list.
3304	 *	- if we just hit in the MFU ghost list, then increase
3305	 *	  the target size of the MFU list by decreasing the
3306	 *	  target size of the MRU list.
3307	 */
3308	if (state == arc_mru_ghost) {
3309		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
3310		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
3311		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
3312
3313		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
3314	} else if (state == arc_mfu_ghost) {
3315		uint64_t delta;
3316
3317		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
3318		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
3319		mult = MIN(mult, 10);
3320
3321		delta = MIN(bytes * mult, arc_p);
3322		arc_p = MAX(arc_p_min, arc_p - delta);
3323	}
3324	ASSERT((int64_t)arc_p >= 0);
3325
3326	if (arc_reclaim_needed()) {
3327		cv_signal(&arc_reclaim_thr_cv);
3328		return;
3329	}
3330
3331	if (arc_no_grow)
3332		return;
3333
3334	if (arc_c >= arc_c_max)
3335		return;
3336
3337	/*
3338	 * If we're within (2 * maxblocksize) bytes of the target
3339	 * cache size, increment the target cache size
3340	 */
3341	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
3342		DTRACE_PROBE1(arc__inc_adapt, int, bytes);
3343		atomic_add_64(&arc_c, (int64_t)bytes);
3344		if (arc_c > arc_c_max)
3345			arc_c = arc_c_max;
3346		else if (state == arc_anon)
3347			atomic_add_64(&arc_p, (int64_t)bytes);
3348		if (arc_p > arc_c)
3349			arc_p = arc_c;
3350	}
3351	ASSERT((int64_t)arc_p >= 0);
3352}
3353
3354/*
3355 * Check if the cache has reached its limits and eviction is required
3356 * prior to insert.
3357 */
3358static int
3359arc_evict_needed(arc_buf_contents_t type)
3360{
3361	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
3362		return (1);
3363
3364	if (arc_reclaim_needed())
3365		return (1);
3366
3367	return (arc_size > arc_c);
3368}
3369
3370/*
3371 * The buffer, supplied as the first argument, needs a data block.
3372 * So, if we are at cache max, determine which cache should be victimized.
3373 * We have the following cases:
3374 *
3375 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
3376 * In this situation if we're out of space, but the resident size of the MFU is
3377 * under the limit, victimize the MFU cache to satisfy this insertion request.
3378 *
3379 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
3380 * Here, we've used up all of the available space for the MRU, so we need to
3381 * evict from our own cache instead.  Evict from the set of resident MRU
3382 * entries.
3383 *
3384 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
3385 * c minus p represents the MFU space in the cache, since p is the size of the
3386 * cache that is dedicated to the MRU.  In this situation there's still space on
3387 * the MFU side, so the MRU side needs to be victimized.
3388 *
3389 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
3390 * MFU's resident set is consuming more space than it has been allotted.  In
3391 * this situation, we must victimize our own cache, the MFU, for this insertion.
3392 */
3393static void
3394arc_get_data_buf(arc_buf_t *buf)
3395{
3396	arc_state_t		*state = buf->b_hdr->b_l1hdr.b_state;
3397	uint64_t		size = buf->b_hdr->b_size;
3398	arc_buf_contents_t	type = arc_buf_type(buf->b_hdr);
3399
3400	arc_adapt(size, state);
3401
3402	/*
3403	 * We have not yet reached cache maximum size,
3404	 * just allocate a new buffer.
3405	 */
3406	if (!arc_evict_needed(type)) {
3407		if (type == ARC_BUFC_METADATA) {
3408			buf->b_data = zio_buf_alloc(size);
3409			arc_space_consume(size, ARC_SPACE_META);
3410		} else {
3411			ASSERT(type == ARC_BUFC_DATA);
3412			buf->b_data = zio_data_buf_alloc(size);
3413			arc_space_consume(size, ARC_SPACE_DATA);
3414		}
3415		goto out;
3416	}
3417
3418	/*
3419	 * If we are prefetching from the mfu ghost list, this buffer
3420	 * will end up on the mru list; so steal space from there.
3421	 */
3422	if (state == arc_mfu_ghost)
3423		state = HDR_PREFETCH(buf->b_hdr) ? arc_mru : arc_mfu;
3424	else if (state == arc_mru_ghost)
3425		state = arc_mru;
3426
3427	if (state == arc_mru || state == arc_anon) {
3428		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
3429		state = (arc_mfu->arcs_lsize[type] >= size &&
3430		    arc_p > mru_used) ? arc_mfu : arc_mru;
3431	} else {
3432		/* MFU cases */
3433		uint64_t mfu_space = arc_c - arc_p;
3434		state =  (arc_mru->arcs_lsize[type] >= size &&
3435		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
3436	}
3437	if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
3438		if (type == ARC_BUFC_METADATA) {
3439			buf->b_data = zio_buf_alloc(size);
3440			arc_space_consume(size, ARC_SPACE_META);
3441		} else {
3442			ASSERT(type == ARC_BUFC_DATA);
3443			buf->b_data = zio_data_buf_alloc(size);
3444			arc_space_consume(size, ARC_SPACE_DATA);
3445		}
3446		ARCSTAT_BUMP(arcstat_recycle_miss);
3447	}
3448	ASSERT(buf->b_data != NULL);
3449out:
3450	/*
3451	 * Update the state size.  Note that ghost states have a
3452	 * "ghost size" and so don't need to be updated.
3453	 */
3454	if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) {
3455		arc_buf_hdr_t *hdr = buf->b_hdr;
3456
3457		atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size);
3458		if (list_link_active(&hdr->b_l1hdr.b_arc_node)) {
3459			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3460			atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type],
3461			    size);
3462		}
3463		/*
3464		 * If we are growing the cache, and we are adding anonymous
3465		 * data, and we have outgrown arc_p, update arc_p
3466		 */
3467		if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
3468		    arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
3469			arc_p = MIN(arc_c, arc_p + size);
3470	}
3471	ARCSTAT_BUMP(arcstat_allocated);
3472}
3473
3474/*
3475 * This routine is called whenever a buffer is accessed.
3476 * NOTE: the hash lock is dropped in this function.
3477 */
3478static void
3479arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
3480{
3481	clock_t now;
3482
3483	ASSERT(MUTEX_HELD(hash_lock));
3484	ASSERT(HDR_HAS_L1HDR(hdr));
3485
3486	if (hdr->b_l1hdr.b_state == arc_anon) {
3487		/*
3488		 * This buffer is not in the cache, and does not
3489		 * appear in our "ghost" list.  Add the new buffer
3490		 * to the MRU state.
3491		 */
3492
3493		ASSERT0(hdr->b_l1hdr.b_arc_access);
3494		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3495		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
3496		arc_change_state(arc_mru, hdr, hash_lock);
3497
3498	} else if (hdr->b_l1hdr.b_state == arc_mru) {
3499		now = ddi_get_lbolt();
3500
3501		/*
3502		 * If this buffer is here because of a prefetch, then either:
3503		 * - clear the flag if this is a "referencing" read
3504		 *   (any subsequent access will bump this into the MFU state).
3505		 * or
3506		 * - move the buffer to the head of the list if this is
3507		 *   another prefetch (to make it less likely to be evicted).
3508		 */
3509		if (HDR_PREFETCH(hdr)) {
3510			if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
3511				ASSERT(list_link_active(
3512				    &hdr->b_l1hdr.b_arc_node));
3513			} else {
3514				hdr->b_flags &= ~ARC_FLAG_PREFETCH;
3515				ARCSTAT_BUMP(arcstat_mru_hits);
3516			}
3517			hdr->b_l1hdr.b_arc_access = now;
3518			return;
3519		}
3520
3521		/*
3522		 * This buffer has been "accessed" only once so far,
3523		 * but it is still in the cache. Move it to the MFU
3524		 * state.
3525		 */
3526		if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) {
3527			/*
3528			 * More than 125ms have passed since we
3529			 * instantiated this buffer.  Move it to the
3530			 * most frequently used state.
3531			 */
3532			hdr->b_l1hdr.b_arc_access = now;
3533			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3534			arc_change_state(arc_mfu, hdr, hash_lock);
3535		}
3536		ARCSTAT_BUMP(arcstat_mru_hits);
3537	} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
3538		arc_state_t	*new_state;
3539		/*
3540		 * This buffer has been "accessed" recently, but
3541		 * was evicted from the cache.  Move it to the
3542		 * MFU state.
3543		 */
3544
3545		if (HDR_PREFETCH(hdr)) {
3546			new_state = arc_mru;
3547			if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
3548				hdr->b_flags &= ~ARC_FLAG_PREFETCH;
3549			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
3550		} else {
3551			new_state = arc_mfu;
3552			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3553		}
3554
3555		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3556		arc_change_state(new_state, hdr, hash_lock);
3557
3558		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
3559	} else if (hdr->b_l1hdr.b_state == arc_mfu) {
3560		/*
3561		 * This buffer has been accessed more than once and is
3562		 * still in the cache.  Keep it in the MFU state.
3563		 *
3564		 * NOTE: an add_reference() that occurred when we did
3565		 * the arc_read() will have kicked this off the list.
3566		 * If it was a prefetch, we will explicitly move it to
3567		 * the head of the list now.
3568		 */
3569		if ((HDR_PREFETCH(hdr)) != 0) {
3570			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3571			ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node));
3572		}
3573		ARCSTAT_BUMP(arcstat_mfu_hits);
3574		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3575	} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
3576		arc_state_t	*new_state = arc_mfu;
3577		/*
3578		 * This buffer has been accessed more than once but has
3579		 * been evicted from the cache.  Move it back to the
3580		 * MFU state.
3581		 */
3582
3583		if (HDR_PREFETCH(hdr)) {
3584			/*
3585			 * This is a prefetch access...
3586			 * move this block back to the MRU state.
3587			 */
3588			ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
3589			new_state = arc_mru;
3590		}
3591
3592		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3593		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3594		arc_change_state(new_state, hdr, hash_lock);
3595
3596		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
3597	} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
3598		/*
3599		 * This buffer is on the 2nd Level ARC.
3600		 */
3601
3602		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
3603		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
3604		arc_change_state(arc_mfu, hdr, hash_lock);
3605	} else {
3606		ASSERT(!"invalid arc state");
3607	}
3608}
3609
3610/* a generic arc_done_func_t which you can use */
3611/* ARGSUSED */
3612void
3613arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
3614{
3615	if (zio == NULL || zio->io_error == 0)
3616		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
3617	VERIFY(arc_buf_remove_ref(buf, arg));
3618}
3619
3620/* a generic arc_done_func_t */
3621void
3622arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
3623{
3624	arc_buf_t **bufp = arg;
3625	if (zio && zio->io_error) {
3626		VERIFY(arc_buf_remove_ref(buf, arg));
3627		*bufp = NULL;
3628	} else {
3629		*bufp = buf;
3630		ASSERT(buf->b_data);
3631	}
3632}
3633
3634static void
3635arc_read_done(zio_t *zio)
3636{
3637	arc_buf_hdr_t	*hdr;
3638	arc_buf_t	*buf;
3639	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
3640	kmutex_t	*hash_lock = NULL;
3641	arc_callback_t	*callback_list, *acb;
3642	int		freeable = FALSE;
3643
3644	buf = zio->io_private;
3645	hdr = buf->b_hdr;
3646
3647	/*
3648	 * The hdr was inserted into hash-table and removed from lists
3649	 * prior to starting I/O.  We should find this header, since
3650	 * it's in the hash table, and it should be legit since it's
3651	 * not possible to evict it during the I/O.  The only possible
3652	 * reason for it not to be found is if we were freed during the
3653	 * read.
3654	 */
3655	if (HDR_IN_HASH_TABLE(hdr)) {
3656		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
3657		ASSERT3U(hdr->b_dva.dva_word[0], ==,
3658		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
3659		ASSERT3U(hdr->b_dva.dva_word[1], ==,
3660		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
3661
3662		arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
3663		    &hash_lock);
3664
3665		ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
3666		    hash_lock == NULL) ||
3667		    (found == hdr &&
3668		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
3669		    (found == hdr && HDR_L2_READING(hdr)));
3670	}
3671
3672	hdr->b_flags &= ~ARC_FLAG_L2_EVICTED;
3673	if (l2arc_noprefetch && HDR_PREFETCH(hdr))
3674		hdr->b_flags &= ~ARC_FLAG_L2CACHE;
3675
3676	/* byteswap if necessary */
3677	callback_list = hdr->b_l1hdr.b_acb;
3678	ASSERT(callback_list != NULL);
3679	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
3680		dmu_object_byteswap_t bswap =
3681		    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
3682		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
3683		    byteswap_uint64_array :
3684		    dmu_ot_byteswap[bswap].ob_func;
3685		func(buf->b_data, hdr->b_size);
3686	}
3687
3688	arc_cksum_compute(buf, B_FALSE);
3689#ifdef illumos
3690	arc_buf_watch(buf);
3691#endif
3692
3693	if (hash_lock && zio->io_error == 0 &&
3694	    hdr->b_l1hdr.b_state == arc_anon) {
3695		/*
3696		 * Only call arc_access on anonymous buffers.  This is because
3697		 * if we've issued an I/O for an evicted buffer, we've already
3698		 * called arc_access (to prevent any simultaneous readers from
3699		 * getting confused).
3700		 */
3701		arc_access(hdr, hash_lock);
3702	}
3703
3704	/* create copies of the data buffer for the callers */
3705	abuf = buf;
3706	for (acb = callback_list; acb; acb = acb->acb_next) {
3707		if (acb->acb_done) {
3708			if (abuf == NULL) {
3709				ARCSTAT_BUMP(arcstat_duplicate_reads);
3710				abuf = arc_buf_clone(buf);
3711			}
3712			acb->acb_buf = abuf;
3713			abuf = NULL;
3714		}
3715	}
3716	hdr->b_l1hdr.b_acb = NULL;
3717	hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
3718	ASSERT(!HDR_BUF_AVAILABLE(hdr));
3719	if (abuf == buf) {
3720		ASSERT(buf->b_efunc == NULL);
3721		ASSERT(hdr->b_l1hdr.b_datacnt == 1);
3722		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
3723	}
3724
3725	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
3726	    callback_list != NULL);
3727
3728	if (zio->io_error != 0) {
3729		hdr->b_flags |= ARC_FLAG_IO_ERROR;
3730		if (hdr->b_l1hdr.b_state != arc_anon)
3731			arc_change_state(arc_anon, hdr, hash_lock);
3732		if (HDR_IN_HASH_TABLE(hdr))
3733			buf_hash_remove(hdr);
3734		freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
3735	}
3736
3737	/*
3738	 * Broadcast before we drop the hash_lock to avoid the possibility
3739	 * that the hdr (and hence the cv) might be freed before we get to
3740	 * the cv_broadcast().
3741	 */
3742	cv_broadcast(&hdr->b_l1hdr.b_cv);
3743
3744	if (hash_lock != NULL) {
3745		mutex_exit(hash_lock);
3746	} else {
3747		/*
3748		 * This block was freed while we waited for the read to
3749		 * complete.  It has been removed from the hash table and
3750		 * moved to the anonymous state (so that it won't show up
3751		 * in the cache).
3752		 */
3753		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3754		freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
3755	}
3756
3757	/* execute each callback and free its structure */
3758	while ((acb = callback_list) != NULL) {
3759		if (acb->acb_done)
3760			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
3761
3762		if (acb->acb_zio_dummy != NULL) {
3763			acb->acb_zio_dummy->io_error = zio->io_error;
3764			zio_nowait(acb->acb_zio_dummy);
3765		}
3766
3767		callback_list = acb->acb_next;
3768		kmem_free(acb, sizeof (arc_callback_t));
3769	}
3770
3771	if (freeable)
3772		arc_hdr_destroy(hdr);
3773}
3774
3775/*
3776 * "Read" the block at the specified DVA (in bp) via the
3777 * cache.  If the block is found in the cache, invoke the provided
3778 * callback immediately and return.  Note that the `zio' parameter
3779 * in the callback will be NULL in this case, since no IO was
3780 * required.  If the block is not in the cache pass the read request
3781 * on to the spa with a substitute callback function, so that the
3782 * requested block will be added to the cache.
3783 *
3784 * If a read request arrives for a block that has a read in-progress,
3785 * either wait for the in-progress read to complete (and return the
3786 * results); or, if this is a read with a "done" func, add a record
3787 * to the read to invoke the "done" func when the read completes,
3788 * and return; or just return.
3789 *
3790 * arc_read_done() will invoke all the requested "done" functions
3791 * for readers of this block.
3792 */
3793int
3794arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
3795    void *private, zio_priority_t priority, int zio_flags,
3796    arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
3797{
3798	arc_buf_hdr_t *hdr = NULL;
3799	arc_buf_t *buf = NULL;
3800	kmutex_t *hash_lock = NULL;
3801	zio_t *rzio;
3802	uint64_t guid = spa_load_guid(spa);
3803
3804	ASSERT(!BP_IS_EMBEDDED(bp) ||
3805	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
3806
3807top:
3808	if (!BP_IS_EMBEDDED(bp)) {
3809		/*
3810		 * Embedded BP's have no DVA and require no I/O to "read".
3811		 * Create an anonymous arc buf to back it.
3812		 */
3813		hdr = buf_hash_find(guid, bp, &hash_lock);
3814	}
3815
3816	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) {
3817
3818		*arc_flags |= ARC_FLAG_CACHED;
3819
3820		if (HDR_IO_IN_PROGRESS(hdr)) {
3821
3822			if (*arc_flags & ARC_FLAG_WAIT) {
3823				cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
3824				mutex_exit(hash_lock);
3825				goto top;
3826			}
3827			ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
3828
3829			if (done) {
3830				arc_callback_t	*acb = NULL;
3831
3832				acb = kmem_zalloc(sizeof (arc_callback_t),
3833				    KM_SLEEP);
3834				acb->acb_done = done;
3835				acb->acb_private = private;
3836				if (pio != NULL)
3837					acb->acb_zio_dummy = zio_null(pio,
3838					    spa, NULL, NULL, NULL, zio_flags);
3839
3840				ASSERT(acb->acb_done != NULL);
3841				acb->acb_next = hdr->b_l1hdr.b_acb;
3842				hdr->b_l1hdr.b_acb = acb;
3843				add_reference(hdr, hash_lock, private);
3844				mutex_exit(hash_lock);
3845				return (0);
3846			}
3847			mutex_exit(hash_lock);
3848			return (0);
3849		}
3850
3851		ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
3852		    hdr->b_l1hdr.b_state == arc_mfu);
3853
3854		if (done) {
3855			add_reference(hdr, hash_lock, private);
3856			/*
3857			 * If this block is already in use, create a new
3858			 * copy of the data so that we will be guaranteed
3859			 * that arc_release() will always succeed.
3860			 */
3861			buf = hdr->b_l1hdr.b_buf;
3862			ASSERT(buf);
3863			ASSERT(buf->b_data);
3864			if (HDR_BUF_AVAILABLE(hdr)) {
3865				ASSERT(buf->b_efunc == NULL);
3866				hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
3867			} else {
3868				buf = arc_buf_clone(buf);
3869			}
3870
3871		} else if (*arc_flags & ARC_FLAG_PREFETCH &&
3872		    refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
3873			hdr->b_flags |= ARC_FLAG_PREFETCH;
3874		}
3875		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
3876		arc_access(hdr, hash_lock);
3877		if (*arc_flags & ARC_FLAG_L2CACHE)
3878			hdr->b_flags |= ARC_FLAG_L2CACHE;
3879		if (*arc_flags & ARC_FLAG_L2COMPRESS)
3880			hdr->b_flags |= ARC_FLAG_L2COMPRESS;
3881		mutex_exit(hash_lock);
3882		ARCSTAT_BUMP(arcstat_hits);
3883		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
3884		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
3885		    data, metadata, hits);
3886
3887		if (done)
3888			done(NULL, buf, private);
3889	} else {
3890		uint64_t size = BP_GET_LSIZE(bp);
3891		arc_callback_t *acb;
3892		vdev_t *vd = NULL;
3893		uint64_t addr = 0;
3894		boolean_t devw = B_FALSE;
3895		enum zio_compress b_compress = ZIO_COMPRESS_OFF;
3896		int32_t b_asize = 0;
3897
3898		if (hdr == NULL) {
3899			/* this block is not in the cache */
3900			arc_buf_hdr_t *exists = NULL;
3901			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
3902			buf = arc_buf_alloc(spa, size, private, type);
3903			hdr = buf->b_hdr;
3904			if (!BP_IS_EMBEDDED(bp)) {
3905				hdr->b_dva = *BP_IDENTITY(bp);
3906				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
3907				exists = buf_hash_insert(hdr, &hash_lock);
3908			}
3909			if (exists != NULL) {
3910				/* somebody beat us to the hash insert */
3911				mutex_exit(hash_lock);
3912				buf_discard_identity(hdr);
3913				(void) arc_buf_remove_ref(buf, private);
3914				goto top; /* restart the IO request */
3915			}
3916
3917			/* if this is a prefetch, we don't have a reference */
3918			if (*arc_flags & ARC_FLAG_PREFETCH) {
3919				(void) remove_reference(hdr, hash_lock,
3920				    private);
3921				hdr->b_flags |= ARC_FLAG_PREFETCH;
3922			}
3923			if (*arc_flags & ARC_FLAG_L2CACHE)
3924				hdr->b_flags |= ARC_FLAG_L2CACHE;
3925			if (*arc_flags & ARC_FLAG_L2COMPRESS)
3926				hdr->b_flags |= ARC_FLAG_L2COMPRESS;
3927			if (BP_GET_LEVEL(bp) > 0)
3928				hdr->b_flags |= ARC_FLAG_INDIRECT;
3929		} else {
3930			/*
3931			 * This block is in the ghost cache. If it was L2-only
3932			 * (and thus didn't have an L1 hdr), we realloc the
3933			 * header to add an L1 hdr.
3934			 */
3935			if (!HDR_HAS_L1HDR(hdr)) {
3936				hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
3937				    hdr_full_cache);
3938			}
3939
3940			ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
3941			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3942			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3943			ASSERT(hdr->b_l1hdr.b_buf == NULL);
3944
3945			/* if this is a prefetch, we don't have a reference */
3946			if (*arc_flags & ARC_FLAG_PREFETCH)
3947				hdr->b_flags |= ARC_FLAG_PREFETCH;
3948			else
3949				add_reference(hdr, hash_lock, private);
3950			if (*arc_flags & ARC_FLAG_L2CACHE)
3951				hdr->b_flags |= ARC_FLAG_L2CACHE;
3952			if (*arc_flags & ARC_FLAG_L2COMPRESS)
3953				hdr->b_flags |= ARC_FLAG_L2COMPRESS;
3954			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3955			buf->b_hdr = hdr;
3956			buf->b_data = NULL;
3957			buf->b_efunc = NULL;
3958			buf->b_private = NULL;
3959			buf->b_next = NULL;
3960			hdr->b_l1hdr.b_buf = buf;
3961			ASSERT0(hdr->b_l1hdr.b_datacnt);
3962			hdr->b_l1hdr.b_datacnt = 1;
3963			arc_get_data_buf(buf);
3964			arc_access(hdr, hash_lock);
3965		}
3966
3967		ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
3968
3969		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
3970		acb->acb_done = done;
3971		acb->acb_private = private;
3972
3973		ASSERT(hdr->b_l1hdr.b_acb == NULL);
3974		hdr->b_l1hdr.b_acb = acb;
3975		hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
3976
3977		if (HDR_HAS_L2HDR(hdr) &&
3978		    (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
3979			devw = hdr->b_l2hdr.b_dev->l2ad_writing;
3980			addr = hdr->b_l2hdr.b_daddr;
3981			b_compress = HDR_GET_COMPRESS(hdr);
3982			b_asize = hdr->b_l2hdr.b_asize;
3983			/*
3984			 * Lock out device removal.
3985			 */
3986			if (vdev_is_dead(vd) ||
3987			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
3988				vd = NULL;
3989		}
3990
3991		if (hash_lock != NULL)
3992			mutex_exit(hash_lock);
3993
3994		/*
3995		 * At this point, we have a level 1 cache miss.  Try again in
3996		 * L2ARC if possible.
3997		 */
3998		ASSERT3U(hdr->b_size, ==, size);
3999		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
4000		    uint64_t, size, zbookmark_phys_t *, zb);
4001		ARCSTAT_BUMP(arcstat_misses);
4002		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
4003		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
4004		    data, metadata, misses);
4005#ifdef _KERNEL
4006		curthread->td_ru.ru_inblock++;
4007#endif
4008
4009		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
4010			/*
4011			 * Read from the L2ARC if the following are true:
4012			 * 1. The L2ARC vdev was previously cached.
4013			 * 2. This buffer still has L2ARC metadata.
4014			 * 3. This buffer isn't currently writing to the L2ARC.
4015			 * 4. The L2ARC entry wasn't evicted, which may
4016			 *    also have invalidated the vdev.
4017			 * 5. This isn't prefetch and l2arc_noprefetch is set.
4018			 */
4019			if (HDR_HAS_L2HDR(hdr) &&
4020			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
4021			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
4022				l2arc_read_callback_t *cb;
4023
4024				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
4025				ARCSTAT_BUMP(arcstat_l2_hits);
4026
4027				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
4028				    KM_SLEEP);
4029				cb->l2rcb_buf = buf;
4030				cb->l2rcb_spa = spa;
4031				cb->l2rcb_bp = *bp;
4032				cb->l2rcb_zb = *zb;
4033				cb->l2rcb_flags = zio_flags;
4034				cb->l2rcb_compress = b_compress;
4035
4036				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
4037				    addr + size < vd->vdev_psize -
4038				    VDEV_LABEL_END_SIZE);
4039
4040				/*
4041				 * l2arc read.  The SCL_L2ARC lock will be
4042				 * released by l2arc_read_done().
4043				 * Issue a null zio if the underlying buffer
4044				 * was squashed to zero size by compression.
4045				 */
4046				if (b_compress == ZIO_COMPRESS_EMPTY) {
4047					rzio = zio_null(pio, spa, vd,
4048					    l2arc_read_done, cb,
4049					    zio_flags | ZIO_FLAG_DONT_CACHE |
4050					    ZIO_FLAG_CANFAIL |
4051					    ZIO_FLAG_DONT_PROPAGATE |
4052					    ZIO_FLAG_DONT_RETRY);
4053				} else {
4054					rzio = zio_read_phys(pio, vd, addr,
4055					    b_asize, buf->b_data,
4056					    ZIO_CHECKSUM_OFF,
4057					    l2arc_read_done, cb, priority,
4058					    zio_flags | ZIO_FLAG_DONT_CACHE |
4059					    ZIO_FLAG_CANFAIL |
4060					    ZIO_FLAG_DONT_PROPAGATE |
4061					    ZIO_FLAG_DONT_RETRY, B_FALSE);
4062				}
4063				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
4064				    zio_t *, rzio);
4065				ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
4066
4067				if (*arc_flags & ARC_FLAG_NOWAIT) {
4068					zio_nowait(rzio);
4069					return (0);
4070				}
4071
4072				ASSERT(*arc_flags & ARC_FLAG_WAIT);
4073				if (zio_wait(rzio) == 0)
4074					return (0);
4075
4076				/* l2arc read error; goto zio_read() */
4077			} else {
4078				DTRACE_PROBE1(l2arc__miss,
4079				    arc_buf_hdr_t *, hdr);
4080				ARCSTAT_BUMP(arcstat_l2_misses);
4081				if (HDR_L2_WRITING(hdr))
4082					ARCSTAT_BUMP(arcstat_l2_rw_clash);
4083				spa_config_exit(spa, SCL_L2ARC, vd);
4084			}
4085		} else {
4086			if (vd != NULL)
4087				spa_config_exit(spa, SCL_L2ARC, vd);
4088			if (l2arc_ndev != 0) {
4089				DTRACE_PROBE1(l2arc__miss,
4090				    arc_buf_hdr_t *, hdr);
4091				ARCSTAT_BUMP(arcstat_l2_misses);
4092			}
4093		}
4094
4095		rzio = zio_read(pio, spa, bp, buf->b_data, size,
4096		    arc_read_done, buf, priority, zio_flags, zb);
4097
4098		if (*arc_flags & ARC_FLAG_WAIT)
4099			return (zio_wait(rzio));
4100
4101		ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
4102		zio_nowait(rzio);
4103	}
4104	return (0);
4105}
4106
4107void
4108arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
4109{
4110	ASSERT(buf->b_hdr != NULL);
4111	ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon);
4112	ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) ||
4113	    func == NULL);
4114	ASSERT(buf->b_efunc == NULL);
4115	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
4116
4117	buf->b_efunc = func;
4118	buf->b_private = private;
4119}
4120
4121/*
4122 * Notify the arc that a block was freed, and thus will never be used again.
4123 */
4124void
4125arc_freed(spa_t *spa, const blkptr_t *bp)
4126{
4127	arc_buf_hdr_t *hdr;
4128	kmutex_t *hash_lock;
4129	uint64_t guid = spa_load_guid(spa);
4130
4131	ASSERT(!BP_IS_EMBEDDED(bp));
4132
4133	hdr = buf_hash_find(guid, bp, &hash_lock);
4134	if (hdr == NULL)
4135		return;
4136	if (HDR_BUF_AVAILABLE(hdr)) {
4137		arc_buf_t *buf = hdr->b_l1hdr.b_buf;
4138		add_reference(hdr, hash_lock, FTAG);
4139		hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
4140		mutex_exit(hash_lock);
4141
4142		arc_release(buf, FTAG);
4143		(void) arc_buf_remove_ref(buf, FTAG);
4144	} else {
4145		mutex_exit(hash_lock);
4146	}
4147
4148}
4149
4150/*
4151 * Clear the user eviction callback set by arc_set_callback(), first calling
4152 * it if it exists.  Because the presence of a callback keeps an arc_buf cached
4153 * clearing the callback may result in the arc_buf being destroyed.  However,
4154 * it will not result in the *last* arc_buf being destroyed, hence the data
4155 * will remain cached in the ARC. We make a copy of the arc buffer here so
4156 * that we can process the callback without holding any locks.
4157 *
4158 * It's possible that the callback is already in the process of being cleared
4159 * by another thread.  In this case we can not clear the callback.
4160 *
4161 * Returns B_TRUE if the callback was successfully called and cleared.
4162 */
4163boolean_t
4164arc_clear_callback(arc_buf_t *buf)
4165{
4166	arc_buf_hdr_t *hdr;
4167	kmutex_t *hash_lock;
4168	arc_evict_func_t *efunc = buf->b_efunc;
4169	void *private = buf->b_private;
4170
4171	mutex_enter(&buf->b_evict_lock);
4172	hdr = buf->b_hdr;
4173	if (hdr == NULL) {
4174		/*
4175		 * We are in arc_do_user_evicts().
4176		 */
4177		ASSERT(buf->b_data == NULL);
4178		mutex_exit(&buf->b_evict_lock);
4179		return (B_FALSE);
4180	} else if (buf->b_data == NULL) {
4181		/*
4182		 * We are on the eviction list; process this buffer now
4183		 * but let arc_do_user_evicts() do the reaping.
4184		 */
4185		buf->b_efunc = NULL;
4186		mutex_exit(&buf->b_evict_lock);
4187		VERIFY0(efunc(private));
4188		return (B_TRUE);
4189	}
4190	hash_lock = HDR_LOCK(hdr);
4191	mutex_enter(hash_lock);
4192	hdr = buf->b_hdr;
4193	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4194
4195	ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <,
4196	    hdr->b_l1hdr.b_datacnt);
4197	ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
4198	    hdr->b_l1hdr.b_state == arc_mfu);
4199
4200	buf->b_efunc = NULL;
4201	buf->b_private = NULL;
4202
4203	if (hdr->b_l1hdr.b_datacnt > 1) {
4204		mutex_exit(&buf->b_evict_lock);
4205		arc_buf_destroy(buf, FALSE, TRUE);
4206	} else {
4207		ASSERT(buf == hdr->b_l1hdr.b_buf);
4208		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
4209		mutex_exit(&buf->b_evict_lock);
4210	}
4211
4212	mutex_exit(hash_lock);
4213	VERIFY0(efunc(private));
4214	return (B_TRUE);
4215}
4216
4217/*
4218 * Release this buffer from the cache, making it an anonymous buffer.  This
4219 * must be done after a read and prior to modifying the buffer contents.
4220 * If the buffer has more than one reference, we must make
4221 * a new hdr for the buffer.
4222 */
4223void
4224arc_release(arc_buf_t *buf, void *tag)
4225{
4226	arc_buf_hdr_t *hdr = buf->b_hdr;
4227
4228	/*
4229	 * It would be nice to assert that if it's DMU metadata (level >
4230	 * 0 || it's the dnode file), then it must be syncing context.
4231	 * But we don't know that information at this level.
4232	 */
4233
4234	mutex_enter(&buf->b_evict_lock);
4235	/*
4236	 * We don't grab the hash lock prior to this check, because if
4237	 * the buffer's header is in the arc_anon state, it won't be
4238	 * linked into the hash table.
4239	 */
4240	if (hdr->b_l1hdr.b_state == arc_anon) {
4241		mutex_exit(&buf->b_evict_lock);
4242		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4243		ASSERT(!HDR_IN_HASH_TABLE(hdr));
4244		ASSERT(!HDR_HAS_L2HDR(hdr));
4245		ASSERT(BUF_EMPTY(hdr));
4246		ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1);
4247		ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
4248		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
4249
4250		ASSERT3P(buf->b_efunc, ==, NULL);
4251		ASSERT3P(buf->b_private, ==, NULL);
4252
4253		hdr->b_l1hdr.b_arc_access = 0;
4254		arc_buf_thaw(buf);
4255
4256		return;
4257	}
4258
4259	kmutex_t *hash_lock = HDR_LOCK(hdr);
4260	mutex_enter(hash_lock);
4261
4262	/*
4263	 * This assignment is only valid as long as the hash_lock is
4264	 * held, we must be careful not to reference state or the
4265	 * b_state field after dropping the lock.
4266	 */
4267	arc_state_t *state = hdr->b_l1hdr.b_state;
4268	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4269	ASSERT3P(state, !=, arc_anon);
4270
4271	/* this buffer is not on any list */
4272	ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0);
4273
4274	if (HDR_HAS_L2HDR(hdr)) {
4275		mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
4276
4277		/*
4278		 * We have to recheck this conditional again now that
4279		 * we're holding the l2ad_mtx to prevent a race with
4280		 * another thread which might be concurrently calling
4281		 * l2arc_evict(). In that case, l2arc_evict() might have
4282		 * destroyed the header's L2 portion as we were waiting
4283		 * to acquire the l2ad_mtx.
4284		 */
4285		if (HDR_HAS_L2HDR(hdr)) {
4286			if (hdr->b_l2hdr.b_daddr != L2ARC_ADDR_UNSET)
4287				trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev,
4288				    hdr->b_l2hdr.b_daddr,
4289				    hdr->b_l2hdr.b_asize, 0);
4290			arc_hdr_l2hdr_destroy(hdr);
4291		}
4292
4293		mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
4294	}
4295
4296	/*
4297	 * Do we have more than one buf?
4298	 */
4299	if (hdr->b_l1hdr.b_datacnt > 1) {
4300		arc_buf_hdr_t *nhdr;
4301		arc_buf_t **bufp;
4302		uint64_t blksz = hdr->b_size;
4303		uint64_t spa = hdr->b_spa;
4304		arc_buf_contents_t type = arc_buf_type(hdr);
4305		uint32_t flags = hdr->b_flags;
4306
4307		ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
4308		/*
4309		 * Pull the data off of this hdr and attach it to
4310		 * a new anonymous hdr.
4311		 */
4312		(void) remove_reference(hdr, hash_lock, tag);
4313		bufp = &hdr->b_l1hdr.b_buf;
4314		while (*bufp != buf)
4315			bufp = &(*bufp)->b_next;
4316		*bufp = buf->b_next;
4317		buf->b_next = NULL;
4318
4319		ASSERT3P(state, !=, arc_l2c_only);
4320		ASSERT3U(state->arcs_size, >=, hdr->b_size);
4321		atomic_add_64(&state->arcs_size, -hdr->b_size);
4322		if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
4323			ASSERT3P(state, !=, arc_l2c_only);
4324			uint64_t *size = &state->arcs_lsize[type];
4325			ASSERT3U(*size, >=, hdr->b_size);
4326			atomic_add_64(size, -hdr->b_size);
4327		}
4328
4329		/*
4330		 * We're releasing a duplicate user data buffer, update
4331		 * our statistics accordingly.
4332		 */
4333		if (HDR_ISTYPE_DATA(hdr)) {
4334			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
4335			ARCSTAT_INCR(arcstat_duplicate_buffers_size,
4336			    -hdr->b_size);
4337		}
4338		hdr->b_l1hdr.b_datacnt -= 1;
4339		arc_cksum_verify(buf);
4340#ifdef illumos
4341		arc_buf_unwatch(buf);
4342#endif
4343
4344		mutex_exit(hash_lock);
4345
4346		nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
4347		nhdr->b_size = blksz;
4348		nhdr->b_spa = spa;
4349
4350		nhdr->b_flags = flags & ARC_FLAG_L2_WRITING;
4351		nhdr->b_flags |= arc_bufc_to_flags(type);
4352		nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
4353
4354		nhdr->b_l1hdr.b_buf = buf;
4355		nhdr->b_l1hdr.b_datacnt = 1;
4356		nhdr->b_l1hdr.b_state = arc_anon;
4357		nhdr->b_l1hdr.b_arc_access = 0;
4358		nhdr->b_freeze_cksum = NULL;
4359
4360		(void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
4361		buf->b_hdr = nhdr;
4362		mutex_exit(&buf->b_evict_lock);
4363		atomic_add_64(&arc_anon->arcs_size, blksz);
4364	} else {
4365		mutex_exit(&buf->b_evict_lock);
4366		ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
4367		/* protected by hash lock */
4368		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
4369		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4370		arc_change_state(arc_anon, hdr, hash_lock);
4371		hdr->b_l1hdr.b_arc_access = 0;
4372		mutex_exit(hash_lock);
4373
4374		buf_discard_identity(hdr);
4375		arc_buf_thaw(buf);
4376	}
4377	buf->b_efunc = NULL;
4378	buf->b_private = NULL;
4379}
4380
4381int
4382arc_released(arc_buf_t *buf)
4383{
4384	int released;
4385
4386	mutex_enter(&buf->b_evict_lock);
4387	released = (buf->b_data != NULL &&
4388	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
4389	mutex_exit(&buf->b_evict_lock);
4390	return (released);
4391}
4392
4393#ifdef ZFS_DEBUG
4394int
4395arc_referenced(arc_buf_t *buf)
4396{
4397	int referenced;
4398
4399	mutex_enter(&buf->b_evict_lock);
4400	referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
4401	mutex_exit(&buf->b_evict_lock);
4402	return (referenced);
4403}
4404#endif
4405
4406static void
4407arc_write_ready(zio_t *zio)
4408{
4409	arc_write_callback_t *callback = zio->io_private;
4410	arc_buf_t *buf = callback->awcb_buf;
4411	arc_buf_hdr_t *hdr = buf->b_hdr;
4412
4413	ASSERT(HDR_HAS_L1HDR(hdr));
4414	ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
4415	ASSERT(hdr->b_l1hdr.b_datacnt > 0);
4416	callback->awcb_ready(zio, buf, callback->awcb_private);
4417
4418	/*
4419	 * If the IO is already in progress, then this is a re-write
4420	 * attempt, so we need to thaw and re-compute the cksum.
4421	 * It is the responsibility of the callback to handle the
4422	 * accounting for any re-write attempt.
4423	 */
4424	if (HDR_IO_IN_PROGRESS(hdr)) {
4425		mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
4426		if (hdr->b_freeze_cksum != NULL) {
4427			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
4428			hdr->b_freeze_cksum = NULL;
4429		}
4430		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
4431	}
4432	arc_cksum_compute(buf, B_FALSE);
4433	hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
4434}
4435
4436/*
4437 * The SPA calls this callback for each physical write that happens on behalf
4438 * of a logical write.  See the comment in dbuf_write_physdone() for details.
4439 */
4440static void
4441arc_write_physdone(zio_t *zio)
4442{
4443	arc_write_callback_t *cb = zio->io_private;
4444	if (cb->awcb_physdone != NULL)
4445		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
4446}
4447
4448static void
4449arc_write_done(zio_t *zio)
4450{
4451	arc_write_callback_t *callback = zio->io_private;
4452	arc_buf_t *buf = callback->awcb_buf;
4453	arc_buf_hdr_t *hdr = buf->b_hdr;
4454
4455	ASSERT(hdr->b_l1hdr.b_acb == NULL);
4456
4457	if (zio->io_error == 0) {
4458		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
4459			buf_discard_identity(hdr);
4460		} else {
4461			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
4462			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
4463		}
4464	} else {
4465		ASSERT(BUF_EMPTY(hdr));
4466	}
4467
4468	/*
4469	 * If the block to be written was all-zero or compressed enough to be
4470	 * embedded in the BP, no write was performed so there will be no
4471	 * dva/birth/checksum.  The buffer must therefore remain anonymous
4472	 * (and uncached).
4473	 */
4474	if (!BUF_EMPTY(hdr)) {
4475		arc_buf_hdr_t *exists;
4476		kmutex_t *hash_lock;
4477
4478		ASSERT(zio->io_error == 0);
4479
4480		arc_cksum_verify(buf);
4481
4482		exists = buf_hash_insert(hdr, &hash_lock);
4483		if (exists != NULL) {
4484			/*
4485			 * This can only happen if we overwrite for
4486			 * sync-to-convergence, because we remove
4487			 * buffers from the hash table when we arc_free().
4488			 */
4489			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
4490				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
4491					panic("bad overwrite, hdr=%p exists=%p",
4492					    (void *)hdr, (void *)exists);
4493				ASSERT(refcount_is_zero(
4494				    &exists->b_l1hdr.b_refcnt));
4495				arc_change_state(arc_anon, exists, hash_lock);
4496				mutex_exit(hash_lock);
4497				arc_hdr_destroy(exists);
4498				exists = buf_hash_insert(hdr, &hash_lock);
4499				ASSERT3P(exists, ==, NULL);
4500			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
4501				/* nopwrite */
4502				ASSERT(zio->io_prop.zp_nopwrite);
4503				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
4504					panic("bad nopwrite, hdr=%p exists=%p",
4505					    (void *)hdr, (void *)exists);
4506			} else {
4507				/* Dedup */
4508				ASSERT(hdr->b_l1hdr.b_datacnt == 1);
4509				ASSERT(hdr->b_l1hdr.b_state == arc_anon);
4510				ASSERT(BP_GET_DEDUP(zio->io_bp));
4511				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
4512			}
4513		}
4514		hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
4515		/* if it's not anon, we are doing a scrub */
4516		if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
4517			arc_access(hdr, hash_lock);
4518		mutex_exit(hash_lock);
4519	} else {
4520		hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
4521	}
4522
4523	ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4524	callback->awcb_done(zio, buf, callback->awcb_private);
4525
4526	kmem_free(callback, sizeof (arc_write_callback_t));
4527}
4528
4529zio_t *
4530arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
4531    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
4532    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
4533    arc_done_func_t *done, void *private, zio_priority_t priority,
4534    int zio_flags, const zbookmark_phys_t *zb)
4535{
4536	arc_buf_hdr_t *hdr = buf->b_hdr;
4537	arc_write_callback_t *callback;
4538	zio_t *zio;
4539
4540	ASSERT(ready != NULL);
4541	ASSERT(done != NULL);
4542	ASSERT(!HDR_IO_ERROR(hdr));
4543	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4544	ASSERT(hdr->b_l1hdr.b_acb == NULL);
4545	ASSERT(hdr->b_l1hdr.b_datacnt > 0);
4546	if (l2arc)
4547		hdr->b_flags |= ARC_FLAG_L2CACHE;
4548	if (l2arc_compress)
4549		hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4550	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
4551	callback->awcb_ready = ready;
4552	callback->awcb_physdone = physdone;
4553	callback->awcb_done = done;
4554	callback->awcb_private = private;
4555	callback->awcb_buf = buf;
4556
4557	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
4558	    arc_write_ready, arc_write_physdone, arc_write_done, callback,
4559	    priority, zio_flags, zb);
4560
4561	return (zio);
4562}
4563
4564static int
4565arc_memory_throttle(uint64_t reserve, uint64_t txg)
4566{
4567#ifdef _KERNEL
4568	uint64_t available_memory = ptob(freemem);
4569	static uint64_t page_load = 0;
4570	static uint64_t last_txg = 0;
4571
4572#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
4573	available_memory =
4574	    MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE)));
4575#endif
4576
4577	if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
4578		return (0);
4579
4580	if (txg > last_txg) {
4581		last_txg = txg;
4582		page_load = 0;
4583	}
4584	/*
4585	 * If we are in pageout, we know that memory is already tight,
4586	 * the arc is already going to be evicting, so we just want to
4587	 * continue to let page writes occur as quickly as possible.
4588	 */
4589	if (curproc == pageproc) {
4590		if (page_load > MAX(ptob(minfree), available_memory) / 4)
4591			return (SET_ERROR(ERESTART));
4592		/* Note: reserve is inflated, so we deflate */
4593		page_load += reserve / 8;
4594		return (0);
4595	} else if (page_load > 0 && arc_reclaim_needed()) {
4596		/* memory is low, delay before restarting */
4597		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
4598		return (SET_ERROR(EAGAIN));
4599	}
4600	page_load = 0;
4601#endif
4602	return (0);
4603}
4604
4605void
4606arc_tempreserve_clear(uint64_t reserve)
4607{
4608	atomic_add_64(&arc_tempreserve, -reserve);
4609	ASSERT((int64_t)arc_tempreserve >= 0);
4610}
4611
4612int
4613arc_tempreserve_space(uint64_t reserve, uint64_t txg)
4614{
4615	int error;
4616	uint64_t anon_size;
4617
4618	if (reserve > arc_c/4 && !arc_no_grow) {
4619		arc_c = MIN(arc_c_max, reserve * 4);
4620		DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
4621	}
4622	if (reserve > arc_c)
4623		return (SET_ERROR(ENOMEM));
4624
4625	/*
4626	 * Don't count loaned bufs as in flight dirty data to prevent long
4627	 * network delays from blocking transactions that are ready to be
4628	 * assigned to a txg.
4629	 */
4630	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
4631
4632	/*
4633	 * Writes will, almost always, require additional memory allocations
4634	 * in order to compress/encrypt/etc the data.  We therefore need to
4635	 * make sure that there is sufficient available memory for this.
4636	 */
4637	error = arc_memory_throttle(reserve, txg);
4638	if (error != 0)
4639		return (error);
4640
4641	/*
4642	 * Throttle writes when the amount of dirty data in the cache
4643	 * gets too large.  We try to keep the cache less than half full
4644	 * of dirty blocks so that our sync times don't grow too large.
4645	 * Note: if two requests come in concurrently, we might let them
4646	 * both succeed, when one of them should fail.  Not a huge deal.
4647	 */
4648
4649	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
4650	    anon_size > arc_c / 4) {
4651		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
4652		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
4653		    arc_tempreserve>>10,
4654		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
4655		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
4656		    reserve>>10, arc_c>>10);
4657		return (SET_ERROR(ERESTART));
4658	}
4659	atomic_add_64(&arc_tempreserve, reserve);
4660	return (0);
4661}
4662
4663static void
4664arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
4665    kstat_named_t *evict_data, kstat_named_t *evict_metadata)
4666{
4667	size->value.ui64 = state->arcs_size;
4668	evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
4669	evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
4670}
4671
4672static int
4673arc_kstat_update(kstat_t *ksp, int rw)
4674{
4675	arc_stats_t *as = ksp->ks_data;
4676
4677	if (rw == KSTAT_WRITE) {
4678		return (EACCES);
4679	} else {
4680		arc_kstat_update_state(arc_anon,
4681		    &as->arcstat_anon_size,
4682		    &as->arcstat_anon_evictable_data,
4683		    &as->arcstat_anon_evictable_metadata);
4684		arc_kstat_update_state(arc_mru,
4685		    &as->arcstat_mru_size,
4686		    &as->arcstat_mru_evictable_data,
4687		    &as->arcstat_mru_evictable_metadata);
4688		arc_kstat_update_state(arc_mru_ghost,
4689		    &as->arcstat_mru_ghost_size,
4690		    &as->arcstat_mru_ghost_evictable_data,
4691		    &as->arcstat_mru_ghost_evictable_metadata);
4692		arc_kstat_update_state(arc_mfu,
4693		    &as->arcstat_mfu_size,
4694		    &as->arcstat_mfu_evictable_data,
4695		    &as->arcstat_mfu_evictable_metadata);
4696		arc_kstat_update_state(arc_mfu_ghost,
4697		    &as->arcstat_mfu_ghost_size,
4698		    &as->arcstat_mfu_ghost_evictable_data,
4699		    &as->arcstat_mfu_ghost_evictable_metadata);
4700	}
4701
4702	return (0);
4703}
4704
4705#ifdef _KERNEL
4706static eventhandler_tag arc_event_lowmem = NULL;
4707
4708static void
4709arc_lowmem(void *arg __unused, int howto __unused)
4710{
4711
4712	mutex_enter(&arc_reclaim_thr_lock);
4713	/* XXX: Memory deficit should be passed as argument. */
4714	needfree = btoc(arc_c >> arc_shrink_shift);
4715	DTRACE_PROBE(arc__needfree);
4716	cv_signal(&arc_reclaim_thr_cv);
4717
4718	/*
4719	 * It is unsafe to block here in arbitrary threads, because we can come
4720	 * here from ARC itself and may hold ARC locks and thus risk a deadlock
4721	 * with ARC reclaim thread.
4722	 */
4723	if (curproc == pageproc)
4724		msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
4725	mutex_exit(&arc_reclaim_thr_lock);
4726}
4727#endif
4728
4729void
4730arc_init(void)
4731{
4732	int i, prefetch_tunable_set = 0;
4733
4734	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
4735	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
4736
4737	/* Convert seconds to clock ticks */
4738	arc_min_prefetch_lifespan = 1 * hz;
4739
4740	/* Start out with 1/8 of all memory */
4741	arc_c = kmem_size() / 8;
4742
4743#ifdef illumos
4744#ifdef _KERNEL
4745	/*
4746	 * On architectures where the physical memory can be larger
4747	 * than the addressable space (intel in 32-bit mode), we may
4748	 * need to limit the cache to 1/8 of VM size.
4749	 */
4750	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
4751#endif
4752#endif	/* illumos */
4753	/* set min cache to 1/32 of all memory, or 16MB, whichever is more */
4754	arc_c_min = MAX(arc_c / 4, 16 << 20);
4755	/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
4756	if (arc_c * 8 >= 1 << 30)
4757		arc_c_max = (arc_c * 8) - (1 << 30);
4758	else
4759		arc_c_max = arc_c_min;
4760	arc_c_max = MAX(arc_c * 5, arc_c_max);
4761
4762#ifdef _KERNEL
4763	/*
4764	 * Allow the tunables to override our calculations if they are
4765	 * reasonable (ie. over 16MB)
4766	 */
4767	if (zfs_arc_max > 16 << 20 && zfs_arc_max < kmem_size())
4768		arc_c_max = zfs_arc_max;
4769	if (zfs_arc_min > 16 << 20 && zfs_arc_min <= arc_c_max)
4770		arc_c_min = zfs_arc_min;
4771#endif
4772
4773	arc_c = arc_c_max;
4774	arc_p = (arc_c >> 1);
4775
4776	/* limit meta-data to 1/4 of the arc capacity */
4777	arc_meta_limit = arc_c_max / 4;
4778
4779	/* Allow the tunable to override if it is reasonable */
4780	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
4781		arc_meta_limit = zfs_arc_meta_limit;
4782
4783	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
4784		arc_c_min = arc_meta_limit / 2;
4785
4786	if (zfs_arc_meta_min > 0) {
4787		arc_meta_min = zfs_arc_meta_min;
4788	} else {
4789		arc_meta_min = arc_c_min / 2;
4790	}
4791
4792	if (zfs_arc_grow_retry > 0)
4793		arc_grow_retry = zfs_arc_grow_retry;
4794
4795	if (zfs_arc_shrink_shift > 0)
4796		arc_shrink_shift = zfs_arc_shrink_shift;
4797
4798	/*
4799	 * Ensure that arc_no_grow_shift is less than arc_shrink_shift.
4800	 */
4801	if (arc_no_grow_shift >= arc_shrink_shift)
4802		arc_no_grow_shift = arc_shrink_shift - 1;
4803
4804	if (zfs_arc_p_min_shift > 0)
4805		arc_p_min_shift = zfs_arc_p_min_shift;
4806
4807	/* if kmem_flags are set, lets try to use less memory */
4808	if (kmem_debugging())
4809		arc_c = arc_c / 2;
4810	if (arc_c < arc_c_min)
4811		arc_c = arc_c_min;
4812
4813	zfs_arc_min = arc_c_min;
4814	zfs_arc_max = arc_c_max;
4815
4816	arc_anon = &ARC_anon;
4817	arc_mru = &ARC_mru;
4818	arc_mru_ghost = &ARC_mru_ghost;
4819	arc_mfu = &ARC_mfu;
4820	arc_mfu_ghost = &ARC_mfu_ghost;
4821	arc_l2c_only = &ARC_l2c_only;
4822	arc_size = 0;
4823
4824	mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4825	mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4826	mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4827	mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4828	mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4829	mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
4830
4831	list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
4832	    sizeof (arc_buf_hdr_t),
4833	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4834	list_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
4835	    sizeof (arc_buf_hdr_t),
4836	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4837	list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
4838	    sizeof (arc_buf_hdr_t),
4839	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4840	list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
4841	    sizeof (arc_buf_hdr_t),
4842	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4843	list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
4844	    sizeof (arc_buf_hdr_t),
4845	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4846	list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
4847	    sizeof (arc_buf_hdr_t),
4848	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4849	list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
4850	    sizeof (arc_buf_hdr_t),
4851	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4852	list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
4853	    sizeof (arc_buf_hdr_t),
4854	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4855	list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
4856	    sizeof (arc_buf_hdr_t),
4857	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4858	list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
4859	    sizeof (arc_buf_hdr_t),
4860	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node));
4861
4862	buf_init();
4863
4864	arc_thread_exit = 0;
4865	arc_eviction_list = NULL;
4866	mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
4867	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
4868
4869	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
4870	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4871
4872	if (arc_ksp != NULL) {
4873		arc_ksp->ks_data = &arc_stats;
4874		arc_ksp->ks_update = arc_kstat_update;
4875		kstat_install(arc_ksp);
4876	}
4877
4878	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
4879	    TS_RUN, minclsyspri);
4880
4881#ifdef _KERNEL
4882	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
4883	    EVENTHANDLER_PRI_FIRST);
4884#endif
4885
4886	arc_dead = FALSE;
4887	arc_warm = B_FALSE;
4888
4889	/*
4890	 * Calculate maximum amount of dirty data per pool.
4891	 *
4892	 * If it has been set by /etc/system, take that.
4893	 * Otherwise, use a percentage of physical memory defined by
4894	 * zfs_dirty_data_max_percent (default 10%) with a cap at
4895	 * zfs_dirty_data_max_max (default 4GB).
4896	 */
4897	if (zfs_dirty_data_max == 0) {
4898		zfs_dirty_data_max = ptob(physmem) *
4899		    zfs_dirty_data_max_percent / 100;
4900		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
4901		    zfs_dirty_data_max_max);
4902	}
4903
4904#ifdef _KERNEL
4905	if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
4906		prefetch_tunable_set = 1;
4907
4908#ifdef __i386__
4909	if (prefetch_tunable_set == 0) {
4910		printf("ZFS NOTICE: Prefetch is disabled by default on i386 "
4911		    "-- to enable,\n");
4912		printf("            add \"vfs.zfs.prefetch_disable=0\" "
4913		    "to /boot/loader.conf.\n");
4914		zfs_prefetch_disable = 1;
4915	}
4916#else
4917	if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) &&
4918	    prefetch_tunable_set == 0) {
4919		printf("ZFS NOTICE: Prefetch is disabled by default if less "
4920		    "than 4GB of RAM is present;\n"
4921		    "            to enable, add \"vfs.zfs.prefetch_disable=0\" "
4922		    "to /boot/loader.conf.\n");
4923		zfs_prefetch_disable = 1;
4924	}
4925#endif
4926	/* Warn about ZFS memory and address space requirements. */
4927	if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) {
4928		printf("ZFS WARNING: Recommended minimum RAM size is 512MB; "
4929		    "expect unstable behavior.\n");
4930	}
4931	if (kmem_size() < 512 * (1 << 20)) {
4932		printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; "
4933		    "expect unstable behavior.\n");
4934		printf("             Consider tuning vm.kmem_size and "
4935		    "vm.kmem_size_max\n");
4936		printf("             in /boot/loader.conf.\n");
4937	}
4938#endif
4939}
4940
4941void
4942arc_fini(void)
4943{
4944	mutex_enter(&arc_reclaim_thr_lock);
4945	arc_thread_exit = 1;
4946	cv_signal(&arc_reclaim_thr_cv);
4947	while (arc_thread_exit != 0)
4948		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
4949	mutex_exit(&arc_reclaim_thr_lock);
4950
4951	arc_flush(NULL);
4952
4953	arc_dead = TRUE;
4954
4955	if (arc_ksp != NULL) {
4956		kstat_delete(arc_ksp);
4957		arc_ksp = NULL;
4958	}
4959
4960	mutex_destroy(&arc_eviction_mtx);
4961	mutex_destroy(&arc_reclaim_thr_lock);
4962	cv_destroy(&arc_reclaim_thr_cv);
4963
4964	list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
4965	list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
4966	list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
4967	list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
4968	list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
4969	list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
4970	list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
4971	list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
4972
4973	mutex_destroy(&arc_anon->arcs_mtx);
4974	mutex_destroy(&arc_mru->arcs_mtx);
4975	mutex_destroy(&arc_mru_ghost->arcs_mtx);
4976	mutex_destroy(&arc_mfu->arcs_mtx);
4977	mutex_destroy(&arc_mfu_ghost->arcs_mtx);
4978	mutex_destroy(&arc_l2c_only->arcs_mtx);
4979
4980	buf_fini();
4981
4982	ASSERT0(arc_loaned_bytes);
4983
4984#ifdef _KERNEL
4985	if (arc_event_lowmem != NULL)
4986		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
4987#endif
4988}
4989
4990/*
4991 * Level 2 ARC
4992 *
4993 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
4994 * It uses dedicated storage devices to hold cached data, which are populated
4995 * using large infrequent writes.  The main role of this cache is to boost
4996 * the performance of random read workloads.  The intended L2ARC devices
4997 * include short-stroked disks, solid state disks, and other media with
4998 * substantially faster read latency than disk.
4999 *
5000 *                 +-----------------------+
5001 *                 |         ARC           |
5002 *                 +-----------------------+
5003 *                    |         ^     ^
5004 *                    |         |     |
5005 *      l2arc_feed_thread()    arc_read()
5006 *                    |         |     |
5007 *                    |  l2arc read   |
5008 *                    V         |     |
5009 *               +---------------+    |
5010 *               |     L2ARC     |    |
5011 *               +---------------+    |
5012 *                   |    ^           |
5013 *          l2arc_write() |           |
5014 *                   |    |           |
5015 *                   V    |           |
5016 *                 +-------+      +-------+
5017 *                 | vdev  |      | vdev  |
5018 *                 | cache |      | cache |
5019 *                 +-------+      +-------+
5020 *                 +=========+     .-----.
5021 *                 :  L2ARC  :    |-_____-|
5022 *                 : devices :    | Disks |
5023 *                 +=========+    `-_____-'
5024 *
5025 * Read requests are satisfied from the following sources, in order:
5026 *
5027 *	1) ARC
5028 *	2) vdev cache of L2ARC devices
5029 *	3) L2ARC devices
5030 *	4) vdev cache of disks
5031 *	5) disks
5032 *
5033 * Some L2ARC device types exhibit extremely slow write performance.
5034 * To accommodate for this there are some significant differences between
5035 * the L2ARC and traditional cache design:
5036 *
5037 * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
5038 * the ARC behave as usual, freeing buffers and placing headers on ghost
5039 * lists.  The ARC does not send buffers to the L2ARC during eviction as
5040 * this would add inflated write latencies for all ARC memory pressure.
5041 *
5042 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
5043 * It does this by periodically scanning buffers from the eviction-end of
5044 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
5045 * not already there. It scans until a headroom of buffers is satisfied,
5046 * which itself is a buffer for ARC eviction. If a compressible buffer is
5047 * found during scanning and selected for writing to an L2ARC device, we
5048 * temporarily boost scanning headroom during the next scan cycle to make
5049 * sure we adapt to compression effects (which might significantly reduce
5050 * the data volume we write to L2ARC). The thread that does this is
5051 * l2arc_feed_thread(), illustrated below; example sizes are included to
5052 * provide a better sense of ratio than this diagram:
5053 *
5054 *	       head -->                        tail
5055 *	        +---------------------+----------+
5056 *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
5057 *	        +---------------------+----------+   |   o L2ARC eligible
5058 *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
5059 *	        +---------------------+----------+   |
5060 *	             15.9 Gbytes      ^ 32 Mbytes    |
5061 *	                           headroom          |
5062 *	                                      l2arc_feed_thread()
5063 *	                                             |
5064 *	                 l2arc write hand <--[oooo]--'
5065 *	                         |           8 Mbyte
5066 *	                         |          write max
5067 *	                         V
5068 *		  +==============================+
5069 *	L2ARC dev |####|#|###|###|    |####| ... |
5070 *	          +==============================+
5071 *	                     32 Gbytes
5072 *
5073 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
5074 * evicted, then the L2ARC has cached a buffer much sooner than it probably
5075 * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
5076 * safe to say that this is an uncommon case, since buffers at the end of
5077 * the ARC lists have moved there due to inactivity.
5078 *
5079 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
5080 * then the L2ARC simply misses copying some buffers.  This serves as a
5081 * pressure valve to prevent heavy read workloads from both stalling the ARC
5082 * with waits and clogging the L2ARC with writes.  This also helps prevent
5083 * the potential for the L2ARC to churn if it attempts to cache content too
5084 * quickly, such as during backups of the entire pool.
5085 *
5086 * 5. After system boot and before the ARC has filled main memory, there are
5087 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
5088 * lists can remain mostly static.  Instead of searching from tail of these
5089 * lists as pictured, the l2arc_feed_thread() will search from the list heads
5090 * for eligible buffers, greatly increasing its chance of finding them.
5091 *
5092 * The L2ARC device write speed is also boosted during this time so that
5093 * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
5094 * there are no L2ARC reads, and no fear of degrading read performance
5095 * through increased writes.
5096 *
5097 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
5098 * the vdev queue can aggregate them into larger and fewer writes.  Each
5099 * device is written to in a rotor fashion, sweeping writes through
5100 * available space then repeating.
5101 *
5102 * 7. The L2ARC does not store dirty content.  It never needs to flush
5103 * write buffers back to disk based storage.
5104 *
5105 * 8. If an ARC buffer is written (and dirtied) which also exists in the
5106 * L2ARC, the now stale L2ARC buffer is immediately dropped.
5107 *
5108 * The performance of the L2ARC can be tweaked by a number of tunables, which
5109 * may be necessary for different workloads:
5110 *
5111 *	l2arc_write_max		max write bytes per interval
5112 *	l2arc_write_boost	extra write bytes during device warmup
5113 *	l2arc_noprefetch	skip caching prefetched buffers
5114 *	l2arc_headroom		number of max device writes to precache
5115 *	l2arc_headroom_boost	when we find compressed buffers during ARC
5116 *				scanning, we multiply headroom by this
5117 *				percentage factor for the next scan cycle,
5118 *				since more compressed buffers are likely to
5119 *				be present
5120 *	l2arc_feed_secs		seconds between L2ARC writing
5121 *
5122 * Tunables may be removed or added as future performance improvements are
5123 * integrated, and also may become zpool properties.
5124 *
5125 * There are three key functions that control how the L2ARC warms up:
5126 *
5127 *	l2arc_write_eligible()	check if a buffer is eligible to cache
5128 *	l2arc_write_size()	calculate how much to write
5129 *	l2arc_write_interval()	calculate sleep delay between writes
5130 *
5131 * These three functions determine what to write, how much, and how quickly
5132 * to send writes.
5133 */
5134
5135static boolean_t
5136l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
5137{
5138	/*
5139	 * A buffer is *not* eligible for the L2ARC if it:
5140	 * 1. belongs to a different spa.
5141	 * 2. is already cached on the L2ARC.
5142	 * 3. has an I/O in progress (it may be an incomplete read).
5143	 * 4. is flagged not eligible (zfs property).
5144	 */
5145	if (hdr->b_spa != spa_guid) {
5146		ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
5147		return (B_FALSE);
5148	}
5149	if (HDR_HAS_L2HDR(hdr)) {
5150		ARCSTAT_BUMP(arcstat_l2_write_in_l2);
5151		return (B_FALSE);
5152	}
5153	if (HDR_IO_IN_PROGRESS(hdr)) {
5154		ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
5155		return (B_FALSE);
5156	}
5157	if (!HDR_L2CACHE(hdr)) {
5158		ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
5159		return (B_FALSE);
5160	}
5161
5162	return (B_TRUE);
5163}
5164
5165static uint64_t
5166l2arc_write_size(void)
5167{
5168	uint64_t size;
5169
5170	/*
5171	 * Make sure our globals have meaningful values in case the user
5172	 * altered them.
5173	 */
5174	size = l2arc_write_max;
5175	if (size == 0) {
5176		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
5177		    "be greater than zero, resetting it to the default (%d)",
5178		    L2ARC_WRITE_SIZE);
5179		size = l2arc_write_max = L2ARC_WRITE_SIZE;
5180	}
5181
5182	if (arc_warm == B_FALSE)
5183		size += l2arc_write_boost;
5184
5185	return (size);
5186
5187}
5188
5189static clock_t
5190l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
5191{
5192	clock_t interval, next, now;
5193
5194	/*
5195	 * If the ARC lists are busy, increase our write rate; if the
5196	 * lists are stale, idle back.  This is achieved by checking
5197	 * how much we previously wrote - if it was more than half of
5198	 * what we wanted, schedule the next write much sooner.
5199	 */
5200	if (l2arc_feed_again && wrote > (wanted / 2))
5201		interval = (hz * l2arc_feed_min_ms) / 1000;
5202	else
5203		interval = hz * l2arc_feed_secs;
5204
5205	now = ddi_get_lbolt();
5206	next = MAX(now, MIN(now + interval, began + interval));
5207
5208	return (next);
5209}
5210
5211/*
5212 * Cycle through L2ARC devices.  This is how L2ARC load balances.
5213 * If a device is returned, this also returns holding the spa config lock.
5214 */
5215static l2arc_dev_t *
5216l2arc_dev_get_next(void)
5217{
5218	l2arc_dev_t *first, *next = NULL;
5219
5220	/*
5221	 * Lock out the removal of spas (spa_namespace_lock), then removal
5222	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
5223	 * both locks will be dropped and a spa config lock held instead.
5224	 */
5225	mutex_enter(&spa_namespace_lock);
5226	mutex_enter(&l2arc_dev_mtx);
5227
5228	/* if there are no vdevs, there is nothing to do */
5229	if (l2arc_ndev == 0)
5230		goto out;
5231
5232	first = NULL;
5233	next = l2arc_dev_last;
5234	do {
5235		/* loop around the list looking for a non-faulted vdev */
5236		if (next == NULL) {
5237			next = list_head(l2arc_dev_list);
5238		} else {
5239			next = list_next(l2arc_dev_list, next);
5240			if (next == NULL)
5241				next = list_head(l2arc_dev_list);
5242		}
5243
5244		/* if we have come back to the start, bail out */
5245		if (first == NULL)
5246			first = next;
5247		else if (next == first)
5248			break;
5249
5250	} while (vdev_is_dead(next->l2ad_vdev));
5251
5252	/* if we were unable to find any usable vdevs, return NULL */
5253	if (vdev_is_dead(next->l2ad_vdev))
5254		next = NULL;
5255
5256	l2arc_dev_last = next;
5257
5258out:
5259	mutex_exit(&l2arc_dev_mtx);
5260
5261	/*
5262	 * Grab the config lock to prevent the 'next' device from being
5263	 * removed while we are writing to it.
5264	 */
5265	if (next != NULL)
5266		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
5267	mutex_exit(&spa_namespace_lock);
5268
5269	return (next);
5270}
5271
5272/*
5273 * Free buffers that were tagged for destruction.
5274 */
5275static void
5276l2arc_do_free_on_write()
5277{
5278	list_t *buflist;
5279	l2arc_data_free_t *df, *df_prev;
5280
5281	mutex_enter(&l2arc_free_on_write_mtx);
5282	buflist = l2arc_free_on_write;
5283
5284	for (df = list_tail(buflist); df; df = df_prev) {
5285		df_prev = list_prev(buflist, df);
5286		ASSERT(df->l2df_data != NULL);
5287		ASSERT(df->l2df_func != NULL);
5288		df->l2df_func(df->l2df_data, df->l2df_size);
5289		list_remove(buflist, df);
5290		kmem_free(df, sizeof (l2arc_data_free_t));
5291	}
5292
5293	mutex_exit(&l2arc_free_on_write_mtx);
5294}
5295
5296/*
5297 * A write to a cache device has completed.  Update all headers to allow
5298 * reads from these buffers to begin.
5299 */
5300static void
5301l2arc_write_done(zio_t *zio)
5302{
5303	l2arc_write_callback_t *cb;
5304	l2arc_dev_t *dev;
5305	list_t *buflist;
5306	arc_buf_hdr_t *head, *hdr, *hdr_prev;
5307	kmutex_t *hash_lock;
5308	int64_t bytes_dropped = 0;
5309
5310	cb = zio->io_private;
5311	ASSERT(cb != NULL);
5312	dev = cb->l2wcb_dev;
5313	ASSERT(dev != NULL);
5314	head = cb->l2wcb_head;
5315	ASSERT(head != NULL);
5316	buflist = &dev->l2ad_buflist;
5317	ASSERT(buflist != NULL);
5318	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
5319	    l2arc_write_callback_t *, cb);
5320
5321	if (zio->io_error != 0)
5322		ARCSTAT_BUMP(arcstat_l2_writes_error);
5323
5324	mutex_enter(&dev->l2ad_mtx);
5325
5326	/*
5327	 * All writes completed, or an error was hit.
5328	 */
5329	for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
5330		hdr_prev = list_prev(buflist, hdr);
5331
5332		hash_lock = HDR_LOCK(hdr);
5333		if (!mutex_tryenter(hash_lock)) {
5334			/*
5335			 * This buffer misses out.  It may be in a stage
5336			 * of eviction.  Its ARC_FLAG_L2_WRITING flag will be
5337			 * left set, denying reads to this buffer.
5338			 */
5339			ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
5340			continue;
5341		}
5342
5343		/*
5344		 * It's possible that this buffer got evicted from the L1 cache
5345		 * before we grabbed the vdev + hash locks, in which case
5346		 * arc_hdr_realloc freed b_tmp_cdata for us if it was allocated.
5347		 * Only free the buffer if we still have an L1 hdr.
5348		 */
5349		if (HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_tmp_cdata != NULL &&
5350		    HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF)
5351			l2arc_release_cdata_buf(hdr);
5352
5353		if (zio->io_error != 0) {
5354			/*
5355			 * Error - drop L2ARC entry.
5356			 */
5357			trim_map_free(hdr->b_l2hdr.b_dev->l2ad_vdev,
5358			    hdr->b_l2hdr.b_daddr, hdr->b_l2hdr.b_asize, 0);
5359			hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
5360
5361			ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
5362			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
5363
5364			bytes_dropped += hdr->b_l2hdr.b_asize;
5365			(void) refcount_remove_many(&dev->l2ad_alloc,
5366			    hdr->b_l2hdr.b_asize, hdr);
5367		}
5368
5369		/*
5370		 * Allow ARC to begin reads to this L2ARC entry.
5371		 */
5372		hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
5373
5374		mutex_exit(hash_lock);
5375	}
5376
5377	atomic_inc_64(&l2arc_writes_done);
5378	list_remove(buflist, head);
5379	ASSERT(!HDR_HAS_L1HDR(head));
5380	kmem_cache_free(hdr_l2only_cache, head);
5381	mutex_exit(&dev->l2ad_mtx);
5382
5383	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
5384
5385	l2arc_do_free_on_write();
5386
5387	kmem_free(cb, sizeof (l2arc_write_callback_t));
5388}
5389
5390/*
5391 * A read to a cache device completed.  Validate buffer contents before
5392 * handing over to the regular ARC routines.
5393 */
5394static void
5395l2arc_read_done(zio_t *zio)
5396{
5397	l2arc_read_callback_t *cb;
5398	arc_buf_hdr_t *hdr;
5399	arc_buf_t *buf;
5400	kmutex_t *hash_lock;
5401	int equal;
5402
5403	ASSERT(zio->io_vd != NULL);
5404	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
5405
5406	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
5407
5408	cb = zio->io_private;
5409	ASSERT(cb != NULL);
5410	buf = cb->l2rcb_buf;
5411	ASSERT(buf != NULL);
5412
5413	hash_lock = HDR_LOCK(buf->b_hdr);
5414	mutex_enter(hash_lock);
5415	hdr = buf->b_hdr;
5416	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
5417
5418	/*
5419	 * If the buffer was compressed, decompress it first.
5420	 */
5421	if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
5422		l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
5423	ASSERT(zio->io_data != NULL);
5424
5425	/*
5426	 * Check this survived the L2ARC journey.
5427	 */
5428	equal = arc_cksum_equal(buf);
5429	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
5430		mutex_exit(hash_lock);
5431		zio->io_private = buf;
5432		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
5433		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
5434		arc_read_done(zio);
5435	} else {
5436		mutex_exit(hash_lock);
5437		/*
5438		 * Buffer didn't survive caching.  Increment stats and
5439		 * reissue to the original storage device.
5440		 */
5441		if (zio->io_error != 0) {
5442			ARCSTAT_BUMP(arcstat_l2_io_error);
5443		} else {
5444			zio->io_error = SET_ERROR(EIO);
5445		}
5446		if (!equal)
5447			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
5448
5449		/*
5450		 * If there's no waiter, issue an async i/o to the primary
5451		 * storage now.  If there *is* a waiter, the caller must
5452		 * issue the i/o in a context where it's OK to block.
5453		 */
5454		if (zio->io_waiter == NULL) {
5455			zio_t *pio = zio_unique_parent(zio);
5456
5457			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
5458
5459			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
5460			    buf->b_data, zio->io_size, arc_read_done, buf,
5461			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
5462		}
5463	}
5464
5465	kmem_free(cb, sizeof (l2arc_read_callback_t));
5466}
5467
5468/*
5469 * This is the list priority from which the L2ARC will search for pages to
5470 * cache.  This is used within loops (0..3) to cycle through lists in the
5471 * desired order.  This order can have a significant effect on cache
5472 * performance.
5473 *
5474 * Currently the metadata lists are hit first, MFU then MRU, followed by
5475 * the data lists.  This function returns a locked list, and also returns
5476 * the lock pointer.
5477 */
5478static list_t *
5479l2arc_list_locked(int list_num, kmutex_t **lock)
5480{
5481	list_t *list = NULL;
5482
5483	ASSERT(list_num >= 0 && list_num <= 3);
5484
5485	switch (list_num) {
5486	case 0:
5487		list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
5488		*lock = &arc_mfu->arcs_mtx;
5489		break;
5490	case 1:
5491		list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
5492		*lock = &arc_mru->arcs_mtx;
5493		break;
5494	case 2:
5495		list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
5496		*lock = &arc_mfu->arcs_mtx;
5497		break;
5498	case 3:
5499		list = &arc_mru->arcs_list[ARC_BUFC_DATA];
5500		*lock = &arc_mru->arcs_mtx;
5501		break;
5502	}
5503
5504	ASSERT(!(MUTEX_HELD(*lock)));
5505	mutex_enter(*lock);
5506	return (list);
5507}
5508
5509/*
5510 * Evict buffers from the device write hand to the distance specified in
5511 * bytes.  This distance may span populated buffers, it may span nothing.
5512 * This is clearing a region on the L2ARC device ready for writing.
5513 * If the 'all' boolean is set, every buffer is evicted.
5514 */
5515static void
5516l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
5517{
5518	list_t *buflist;
5519	arc_buf_hdr_t *hdr, *hdr_prev;
5520	kmutex_t *hash_lock;
5521	uint64_t taddr;
5522
5523	buflist = &dev->l2ad_buflist;
5524
5525	if (!all && dev->l2ad_first) {
5526		/*
5527		 * This is the first sweep through the device.  There is
5528		 * nothing to evict.
5529		 */
5530		return;
5531	}
5532
5533	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
5534		/*
5535		 * When nearing the end of the device, evict to the end
5536		 * before the device write hand jumps to the start.
5537		 */
5538		taddr = dev->l2ad_end;
5539	} else {
5540		taddr = dev->l2ad_hand + distance;
5541	}
5542	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
5543	    uint64_t, taddr, boolean_t, all);
5544
5545top:
5546	mutex_enter(&dev->l2ad_mtx);
5547	for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
5548		hdr_prev = list_prev(buflist, hdr);
5549
5550		hash_lock = HDR_LOCK(hdr);
5551		if (!mutex_tryenter(hash_lock)) {
5552			/*
5553			 * Missed the hash lock.  Retry.
5554			 */
5555			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
5556			mutex_exit(&dev->l2ad_mtx);
5557			mutex_enter(hash_lock);
5558			mutex_exit(hash_lock);
5559			goto top;
5560		}
5561
5562		if (HDR_L2_WRITE_HEAD(hdr)) {
5563			/*
5564			 * We hit a write head node.  Leave it for
5565			 * l2arc_write_done().
5566			 */
5567			list_remove(buflist, hdr);
5568			mutex_exit(hash_lock);
5569			continue;
5570		}
5571
5572		if (!all && HDR_HAS_L2HDR(hdr) &&
5573		    (hdr->b_l2hdr.b_daddr > taddr ||
5574		    hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
5575			/*
5576			 * We've evicted to the target address,
5577			 * or the end of the device.
5578			 */
5579			mutex_exit(hash_lock);
5580			break;
5581		}
5582
5583		ASSERT(HDR_HAS_L2HDR(hdr));
5584		if (!HDR_HAS_L1HDR(hdr)) {
5585			ASSERT(!HDR_L2_READING(hdr));
5586			/*
5587			 * This doesn't exist in the ARC.  Destroy.
5588			 * arc_hdr_destroy() will call list_remove()
5589			 * and decrement arcstat_l2_size.
5590			 */
5591			arc_change_state(arc_anon, hdr, hash_lock);
5592			arc_hdr_destroy(hdr);
5593		} else {
5594			ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
5595			ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
5596			/*
5597			 * Invalidate issued or about to be issued
5598			 * reads, since we may be about to write
5599			 * over this location.
5600			 */
5601			if (HDR_L2_READING(hdr)) {
5602				ARCSTAT_BUMP(arcstat_l2_evict_reading);
5603				hdr->b_flags |= ARC_FLAG_L2_EVICTED;
5604			}
5605
5606			arc_hdr_l2hdr_destroy(hdr);
5607		}
5608		mutex_exit(hash_lock);
5609	}
5610	mutex_exit(&dev->l2ad_mtx);
5611}
5612
5613/*
5614 * Find and write ARC buffers to the L2ARC device.
5615 *
5616 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
5617 * for reading until they have completed writing.
5618 * The headroom_boost is an in-out parameter used to maintain headroom boost
5619 * state between calls to this function.
5620 *
5621 * Returns the number of bytes actually written (which may be smaller than
5622 * the delta by which the device hand has changed due to alignment).
5623 */
5624static uint64_t
5625l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
5626    boolean_t *headroom_boost)
5627{
5628	arc_buf_hdr_t *hdr, *hdr_prev, *head;
5629	list_t *list;
5630	uint64_t write_asize, write_psize, write_sz, headroom,
5631	    buf_compress_minsz;
5632	void *buf_data;
5633	kmutex_t *list_lock;
5634	boolean_t full;
5635	l2arc_write_callback_t *cb;
5636	zio_t *pio, *wzio;
5637	uint64_t guid = spa_load_guid(spa);
5638	const boolean_t do_headroom_boost = *headroom_boost;
5639	int try;
5640
5641	ASSERT(dev->l2ad_vdev != NULL);
5642
5643	/* Lower the flag now, we might want to raise it again later. */
5644	*headroom_boost = B_FALSE;
5645
5646	pio = NULL;
5647	write_sz = write_asize = write_psize = 0;
5648	full = B_FALSE;
5649	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
5650	head->b_flags |= ARC_FLAG_L2_WRITE_HEAD;
5651	head->b_flags |= ARC_FLAG_HAS_L2HDR;
5652
5653	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
5654	/*
5655	 * We will want to try to compress buffers that are at least 2x the
5656	 * device sector size.
5657	 */
5658	buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
5659
5660	/*
5661	 * Copy buffers for L2ARC writing.
5662	 */
5663	mutex_enter(&dev->l2ad_mtx);
5664	for (try = 0; try <= 3; try++) {
5665		uint64_t passed_sz = 0;
5666
5667		list = l2arc_list_locked(try, &list_lock);
5668		ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
5669
5670		/*
5671		 * L2ARC fast warmup.
5672		 *
5673		 * Until the ARC is warm and starts to evict, read from the
5674		 * head of the ARC lists rather than the tail.
5675		 */
5676		if (arc_warm == B_FALSE)
5677			hdr = list_head(list);
5678		else
5679			hdr = list_tail(list);
5680		if (hdr == NULL)
5681			ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
5682
5683		headroom = target_sz * l2arc_headroom;
5684		if (do_headroom_boost)
5685			headroom = (headroom * l2arc_headroom_boost) / 100;
5686
5687		for (; hdr; hdr = hdr_prev) {
5688			kmutex_t *hash_lock;
5689			uint64_t buf_sz;
5690
5691			if (arc_warm == B_FALSE)
5692				hdr_prev = list_next(list, hdr);
5693			else
5694				hdr_prev = list_prev(list, hdr);
5695			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, hdr->b_size);
5696
5697			hash_lock = HDR_LOCK(hdr);
5698			if (!mutex_tryenter(hash_lock)) {
5699				ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
5700				/*
5701				 * Skip this buffer rather than waiting.
5702				 */
5703				continue;
5704			}
5705
5706			passed_sz += hdr->b_size;
5707			if (passed_sz > headroom) {
5708				/*
5709				 * Searched too far.
5710				 */
5711				mutex_exit(hash_lock);
5712				ARCSTAT_BUMP(arcstat_l2_write_passed_headroom);
5713				break;
5714			}
5715
5716			if (!l2arc_write_eligible(guid, hdr)) {
5717				mutex_exit(hash_lock);
5718				continue;
5719			}
5720
5721			if ((write_sz + hdr->b_size) > target_sz) {
5722				full = B_TRUE;
5723				mutex_exit(hash_lock);
5724				ARCSTAT_BUMP(arcstat_l2_write_full);
5725				break;
5726			}
5727
5728			if (pio == NULL) {
5729				/*
5730				 * Insert a dummy header on the buflist so
5731				 * l2arc_write_done() can find where the
5732				 * write buffers begin without searching.
5733				 */
5734				list_insert_head(&dev->l2ad_buflist, head);
5735
5736				cb = kmem_alloc(
5737				    sizeof (l2arc_write_callback_t), KM_SLEEP);
5738				cb->l2wcb_dev = dev;
5739				cb->l2wcb_head = head;
5740				pio = zio_root(spa, l2arc_write_done, cb,
5741				    ZIO_FLAG_CANFAIL);
5742				ARCSTAT_BUMP(arcstat_l2_write_pios);
5743			}
5744
5745			/*
5746			 * Create and add a new L2ARC header.
5747			 */
5748			hdr->b_l2hdr.b_dev = dev;
5749			hdr->b_flags |= ARC_FLAG_L2_WRITING;
5750			/*
5751			 * Temporarily stash the data buffer in b_tmp_cdata.
5752			 * The subsequent write step will pick it up from
5753			 * there. This is because can't access b_l1hdr.b_buf
5754			 * without holding the hash_lock, which we in turn
5755			 * can't access without holding the ARC list locks
5756			 * (which we want to avoid during compression/writing).
5757			 */
5758			HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
5759			hdr->b_l2hdr.b_asize = hdr->b_size;
5760			hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data;
5761
5762			/*
5763			 * Explicitly set the b_daddr field to a known
5764			 * value which means "invalid address". This
5765			 * enables us to differentiate which stage of
5766			 * l2arc_write_buffers() the particular header
5767			 * is in (e.g. this loop, or the one below).
5768			 * ARC_FLAG_L2_WRITING is not enough to make
5769			 * this distinction, and we need to know in
5770			 * order to do proper l2arc vdev accounting in
5771			 * arc_release() and arc_hdr_destroy().
5772			 *
5773			 * Note, we can't use a new flag to distinguish
5774			 * the two stages because we don't hold the
5775			 * header's hash_lock below, in the second stage
5776			 * of this function. Thus, we can't simply
5777			 * change the b_flags field to denote that the
5778			 * IO has been sent. We can change the b_daddr
5779			 * field of the L2 portion, though, since we'll
5780			 * be holding the l2ad_mtx; which is why we're
5781			 * using it to denote the header's state change.
5782			 */
5783			hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET;
5784
5785			buf_sz = hdr->b_size;
5786			hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
5787
5788			list_insert_head(&dev->l2ad_buflist, hdr);
5789
5790			/*
5791			 * Compute and store the buffer cksum before
5792			 * writing.  On debug the cksum is verified first.
5793			 */
5794			arc_cksum_verify(hdr->b_l1hdr.b_buf);
5795			arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE);
5796
5797			mutex_exit(hash_lock);
5798
5799			write_sz += buf_sz;
5800		}
5801
5802		mutex_exit(list_lock);
5803
5804		if (full == B_TRUE)
5805			break;
5806	}
5807
5808	/* No buffers selected for writing? */
5809	if (pio == NULL) {
5810		ASSERT0(write_sz);
5811		mutex_exit(&dev->l2ad_mtx);
5812		ASSERT(!HDR_HAS_L1HDR(head));
5813		kmem_cache_free(hdr_l2only_cache, head);
5814		return (0);
5815	}
5816
5817	/*
5818	 * Now start writing the buffers. We're starting at the write head
5819	 * and work backwards, retracing the course of the buffer selector
5820	 * loop above.
5821	 */
5822	for (hdr = list_prev(&dev->l2ad_buflist, head); hdr;
5823	    hdr = list_prev(&dev->l2ad_buflist, hdr)) {
5824		uint64_t buf_sz;
5825
5826		/*
5827		 * We shouldn't need to lock the buffer here, since we flagged
5828		 * it as ARC_FLAG_L2_WRITING in the previous step, but we must
5829		 * take care to only access its L2 cache parameters. In
5830		 * particular, hdr->l1hdr.b_buf may be invalid by now due to
5831		 * ARC eviction.
5832		 */
5833		hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
5834
5835		if ((HDR_L2COMPRESS(hdr)) &&
5836		    hdr->b_l2hdr.b_asize >= buf_compress_minsz) {
5837			if (l2arc_compress_buf(hdr)) {
5838				/*
5839				 * If compression succeeded, enable headroom
5840				 * boost on the next scan cycle.
5841				 */
5842				*headroom_boost = B_TRUE;
5843			}
5844		}
5845
5846		/*
5847		 * Pick up the buffer data we had previously stashed away
5848		 * (and now potentially also compressed).
5849		 */
5850		buf_data = hdr->b_l1hdr.b_tmp_cdata;
5851		buf_sz = hdr->b_l2hdr.b_asize;
5852
5853		/*
5854		 * If the data has not been compressed, then clear b_tmp_cdata
5855		 * to make sure that it points only to a temporary compression
5856		 * buffer.
5857		 */
5858		if (!L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr)))
5859			hdr->b_l1hdr.b_tmp_cdata = NULL;
5860
5861		/*
5862		 * We need to do this regardless if buf_sz is zero or
5863		 * not, otherwise, when this l2hdr is evicted we'll
5864		 * remove a reference that was never added.
5865		 */
5866		(void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr);
5867
5868		/* Compression may have squashed the buffer to zero length. */
5869		if (buf_sz != 0) {
5870			uint64_t buf_p_sz;
5871
5872			wzio = zio_write_phys(pio, dev->l2ad_vdev,
5873			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
5874			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
5875			    ZIO_FLAG_CANFAIL, B_FALSE);
5876
5877			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
5878			    zio_t *, wzio);
5879			(void) zio_nowait(wzio);
5880
5881			write_asize += buf_sz;
5882
5883			/*
5884			 * Keep the clock hand suitably device-aligned.
5885			 */
5886			buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
5887			write_psize += buf_p_sz;
5888			dev->l2ad_hand += buf_p_sz;
5889		}
5890	}
5891
5892	mutex_exit(&dev->l2ad_mtx);
5893
5894	ASSERT3U(write_asize, <=, target_sz);
5895	ARCSTAT_BUMP(arcstat_l2_writes_sent);
5896	ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
5897	ARCSTAT_INCR(arcstat_l2_size, write_sz);
5898	ARCSTAT_INCR(arcstat_l2_asize, write_asize);
5899	vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
5900
5901	/*
5902	 * Bump device hand to the device start if it is approaching the end.
5903	 * l2arc_evict() will already have evicted ahead for this case.
5904	 */
5905	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
5906		dev->l2ad_hand = dev->l2ad_start;
5907		dev->l2ad_first = B_FALSE;
5908	}
5909
5910	dev->l2ad_writing = B_TRUE;
5911	(void) zio_wait(pio);
5912	dev->l2ad_writing = B_FALSE;
5913
5914	return (write_asize);
5915}
5916
5917/*
5918 * Compresses an L2ARC buffer.
5919 * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its
5920 * size in l2hdr->b_asize. This routine tries to compress the data and
5921 * depending on the compression result there are three possible outcomes:
5922 * *) The buffer was incompressible. The original l2hdr contents were left
5923 *    untouched and are ready for writing to an L2 device.
5924 * *) The buffer was all-zeros, so there is no need to write it to an L2
5925 *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
5926 *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
5927 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
5928 *    data buffer which holds the compressed data to be written, and b_asize
5929 *    tells us how much data there is. b_compress is set to the appropriate
5930 *    compression algorithm. Once writing is done, invoke
5931 *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
5932 *
5933 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
5934 * buffer was incompressible).
5935 */
5936static boolean_t
5937l2arc_compress_buf(arc_buf_hdr_t *hdr)
5938{
5939	void *cdata;
5940	size_t csize, len, rounded;
5941	ASSERT(HDR_HAS_L2HDR(hdr));
5942	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
5943
5944	ASSERT(HDR_HAS_L1HDR(hdr));
5945	ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF);
5946	ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
5947
5948	len = l2hdr->b_asize;
5949	cdata = zio_data_buf_alloc(len);
5950	ASSERT3P(cdata, !=, NULL);
5951	csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata,
5952	    cdata, l2hdr->b_asize);
5953
5954	if (csize == 0) {
5955		/* zero block, indicate that there's nothing to write */
5956		zio_data_buf_free(cdata, len);
5957		HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY);
5958		l2hdr->b_asize = 0;
5959		hdr->b_l1hdr.b_tmp_cdata = NULL;
5960		ARCSTAT_BUMP(arcstat_l2_compress_zeros);
5961		return (B_TRUE);
5962	}
5963
5964	rounded = P2ROUNDUP(csize,
5965	    (size_t)1 << l2hdr->b_dev->l2ad_vdev->vdev_ashift);
5966	if (rounded < len) {
5967		/*
5968		 * Compression succeeded, we'll keep the cdata around for
5969		 * writing and release it afterwards.
5970		 */
5971		if (rounded > csize) {
5972			bzero((char *)cdata + csize, rounded - csize);
5973			csize = rounded;
5974		}
5975		HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4);
5976		l2hdr->b_asize = csize;
5977		hdr->b_l1hdr.b_tmp_cdata = cdata;
5978		ARCSTAT_BUMP(arcstat_l2_compress_successes);
5979		return (B_TRUE);
5980	} else {
5981		/*
5982		 * Compression failed, release the compressed buffer.
5983		 * l2hdr will be left unmodified.
5984		 */
5985		zio_data_buf_free(cdata, len);
5986		ARCSTAT_BUMP(arcstat_l2_compress_failures);
5987		return (B_FALSE);
5988	}
5989}
5990
5991/*
5992 * Decompresses a zio read back from an l2arc device. On success, the
5993 * underlying zio's io_data buffer is overwritten by the uncompressed
5994 * version. On decompression error (corrupt compressed stream), the
5995 * zio->io_error value is set to signal an I/O error.
5996 *
5997 * Please note that the compressed data stream is not checksummed, so
5998 * if the underlying device is experiencing data corruption, we may feed
5999 * corrupt data to the decompressor, so the decompressor needs to be
6000 * able to handle this situation (LZ4 does).
6001 */
6002static void
6003l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
6004{
6005	ASSERT(L2ARC_IS_VALID_COMPRESS(c));
6006
6007	if (zio->io_error != 0) {
6008		/*
6009		 * An io error has occured, just restore the original io
6010		 * size in preparation for a main pool read.
6011		 */
6012		zio->io_orig_size = zio->io_size = hdr->b_size;
6013		return;
6014	}
6015
6016	if (c == ZIO_COMPRESS_EMPTY) {
6017		/*
6018		 * An empty buffer results in a null zio, which means we
6019		 * need to fill its io_data after we're done restoring the
6020		 * buffer's contents.
6021		 */
6022		ASSERT(hdr->b_l1hdr.b_buf != NULL);
6023		bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size);
6024		zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data;
6025	} else {
6026		ASSERT(zio->io_data != NULL);
6027		/*
6028		 * We copy the compressed data from the start of the arc buffer
6029		 * (the zio_read will have pulled in only what we need, the
6030		 * rest is garbage which we will overwrite at decompression)
6031		 * and then decompress back to the ARC data buffer. This way we
6032		 * can minimize copying by simply decompressing back over the
6033		 * original compressed data (rather than decompressing to an
6034		 * aux buffer and then copying back the uncompressed buffer,
6035		 * which is likely to be much larger).
6036		 */
6037		uint64_t csize;
6038		void *cdata;
6039
6040		csize = zio->io_size;
6041		cdata = zio_data_buf_alloc(csize);
6042		bcopy(zio->io_data, cdata, csize);
6043		if (zio_decompress_data(c, cdata, zio->io_data, csize,
6044		    hdr->b_size) != 0)
6045			zio->io_error = EIO;
6046		zio_data_buf_free(cdata, csize);
6047	}
6048
6049	/* Restore the expected uncompressed IO size. */
6050	zio->io_orig_size = zio->io_size = hdr->b_size;
6051}
6052
6053/*
6054 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
6055 * This buffer serves as a temporary holder of compressed data while
6056 * the buffer entry is being written to an l2arc device. Once that is
6057 * done, we can dispose of it.
6058 */
6059static void
6060l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
6061{
6062	ASSERT(HDR_HAS_L1HDR(hdr));
6063	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_EMPTY) {
6064		/*
6065		 * If the data was compressed, then we've allocated a
6066		 * temporary buffer for it, so now we need to release it.
6067		 */
6068		ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
6069		zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata,
6070		    hdr->b_size);
6071		hdr->b_l1hdr.b_tmp_cdata = NULL;
6072	} else {
6073		ASSERT(hdr->b_l1hdr.b_tmp_cdata == NULL);
6074	}
6075}
6076
6077/*
6078 * This thread feeds the L2ARC at regular intervals.  This is the beating
6079 * heart of the L2ARC.
6080 */
6081static void
6082l2arc_feed_thread(void *dummy __unused)
6083{
6084	callb_cpr_t cpr;
6085	l2arc_dev_t *dev;
6086	spa_t *spa;
6087	uint64_t size, wrote;
6088	clock_t begin, next = ddi_get_lbolt();
6089	boolean_t headroom_boost = B_FALSE;
6090
6091	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
6092
6093	mutex_enter(&l2arc_feed_thr_lock);
6094
6095	while (l2arc_thread_exit == 0) {
6096		CALLB_CPR_SAFE_BEGIN(&cpr);
6097		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
6098		    next - ddi_get_lbolt());
6099		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
6100		next = ddi_get_lbolt() + hz;
6101
6102		/*
6103		 * Quick check for L2ARC devices.
6104		 */
6105		mutex_enter(&l2arc_dev_mtx);
6106		if (l2arc_ndev == 0) {
6107			mutex_exit(&l2arc_dev_mtx);
6108			continue;
6109		}
6110		mutex_exit(&l2arc_dev_mtx);
6111		begin = ddi_get_lbolt();
6112
6113		/*
6114		 * This selects the next l2arc device to write to, and in
6115		 * doing so the next spa to feed from: dev->l2ad_spa.   This
6116		 * will return NULL if there are now no l2arc devices or if
6117		 * they are all faulted.
6118		 *
6119		 * If a device is returned, its spa's config lock is also
6120		 * held to prevent device removal.  l2arc_dev_get_next()
6121		 * will grab and release l2arc_dev_mtx.
6122		 */
6123		if ((dev = l2arc_dev_get_next()) == NULL)
6124			continue;
6125
6126		spa = dev->l2ad_spa;
6127		ASSERT(spa != NULL);
6128
6129		/*
6130		 * If the pool is read-only then force the feed thread to
6131		 * sleep a little longer.
6132		 */
6133		if (!spa_writeable(spa)) {
6134			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
6135			spa_config_exit(spa, SCL_L2ARC, dev);
6136			continue;
6137		}
6138
6139		/*
6140		 * Avoid contributing to memory pressure.
6141		 */
6142		if (arc_reclaim_needed()) {
6143			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
6144			spa_config_exit(spa, SCL_L2ARC, dev);
6145			continue;
6146		}
6147
6148		ARCSTAT_BUMP(arcstat_l2_feeds);
6149
6150		size = l2arc_write_size();
6151
6152		/*
6153		 * Evict L2ARC buffers that will be overwritten.
6154		 */
6155		l2arc_evict(dev, size, B_FALSE);
6156
6157		/*
6158		 * Write ARC buffers.
6159		 */
6160		wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
6161
6162		/*
6163		 * Calculate interval between writes.
6164		 */
6165		next = l2arc_write_interval(begin, size, wrote);
6166		spa_config_exit(spa, SCL_L2ARC, dev);
6167	}
6168
6169	l2arc_thread_exit = 0;
6170	cv_broadcast(&l2arc_feed_thr_cv);
6171	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
6172	thread_exit();
6173}
6174
6175boolean_t
6176l2arc_vdev_present(vdev_t *vd)
6177{
6178	l2arc_dev_t *dev;
6179
6180	mutex_enter(&l2arc_dev_mtx);
6181	for (dev = list_head(l2arc_dev_list); dev != NULL;
6182	    dev = list_next(l2arc_dev_list, dev)) {
6183		if (dev->l2ad_vdev == vd)
6184			break;
6185	}
6186	mutex_exit(&l2arc_dev_mtx);
6187
6188	return (dev != NULL);
6189}
6190
6191/*
6192 * Add a vdev for use by the L2ARC.  By this point the spa has already
6193 * validated the vdev and opened it.
6194 */
6195void
6196l2arc_add_vdev(spa_t *spa, vdev_t *vd)
6197{
6198	l2arc_dev_t *adddev;
6199
6200	ASSERT(!l2arc_vdev_present(vd));
6201
6202	vdev_ashift_optimize(vd);
6203
6204	/*
6205	 * Create a new l2arc device entry.
6206	 */
6207	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
6208	adddev->l2ad_spa = spa;
6209	adddev->l2ad_vdev = vd;
6210	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
6211	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
6212	adddev->l2ad_hand = adddev->l2ad_start;
6213	adddev->l2ad_first = B_TRUE;
6214	adddev->l2ad_writing = B_FALSE;
6215
6216	mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
6217	/*
6218	 * This is a list of all ARC buffers that are still valid on the
6219	 * device.
6220	 */
6221	list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
6222	    offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
6223
6224	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
6225	refcount_create(&adddev->l2ad_alloc);
6226
6227	/*
6228	 * Add device to global list
6229	 */
6230	mutex_enter(&l2arc_dev_mtx);
6231	list_insert_head(l2arc_dev_list, adddev);
6232	atomic_inc_64(&l2arc_ndev);
6233	mutex_exit(&l2arc_dev_mtx);
6234}
6235
6236/*
6237 * Remove a vdev from the L2ARC.
6238 */
6239void
6240l2arc_remove_vdev(vdev_t *vd)
6241{
6242	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
6243
6244	/*
6245	 * Find the device by vdev
6246	 */
6247	mutex_enter(&l2arc_dev_mtx);
6248	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
6249		nextdev = list_next(l2arc_dev_list, dev);
6250		if (vd == dev->l2ad_vdev) {
6251			remdev = dev;
6252			break;
6253		}
6254	}
6255	ASSERT(remdev != NULL);
6256
6257	/*
6258	 * Remove device from global list
6259	 */
6260	list_remove(l2arc_dev_list, remdev);
6261	l2arc_dev_last = NULL;		/* may have been invalidated */
6262	atomic_dec_64(&l2arc_ndev);
6263	mutex_exit(&l2arc_dev_mtx);
6264
6265	/*
6266	 * Clear all buflists and ARC references.  L2ARC device flush.
6267	 */
6268	l2arc_evict(remdev, 0, B_TRUE);
6269	list_destroy(&remdev->l2ad_buflist);
6270	mutex_destroy(&remdev->l2ad_mtx);
6271	refcount_destroy(&remdev->l2ad_alloc);
6272	kmem_free(remdev, sizeof (l2arc_dev_t));
6273}
6274
6275void
6276l2arc_init(void)
6277{
6278	l2arc_thread_exit = 0;
6279	l2arc_ndev = 0;
6280	l2arc_writes_sent = 0;
6281	l2arc_writes_done = 0;
6282
6283	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
6284	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
6285	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
6286	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
6287
6288	l2arc_dev_list = &L2ARC_dev_list;
6289	l2arc_free_on_write = &L2ARC_free_on_write;
6290	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
6291	    offsetof(l2arc_dev_t, l2ad_node));
6292	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
6293	    offsetof(l2arc_data_free_t, l2df_list_node));
6294}
6295
6296void
6297l2arc_fini(void)
6298{
6299	/*
6300	 * This is called from dmu_fini(), which is called from spa_fini();
6301	 * Because of this, we can assume that all l2arc devices have
6302	 * already been removed when the pools themselves were removed.
6303	 */
6304
6305	l2arc_do_free_on_write();
6306
6307	mutex_destroy(&l2arc_feed_thr_lock);
6308	cv_destroy(&l2arc_feed_thr_cv);
6309	mutex_destroy(&l2arc_dev_mtx);
6310	mutex_destroy(&l2arc_free_on_write_mtx);
6311
6312	list_destroy(l2arc_dev_list);
6313	list_destroy(l2arc_free_on_write);
6314}
6315
6316void
6317l2arc_start(void)
6318{
6319	if (!(spa_mode_global & FWRITE))
6320		return;
6321
6322	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
6323	    TS_RUN, minclsyspri);
6324}
6325
6326void
6327l2arc_stop(void)
6328{
6329	if (!(spa_mode_global & FWRITE))
6330		return;
6331
6332	mutex_enter(&l2arc_feed_thr_lock);
6333	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
6334	l2arc_thread_exit = 1;
6335	while (l2arc_thread_exit != 0)
6336		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
6337	mutex_exit(&l2arc_feed_thr_lock);
6338}
6339