Deleted Added
full compact
arc.c (346684) arc.c (346686)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 259 unchanged lines hidden (view full) ---

268#include <sys/abd.h>
269#ifdef _KERNEL
270#include <sys/dnlc.h>
271#include <sys/racct.h>
272#endif
273#include <sys/callb.h>
274#include <sys/kstat.h>
275#include <sys/trim_map.h>
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE

--- 259 unchanged lines hidden (view full) ---

268#include <sys/abd.h>
269#ifdef _KERNEL
270#include <sys/dnlc.h>
271#include <sys/racct.h>
272#endif
273#include <sys/callb.h>
274#include <sys/kstat.h>
275#include <sys/trim_map.h>
276#include <sys/zthr.h>
276#include <zfs_fletcher.h>
277#include <sys/sdt.h>
278#include <sys/aggsum.h>
279#include <sys/cityhash.h>
280
281#include <machine/vmparam.h>
282
283#ifdef illumos
284#ifndef _KERNEL
285/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
286boolean_t arc_watch = B_FALSE;
287int arc_procfd;
288#endif
289#endif /* illumos */
290
277#include <zfs_fletcher.h>
278#include <sys/sdt.h>
279#include <sys/aggsum.h>
280#include <sys/cityhash.h>
281
282#include <machine/vmparam.h>
283
284#ifdef illumos
285#ifndef _KERNEL
286/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
287boolean_t arc_watch = B_FALSE;
288int arc_procfd;
289#endif
290#endif /* illumos */
291
291static kmutex_t arc_reclaim_lock;
292static kcondvar_t arc_reclaim_thread_cv;
293static boolean_t arc_reclaim_thread_exit;
294static kcondvar_t arc_reclaim_waiters_cv;
292/*
293 * This thread's job is to keep enough free memory in the system, by
294 * calling arc_kmem_reap_now() plus arc_shrink(), which improves
295 * arc_available_memory().
296 */
297static zthr_t *arc_reap_zthr;
295
298
299/*
300 * This thread's job is to keep arc_size under arc_c, by calling
301 * arc_adjust(), which improves arc_is_overflowing().
302 */
303static zthr_t *arc_adjust_zthr;
304
305static kmutex_t arc_adjust_lock;
306static kcondvar_t arc_adjust_waiters_cv;
307static boolean_t arc_adjust_needed = B_FALSE;
308
296static kmutex_t arc_dnlc_evicts_lock;
297static kcondvar_t arc_dnlc_evicts_cv;
298static boolean_t arc_dnlc_evicts_thread_exit;
299
300uint_t arc_reduce_dnlc_percent = 3;
301
302/*
303 * The number of headers to evict in arc_evict_state_impl() before
304 * dropping the sublist lock and evicting from another sublist. A lower
305 * value means we're more likely to evict the "correct" header (i.e. the
306 * oldest header in the arc state), but comes with higher overhead
307 * (i.e. more invocations of arc_evict_state_impl()).
308 */
309int zfs_arc_evict_batch_limit = 10;
310
311/* number of seconds before growing cache again */
309static kmutex_t arc_dnlc_evicts_lock;
310static kcondvar_t arc_dnlc_evicts_cv;
311static boolean_t arc_dnlc_evicts_thread_exit;
312
313uint_t arc_reduce_dnlc_percent = 3;
314
315/*
316 * The number of headers to evict in arc_evict_state_impl() before
317 * dropping the sublist lock and evicting from another sublist. A lower
318 * value means we're more likely to evict the "correct" header (i.e. the
319 * oldest header in the arc state), but comes with higher overhead
320 * (i.e. more invocations of arc_evict_state_impl()).
321 */
322int zfs_arc_evict_batch_limit = 10;
323
324/* number of seconds before growing cache again */
312static int arc_grow_retry = 60;
325int arc_grow_retry = 60;
313
326
314/* number of milliseconds before attempting a kmem-cache-reap */
315static int arc_kmem_cache_reap_retry_ms = 0;
327/*
328 * Minimum time between calls to arc_kmem_reap_soon(). Note that this will
329 * be converted to ticks, so with the default hz=100, a setting of 15 ms
330 * will actually wait 2 ticks, or 20ms.
331 */
332int arc_kmem_cache_reap_retry_ms = 1000;
316
317/* shift of arc_c for calculating overflow limit in arc_get_data_impl */
333
334/* shift of arc_c for calculating overflow limit in arc_get_data_impl */
318int zfs_arc_overflow_shift = 8;
335int zfs_arc_overflow_shift = 8;
319
320/* shift of arc_c for calculating both min and max arc_p */
336
337/* shift of arc_c for calculating both min and max arc_p */
321static int arc_p_min_shift = 4;
338int arc_p_min_shift = 4;
322
323/* log2(fraction of arc to reclaim) */
339
340/* log2(fraction of arc to reclaim) */
324static int arc_shrink_shift = 7;
341int arc_shrink_shift = 7;
325
326/*
327 * log2(fraction of ARC which must be free to allow growing).
328 * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
329 * when reading a new block into the ARC, we will evict an equal-sized block
330 * from the ARC.
331 *
332 * This must be less than arc_shrink_shift, so that when we shrink the ARC,

--- 9 unchanged lines hidden (view full) ---

342static int zfs_arc_min_prefetch_ms = 1;
343static int zfs_arc_min_prescient_prefetch_ms = 6;
344
345/*
346 * If this percent of memory is free, don't throttle.
347 */
348int arc_lotsfree_percent = 10;
349
342
343/*
344 * log2(fraction of ARC which must be free to allow growing).
345 * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
346 * when reading a new block into the ARC, we will evict an equal-sized block
347 * from the ARC.
348 *
349 * This must be less than arc_shrink_shift, so that when we shrink the ARC,

--- 9 unchanged lines hidden (view full) ---

359static int zfs_arc_min_prefetch_ms = 1;
360static int zfs_arc_min_prescient_prefetch_ms = 6;
361
362/*
363 * If this percent of memory is free, don't throttle.
364 */
365int arc_lotsfree_percent = 10;
366
350static int arc_dead;
367static boolean_t arc_initialized;
351extern boolean_t zfs_prefetch_disable;
352
353/*
354 * The arc has filled available memory and has now warmed up.
355 */
356static boolean_t arc_warm;
357
358/*

--- 617 unchanged lines hidden (view full) ---

976aggsum_t arc_meta_used;
977aggsum_t astat_data_size;
978aggsum_t astat_metadata_size;
979aggsum_t astat_hdr_size;
980aggsum_t astat_other_size;
981aggsum_t astat_l2_hdr_size;
982
983static int arc_no_grow; /* Don't try to grow cache size */
368extern boolean_t zfs_prefetch_disable;
369
370/*
371 * The arc has filled available memory and has now warmed up.
372 */
373static boolean_t arc_warm;
374
375/*

--- 617 unchanged lines hidden (view full) ---

993aggsum_t arc_meta_used;
994aggsum_t astat_data_size;
995aggsum_t astat_metadata_size;
996aggsum_t astat_hdr_size;
997aggsum_t astat_other_size;
998aggsum_t astat_l2_hdr_size;
999
1000static int arc_no_grow; /* Don't try to grow cache size */
1001static hrtime_t arc_growtime;
984static uint64_t arc_tempreserve;
985static uint64_t arc_loaned_bytes;
986
987typedef struct arc_callback arc_callback_t;
988
989struct arc_callback {
990 void *acb_private;
991 arc_read_done_func_t *acb_done;

--- 744 unchanged lines hidden (view full) ---

1736static void
1737hdr_recl(void *unused)
1738{
1739 dprintf("hdr_recl called\n");
1740 /*
1741 * umem calls the reclaim func when we destroy the buf cache,
1742 * which is after we do arc_fini().
1743 */
1002static uint64_t arc_tempreserve;
1003static uint64_t arc_loaned_bytes;
1004
1005typedef struct arc_callback arc_callback_t;
1006
1007struct arc_callback {
1008 void *acb_private;
1009 arc_read_done_func_t *acb_done;

--- 744 unchanged lines hidden (view full) ---

1754static void
1755hdr_recl(void *unused)
1756{
1757 dprintf("hdr_recl called\n");
1758 /*
1759 * umem calls the reclaim func when we destroy the buf cache,
1760 * which is after we do arc_fini().
1761 */
1744 if (!arc_dead)
1745 cv_signal(&arc_reclaim_thread_cv);
1762 if (arc_initialized)
1763 zthr_wakeup(arc_reap_zthr);
1746}
1747
1748static void
1749buf_init(void)
1750{
1751 uint64_t *ct;
1752 uint64_t hsize = 1ULL << 12;
1753 int i, j;

--- 2007 unchanged lines hidden (view full) ---

3761 * to significantly overflow arc_c; since
3762 * arc_get_data_impl() doesn't check for overflow
3763 * when it's woken up (it doesn't because it's
3764 * possible for the ARC to be overflowing while
3765 * full of un-evictable buffers, and the
3766 * function should proceed in this case).
3767 *
3768 * If threads are left sleeping, due to not
1764}
1765
1766static void
1767buf_init(void)
1768{
1769 uint64_t *ct;
1770 uint64_t hsize = 1ULL << 12;
1771 int i, j;

--- 2007 unchanged lines hidden (view full) ---

3779 * to significantly overflow arc_c; since
3780 * arc_get_data_impl() doesn't check for overflow
3781 * when it's woken up (it doesn't because it's
3782 * possible for the ARC to be overflowing while
3783 * full of un-evictable buffers, and the
3784 * function should proceed in this case).
3785 *
3786 * If threads are left sleeping, due to not
3769 * using cv_broadcast, they will be woken up
3770 * just before arc_reclaim_thread() sleeps.
3787 * using cv_broadcast here, they will be woken
3788 * up via cv_broadcast in arc_adjust_cb() just
3789 * before arc_adjust_zthr sleeps.
3771 */
3790 */
3772 mutex_enter(&arc_reclaim_lock);
3791 mutex_enter(&arc_adjust_lock);
3773 if (!arc_is_overflowing())
3792 if (!arc_is_overflowing())
3774 cv_signal(&arc_reclaim_waiters_cv);
3775 mutex_exit(&arc_reclaim_lock);
3793 cv_signal(&arc_adjust_waiters_cv);
3794 mutex_exit(&arc_adjust_lock);
3776 } else {
3777 ARCSTAT_BUMP(arcstat_mutex_miss);
3778 }
3779 }
3780
3781 multilist_sublist_unlock(mls);
3782
3783 return (bytes_evicted);

--- 462 unchanged lines hidden (view full) ---

4246
4247 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
4248 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
4249
4250 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
4251 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
4252}
4253
3795 } else {
3796 ARCSTAT_BUMP(arcstat_mutex_miss);
3797 }
3798 }
3799
3800 multilist_sublist_unlock(mls);
3801
3802 return (bytes_evicted);

--- 462 unchanged lines hidden (view full) ---

4265
4266 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
4267 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
4268
4269 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
4270 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
4271}
4272
4254uint64_t
4255arc_shrink(int64_t to_free)
4273static void
4274arc_reduce_target_size(int64_t to_free)
4256{
4257 uint64_t asize = aggsum_value(&arc_size);
4258 if (arc_c > arc_c_min) {
4259 DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
4260 arc_c_min, uint64_t, arc_p, uint64_t, to_free);
4261 if (arc_c > arc_c_min + to_free)
4262 atomic_add_64(&arc_c, -to_free);
4263 else

--- 10 unchanged lines hidden (view full) ---

4274
4275 ASSERT(arc_c >= arc_c_min);
4276 ASSERT((int64_t)arc_p >= 0);
4277 }
4278
4279 if (asize > arc_c) {
4280 DTRACE_PROBE2(arc__shrink_adjust, uint64_t, asize,
4281 uint64_t, arc_c);
4275{
4276 uint64_t asize = aggsum_value(&arc_size);
4277 if (arc_c > arc_c_min) {
4278 DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
4279 arc_c_min, uint64_t, arc_p, uint64_t, to_free);
4280 if (arc_c > arc_c_min + to_free)
4281 atomic_add_64(&arc_c, -to_free);
4282 else

--- 10 unchanged lines hidden (view full) ---

4293
4294 ASSERT(arc_c >= arc_c_min);
4295 ASSERT((int64_t)arc_p >= 0);
4296 }
4297
4298 if (asize > arc_c) {
4299 DTRACE_PROBE2(arc__shrink_adjust, uint64_t, asize,
4300 uint64_t, arc_c);
4282 return (arc_adjust());
4301 /* See comment in arc_adjust_cb_check() on why lock+flag */
4302 mutex_enter(&arc_adjust_lock);
4303 arc_adjust_needed = B_TRUE;
4304 mutex_exit(&arc_adjust_lock);
4305 zthr_wakeup(arc_adjust_zthr);
4283 }
4306 }
4284 return (0);
4285}
4286
4287typedef enum free_memory_reason_t {
4288 FMR_UNKNOWN,
4289 FMR_NEEDFREE,
4290 FMR_LOTSFREE,
4291 FMR_SWAPFS_MINFREE,
4292 FMR_PAGES_PP_MAXIMUM,

--- 172 unchanged lines hidden (view full) ---

4465}
4466
4467extern kmem_cache_t *zio_buf_cache[];
4468extern kmem_cache_t *zio_data_buf_cache[];
4469extern kmem_cache_t *range_seg_cache;
4470extern kmem_cache_t *abd_chunk_cache;
4471
4472static __noinline void
4307}
4308
4309typedef enum free_memory_reason_t {
4310 FMR_UNKNOWN,
4311 FMR_NEEDFREE,
4312 FMR_LOTSFREE,
4313 FMR_SWAPFS_MINFREE,
4314 FMR_PAGES_PP_MAXIMUM,

--- 172 unchanged lines hidden (view full) ---

4487}
4488
4489extern kmem_cache_t *zio_buf_cache[];
4490extern kmem_cache_t *zio_data_buf_cache[];
4491extern kmem_cache_t *range_seg_cache;
4492extern kmem_cache_t *abd_chunk_cache;
4493
4494static __noinline void
4473arc_kmem_reap_now(void)
4495arc_kmem_reap_soon(void)
4474{
4475 size_t i;
4476 kmem_cache_t *prev_cache = NULL;
4477 kmem_cache_t *prev_data_cache = NULL;
4478
4479 DTRACE_PROBE(arc__kmem_reap_start);
4480#ifdef _KERNEL
4481 if (aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) {

--- 6 unchanged lines hidden (view full) ---

4488#if defined(__i386)
4489 /*
4490 * Reclaim unused memory from all kmem caches.
4491 */
4492 kmem_reap();
4493#endif
4494#endif
4495
4496{
4497 size_t i;
4498 kmem_cache_t *prev_cache = NULL;
4499 kmem_cache_t *prev_data_cache = NULL;
4500
4501 DTRACE_PROBE(arc__kmem_reap_start);
4502#ifdef _KERNEL
4503 if (aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) {

--- 6 unchanged lines hidden (view full) ---

4510#if defined(__i386)
4511 /*
4512 * Reclaim unused memory from all kmem caches.
4513 */
4514 kmem_reap();
4515#endif
4516#endif
4517
4496 /*
4497 * If a kmem reap is already active, don't schedule more. We must
4498 * check for this because kmem_cache_reap_soon() won't actually
4499 * block on the cache being reaped (this is to prevent callers from
4500 * becoming implicitly blocked by a system-wide kmem reap -- which,
4501 * on a system with many, many full magazines, can take minutes).
4502 */
4503 if (kmem_cache_reap_active())
4504 return;
4505
4506 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
4507 if (zio_buf_cache[i] != prev_cache) {
4508 prev_cache = zio_buf_cache[i];
4509 kmem_cache_reap_soon(zio_buf_cache[i]);
4510 }
4511 if (zio_data_buf_cache[i] != prev_data_cache) {
4512 prev_data_cache = zio_data_buf_cache[i];
4513 kmem_cache_reap_soon(zio_data_buf_cache[i]);

--- 12 unchanged lines hidden (view full) ---

4526 * quantum caches.
4527 */
4528 vmem_qcache_reap(zio_arena);
4529 }
4530#endif
4531 DTRACE_PROBE(arc__kmem_reap_end);
4532}
4533
4518 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
4519 if (zio_buf_cache[i] != prev_cache) {
4520 prev_cache = zio_buf_cache[i];
4521 kmem_cache_reap_soon(zio_buf_cache[i]);
4522 }
4523 if (zio_data_buf_cache[i] != prev_data_cache) {
4524 prev_data_cache = zio_data_buf_cache[i];
4525 kmem_cache_reap_soon(zio_data_buf_cache[i]);

--- 12 unchanged lines hidden (view full) ---

4538 * quantum caches.
4539 */
4540 vmem_qcache_reap(zio_arena);
4541 }
4542#endif
4543 DTRACE_PROBE(arc__kmem_reap_end);
4544}
4545
4534/*
4535 * Threads can block in arc_get_data_impl() waiting for this thread to evict
4536 * enough data and signal them to proceed. When this happens, the threads in
4537 * arc_get_data_impl() are sleeping while holding the hash lock for their
4538 * particular arc header. Thus, we must be careful to never sleep on a
4539 * hash lock in this thread. This is to prevent the following deadlock:
4540 *
4541 * - Thread A sleeps on CV in arc_get_data_impl() holding hash lock "L",
4542 * waiting for the reclaim thread to signal it.
4543 *
4544 * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
4545 * fails, and goes to sleep forever.
4546 *
4547 * This possible deadlock is avoided by always acquiring a hash lock
4548 * using mutex_tryenter() from arc_reclaim_thread().
4549 */
4550/* ARGSUSED */
4546/* ARGSUSED */
4551static void
4552arc_reclaim_thread(void *unused __unused)
4547static boolean_t
4548arc_adjust_cb_check(void *arg, zthr_t *zthr)
4553{
4549{
4554 hrtime_t growtime = 0;
4555 hrtime_t kmem_reap_time = 0;
4556 callb_cpr_t cpr;
4550 /*
4551 * This is necessary in order for the mdb ::arc dcmd to
4552 * show up to date information. Since the ::arc command
4553 * does not call the kstat's update function, without
4554 * this call, the command may show stale stats for the
4555 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
4556 * with this change, the data might be up to 1 second
4557 * out of date(the arc_adjust_zthr has a maximum sleep
4558 * time of 1 second); but that should suffice. The
4559 * arc_state_t structures can be queried directly if more
4560 * accurate information is needed.
4561 */
4562 if (arc_ksp != NULL)
4563 arc_ksp->ks_update(arc_ksp, KSTAT_READ);
4557
4564
4558 CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
4565 /*
4566 * We have to rely on arc_get_data_impl() to tell us when to adjust,
4567 * rather than checking if we are overflowing here, so that we are
4568 * sure to not leave arc_get_data_impl() waiting on
4569 * arc_adjust_waiters_cv. If we have become "not overflowing" since
4570 * arc_get_data_impl() checked, we need to wake it up. We could
4571 * broadcast the CV here, but arc_get_data_impl() may have not yet
4572 * gone to sleep. We would need to use a mutex to ensure that this
4573 * function doesn't broadcast until arc_get_data_impl() has gone to
4574 * sleep (e.g. the arc_adjust_lock). However, the lock ordering of
4575 * such a lock would necessarily be incorrect with respect to the
4576 * zthr_lock, which is held before this function is called, and is
4577 * held by arc_get_data_impl() when it calls zthr_wakeup().
4578 */
4579 return (arc_adjust_needed);
4580}
4559
4581
4560 mutex_enter(&arc_reclaim_lock);
4561 while (!arc_reclaim_thread_exit) {
4562 uint64_t evicted = 0;
4582/*
4583 * Keep arc_size under arc_c by running arc_adjust which evicts data
4584 * from the ARC. */
4585/* ARGSUSED */
4586static int
4587arc_adjust_cb(void *arg, zthr_t *zthr)
4588{
4589 uint64_t evicted = 0;
4563
4590
4591 /* Evict from cache */
4592 evicted = arc_adjust();
4593
4594 /*
4595 * If evicted is zero, we couldn't evict anything
4596 * via arc_adjust(). This could be due to hash lock
4597 * collisions, but more likely due to the majority of
4598 * arc buffers being unevictable. Therefore, even if
4599 * arc_size is above arc_c, another pass is unlikely to
4600 * be helpful and could potentially cause us to enter an
4601 * infinite loop. Additionally, zthr_iscancelled() is
4602 * checked here so that if the arc is shutting down, the
4603 * broadcast will wake any remaining arc adjust waiters.
4604 */
4605 mutex_enter(&arc_adjust_lock);
4606 arc_adjust_needed = !zthr_iscancelled(arc_adjust_zthr) &&
4607 evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0;
4608 if (!arc_adjust_needed) {
4564 /*
4609 /*
4565 * This is necessary in order for the mdb ::arc dcmd to
4566 * show up to date information. Since the ::arc command
4567 * does not call the kstat's update function, without
4568 * this call, the command may show stale stats for the
4569 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
4570 * with this change, the data might be up to 1 second
4571 * out of date; but that should suffice. The arc_state_t
4572 * structures can be queried directly if more accurate
4573 * information is needed.
4610 * We're either no longer overflowing, or we
4611 * can't evict anything more, so we should wake
4612 * up any waiters.
4574 */
4613 */
4575 if (arc_ksp != NULL)
4576 arc_ksp->ks_update(arc_ksp, KSTAT_READ);
4614 cv_broadcast(&arc_adjust_waiters_cv);
4615 }
4616 mutex_exit(&arc_adjust_lock);
4577
4617
4578 mutex_exit(&arc_reclaim_lock);
4618 return (0);
4619}
4579
4620
4621/* ARGSUSED */
4622static boolean_t
4623arc_reap_cb_check(void *arg, zthr_t *zthr)
4624{
4625 int64_t free_memory = arc_available_memory();
4626
4627 /*
4628 * If a kmem reap is already active, don't schedule more. We must
4629 * check for this because kmem_cache_reap_soon() won't actually
4630 * block on the cache being reaped (this is to prevent callers from
4631 * becoming implicitly blocked by a system-wide kmem reap -- which,
4632 * on a system with many, many full magazines, can take minutes).
4633 */
4634 if (!kmem_cache_reap_active() &&
4635 free_memory < 0) {
4636 arc_no_grow = B_TRUE;
4637 arc_warm = B_TRUE;
4580 /*
4638 /*
4581 * We call arc_adjust() before (possibly) calling
4582 * arc_kmem_reap_now(), so that we can wake up
4583 * arc_get_data_impl() sooner.
4639 * Wait at least zfs_grow_retry (default 60) seconds
4640 * before considering growing.
4584 */
4641 */
4585 evicted = arc_adjust();
4642 arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
4643 return (B_TRUE);
4644 } else if (free_memory < arc_c >> arc_no_grow_shift) {
4645 arc_no_grow = B_TRUE;
4646 } else if (gethrtime() >= arc_growtime) {
4647 arc_no_grow = B_FALSE;
4648 }
4586
4649
4587 int64_t free_memory = arc_available_memory();
4588 if (free_memory < 0) {
4589 hrtime_t curtime = gethrtime();
4590 arc_no_grow = B_TRUE;
4591 arc_warm = B_TRUE;
4650 return (B_FALSE);
4651}
4592
4652
4593 /*
4594 * Wait at least zfs_grow_retry (default 60) seconds
4595 * before considering growing.
4596 */
4597 growtime = curtime + SEC2NSEC(arc_grow_retry);
4653/*
4654 * Keep enough free memory in the system by reaping the ARC's kmem
4655 * caches. To cause more slabs to be reapable, we may reduce the
4656 * target size of the cache (arc_c), causing the arc_adjust_cb()
4657 * to free more buffers.
4658 */
4659/* ARGSUSED */
4660static int
4661arc_reap_cb(void *arg, zthr_t *zthr)
4662{
4663 int64_t free_memory;
4598
4664
4599 /*
4600 * Wait at least arc_kmem_cache_reap_retry_ms
4601 * between arc_kmem_reap_now() calls. Without
4602 * this check it is possible to end up in a
4603 * situation where we spend lots of time
4604 * reaping caches, while we're near arc_c_min.
4605 */
4606 if (curtime >= kmem_reap_time) {
4607 arc_kmem_reap_now();
4608 kmem_reap_time = gethrtime() +
4609 MSEC2NSEC(arc_kmem_cache_reap_retry_ms);
4610 }
4665 /*
4666 * Kick off asynchronous kmem_reap()'s of all our caches.
4667 */
4668 arc_kmem_reap_soon();
4611
4669
4612 /*
4613 * If we are still low on memory, shrink the ARC
4614 * so that we have arc_shrink_min free space.
4615 */
4616 free_memory = arc_available_memory();
4670 /*
4671 * Wait at least arc_kmem_cache_reap_retry_ms between
4672 * arc_kmem_reap_soon() calls. Without this check it is possible to
4673 * end up in a situation where we spend lots of time reaping
4674 * caches, while we're near arc_c_min. Waiting here also gives the
4675 * subsequent free memory check a chance of finding that the
4676 * asynchronous reap has already freed enough memory, and we don't
4677 * need to call arc_reduce_target_size().
4678 */
4679 delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000);
4617
4680
4618 int64_t to_free =
4619 (arc_c >> arc_shrink_shift) - free_memory;
4620 if (to_free > 0) {
4681 /*
4682 * Reduce the target size as needed to maintain the amount of free
4683 * memory in the system at a fraction of the arc_size (1/128th by
4684 * default). If oversubscribed (free_memory < 0) then reduce the
4685 * target arc_size by the deficit amount plus the fractional
4686 * amount. If free memory is positive but less then the fractional
4687 * amount, reduce by what is needed to hit the fractional amount.
4688 */
4689 free_memory = arc_available_memory();
4690
4691 int64_t to_free =
4692 (arc_c >> arc_shrink_shift) - free_memory;
4693 if (to_free > 0) {
4621#ifdef _KERNEL
4622#ifdef illumos
4694#ifdef _KERNEL
4695#ifdef illumos
4623 to_free = MAX(to_free, ptob(needfree));
4696 to_free = MAX(to_free, ptob(needfree));
4624#endif
4625#endif
4697#endif
4698#endif
4626 evicted += arc_shrink(to_free);
4627 }
4628 } else if (free_memory < arc_c >> arc_no_grow_shift) {
4629 arc_no_grow = B_TRUE;
4630 } else if (gethrtime() >= growtime) {
4631 arc_no_grow = B_FALSE;
4632 }
4633
4634 mutex_enter(&arc_reclaim_lock);
4635
4636 /*
4637 * If evicted is zero, we couldn't evict anything via
4638 * arc_adjust(). This could be due to hash lock
4639 * collisions, but more likely due to the majority of
4640 * arc buffers being unevictable. Therefore, even if
4641 * arc_size is above arc_c, another pass is unlikely to
4642 * be helpful and could potentially cause us to enter an
4643 * infinite loop.
4644 */
4645 if (aggsum_compare(&arc_size, arc_c) <= 0|| evicted == 0) {
4646 /*
4647 * We're either no longer overflowing, or we
4648 * can't evict anything more, so we should wake
4649 * up any threads before we go to sleep.
4650 */
4651 cv_broadcast(&arc_reclaim_waiters_cv);
4652
4653 /*
4654 * Block until signaled, or after one second (we
4655 * might need to perform arc_kmem_reap_now()
4656 * even if we aren't being signalled)
4657 */
4658 CALLB_CPR_SAFE_BEGIN(&cpr);
4659 (void) cv_timedwait_hires(&arc_reclaim_thread_cv,
4660 &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
4661 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
4662 }
4699 arc_reduce_target_size(to_free);
4663 }
4664
4700 }
4701
4665 arc_reclaim_thread_exit = B_FALSE;
4666 cv_broadcast(&arc_reclaim_thread_cv);
4667 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */
4668 thread_exit();
4702 return (0);
4669}
4670
4671static u_int arc_dnlc_evicts_arg;
4672extern struct vfsops zfs_vfsops;
4673
4674static void
4675arc_dnlc_evicts_thread(void *dummy __unused)
4676{

--- 78 unchanged lines hidden (view full) ---

4755 mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
4756 mult = MIN(mult, 10);
4757
4758 delta = MIN(bytes * mult, arc_p);
4759 arc_p = MAX(arc_p_min, arc_p - delta);
4760 }
4761 ASSERT((int64_t)arc_p >= 0);
4762
4703}
4704
4705static u_int arc_dnlc_evicts_arg;
4706extern struct vfsops zfs_vfsops;
4707
4708static void
4709arc_dnlc_evicts_thread(void *dummy __unused)
4710{

--- 78 unchanged lines hidden (view full) ---

4789 mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
4790 mult = MIN(mult, 10);
4791
4792 delta = MIN(bytes * mult, arc_p);
4793 arc_p = MAX(arc_p_min, arc_p - delta);
4794 }
4795 ASSERT((int64_t)arc_p >= 0);
4796
4797 /*
4798 * Wake reap thread if we do not have any available memory
4799 */
4763 if (arc_reclaim_needed()) {
4800 if (arc_reclaim_needed()) {
4764 cv_signal(&arc_reclaim_thread_cv);
4801 zthr_wakeup(arc_reap_zthr);
4765 return;
4766 }
4767
4768 if (arc_no_grow)
4769 return;
4770
4771 if (arc_c >= arc_c_max)
4772 return;

--- 91 unchanged lines hidden (view full) ---

4864 *
4865 * It's also possible that the reclaim thread is unable to evict
4866 * enough buffers to get arc_size below the overflow limit (e.g.
4867 * due to buffers being un-evictable, or hash lock collisions).
4868 * In this case, we want to proceed regardless if we're
4869 * overflowing; thus we don't use a while loop here.
4870 */
4871 if (arc_is_overflowing()) {
4802 return;
4803 }
4804
4805 if (arc_no_grow)
4806 return;
4807
4808 if (arc_c >= arc_c_max)
4809 return;

--- 91 unchanged lines hidden (view full) ---

4901 *
4902 * It's also possible that the reclaim thread is unable to evict
4903 * enough buffers to get arc_size below the overflow limit (e.g.
4904 * due to buffers being un-evictable, or hash lock collisions).
4905 * In this case, we want to proceed regardless if we're
4906 * overflowing; thus we don't use a while loop here.
4907 */
4908 if (arc_is_overflowing()) {
4872 mutex_enter(&arc_reclaim_lock);
4909 mutex_enter(&arc_adjust_lock);
4873
4874 /*
4875 * Now that we've acquired the lock, we may no longer be
4876 * over the overflow limit, lets check.
4877 *
4878 * We're ignoring the case of spurious wake ups. If that
4879 * were to happen, it'd let this thread consume an ARC
4880 * buffer before it should have (i.e. before we're under
4881 * the overflow limit and were signalled by the reclaim
4882 * thread). As long as that is a rare occurrence, it
4883 * shouldn't cause any harm.
4884 */
4885 if (arc_is_overflowing()) {
4910
4911 /*
4912 * Now that we've acquired the lock, we may no longer be
4913 * over the overflow limit, lets check.
4914 *
4915 * We're ignoring the case of spurious wake ups. If that
4916 * were to happen, it'd let this thread consume an ARC
4917 * buffer before it should have (i.e. before we're under
4918 * the overflow limit and were signalled by the reclaim
4919 * thread). As long as that is a rare occurrence, it
4920 * shouldn't cause any harm.
4921 */
4922 if (arc_is_overflowing()) {
4886 cv_signal(&arc_reclaim_thread_cv);
4887 cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
4923 arc_adjust_needed = B_TRUE;
4924 zthr_wakeup(arc_adjust_zthr);
4925 (void) cv_wait(&arc_adjust_waiters_cv,
4926 &arc_adjust_lock);
4888 }
4927 }
4889
4890 mutex_exit(&arc_reclaim_lock);
4928 mutex_exit(&arc_adjust_lock);
4891 }
4892
4893 VERIFY3U(hdr->b_type, ==, type);
4894 if (type == ARC_BUFC_METADATA) {
4895 arc_space_consume(size, ARC_SPACE_META);
4896 } else {
4897 arc_space_consume(size, ARC_SPACE_DATA);
4898 }

--- 1651 unchanged lines hidden (view full) ---

6550}
6551
6552#ifdef _KERNEL
6553static eventhandler_tag arc_event_lowmem = NULL;
6554
6555static void
6556arc_lowmem(void *arg __unused, int howto __unused)
6557{
4929 }
4930
4931 VERIFY3U(hdr->b_type, ==, type);
4932 if (type == ARC_BUFC_METADATA) {
4933 arc_space_consume(size, ARC_SPACE_META);
4934 } else {
4935 arc_space_consume(size, ARC_SPACE_DATA);
4936 }

--- 1651 unchanged lines hidden (view full) ---

6588}
6589
6590#ifdef _KERNEL
6591static eventhandler_tag arc_event_lowmem = NULL;
6592
6593static void
6594arc_lowmem(void *arg __unused, int howto __unused)
6595{
6596 int64_t free_memory, to_free;
6558
6597
6559 mutex_enter(&arc_reclaim_lock);
6560 DTRACE_PROBE1(arc__needfree, int64_t, ((int64_t)freemem - zfs_arc_free_target) * PAGESIZE);
6561 cv_signal(&arc_reclaim_thread_cv);
6598 arc_no_grow = B_TRUE;
6599 arc_warm = B_TRUE;
6600 arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
6601 free_memory = arc_available_memory();
6602 to_free = (arc_c >> arc_shrink_shift) - MIN(free_memory, 0);
6603 DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free);
6604 arc_reduce_target_size(to_free);
6562
6605
6606 mutex_enter(&arc_adjust_lock);
6607 arc_adjust_needed = B_TRUE;
6608 zthr_wakeup(arc_adjust_zthr);
6609
6563 /*
6564 * It is unsafe to block here in arbitrary threads, because we can come
6565 * here from ARC itself and may hold ARC locks and thus risk a deadlock
6566 * with ARC reclaim thread.
6567 */
6568 if (curproc == pageproc)
6610 /*
6611 * It is unsafe to block here in arbitrary threads, because we can come
6612 * here from ARC itself and may hold ARC locks and thus risk a deadlock
6613 * with ARC reclaim thread.
6614 */
6615 if (curproc == pageproc)
6569 (void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
6570 mutex_exit(&arc_reclaim_lock);
6616 (void) cv_wait(&arc_adjust_waiters_cv, &arc_adjust_lock);
6617 mutex_exit(&arc_adjust_lock);
6571}
6572#endif
6573
6574static void
6575arc_state_init(void)
6576{
6577 arc_anon = &ARC_anon;
6578 arc_mru = &ARC_mru;

--- 123 unchanged lines hidden (view full) ---

6702#ifdef _KERNEL
6703 uint64_t allmem = ptob(physmem - swapfs_minfree);
6704#else
6705 uint64_t allmem = (physmem * PAGESIZE) / 2;
6706#endif
6707#else
6708 uint64_t allmem = kmem_size();
6709#endif
6618}
6619#endif
6620
6621static void
6622arc_state_init(void)
6623{
6624 arc_anon = &ARC_anon;
6625 arc_mru = &ARC_mru;

--- 123 unchanged lines hidden (view full) ---

6749#ifdef _KERNEL
6750 uint64_t allmem = ptob(physmem - swapfs_minfree);
6751#else
6752 uint64_t allmem = (physmem * PAGESIZE) / 2;
6753#endif
6754#else
6755 uint64_t allmem = kmem_size();
6756#endif
6757 mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL);
6758 cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL);
6710
6759
6711
6712 mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
6713 cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
6714 cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
6715
6716 mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
6717 cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL);
6718
6719 /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */
6720 arc_c_min = MAX(allmem / 32, arc_abs_min);
6721 /* set max to 5/8 of all memory, or all but 1GB, whichever is more */
6722 if (allmem >= 1 << 30)
6723 arc_c_max = allmem - (1 << 30);

--- 74 unchanged lines hidden (view full) ---

6798 arc_c = arc_c / 2;
6799 if (arc_c < arc_c_min)
6800 arc_c = arc_c_min;
6801
6802 zfs_arc_min = arc_c_min;
6803 zfs_arc_max = arc_c_max;
6804
6805 arc_state_init();
6760 mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
6761 cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL);
6762
6763 /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */
6764 arc_c_min = MAX(allmem / 32, arc_abs_min);
6765 /* set max to 5/8 of all memory, or all but 1GB, whichever is more */
6766 if (allmem >= 1 << 30)
6767 arc_c_max = allmem - (1 << 30);

--- 74 unchanged lines hidden (view full) ---

6842 arc_c = arc_c / 2;
6843 if (arc_c < arc_c_min)
6844 arc_c = arc_c_min;
6845
6846 zfs_arc_min = arc_c_min;
6847 zfs_arc_max = arc_c_max;
6848
6849 arc_state_init();
6850
6851 /*
6852 * The arc must be "uninitialized", so that hdr_recl() (which is
6853 * registered by buf_init()) will not access arc_reap_zthr before
6854 * it is created.
6855 */
6856 ASSERT(!arc_initialized);
6806 buf_init();
6807
6857 buf_init();
6858
6808 arc_reclaim_thread_exit = B_FALSE;
6809 arc_dnlc_evicts_thread_exit = FALSE;
6810
6811 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
6812 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
6813
6814 if (arc_ksp != NULL) {
6815 arc_ksp->ks_data = &arc_stats;
6816 arc_ksp->ks_update = arc_kstat_update;
6817 kstat_install(arc_ksp);
6818 }
6819
6859 arc_dnlc_evicts_thread_exit = FALSE;
6860
6861 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
6862 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
6863
6864 if (arc_ksp != NULL) {
6865 arc_ksp->ks_data = &arc_stats;
6866 arc_ksp->ks_update = arc_kstat_update;
6867 kstat_install(arc_ksp);
6868 }
6869
6820 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
6821 TS_RUN, minclsyspri);
6870 arc_adjust_zthr = zthr_create_timer(arc_adjust_cb_check,
6871 arc_adjust_cb, NULL, SEC2NSEC(1));
6872 arc_reap_zthr = zthr_create_timer(arc_reap_cb_check,
6873 arc_reap_cb, NULL, SEC2NSEC(1));
6822
6823#ifdef _KERNEL
6824 arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
6825 EVENTHANDLER_PRI_FIRST);
6826#endif
6827
6828 (void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0,
6829 TS_RUN, minclsyspri);
6830
6874
6875#ifdef _KERNEL
6876 arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
6877 EVENTHANDLER_PRI_FIRST);
6878#endif
6879
6880 (void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0,
6881 TS_RUN, minclsyspri);
6882
6831 arc_dead = B_FALSE;
6883 arc_initialized = B_TRUE;
6832 arc_warm = B_FALSE;
6833
6834 /*
6835 * Calculate maximum amount of dirty data per pool.
6836 *
6837 * If it has been set by /etc/system, take that.
6838 * Otherwise, use a percentage of physical memory defined by
6839 * zfs_dirty_data_max_percent (default 10%) with a cap at

--- 46 unchanged lines hidden (view full) ---

6886void
6887arc_fini(void)
6888{
6889#ifdef _KERNEL
6890 if (arc_event_lowmem != NULL)
6891 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
6892#endif
6893
6884 arc_warm = B_FALSE;
6885
6886 /*
6887 * Calculate maximum amount of dirty data per pool.
6888 *
6889 * If it has been set by /etc/system, take that.
6890 * Otherwise, use a percentage of physical memory defined by
6891 * zfs_dirty_data_max_percent (default 10%) with a cap at

--- 46 unchanged lines hidden (view full) ---

6938void
6939arc_fini(void)
6940{
6941#ifdef _KERNEL
6942 if (arc_event_lowmem != NULL)
6943 EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
6944#endif
6945
6894 mutex_enter(&arc_reclaim_lock);
6895 arc_reclaim_thread_exit = B_TRUE;
6896 /*
6897 * The reclaim thread will set arc_reclaim_thread_exit back to
6898 * B_FALSE when it is finished exiting; we're waiting for that.
6899 */
6900 while (arc_reclaim_thread_exit) {
6901 cv_signal(&arc_reclaim_thread_cv);
6902 cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
6903 }
6904 mutex_exit(&arc_reclaim_lock);
6905
6906 /* Use B_TRUE to ensure *all* buffers are evicted */
6907 arc_flush(NULL, B_TRUE);
6908
6909 mutex_enter(&arc_dnlc_evicts_lock);
6910 arc_dnlc_evicts_thread_exit = TRUE;
6911 /*
6912 * The user evicts thread will set arc_user_evicts_thread_exit
6913 * to FALSE when it is finished exiting; we're waiting for that.
6914 */
6915 while (arc_dnlc_evicts_thread_exit) {
6916 cv_signal(&arc_dnlc_evicts_cv);
6917 cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock);
6918 }
6919 mutex_exit(&arc_dnlc_evicts_lock);
6920
6946 /* Use B_TRUE to ensure *all* buffers are evicted */
6947 arc_flush(NULL, B_TRUE);
6948
6949 mutex_enter(&arc_dnlc_evicts_lock);
6950 arc_dnlc_evicts_thread_exit = TRUE;
6951 /*
6952 * The user evicts thread will set arc_user_evicts_thread_exit
6953 * to FALSE when it is finished exiting; we're waiting for that.
6954 */
6955 while (arc_dnlc_evicts_thread_exit) {
6956 cv_signal(&arc_dnlc_evicts_cv);
6957 cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock);
6958 }
6959 mutex_exit(&arc_dnlc_evicts_lock);
6960
6921 arc_dead = B_TRUE;
6961 arc_initialized = B_FALSE;
6922
6923 if (arc_ksp != NULL) {
6924 kstat_delete(arc_ksp);
6925 arc_ksp = NULL;
6926 }
6927
6962
6963 if (arc_ksp != NULL) {
6964 kstat_delete(arc_ksp);
6965 arc_ksp = NULL;
6966 }
6967
6928 mutex_destroy(&arc_reclaim_lock);
6929 cv_destroy(&arc_reclaim_thread_cv);
6930 cv_destroy(&arc_reclaim_waiters_cv);
6931
6968
6969 (void) zthr_cancel(arc_adjust_zthr);
6970 zthr_destroy(arc_adjust_zthr);
6971
6932 mutex_destroy(&arc_dnlc_evicts_lock);
6933 cv_destroy(&arc_dnlc_evicts_cv);
6934
6972 mutex_destroy(&arc_dnlc_evicts_lock);
6973 cv_destroy(&arc_dnlc_evicts_cv);
6974
6975 (void) zthr_cancel(arc_reap_zthr);
6976 zthr_destroy(arc_reap_zthr);
6977
6978 mutex_destroy(&arc_adjust_lock);
6979 cv_destroy(&arc_adjust_waiters_cv);
6980
6935 arc_state_fini();
6936 buf_fini();
6937
6938 ASSERT0(arc_loaned_bytes);
6939}
6940
6941/*
6942 * Level 2 ARC

--- 1160 unchanged lines hidden ---
6981 arc_state_fini();
6982 buf_fini();
6983
6984 ASSERT0(arc_loaned_bytes);
6985}
6986
6987/*
6988 * Level 2 ARC

--- 1160 unchanged lines hidden ---