1/*
2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <vm/vm_page.h>
30#include <vm/vm_object.h>
31#include <vm/vm_kern.h>
32#include <vm/vm_pageout.h>
33#include <vm/vm_phantom_cache.h>
34#include <vm/vm_compressor.h>
35
36
37uint32_t phantom_cache_eval_period_in_msecs = 250;
38uint32_t phantom_cache_thrashing_threshold_ssd = 1000;
39uint32_t phantom_cache_thrashing_threshold = 100;
40
41/*
42 * Number of consecutive thrashing periods required before
43 * vm_phantom_cache_check_pressure() returns true.
44 */
45unsigned phantom_cache_contiguous_periods = 2;
46
47clock_sec_t	pc_start_of_eval_period_sec = 0;
48clock_nsec_t	pc_start_of_eval_period_nsec = 0;
49boolean_t	pc_need_eval_reset = FALSE;
50
51/* One bit per recent sampling period. Bit 0 = current period. */
52uint32_t	pc_history = 0;
53
54uint32_t	sample_period_ghost_added_count = 0;
55uint32_t	sample_period_ghost_added_count_ssd = 0;
56uint32_t	sample_period_ghost_found_count = 0;
57uint32_t	sample_period_ghost_found_count_ssd = 0;
58
59uint32_t	vm_phantom_object_id = 1;
60#define		VM_PHANTOM_OBJECT_ID_AFTER_WRAP	1000000
61
62vm_ghost_t	vm_phantom_cache;
63uint32_t	vm_phantom_cache_nindx = 1;
64uint32_t	vm_phantom_cache_num_entries = 0;
65uint32_t	vm_phantom_cache_size;
66
67typedef	uint32_t	vm_phantom_hash_entry_t;
68vm_phantom_hash_entry_t	*vm_phantom_cache_hash;
69uint32_t	vm_phantom_cache_hash_size;
70uint32_t	vm_ghost_hash_mask;		/* Mask for hash function */
71uint32_t	vm_ghost_bucket_hash;		/* Basic bucket hash */
72
73
74int pg_masks[4] = {
75	0x1, 0x2, 0x4, 0x8
76};
77
78
79#define vm_phantom_hash(obj_id, offset) (\
80		( (natural_t)((uintptr_t)obj_id * vm_ghost_bucket_hash) + (offset ^ vm_ghost_bucket_hash)) & vm_ghost_hash_mask)
81
82
83struct phantom_cache_stats {
84	uint32_t	pcs_wrapped;
85	uint32_t	pcs_added_page_to_entry;
86	uint32_t	pcs_added_new_entry;
87	uint32_t	pcs_replaced_entry;
88
89	uint32_t	pcs_lookup_found_page_in_cache;
90	uint32_t	pcs_lookup_entry_not_in_cache;
91	uint32_t	pcs_lookup_page_not_in_entry;
92
93	uint32_t	pcs_updated_phantom_state;
94} phantom_cache_stats;
95
96
97void
98vm_phantom_cache_init()
99{
100	unsigned int	num_entries;
101	unsigned int	log1;
102	unsigned int	size;
103
104	num_entries = (uint32_t)(((max_mem / PAGE_SIZE) / 4) / VM_GHOST_PAGES_PER_ENTRY);
105	vm_phantom_cache_num_entries = 1;
106
107	while (vm_phantom_cache_num_entries < num_entries)
108		vm_phantom_cache_num_entries <<= 1;
109
110	vm_phantom_cache_size = sizeof(struct vm_ghost) * vm_phantom_cache_num_entries;
111	vm_phantom_cache_hash_size = sizeof(vm_phantom_hash_entry_t) * vm_phantom_cache_num_entries;
112
113	if (kernel_memory_allocate(kernel_map, (vm_offset_t *)(&vm_phantom_cache), vm_phantom_cache_size, 0, KMA_KOBJECT) != KERN_SUCCESS)
114		panic("vm_phantom_cache_init: kernel_memory_allocate failed\n");
115	bzero(vm_phantom_cache, vm_phantom_cache_size);
116
117	if (kernel_memory_allocate(kernel_map, (vm_offset_t *)(&vm_phantom_cache_hash), vm_phantom_cache_hash_size, 0, KMA_KOBJECT) != KERN_SUCCESS)
118		panic("vm_phantom_cache_init: kernel_memory_allocate failed\n");
119	bzero(vm_phantom_cache_hash, vm_phantom_cache_hash_size);
120
121
122	vm_ghost_hash_mask = vm_phantom_cache_num_entries - 1;
123
124	/*
125	 *	Calculate object_id shift value for hashing algorithm:
126	 *		O = log2(sizeof(struct vm_object))
127	 *		B = log2(vm_page_bucket_count)
128	 *	        hash shifts the object_id left by
129	 *		B/2 - O
130	 */
131	size = vm_phantom_cache_num_entries;
132	for (log1 = 0; size > 1; log1++)
133		size /= 2;
134
135	vm_ghost_bucket_hash = 1 << ((log1 + 1) >> 1);		/* Get (ceiling of sqrt of table size) */
136	vm_ghost_bucket_hash |= 1 << ((log1 + 1) >> 2);		/* Get (ceiling of quadroot of table size) */
137	vm_ghost_bucket_hash |= 1;				/* Set bit and add 1 - always must be 1 to insure unique series */
138
139	if (vm_ghost_hash_mask & vm_phantom_cache_num_entries)
140		printf("vm_phantom_cache_init: WARNING -- strange page hash\n");
141}
142
143
144void
145vm_phantom_cache_add_ghost(vm_page_t m)
146{
147	vm_ghost_t	vpce;
148	int		ghost_index;
149	int		pg_mask;
150	boolean_t	isSSD = FALSE;
151	vm_phantom_hash_entry_t ghost_hash_index;
152
153#if MACH_ASSERT || DEBUG
154	lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
155	vm_object_lock_assert_exclusive(m->object);
156#endif
157
158	if (vm_phantom_cache_num_entries == 0)
159		return;
160
161	pg_mask = pg_masks[(m->offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK];
162
163	if (m->object->phantom_object_id == 0) {
164
165		vnode_pager_get_isSSD(m->object->pager, &isSSD);
166
167		if (isSSD == TRUE)
168			m->object->phantom_isssd = TRUE;
169
170		m->object->phantom_object_id = vm_phantom_object_id++;
171
172		if (vm_phantom_object_id == 0)
173			vm_phantom_object_id = VM_PHANTOM_OBJECT_ID_AFTER_WRAP;
174	} else {
175		if ( (vpce = vm_phantom_cache_lookup_ghost(m, 0)) ) {
176			vpce->g_pages_held |= pg_mask;
177
178			phantom_cache_stats.pcs_added_page_to_entry++;
179			goto done;
180		}
181	}
182	/*
183	 * if we're here then the vm_ghost_t of this vm_page_t
184	 * is not present in the phantom cache... take the next
185	 * available entry in the LRU first evicting the existing
186	 * entry if we've wrapped the ring
187	 */
188	ghost_index = vm_phantom_cache_nindx++;
189
190	if (vm_phantom_cache_nindx == vm_phantom_cache_num_entries) {
191		vm_phantom_cache_nindx = 1;
192
193		phantom_cache_stats.pcs_wrapped++;
194	}
195	vpce = &vm_phantom_cache[ghost_index];
196
197	if (vpce->g_obj_id) {
198		/*
199		 * we're going to replace an existing entry
200		 * so first remove it from the hash
201		 */
202		vm_ghost_t	nvpce;
203
204		ghost_hash_index = vm_phantom_hash(vpce->g_obj_id, vpce->g_obj_offset);
205
206		nvpce = &vm_phantom_cache[vm_phantom_cache_hash[ghost_hash_index]];
207
208		if (nvpce == vpce) {
209			vm_phantom_cache_hash[ghost_hash_index] = vpce->g_next_index;
210		} else {
211			for (;;) {
212				if (nvpce->g_next_index == 0)
213					panic("didn't find ghost in hash\n");
214
215				if (&vm_phantom_cache[nvpce->g_next_index] == vpce) {
216					nvpce->g_next_index = vpce->g_next_index;
217					break;
218				}
219				nvpce = &vm_phantom_cache[nvpce->g_next_index];
220			}
221		}
222		phantom_cache_stats.pcs_replaced_entry++;
223	} else
224		phantom_cache_stats.pcs_added_new_entry++;
225
226	vpce->g_pages_held = pg_mask;
227	vpce->g_obj_offset = (m->offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK;
228	vpce->g_obj_id = m->object->phantom_object_id;
229
230	ghost_hash_index = vm_phantom_hash(vpce->g_obj_id, vpce->g_obj_offset);
231	vpce->g_next_index = vm_phantom_cache_hash[ghost_hash_index];
232	vm_phantom_cache_hash[ghost_hash_index] = ghost_index;
233
234done:
235	if (m->object->phantom_isssd)
236		OSAddAtomic(1, &sample_period_ghost_added_count_ssd);
237	else
238		OSAddAtomic(1, &sample_period_ghost_added_count);
239}
240
241
242vm_ghost_t
243vm_phantom_cache_lookup_ghost(vm_page_t m, uint32_t pg_mask)
244{
245	uint64_t	g_obj_offset;
246	uint32_t	g_obj_id;
247	uint32_t	ghost_index;
248
249	if ((g_obj_id = m->object->phantom_object_id) == 0) {
250		/*
251		 * no entries in phantom cache for this object
252		 */
253		return (NULL);
254	}
255	g_obj_offset = (m->offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK;
256
257	ghost_index = vm_phantom_cache_hash[vm_phantom_hash(g_obj_id, g_obj_offset)];
258
259	while (ghost_index) {
260		vm_ghost_t      vpce;
261
262		vpce = &vm_phantom_cache[ghost_index];
263
264		if (vpce->g_obj_id == g_obj_id && vpce->g_obj_offset == g_obj_offset) {
265
266			if (pg_mask == 0 || (vpce->g_pages_held & pg_mask)) {
267				phantom_cache_stats.pcs_lookup_found_page_in_cache++;
268
269				return (vpce);
270			}
271			phantom_cache_stats.pcs_lookup_page_not_in_entry++;
272
273			return (NULL);
274		}
275		ghost_index = vpce->g_next_index;
276	}
277	phantom_cache_stats.pcs_lookup_entry_not_in_cache++;
278
279	return (NULL);
280}
281
282
283
284void
285vm_phantom_cache_update(vm_page_t m)
286{
287	int		pg_mask;
288	vm_ghost_t      vpce;
289
290#if MACH_ASSERT || DEBUG
291	lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
292	vm_object_lock_assert_exclusive(m->object);
293#endif
294
295	if (vm_phantom_cache_num_entries == 0)
296		return;
297
298	pg_mask = pg_masks[(m->offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK];
299
300	if ( (vpce = vm_phantom_cache_lookup_ghost(m, pg_mask)) ) {
301
302		vpce->g_pages_held &= ~pg_mask;
303
304		phantom_cache_stats.pcs_updated_phantom_state++;
305
306		if (m->object->phantom_isssd)
307			OSAddAtomic(1, &sample_period_ghost_found_count_ssd);
308		else
309			OSAddAtomic(1, &sample_period_ghost_found_count);
310	}
311}
312
313
314#define	PHANTOM_CACHE_DEBUG	1
315
316#if	PHANTOM_CACHE_DEBUG
317
318int	sample_period_ghost_counts_indx = 0;
319
320struct {
321	uint32_t	added;
322	uint32_t	found;
323	uint32_t	added_ssd;
324	uint32_t	found_ssd;
325	uint32_t	elapsed_ms;
326	boolean_t	pressure_detected;
327} sample_period_ghost_counts[256];
328
329#endif
330
331/*
332 * Determine if the file cache is thrashing from sampling interval statistics.
333 *
334 * Pages added to the phantom cache = pages evicted from the file cache.
335 * Pages found in the phantom cache = reads of pages that were recently evicted.
336 * Threshold is the latency-dependent number of reads we consider thrashing.
337 */
338static boolean_t
339is_thrashing(uint32_t added, uint32_t found, uint32_t threshold)
340{
341	/* Ignore normal activity below the threshold. */
342	if (added < threshold || found < threshold)
343		return FALSE;
344
345	/*
346	 * When thrashing in a way that we can mitigate, most of the pages read
347	 * into the file cache were recently evicted, and 'found' will be close
348	 * to 'added'.
349	 *
350	 * When replacing the current working set because a new app is
351	 * launched, we see very high read traffic with sporadic phantom cache
352	 * hits.
353	 *
354	 * This is not thrashing, or freeing up memory wouldn't help much
355	 * anyway.
356	 */
357	if (found < added / 2)
358		return FALSE;
359
360	return TRUE;
361}
362
363/*
364 * the following function is never called
365 * from multiple threads simultaneously due
366 * to a condition variable used to serialize
367 * at the compressor level... thus no need
368 * to provide locking for the sample processing
369 */
370boolean_t
371vm_phantom_cache_check_pressure()
372{
373        clock_sec_t	cur_ts_sec;
374        clock_nsec_t	cur_ts_nsec;
375	uint64_t	elapsed_msecs_in_eval;
376	boolean_t	pressure_detected = FALSE;
377
378	clock_get_system_nanotime(&cur_ts_sec, &cur_ts_nsec);
379
380	elapsed_msecs_in_eval = vm_compressor_compute_elapsed_msecs(cur_ts_sec, cur_ts_nsec, pc_start_of_eval_period_sec, pc_start_of_eval_period_nsec);
381
382	/*
383	 * Reset evaluation period after phantom_cache_eval_period_in_msecs or
384	 * whenever vm_phantom_cache_restart_sample has been called.
385	 */
386	if (elapsed_msecs_in_eval >= phantom_cache_eval_period_in_msecs) {
387		pc_need_eval_reset = TRUE;
388	}
389
390	if (pc_need_eval_reset == TRUE) {
391
392#if PHANTOM_CACHE_DEBUG
393		/*
394		 * maintain some info about the last 256 sample periods
395		 */
396		sample_period_ghost_counts[sample_period_ghost_counts_indx].added = sample_period_ghost_added_count;
397		sample_period_ghost_counts[sample_period_ghost_counts_indx].found = sample_period_ghost_found_count;
398		sample_period_ghost_counts[sample_period_ghost_counts_indx].added_ssd = sample_period_ghost_added_count_ssd;
399		sample_period_ghost_counts[sample_period_ghost_counts_indx].found_ssd = sample_period_ghost_found_count_ssd;
400		sample_period_ghost_counts[sample_period_ghost_counts_indx].elapsed_ms = (uint32_t)elapsed_msecs_in_eval;
401
402		sample_period_ghost_counts_indx++;
403
404		if (sample_period_ghost_counts_indx >= 256)
405			sample_period_ghost_counts_indx = 0;
406#endif
407		sample_period_ghost_added_count = 0;
408		sample_period_ghost_found_count = 0;
409		sample_period_ghost_added_count_ssd = 0;
410		sample_period_ghost_found_count_ssd = 0;
411
412		pc_start_of_eval_period_sec = cur_ts_sec;
413		pc_start_of_eval_period_nsec = cur_ts_nsec;
414		pc_history <<= 1;
415		pc_need_eval_reset = FALSE;
416	} else {
417		/*
418		 * Since the trashing rate is really a function of the read latency of the disk
419		 * we have to consider both the SSD and spinning disk case since the file cache
420		 * could be backed by either or even both flavors.  When the object is first
421		 * assigned a phantom_object_id, we query the pager to determine if the backing
422		 * backing media is an SSD and remember that answer in the vm_object.  We use
423		 * that info to maintains counts for both the SSD and spinning disk cases.
424		 */
425		if (is_thrashing(sample_period_ghost_added_count,
426				 sample_period_ghost_found_count,
427				 phantom_cache_thrashing_threshold) ||
428		    is_thrashing(sample_period_ghost_added_count_ssd,
429				 sample_period_ghost_found_count_ssd,
430				 phantom_cache_thrashing_threshold_ssd)) {
431			/* Thrashing in the current period: Set bit 0. */
432			pc_history |= 1;
433		}
434	}
435
436	/*
437	 * Declare pressure_detected after phantom_cache_contiguous_periods.
438	 *
439	 * Create a bitmask with the N low bits set. These bits must all be set
440	 * in pc_history. The high bits of pc_history are ignored.
441	 */
442	uint32_t bitmask = (1u << phantom_cache_contiguous_periods) - 1;
443	if ((pc_history & bitmask) == bitmask)
444		pressure_detected = TRUE;
445
446	if (vm_page_external_count > ((AVAILABLE_MEMORY) * 50) / 100)
447		pressure_detected = FALSE;
448
449#if PHANTOM_CACHE_DEBUG
450	sample_period_ghost_counts[sample_period_ghost_counts_indx].pressure_detected = pressure_detected;
451#endif
452	return (pressure_detected);
453}
454
455/*
456 * Restart the current sampling because conditions have changed significantly,
457 * and we don't want to react to old data.
458 *
459 * This function can be called from any thread.
460 */
461void
462vm_phantom_cache_restart_sample(void)
463{
464	pc_need_eval_reset = TRUE;
465}
466