1/* 2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#include <vm/vm_page.h> 30#include <vm/vm_object.h> 31#include <vm/vm_kern.h> 32#include <vm/vm_pageout.h> 33#include <vm/vm_phantom_cache.h> 34#include <vm/vm_compressor.h> 35 36 37uint32_t phantom_cache_eval_period_in_msecs = 250; 38uint32_t phantom_cache_thrashing_threshold_ssd = 1000; 39uint32_t phantom_cache_thrashing_threshold = 100; 40 41/* 42 * Number of consecutive thrashing periods required before 43 * vm_phantom_cache_check_pressure() returns true. 44 */ 45unsigned phantom_cache_contiguous_periods = 2; 46 47clock_sec_t pc_start_of_eval_period_sec = 0; 48clock_nsec_t pc_start_of_eval_period_nsec = 0; 49boolean_t pc_need_eval_reset = FALSE; 50 51/* One bit per recent sampling period. Bit 0 = current period. */ 52uint32_t pc_history = 0; 53 54uint32_t sample_period_ghost_added_count = 0; 55uint32_t sample_period_ghost_added_count_ssd = 0; 56uint32_t sample_period_ghost_found_count = 0; 57uint32_t sample_period_ghost_found_count_ssd = 0; 58 59uint32_t vm_phantom_object_id = 1; 60#define VM_PHANTOM_OBJECT_ID_AFTER_WRAP 1000000 61 62vm_ghost_t vm_phantom_cache; 63uint32_t vm_phantom_cache_nindx = 1; 64uint32_t vm_phantom_cache_num_entries = 0; 65uint32_t vm_phantom_cache_size; 66 67typedef uint32_t vm_phantom_hash_entry_t; 68vm_phantom_hash_entry_t *vm_phantom_cache_hash; 69uint32_t vm_phantom_cache_hash_size; 70uint32_t vm_ghost_hash_mask; /* Mask for hash function */ 71uint32_t vm_ghost_bucket_hash; /* Basic bucket hash */ 72 73 74int pg_masks[4] = { 75 0x1, 0x2, 0x4, 0x8 76}; 77 78 79#define vm_phantom_hash(obj_id, offset) (\ 80 ( (natural_t)((uintptr_t)obj_id * vm_ghost_bucket_hash) + (offset ^ vm_ghost_bucket_hash)) & vm_ghost_hash_mask) 81 82 83struct phantom_cache_stats { 84 uint32_t pcs_wrapped; 85 uint32_t pcs_added_page_to_entry; 86 uint32_t pcs_added_new_entry; 87 uint32_t pcs_replaced_entry; 88 89 uint32_t pcs_lookup_found_page_in_cache; 90 uint32_t pcs_lookup_entry_not_in_cache; 91 uint32_t pcs_lookup_page_not_in_entry; 92 93 uint32_t pcs_updated_phantom_state; 94} phantom_cache_stats; 95 96 97void 98vm_phantom_cache_init() 99{ 100 unsigned int num_entries; 101 unsigned int log1; 102 unsigned int size; 103 104 num_entries = (uint32_t)(((max_mem / PAGE_SIZE) / 4) / VM_GHOST_PAGES_PER_ENTRY); 105 vm_phantom_cache_num_entries = 1; 106 107 while (vm_phantom_cache_num_entries < num_entries) 108 vm_phantom_cache_num_entries <<= 1; 109 110 vm_phantom_cache_size = sizeof(struct vm_ghost) * vm_phantom_cache_num_entries; 111 vm_phantom_cache_hash_size = sizeof(vm_phantom_hash_entry_t) * vm_phantom_cache_num_entries; 112 113 if (kernel_memory_allocate(kernel_map, (vm_offset_t *)(&vm_phantom_cache), vm_phantom_cache_size, 0, KMA_KOBJECT) != KERN_SUCCESS) 114 panic("vm_phantom_cache_init: kernel_memory_allocate failed\n"); 115 bzero(vm_phantom_cache, vm_phantom_cache_size); 116 117 if (kernel_memory_allocate(kernel_map, (vm_offset_t *)(&vm_phantom_cache_hash), vm_phantom_cache_hash_size, 0, KMA_KOBJECT) != KERN_SUCCESS) 118 panic("vm_phantom_cache_init: kernel_memory_allocate failed\n"); 119 bzero(vm_phantom_cache_hash, vm_phantom_cache_hash_size); 120 121 122 vm_ghost_hash_mask = vm_phantom_cache_num_entries - 1; 123 124 /* 125 * Calculate object_id shift value for hashing algorithm: 126 * O = log2(sizeof(struct vm_object)) 127 * B = log2(vm_page_bucket_count) 128 * hash shifts the object_id left by 129 * B/2 - O 130 */ 131 size = vm_phantom_cache_num_entries; 132 for (log1 = 0; size > 1; log1++) 133 size /= 2; 134 135 vm_ghost_bucket_hash = 1 << ((log1 + 1) >> 1); /* Get (ceiling of sqrt of table size) */ 136 vm_ghost_bucket_hash |= 1 << ((log1 + 1) >> 2); /* Get (ceiling of quadroot of table size) */ 137 vm_ghost_bucket_hash |= 1; /* Set bit and add 1 - always must be 1 to insure unique series */ 138 139 if (vm_ghost_hash_mask & vm_phantom_cache_num_entries) 140 printf("vm_phantom_cache_init: WARNING -- strange page hash\n"); 141} 142 143 144void 145vm_phantom_cache_add_ghost(vm_page_t m) 146{ 147 vm_ghost_t vpce; 148 int ghost_index; 149 int pg_mask; 150 boolean_t isSSD = FALSE; 151 vm_phantom_hash_entry_t ghost_hash_index; 152 153#if MACH_ASSERT || DEBUG 154 lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); 155 vm_object_lock_assert_exclusive(m->object); 156#endif 157 158 if (vm_phantom_cache_num_entries == 0) 159 return; 160 161 pg_mask = pg_masks[(m->offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK]; 162 163 if (m->object->phantom_object_id == 0) { 164 165 vnode_pager_get_isSSD(m->object->pager, &isSSD); 166 167 if (isSSD == TRUE) 168 m->object->phantom_isssd = TRUE; 169 170 m->object->phantom_object_id = vm_phantom_object_id++; 171 172 if (vm_phantom_object_id == 0) 173 vm_phantom_object_id = VM_PHANTOM_OBJECT_ID_AFTER_WRAP; 174 } else { 175 if ( (vpce = vm_phantom_cache_lookup_ghost(m, 0)) ) { 176 vpce->g_pages_held |= pg_mask; 177 178 phantom_cache_stats.pcs_added_page_to_entry++; 179 goto done; 180 } 181 } 182 /* 183 * if we're here then the vm_ghost_t of this vm_page_t 184 * is not present in the phantom cache... take the next 185 * available entry in the LRU first evicting the existing 186 * entry if we've wrapped the ring 187 */ 188 ghost_index = vm_phantom_cache_nindx++; 189 190 if (vm_phantom_cache_nindx == vm_phantom_cache_num_entries) { 191 vm_phantom_cache_nindx = 1; 192 193 phantom_cache_stats.pcs_wrapped++; 194 } 195 vpce = &vm_phantom_cache[ghost_index]; 196 197 if (vpce->g_obj_id) { 198 /* 199 * we're going to replace an existing entry 200 * so first remove it from the hash 201 */ 202 vm_ghost_t nvpce; 203 204 ghost_hash_index = vm_phantom_hash(vpce->g_obj_id, vpce->g_obj_offset); 205 206 nvpce = &vm_phantom_cache[vm_phantom_cache_hash[ghost_hash_index]]; 207 208 if (nvpce == vpce) { 209 vm_phantom_cache_hash[ghost_hash_index] = vpce->g_next_index; 210 } else { 211 for (;;) { 212 if (nvpce->g_next_index == 0) 213 panic("didn't find ghost in hash\n"); 214 215 if (&vm_phantom_cache[nvpce->g_next_index] == vpce) { 216 nvpce->g_next_index = vpce->g_next_index; 217 break; 218 } 219 nvpce = &vm_phantom_cache[nvpce->g_next_index]; 220 } 221 } 222 phantom_cache_stats.pcs_replaced_entry++; 223 } else 224 phantom_cache_stats.pcs_added_new_entry++; 225 226 vpce->g_pages_held = pg_mask; 227 vpce->g_obj_offset = (m->offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK; 228 vpce->g_obj_id = m->object->phantom_object_id; 229 230 ghost_hash_index = vm_phantom_hash(vpce->g_obj_id, vpce->g_obj_offset); 231 vpce->g_next_index = vm_phantom_cache_hash[ghost_hash_index]; 232 vm_phantom_cache_hash[ghost_hash_index] = ghost_index; 233 234done: 235 if (m->object->phantom_isssd) 236 OSAddAtomic(1, &sample_period_ghost_added_count_ssd); 237 else 238 OSAddAtomic(1, &sample_period_ghost_added_count); 239} 240 241 242vm_ghost_t 243vm_phantom_cache_lookup_ghost(vm_page_t m, uint32_t pg_mask) 244{ 245 uint64_t g_obj_offset; 246 uint32_t g_obj_id; 247 uint32_t ghost_index; 248 249 if ((g_obj_id = m->object->phantom_object_id) == 0) { 250 /* 251 * no entries in phantom cache for this object 252 */ 253 return (NULL); 254 } 255 g_obj_offset = (m->offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK; 256 257 ghost_index = vm_phantom_cache_hash[vm_phantom_hash(g_obj_id, g_obj_offset)]; 258 259 while (ghost_index) { 260 vm_ghost_t vpce; 261 262 vpce = &vm_phantom_cache[ghost_index]; 263 264 if (vpce->g_obj_id == g_obj_id && vpce->g_obj_offset == g_obj_offset) { 265 266 if (pg_mask == 0 || (vpce->g_pages_held & pg_mask)) { 267 phantom_cache_stats.pcs_lookup_found_page_in_cache++; 268 269 return (vpce); 270 } 271 phantom_cache_stats.pcs_lookup_page_not_in_entry++; 272 273 return (NULL); 274 } 275 ghost_index = vpce->g_next_index; 276 } 277 phantom_cache_stats.pcs_lookup_entry_not_in_cache++; 278 279 return (NULL); 280} 281 282 283 284void 285vm_phantom_cache_update(vm_page_t m) 286{ 287 int pg_mask; 288 vm_ghost_t vpce; 289 290#if MACH_ASSERT || DEBUG 291 lck_mtx_assert(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED); 292 vm_object_lock_assert_exclusive(m->object); 293#endif 294 295 if (vm_phantom_cache_num_entries == 0) 296 return; 297 298 pg_mask = pg_masks[(m->offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK]; 299 300 if ( (vpce = vm_phantom_cache_lookup_ghost(m, pg_mask)) ) { 301 302 vpce->g_pages_held &= ~pg_mask; 303 304 phantom_cache_stats.pcs_updated_phantom_state++; 305 306 if (m->object->phantom_isssd) 307 OSAddAtomic(1, &sample_period_ghost_found_count_ssd); 308 else 309 OSAddAtomic(1, &sample_period_ghost_found_count); 310 } 311} 312 313 314#define PHANTOM_CACHE_DEBUG 1 315 316#if PHANTOM_CACHE_DEBUG 317 318int sample_period_ghost_counts_indx = 0; 319 320struct { 321 uint32_t added; 322 uint32_t found; 323 uint32_t added_ssd; 324 uint32_t found_ssd; 325 uint32_t elapsed_ms; 326 boolean_t pressure_detected; 327} sample_period_ghost_counts[256]; 328 329#endif 330 331/* 332 * Determine if the file cache is thrashing from sampling interval statistics. 333 * 334 * Pages added to the phantom cache = pages evicted from the file cache. 335 * Pages found in the phantom cache = reads of pages that were recently evicted. 336 * Threshold is the latency-dependent number of reads we consider thrashing. 337 */ 338static boolean_t 339is_thrashing(uint32_t added, uint32_t found, uint32_t threshold) 340{ 341 /* Ignore normal activity below the threshold. */ 342 if (added < threshold || found < threshold) 343 return FALSE; 344 345 /* 346 * When thrashing in a way that we can mitigate, most of the pages read 347 * into the file cache were recently evicted, and 'found' will be close 348 * to 'added'. 349 * 350 * When replacing the current working set because a new app is 351 * launched, we see very high read traffic with sporadic phantom cache 352 * hits. 353 * 354 * This is not thrashing, or freeing up memory wouldn't help much 355 * anyway. 356 */ 357 if (found < added / 2) 358 return FALSE; 359 360 return TRUE; 361} 362 363/* 364 * the following function is never called 365 * from multiple threads simultaneously due 366 * to a condition variable used to serialize 367 * at the compressor level... thus no need 368 * to provide locking for the sample processing 369 */ 370boolean_t 371vm_phantom_cache_check_pressure() 372{ 373 clock_sec_t cur_ts_sec; 374 clock_nsec_t cur_ts_nsec; 375 uint64_t elapsed_msecs_in_eval; 376 boolean_t pressure_detected = FALSE; 377 378 clock_get_system_nanotime(&cur_ts_sec, &cur_ts_nsec); 379 380 elapsed_msecs_in_eval = vm_compressor_compute_elapsed_msecs(cur_ts_sec, cur_ts_nsec, pc_start_of_eval_period_sec, pc_start_of_eval_period_nsec); 381 382 /* 383 * Reset evaluation period after phantom_cache_eval_period_in_msecs or 384 * whenever vm_phantom_cache_restart_sample has been called. 385 */ 386 if (elapsed_msecs_in_eval >= phantom_cache_eval_period_in_msecs) { 387 pc_need_eval_reset = TRUE; 388 } 389 390 if (pc_need_eval_reset == TRUE) { 391 392#if PHANTOM_CACHE_DEBUG 393 /* 394 * maintain some info about the last 256 sample periods 395 */ 396 sample_period_ghost_counts[sample_period_ghost_counts_indx].added = sample_period_ghost_added_count; 397 sample_period_ghost_counts[sample_period_ghost_counts_indx].found = sample_period_ghost_found_count; 398 sample_period_ghost_counts[sample_period_ghost_counts_indx].added_ssd = sample_period_ghost_added_count_ssd; 399 sample_period_ghost_counts[sample_period_ghost_counts_indx].found_ssd = sample_period_ghost_found_count_ssd; 400 sample_period_ghost_counts[sample_period_ghost_counts_indx].elapsed_ms = (uint32_t)elapsed_msecs_in_eval; 401 402 sample_period_ghost_counts_indx++; 403 404 if (sample_period_ghost_counts_indx >= 256) 405 sample_period_ghost_counts_indx = 0; 406#endif 407 sample_period_ghost_added_count = 0; 408 sample_period_ghost_found_count = 0; 409 sample_period_ghost_added_count_ssd = 0; 410 sample_period_ghost_found_count_ssd = 0; 411 412 pc_start_of_eval_period_sec = cur_ts_sec; 413 pc_start_of_eval_period_nsec = cur_ts_nsec; 414 pc_history <<= 1; 415 pc_need_eval_reset = FALSE; 416 } else { 417 /* 418 * Since the trashing rate is really a function of the read latency of the disk 419 * we have to consider both the SSD and spinning disk case since the file cache 420 * could be backed by either or even both flavors. When the object is first 421 * assigned a phantom_object_id, we query the pager to determine if the backing 422 * backing media is an SSD and remember that answer in the vm_object. We use 423 * that info to maintains counts for both the SSD and spinning disk cases. 424 */ 425 if (is_thrashing(sample_period_ghost_added_count, 426 sample_period_ghost_found_count, 427 phantom_cache_thrashing_threshold) || 428 is_thrashing(sample_period_ghost_added_count_ssd, 429 sample_period_ghost_found_count_ssd, 430 phantom_cache_thrashing_threshold_ssd)) { 431 /* Thrashing in the current period: Set bit 0. */ 432 pc_history |= 1; 433 } 434 } 435 436 /* 437 * Declare pressure_detected after phantom_cache_contiguous_periods. 438 * 439 * Create a bitmask with the N low bits set. These bits must all be set 440 * in pc_history. The high bits of pc_history are ignored. 441 */ 442 uint32_t bitmask = (1u << phantom_cache_contiguous_periods) - 1; 443 if ((pc_history & bitmask) == bitmask) 444 pressure_detected = TRUE; 445 446 if (vm_page_external_count > ((AVAILABLE_MEMORY) * 50) / 100) 447 pressure_detected = FALSE; 448 449#if PHANTOM_CACHE_DEBUG 450 sample_period_ghost_counts[sample_period_ghost_counts_indx].pressure_detected = pressure_detected; 451#endif 452 return (pressure_detected); 453} 454 455/* 456 * Restart the current sampling because conditions have changed significantly, 457 * and we don't want to react to old data. 458 * 459 * This function can be called from any thread. 460 */ 461void 462vm_phantom_cache_restart_sample(void) 463{ 464 pc_need_eval_reset = TRUE; 465} 466