1/* 2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29 30#ifndef _I386_PMAP_INTERNAL_ 31#define _I386_PMAP_INTERNAL_ 32#ifdef MACH_KERNEL_PRIVATE 33 34#include <vm/pmap.h> 35#include <sys/kdebug.h> 36#include <kern/ledger.h> 37 38/* 39 * pmap locking 40 */ 41 42#define PMAP_LOCK(pmap) { \ 43 simple_lock(&(pmap)->lock); \ 44} 45 46#define PMAP_UNLOCK(pmap) { \ 47 simple_unlock(&(pmap)->lock); \ 48} 49 50#define PMAP_UPDATE_TLBS(pmap, s, e) \ 51 pmap_flush_tlbs(pmap, s, e, 0, NULL) 52 53 54#define PMAP_DELAY_TLB_FLUSH 0x01 55 56#define PMAP_UPDATE_TLBS_DELAYED(pmap, s, e, c) \ 57 pmap_flush_tlbs(pmap, s, e, PMAP_DELAY_TLB_FLUSH, c) 58 59 60#define iswired(pte) ((pte) & INTEL_PTE_WIRED) 61 62#ifdef PMAP_TRACES 63extern boolean_t pmap_trace; 64#define PMAP_TRACE(x,a,b,c,d,e) \ 65 if (pmap_trace) { \ 66 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \ 67 } 68#else 69#define PMAP_TRACE(x,a,b,c,d,e) KERNEL_DEBUG(x,a,b,c,d,e) 70#endif /* PMAP_TRACES */ 71 72#define PMAP_TRACE_CONSTANT(x,a,b,c,d,e) \ 73 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \ 74 75kern_return_t pmap_expand_pml4( 76 pmap_t map, 77 vm_map_offset_t v, 78 unsigned int options); 79 80kern_return_t pmap_expand_pdpt( 81 pmap_t map, 82 vm_map_offset_t v, 83 unsigned int options); 84 85void phys_attribute_set( 86 ppnum_t phys, 87 int bits); 88 89void pmap_set_reference( 90 ppnum_t pn); 91 92boolean_t phys_page_exists( 93 ppnum_t pn); 94 95void 96pmap_flush_tlbs(pmap_t, vm_map_offset_t, vm_map_offset_t, int, pmap_flush_context *); 97 98void 99pmap_update_cache_attributes_locked(ppnum_t, unsigned); 100 101extern const boolean_t cpu_64bit; 102 103/* 104 * Private data structures. 105 */ 106 107/* 108 * For each vm_page_t, there is a list of all currently 109 * valid virtual mappings of that page. An entry is 110 * a pv_rooted_entry_t; the list is the pv_table. 111 * 112 * N.B. with the new combo rooted/hashed scheme it is 113 * only possibly to remove individual non-rooted entries 114 * if they are found via the hashed chains as there is no 115 * way to unlink the singly linked hashed entries if navigated to 116 * via the queue list off the rooted entries. Think of it as 117 * hash/walk/pull, keeping track of the prev pointer while walking 118 * the singly linked hash list. All of this is to save memory and 119 * keep both types of pv_entries as small as possible. 120 */ 121 122/* 123 124PV HASHING Changes - JK 1/2007 125 126Pve's establish physical to virtual mappings. These are used for aliasing of a 127physical page to (potentially many) virtual addresses within pmaps. In the 128previous implementation the structure of the pv_entries (each 16 bytes in size) was 129 130typedef struct pv_entry { 131 struct pv_entry_t next; 132 pmap_t pmap; 133 vm_map_offset_t va; 134} *pv_entry_t; 135 136An initial array of these is created at boot time, one per physical page of 137memory, indexed by the physical page number. Additionally, a pool of entries 138is created from a pv_zone to be used as needed by pmap_enter() when it is 139creating new mappings. Originally, we kept this pool around because the code 140in pmap_enter() was unable to block if it needed an entry and none were 141available - we'd panic. Some time ago I restructured the pmap_enter() code 142so that for user pmaps it can block while zalloc'ing a pv structure and restart, 143removing a panic from the code (in the case of the kernel pmap we cannot block 144and still panic, so, we keep a separate hot pool for use only on kernel pmaps). 145The pool has not been removed since there is a large performance gain keeping 146freed pv's around for reuse and not suffering the overhead of zalloc for every 147new pv we need. 148 149As pmap_enter() created new mappings it linked the new pve's for them off the 150fixed pv array for that ppn (off the next pointer). These pve's are accessed 151for several operations, one of them being address space teardown. In that case, 152we basically do this 153 154 for (every page/pte in the space) { 155 calc pve_ptr from the ppn in the pte 156 for (every pv in the list for the ppn) { 157 if (this pv is for this pmap/vaddr) { 158 do housekeeping 159 unlink/free the pv 160 } 161 } 162 } 163 164The problem arose when we were running, say 8000 (or even 2000) apache or 165other processes and one or all terminate. The list hanging off each pv array 166entry could have thousands of entries. We were continuously linearly searching 167each of these lists as we stepped through the address space we were tearing 168down. Because of the locks we hold, likely taking a cache miss for each node, 169and interrupt disabling for MP issues the system became completely unresponsive 170for many seconds while we did this. 171 172Realizing that pve's are accessed in two distinct ways (linearly running the 173list by ppn for operations like pmap_page_protect and finding and 174modifying/removing a single pve as part of pmap_enter processing) has led to 175modifying the pve structures and databases. 176 177There are now two types of pve structures. A "rooted" structure which is 178basically the original structure accessed in an array by ppn, and a ''hashed'' 179structure accessed on a hash list via a hash of [pmap, vaddr]. These have been 180designed with the two goals of minimizing wired memory and making the lookup of 181a ppn faster. Since a vast majority of pages in the system are not aliased 182and hence represented by a single pv entry I've kept the rooted entry size as 183small as possible because there is one of these dedicated for every physical 184page of memory. The hashed pve's are larger due to the addition of the hash 185link and the ppn entry needed for matching while running the hash list to find 186the entry we are looking for. This way, only systems that have lots of 187aliasing (like 2000+ httpd procs) will pay the extra memory price. Both 188structures have the same first three fields allowing some simplification in 189the code. 190 191They have these shapes 192 193typedef struct pv_rooted_entry { 194 queue_head_t qlink; 195 vm_map_offset_t va; 196 pmap_t pmap; 197} *pv_rooted_entry_t; 198 199 200typedef struct pv_hashed_entry { 201 queue_head_t qlink; 202 vm_map_offset_t va; 203 pmap_t pmap; 204 ppnum_t ppn; 205 struct pv_hashed_entry *nexth; 206} *pv_hashed_entry_t; 207 208The main flow difference is that the code is now aware of the rooted entry and 209the hashed entries. Code that runs the pv list still starts with the rooted 210entry and then continues down the qlink onto the hashed entries. Code that is 211looking up a specific pv entry first checks the rooted entry and then hashes 212and runs the hash list for the match. The hash list lengths are much smaller 213than the original pv lists that contained all aliases for the specific ppn. 214 215*/ 216 217typedef struct pv_rooted_entry { 218 /* first three entries must match pv_hashed_entry_t */ 219 queue_head_t qlink; 220 vm_map_offset_t va; /* virtual address for mapping */ 221 pmap_t pmap; /* pmap where mapping lies */ 222} *pv_rooted_entry_t; 223 224#define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0) 225 226typedef struct pv_hashed_entry { 227 /* first three entries must match pv_rooted_entry_t */ 228 queue_head_t qlink; 229 vm_map_offset_t va; 230 pmap_t pmap; 231 ppnum_t ppn; 232 struct pv_hashed_entry *nexth; 233} *pv_hashed_entry_t; 234 235#define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0) 236 237//#define PV_DEBUG 1 /* uncomment to enable some PV debugging code */ 238#ifdef PV_DEBUG 239#define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized"); 240#else 241#define CHK_NPVHASH(x) 242#endif 243 244#define NPVHASH 4095 /* MUST BE 2^N - 1 */ 245#define PV_HASHED_LOW_WATER_MARK_DEFAULT 5000 246#define PV_HASHED_KERN_LOW_WATER_MARK_DEFAULT 2000 247#define PV_HASHED_ALLOC_CHUNK_INITIAL 2000 248#define PV_HASHED_KERN_ALLOC_CHUNK_INITIAL 200 249 250extern volatile uint32_t mappingrecurse; 251extern uint32_t pv_hashed_low_water_mark, pv_hashed_kern_low_water_mark; 252 253/* 254 * PV hash locking 255 */ 256 257#define LOCK_PV_HASH(hash) lock_hash_hash(hash) 258#define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash) 259extern uint32_t npvhash; 260extern pv_hashed_entry_t *pv_hash_table; /* hash lists */ 261extern pv_hashed_entry_t pv_hashed_free_list; 262extern pv_hashed_entry_t pv_hashed_kern_free_list; 263decl_simple_lock_data(extern, pv_hashed_free_list_lock) 264decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock) 265decl_simple_lock_data(extern, pv_hash_table_lock) 266 267extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry 268 * structures */ 269 270extern uint32_t pv_hashed_free_count; 271extern uint32_t pv_hashed_kern_free_count; 272/* 273 * Each entry in the pv_head_table is locked by a bit in the 274 * pv_lock_table. The lock bits are accessed by the address of 275 * the frame they lock. 276 */ 277#define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) 278#define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) 279extern char *pv_lock_table; /* pointer to array of bits */ 280extern char *pv_hash_lock_table; 281extern pv_rooted_entry_t pv_head_table; /* array of entries, one per page */ 282 283extern event_t mapping_replenish_event; 284 285static inline void PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep) { 286 pmap_assert(*pvh_ep == PV_HASHED_ENTRY_NULL); 287 simple_lock(&pv_hashed_free_list_lock); 288 /* If the kernel reserved pool is low, let non-kernel mappings allocate 289 * synchronously, possibly subject to a throttle. 290 */ 291 if ((pv_hashed_kern_free_count > pv_hashed_kern_low_water_mark) && ((*pvh_ep = pv_hashed_free_list) != 0)) { 292 pv_hashed_free_list = (pv_hashed_entry_t)(*pvh_ep)->qlink.next; 293 pv_hashed_free_count--; 294 } 295 296 simple_unlock(&pv_hashed_free_list_lock); 297 298 if (pv_hashed_free_count <= pv_hashed_low_water_mark) { 299 if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse)) 300 thread_wakeup(&mapping_replenish_event); 301 } 302} 303 304static inline void PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) { 305 simple_lock(&pv_hashed_free_list_lock); 306 pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; 307 pv_hashed_free_list = pvh_eh; 308 pv_hashed_free_count += pv_cnt; 309 simple_unlock(&pv_hashed_free_list_lock); 310} 311 312extern unsigned pmap_kern_reserve_alloc_stat; 313 314static inline void PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e) { 315 pmap_assert(*pvh_e == PV_HASHED_ENTRY_NULL); 316 simple_lock(&pv_hashed_kern_free_list_lock); 317 318 if ((*pvh_e = pv_hashed_kern_free_list) != 0) { 319 pv_hashed_kern_free_list = (pv_hashed_entry_t)(*pvh_e)->qlink.next; 320 pv_hashed_kern_free_count--; 321 pmap_kern_reserve_alloc_stat++; 322 } 323 324 simple_unlock(&pv_hashed_kern_free_list_lock); 325 326 if (pv_hashed_kern_free_count < pv_hashed_kern_low_water_mark) { 327 if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse)) 328 thread_wakeup(&mapping_replenish_event); 329 } 330} 331 332static inline void PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) { 333 simple_lock(&pv_hashed_kern_free_list_lock); 334 pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; 335 pv_hashed_kern_free_list = pvh_eh; 336 pv_hashed_kern_free_count += pv_cnt; 337 simple_unlock(&pv_hashed_kern_free_list_lock); 338} 339 340extern uint64_t pmap_pv_throttle_stat, pmap_pv_throttled_waiters; 341extern event_t pmap_user_pv_throttle_event; 342 343static inline void pmap_pv_throttle(__unused pmap_t p) { 344 pmap_assert(p != kernel_pmap); 345 /* Apply throttle on non-kernel mappings */ 346 if (pv_hashed_kern_free_count < (pv_hashed_kern_low_water_mark / 2)) { 347 pmap_pv_throttle_stat++; 348 /* This doesn't need to be strictly accurate, merely a hint 349 * to eliminate the timeout when the reserve is replenished. 350 */ 351 pmap_pv_throttled_waiters++; 352 assert_wait_timeout(&pmap_user_pv_throttle_event, THREAD_UNINT, 1, 1000 * NSEC_PER_USEC); 353 thread_block(THREAD_CONTINUE_NULL); 354 } 355} 356 357/* 358 * Index into pv_head table, its lock bits, and the modify/reference and managed bits 359 */ 360 361#define pa_index(pa) (i386_btop(pa)) 362#define ppn_to_pai(ppn) ((int)ppn) 363 364#define pai_to_pvh(pai) (&pv_head_table[pai]) 365#define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table) 366#define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table) 367#define pvhash(idx) (&pv_hash_table[idx]) 368#define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table) 369#define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table) 370 371#define IS_MANAGED_PAGE(x) \ 372 ((unsigned int)(x) <= last_managed_page && \ 373 (pmap_phys_attributes[x] & PHYS_MANAGED)) 374#define IS_INTERNAL_PAGE(x) \ 375 (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_INTERNAL)) 376#define IS_REUSABLE_PAGE(x) \ 377 (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_REUSABLE)) 378 379/* 380 * Physical page attributes. Copy bits from PTE definition. 381 */ 382#define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */ 383#define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */ 384#define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */ 385#define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */ 386#define PHYS_NCACHE INTEL_PTE_NCACHE 387#define PHYS_PTA INTEL_PTE_PTA 388#define PHYS_CACHEABILITY_MASK (INTEL_PTE_PTA | INTEL_PTE_NCACHE) 389#define PHYS_INTERNAL INTEL_PTE_WTHRU /* page from internal object */ 390#define PHYS_REUSABLE INTEL_PTE_WRITE /* page is "reusable" */ 391 392extern const boolean_t pmap_disable_kheap_nx; 393extern const boolean_t pmap_disable_kstack_nx; 394 395#define PMAP_EXPAND_OPTIONS_NONE (0x0) 396#define PMAP_EXPAND_OPTIONS_NOWAIT (PMAP_OPTIONS_NOWAIT) 397#define PMAP_EXPAND_OPTIONS_NOENTER (PMAP_OPTIONS_NOENTER) 398 399/* 400 * Amount of virtual memory mapped by one 401 * page-directory entry. 402 */ 403#define PDE_MAPPED_SIZE (pdetova(1)) 404 405 406/* 407 * Locking and TLB invalidation 408 */ 409 410/* 411 * Locking Protocols: (changed 2/2007 JK) 412 * 413 * There are two structures in the pmap module that need locking: 414 * the pmaps themselves, and the per-page pv_lists (which are locked 415 * by locking the pv_lock_table entry that corresponds to the pv_head 416 * for the list in question.) Most routines want to lock a pmap and 417 * then do operations in it that require pv_list locking -- however 418 * pmap_remove_all and pmap_copy_on_write operate on a physical page 419 * basis and want to do the locking in the reverse order, i.e. lock 420 * a pv_list and then go through all the pmaps referenced by that list. 421 * 422 * The system wide pmap lock has been removed. Now, paths take a lock 423 * on the pmap before changing its 'shape' and the reverse order lockers 424 * (coming in by phys ppn) take a lock on the corresponding pv and then 425 * retest to be sure nothing changed during the window before they locked 426 * and can then run up/down the pv lists holding the list lock. This also 427 * lets the pmap layer run (nearly completely) interrupt enabled, unlike 428 * previously. 429 */ 430 431/* 432 * PV locking 433 */ 434 435#define LOCK_PVH(index) { \ 436 mp_disable_preemption(); \ 437 lock_pvh_pai(index); \ 438} 439 440#define UNLOCK_PVH(index) { \ 441 unlock_pvh_pai(index); \ 442 mp_enable_preemption(); \ 443} 444 445extern uint64_t pde_mapped_size; 446 447extern char *pmap_phys_attributes; 448extern ppnum_t last_managed_page; 449 450extern ppnum_t lowest_lo; 451extern ppnum_t lowest_hi; 452extern ppnum_t highest_hi; 453 454/* 455 * when spinning through pmap_remove 456 * ensure that we don't spend too much 457 * time with preemption disabled. 458 * I'm setting the current threshold 459 * to 20us 460 */ 461#define MAX_PREEMPTION_LATENCY_NS 20000 462extern uint64_t max_preemption_latency_tsc; 463 464/* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */ 465#ifdef DEBUGINTERRUPTS 466#define pmap_intr_assert() { \ 467 if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \ 468 panic("pmap interrupt assert %s, %d",__FILE__, __LINE__); \ 469} 470#else 471#define pmap_intr_assert() 472#endif 473 474extern int nx_enabled; 475extern unsigned int inuse_ptepages_count; 476 477static inline uint32_t 478pvhashidx(pmap_t pmap, vm_map_offset_t va) 479{ 480 return ((uint32_t)(uintptr_t)pmap ^ 481 ((uint32_t)(va >> PAGE_SHIFT) & 0xFFFFFFFF)) & 482 npvhash; 483} 484 485 486/* 487 * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain. 488 * properly deals with the anchor. 489 * must be called with the hash locked, does not unlock it 490 */ 491static inline void 492pmap_pvh_unlink(pv_hashed_entry_t pvh) 493{ 494 pv_hashed_entry_t curh; 495 pv_hashed_entry_t *pprevh; 496 int pvhash_idx; 497 498 CHK_NPVHASH(); 499 pvhash_idx = pvhashidx(pvh->pmap, pvh->va); 500 501 pprevh = pvhash(pvhash_idx); 502 503#if PV_DEBUG 504 if (NULL == *pprevh) 505 panic("pvh_unlink null anchor"); /* JK DEBUG */ 506#endif 507 curh = *pprevh; 508 509 while (PV_HASHED_ENTRY_NULL != curh) { 510 if (pvh == curh) 511 break; 512 pprevh = &curh->nexth; 513 curh = curh->nexth; 514 } 515 if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh"); 516 *pprevh = pvh->nexth; 517 return; 518} 519 520static inline void 521pv_hash_add(pv_hashed_entry_t pvh_e, 522 pv_rooted_entry_t pv_h) 523{ 524 pv_hashed_entry_t *hashp; 525 int pvhash_idx; 526 527 CHK_NPVHASH(); 528 pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va); 529 LOCK_PV_HASH(pvhash_idx); 530 insque(&pvh_e->qlink, &pv_h->qlink); 531 hashp = pvhash(pvhash_idx); 532#if PV_DEBUG 533 if (NULL==hashp) 534 panic("pv_hash_add(%p) null hash bucket", pvh_e); 535#endif 536 pvh_e->nexth = *hashp; 537 *hashp = pvh_e; 538 UNLOCK_PV_HASH(pvhash_idx); 539} 540 541static inline void 542pv_hash_remove(pv_hashed_entry_t pvh_e) 543{ 544 int pvhash_idx; 545 546 CHK_NPVHASH(); 547 pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va); 548 LOCK_PV_HASH(pvhash_idx); 549 remque(&pvh_e->qlink); 550 pmap_pvh_unlink(pvh_e); 551 UNLOCK_PV_HASH(pvhash_idx); 552} 553 554static inline boolean_t popcnt1(uint64_t distance) { 555 return ((distance & (distance - 1)) == 0); 556} 557 558/* 559 * Routines to handle suppression of/recovery from some forms of pagetable corruption 560 * incidents observed in the field. These can be either software induced (wild 561 * stores to the mapwindows where applicable, use after free errors 562 * (typically of pages addressed physically), mis-directed DMAs etc., or due 563 * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors, 564 * the recording mechanism is deliberately not MP-safe. The overarching goal is to 565 * still assert on potential software races, but attempt recovery from incidents 566 * identifiable as occurring due to issues beyond the control of the pmap module. 567 * The latter includes single-bit errors and malformed pagetable entries. 568 * We currently limit ourselves to recovery/suppression of one incident per 569 * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident 570 * are logged. 571 * Assertions are not suppressed if kernel debugging is enabled. (DRK 09) 572 */ 573 574typedef enum { 575 PTE_VALID = 0x0, 576 PTE_INVALID = 0x1, 577 PTE_RSVD = 0x2, 578 PTE_SUPERVISOR = 0x4, 579 PTE_BITFLIP = 0x8, 580 PV_BITFLIP = 0x10, 581 PTE_INVALID_CACHEABILITY = 0x20 582} pmap_pagetable_corruption_t; 583 584typedef enum { 585 ROOT_PRESENT = 0, 586 ROOT_ABSENT = 1 587} pmap_pv_assertion_t; 588 589typedef enum { 590 PMAP_ACTION_IGNORE = 0x0, 591 PMAP_ACTION_ASSERT = 0x1, 592 PMAP_ACTION_RETRY = 0x2, 593 PMAP_ACTION_RETRY_RELOCK = 0x4 594} pmap_pagetable_corruption_action_t; 595 596#define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL) 597extern uint64_t pmap_pagetable_corruption_interval_abstime; 598 599extern uint32_t pmap_pagetable_corruption_incidents; 600#define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8) 601typedef struct { 602 pmap_pv_assertion_t incident; 603 pmap_pagetable_corruption_t reason; 604 pmap_pagetable_corruption_action_t action; 605 pmap_t pmap; 606 vm_map_offset_t vaddr; 607 pt_entry_t pte; 608 ppnum_t ppn; 609 pmap_t pvpmap; 610 vm_map_offset_t pvva; 611 uint64_t abstime; 612} pmap_pagetable_corruption_record_t; 613 614extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[]; 615extern uint64_t pmap_pagetable_corruption_last_abstime; 616extern thread_call_t pmap_pagetable_corruption_log_call; 617extern boolean_t pmap_pagetable_corruption_timeout; 618 619static inline void 620pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva) { 621 uint32_t pmap_pagetable_corruption_log_index; 622 pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG; 623 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident; 624 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason; 625 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].action = action; 626 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pmap = pmap; 627 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].vaddr = vaddr; 628 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pte = *ptep; 629 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn; 630 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap; 631 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva; 632 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = mach_absolute_time(); 633 /* Asynchronously log */ 634 thread_call_enter(pmap_pagetable_corruption_log_call); 635} 636 637static inline pmap_pagetable_corruption_action_t 638pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident) { 639 pmap_pagetable_corruption_action_t action = PMAP_ACTION_ASSERT; 640 pmap_pagetable_corruption_t suppress_reason = PTE_VALID; 641 ppnum_t suppress_ppn = 0; 642 pt_entry_t cpte = *ptep; 643 ppnum_t cpn = pa_index(pte_to_pa(cpte)); 644 ppnum_t ppn = *ppnp; 645 pv_rooted_entry_t pv_h = pai_to_pvh(ppn_to_pai(ppn)); 646 pv_rooted_entry_t pv_e = pv_h; 647 uint32_t bitdex; 648 pmap_t pvpmap = pv_h->pmap; 649 vm_map_offset_t pvva = pv_h->va; 650 boolean_t ppcd = FALSE; 651 652 /* Ideally, we'd consult the Mach VM here to definitively determine 653 * the nature of the mapping for this address space and address. 654 * As that would be a layering violation in this context, we 655 * use various heuristics to recover from single bit errors, 656 * malformed pagetable entries etc. These are not intended 657 * to be comprehensive. 658 */ 659 660 /* As a precautionary measure, mark A+D */ 661 pmap_phys_attributes[ppn_to_pai(ppn)] |= (PHYS_MODIFIED | PHYS_REFERENCED); 662 663 /* 664 * Correct potential single bit errors in either (but not both) element 665 * of the PV 666 */ 667 do { 668 if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && pv_e->va == vaddr) || 669 (pv_e->pmap == pmap && popcnt1(pv_e->va ^ vaddr))) { 670 pv_e->pmap = pmap; 671 pv_e->va = vaddr; 672 suppress_reason = PV_BITFLIP; 673 action = PMAP_ACTION_RETRY; 674 goto pmap_cpc_exit; 675 } 676 } while (((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink))) && (pv_e != pv_h)); 677 678 /* Discover root entries with a Hamming 679 * distance of 1 from the supplied 680 * physical page frame. 681 */ 682 for (bitdex = 0; bitdex < (sizeof(ppnum_t) << 3); bitdex++) { 683 ppnum_t npn = cpn ^ (ppnum_t) (1ULL << bitdex); 684 if (IS_MANAGED_PAGE(npn)) { 685 pv_rooted_entry_t npv_h = pai_to_pvh(ppn_to_pai(npn)); 686 if (npv_h->va == vaddr && npv_h->pmap == pmap) { 687 suppress_reason = PTE_BITFLIP; 688 suppress_ppn = npn; 689 action = PMAP_ACTION_RETRY_RELOCK; 690 UNLOCK_PVH(ppn_to_pai(ppn)); 691 *ppnp = npn; 692 goto pmap_cpc_exit; 693 } 694 } 695 } 696 697 if (pmap == kernel_pmap) { 698 action = PMAP_ACTION_ASSERT; 699 goto pmap_cpc_exit; 700 } 701 702 /* Check for malformed/inconsistent entries */ 703 704 if ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PTA)) == (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU)) { 705 action = PMAP_ACTION_IGNORE; 706 suppress_reason = PTE_INVALID_CACHEABILITY; 707 } 708 else if (cpte & INTEL_PTE_RSVD) { 709 action = PMAP_ACTION_IGNORE; 710 suppress_reason = PTE_RSVD; 711 } 712 else if ((pmap != kernel_pmap) && ((cpte & INTEL_PTE_USER) == 0)) { 713 action = PMAP_ACTION_IGNORE; 714 suppress_reason = PTE_SUPERVISOR; 715 } 716pmap_cpc_exit: 717 PE_parse_boot_argn("-pmap_pagetable_corruption_deassert", &ppcd, sizeof(ppcd)); 718 719 if (debug_boot_arg && !ppcd) { 720 action = PMAP_ACTION_ASSERT; 721 } 722 723 if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) { 724 action = PMAP_ACTION_ASSERT; 725 pmap_pagetable_corruption_timeout = TRUE; 726 } 727 else 728 { 729 pmap_pagetable_corruption_last_abstime = mach_absolute_time(); 730 } 731 pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva); 732 return action; 733} 734 735/* 736 * Remove pv list entry. 737 * Called with pv_head_table entry locked. 738 * Returns pv entry to be freed (or NULL). 739 */ 740static inline __attribute__((always_inline)) pv_hashed_entry_t 741pmap_pv_remove(pmap_t pmap, 742 vm_map_offset_t vaddr, 743 ppnum_t *ppnp, 744 pt_entry_t *pte) 745{ 746 pv_hashed_entry_t pvh_e; 747 pv_rooted_entry_t pv_h; 748 pv_hashed_entry_t *pprevh; 749 int pvhash_idx; 750 uint32_t pv_cnt; 751 ppnum_t ppn; 752 753pmap_pv_remove_retry: 754 ppn = *ppnp; 755 pvh_e = PV_HASHED_ENTRY_NULL; 756 pv_h = pai_to_pvh(ppn_to_pai(ppn)); 757 758 if (__improbable(pv_h->pmap == PMAP_NULL)) { 759 pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT); 760 if (pac == PMAP_ACTION_IGNORE) 761 goto pmap_pv_remove_exit; 762 else if (pac == PMAP_ACTION_ASSERT) 763 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p, %p): null pv_list!", pmap, vaddr, ppn, *pte, ppnp, pte); 764 else if (pac == PMAP_ACTION_RETRY_RELOCK) { 765 LOCK_PVH(ppn_to_pai(*ppnp)); 766 pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED); 767 goto pmap_pv_remove_retry; 768 } 769 else if (pac == PMAP_ACTION_RETRY) 770 goto pmap_pv_remove_retry; 771 } 772 773 if (pv_h->va == vaddr && pv_h->pmap == pmap) { 774 /* 775 * Header is the pv_rooted_entry. 776 * We can't free that. If there is a queued 777 * entry after this one we remove that 778 * from the ppn queue, we remove it from the hash chain 779 * and copy it to the rooted entry. Then free it instead. 780 */ 781 pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink); 782 if (pv_h != (pv_rooted_entry_t) pvh_e) { 783 /* 784 * Entry queued to root, remove this from hash 785 * and install as new root. 786 */ 787 CHK_NPVHASH(); 788 pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va); 789 LOCK_PV_HASH(pvhash_idx); 790 remque(&pvh_e->qlink); 791 pprevh = pvhash(pvhash_idx); 792 if (PV_HASHED_ENTRY_NULL == *pprevh) { 793 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x): " 794 "empty hash, removing rooted", 795 pmap, vaddr, ppn); 796 } 797 pmap_pvh_unlink(pvh_e); 798 UNLOCK_PV_HASH(pvhash_idx); 799 pv_h->pmap = pvh_e->pmap; 800 pv_h->va = pvh_e->va; /* dispose of pvh_e */ 801 } else { 802 /* none queued after rooted */ 803 pv_h->pmap = PMAP_NULL; 804 pvh_e = PV_HASHED_ENTRY_NULL; 805 } 806 } else { 807 /* 808 * not removing rooted pv. find it on hash chain, remove from 809 * ppn queue and hash chain and free it 810 */ 811 CHK_NPVHASH(); 812 pvhash_idx = pvhashidx(pmap, vaddr); 813 LOCK_PV_HASH(pvhash_idx); 814 pprevh = pvhash(pvhash_idx); 815 if (PV_HASHED_ENTRY_NULL == *pprevh) { 816 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p): empty hash", 817 pmap, vaddr, ppn, *pte, pte); 818 } 819 pvh_e = *pprevh; 820 pmap_pv_hashlist_walks++; 821 pv_cnt = 0; 822 while (PV_HASHED_ENTRY_NULL != pvh_e) { 823 pv_cnt++; 824 if (pvh_e->pmap == pmap && 825 pvh_e->va == vaddr && 826 pvh_e->ppn == ppn) 827 break; 828 pprevh = &pvh_e->nexth; 829 pvh_e = pvh_e->nexth; 830 } 831 832 if (PV_HASHED_ENTRY_NULL == pvh_e) { 833 pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT); 834 835 if (pac == PMAP_ACTION_ASSERT) 836 panic("Possible memory corruption: pmap_pv_remove(%p, 0x%llx, 0x%x, 0x%llx, %p, %p): pv not on hash, head: %p, 0x%llx", pmap, vaddr, ppn, *pte, ppnp, pte, pv_h->pmap, pv_h->va); 837 else { 838 UNLOCK_PV_HASH(pvhash_idx); 839 if (pac == PMAP_ACTION_RETRY_RELOCK) { 840 LOCK_PVH(ppn_to_pai(*ppnp)); 841 pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED); 842 goto pmap_pv_remove_retry; 843 } 844 else if (pac == PMAP_ACTION_RETRY) { 845 goto pmap_pv_remove_retry; 846 } 847 else if (pac == PMAP_ACTION_IGNORE) { 848 goto pmap_pv_remove_exit; 849 } 850 } 851 } 852 853 pmap_pv_hashlist_cnts += pv_cnt; 854 if (pmap_pv_hashlist_max < pv_cnt) 855 pmap_pv_hashlist_max = pv_cnt; 856 *pprevh = pvh_e->nexth; 857 remque(&pvh_e->qlink); 858 UNLOCK_PV_HASH(pvhash_idx); 859 } 860pmap_pv_remove_exit: 861 return pvh_e; 862} 863 864 865extern int pt_fake_zone_index; 866static inline void 867PMAP_ZINFO_PALLOC(pmap_t pmap, vm_size_t bytes) 868{ 869 thread_t thr = current_thread(); 870 task_t task; 871 zinfo_usage_t zinfo; 872 873 pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes); 874 875 if (pt_fake_zone_index != -1 && 876 (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) 877 OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].alloc); 878} 879 880static inline void 881PMAP_ZINFO_PFREE(pmap_t pmap, vm_size_t bytes) 882{ 883 thread_t thr = current_thread(); 884 task_t task; 885 zinfo_usage_t zinfo; 886 887 pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes); 888 889 if (pt_fake_zone_index != -1 && 890 (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) 891 OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].free); 892} 893 894static inline void 895PMAP_ZINFO_SALLOC(pmap_t pmap, vm_size_t bytes) 896{ 897 pmap_ledger_credit(pmap, task_ledgers.tkm_shared, bytes); 898} 899 900static inline void 901PMAP_ZINFO_SFREE(pmap_t pmap, vm_size_t bytes) 902{ 903 pmap_ledger_debit(pmap, task_ledgers.tkm_shared, bytes); 904} 905 906extern boolean_t pmap_initialized;/* Has pmap_init completed? */ 907#define valid_page(x) (pmap_initialized && pmap_valid_page(x)) 908 909// XXX 910#define HIGH_MEM_BASE ((uint32_t)( -NBPDE) ) /* shared gdt etc seg addr */ /* XXX64 ?? */ 911// XXX 912 913 914int phys_attribute_test( 915 ppnum_t phys, 916 int bits); 917void phys_attribute_clear( 918 ppnum_t phys, 919 int bits, 920 unsigned int options, 921 void *arg); 922 923//#define PCID_DEBUG 1 924#if PCID_DEBUG 925#define pmap_pcid_log(fmt, args...) \ 926 do { \ 927 kprintf(fmt, ##args); \ 928 printf(fmt, ##args); \ 929 } while(0) 930#else 931#define pmap_pcid_log(fmt, args...) 932#endif 933void pmap_pcid_configure(void); 934 935 936/* 937 * Atomic 64-bit compare and exchange of a page table entry. 938 */ 939static inline boolean_t 940pmap_cmpx_pte(pt_entry_t *entryp, pt_entry_t old, pt_entry_t new) 941{ 942 boolean_t ret; 943 944 /* 945 * Load the old value into %rax 946 * Load the new value into another register 947 * Compare-exchange-quad at address entryp 948 * If the compare succeeds, the new value is stored, return TRUE. 949 * Otherwise, no swap is made, return FALSE. 950 */ 951 asm volatile( 952 " lock; cmpxchgq %2,(%3) \n\t" 953 " setz %%al \n\t" 954 " movzbl %%al,%0" 955 : "=a" (ret) 956 : "a" (old), 957 "r" (new), 958 "r" (entryp) 959 : "memory"); 960 return ret; 961} 962 963extern uint32_t pmap_update_clear_pte_count; 964 965static inline void pmap_update_pte(pt_entry_t *mptep, uint64_t pclear_bits, uint64_t pset_bits) { 966 pt_entry_t npte, opte; 967 do { 968 opte = *mptep; 969 if (__improbable(opte == 0)) { 970 pmap_update_clear_pte_count++; 971 break; 972 } 973 npte = opte & ~(pclear_bits); 974 npte |= pset_bits; 975 } while (!pmap_cmpx_pte(mptep, opte, npte)); 976} 977 978#if defined(__x86_64__) 979/* 980 * The single pml4 page per pmap is allocated at pmap create time and exists 981 * for the duration of the pmap. we allocate this page in kernel vm. 982 * this returns the address of the requested pml4 entry in the top level page. 983 */ 984static inline 985pml4_entry_t * 986pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr) 987{ 988 if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) && 989 (vaddr < 0xFFFF800000000000ULL))) { 990 return (NULL); 991 } 992 993#if PMAP_ASSERT 994 return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_cr3)[(vaddr >> PML4SHIFT) & (NPML4PG-1)]); 995#else 996 return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG-1)]; 997#endif 998} 999 1000/* 1001 * Returns address of requested PDPT entry in the physmap. 1002 */ 1003static inline pdpt_entry_t * 1004pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr) 1005{ 1006 pml4_entry_t newpf; 1007 pml4_entry_t *pml4; 1008 1009 pml4 = pmap64_pml4(pmap, vaddr); 1010 if (pml4 && ((*pml4 & INTEL_PTE_VALID))) { 1011 newpf = *pml4 & PG_FRAME; 1012 return &((pdpt_entry_t *) PHYSMAP_PTOV(newpf)) 1013 [(vaddr >> PDPTSHIFT) & (NPDPTPG-1)]; 1014 } 1015 return (NULL); 1016} 1017/* 1018 * Returns the address of the requested PDE entry in the physmap. 1019 */ 1020static inline pd_entry_t * 1021pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr) 1022{ 1023 pdpt_entry_t newpf; 1024 pdpt_entry_t *pdpt; 1025 1026 pdpt = pmap64_pdpt(pmap, vaddr); 1027 1028 if (pdpt && ((*pdpt & INTEL_PTE_VALID))) { 1029 newpf = *pdpt & PG_FRAME; 1030 return &((pd_entry_t *) PHYSMAP_PTOV(newpf)) 1031 [(vaddr >> PDSHIFT) & (NPDPG-1)]; 1032 } 1033 return (NULL); 1034} 1035 1036static inline pd_entry_t * 1037pmap_pde(pmap_t m, vm_map_offset_t v) 1038{ 1039 pd_entry_t *pde; 1040 1041 pde = pmap64_pde(m, v); 1042 1043 return pde; 1044} 1045 1046 1047/* 1048 * return address of mapped pte for vaddr va in pmap pmap. 1049 * 1050 * In case the pde maps a superpage, return the pde, which, in this case 1051 * is the actual page table entry. 1052 */ 1053static inline pt_entry_t * 1054pmap_pte(pmap_t pmap, vm_map_offset_t vaddr) 1055{ 1056 pd_entry_t *pde; 1057 pd_entry_t newpf; 1058 1059 assert(pmap); 1060 pde = pmap64_pde(pmap, vaddr); 1061 1062 if (pde && ((*pde & INTEL_PTE_VALID))) { 1063 if (*pde & INTEL_PTE_PS) 1064 return pde; 1065 newpf = *pde & PG_FRAME; 1066 return &((pt_entry_t *)PHYSMAP_PTOV(newpf)) 1067 [i386_btop(vaddr) & (ppnum_t)(NPTEPG-1)]; 1068 } 1069 return (NULL); 1070} 1071#endif 1072#if DEBUG 1073#define DPRINTF(x...) kprintf(x) 1074#else 1075#define DPRINTF(x...) 1076#endif 1077 1078#endif /* MACH_KERNEL_PRIVATE */ 1079#endif /* _I386_PMAP_INTERNAL_ */ 1080