1/* 2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29 30#ifndef _I386_PMAP_INTERNAL_ 31#define _I386_PMAP_INTERNAL_ 32#ifdef MACH_KERNEL_PRIVATE 33 34#include <vm/pmap.h> 35#include <sys/kdebug.h> 36#include <kern/ledger.h> 37 38/* 39 * pmap locking 40 */ 41 42#define PMAP_LOCK(pmap) { \ 43 simple_lock(&(pmap)->lock); \ 44} 45 46#define PMAP_UNLOCK(pmap) { \ 47 simple_unlock(&(pmap)->lock); \ 48} 49 50#define PMAP_UPDATE_TLBS(pmap, s, e) \ 51 pmap_flush_tlbs(pmap, s, e) 52 53#define iswired(pte) ((pte) & INTEL_PTE_WIRED) 54 55#ifdef PMAP_TRACES 56extern boolean_t pmap_trace; 57#define PMAP_TRACE(x,a,b,c,d,e) \ 58 if (pmap_trace) { \ 59 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \ 60 } 61#else 62#define PMAP_TRACE(x,a,b,c,d,e) KERNEL_DEBUG(x,a,b,c,d,e) 63#endif /* PMAP_TRACES */ 64 65#define PMAP_TRACE_CONSTANT(x,a,b,c,d,e) \ 66 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \ 67 68kern_return_t pmap_expand_pml4( 69 pmap_t map, 70 vm_map_offset_t v, 71 unsigned int options); 72 73kern_return_t pmap_expand_pdpt( 74 pmap_t map, 75 vm_map_offset_t v, 76 unsigned int options); 77 78void phys_attribute_set( 79 ppnum_t phys, 80 int bits); 81 82void pmap_set_reference( 83 ppnum_t pn); 84 85boolean_t phys_page_exists( 86 ppnum_t pn); 87 88void pmap_flush_tlbs(pmap_t, vm_map_offset_t, vm_map_offset_t); 89 90void 91pmap_update_cache_attributes_locked(ppnum_t, unsigned); 92 93#if CONFIG_YONAH 94extern boolean_t cpu_64bit; 95#else 96extern const boolean_t cpu_64bit; 97#endif 98 99/* 100 * Private data structures. 101 */ 102 103/* 104 * For each vm_page_t, there is a list of all currently 105 * valid virtual mappings of that page. An entry is 106 * a pv_rooted_entry_t; the list is the pv_table. 107 * 108 * N.B. with the new combo rooted/hashed scheme it is 109 * only possibly to remove individual non-rooted entries 110 * if they are found via the hashed chains as there is no 111 * way to unlink the singly linked hashed entries if navigated to 112 * via the queue list off the rooted entries. Think of it as 113 * hash/walk/pull, keeping track of the prev pointer while walking 114 * the singly linked hash list. All of this is to save memory and 115 * keep both types of pv_entries as small as possible. 116 */ 117 118/* 119 120PV HASHING Changes - JK 1/2007 121 122Pve's establish physical to virtual mappings. These are used for aliasing of a 123physical page to (potentially many) virtual addresses within pmaps. In the 124previous implementation the structure of the pv_entries (each 16 bytes in size) was 125 126typedef struct pv_entry { 127 struct pv_entry_t next; 128 pmap_t pmap; 129 vm_map_offset_t va; 130} *pv_entry_t; 131 132An initial array of these is created at boot time, one per physical page of 133memory, indexed by the physical page number. Additionally, a pool of entries 134is created from a pv_zone to be used as needed by pmap_enter() when it is 135creating new mappings. Originally, we kept this pool around because the code 136in pmap_enter() was unable to block if it needed an entry and none were 137available - we'd panic. Some time ago I restructured the pmap_enter() code 138so that for user pmaps it can block while zalloc'ing a pv structure and restart, 139removing a panic from the code (in the case of the kernel pmap we cannot block 140and still panic, so, we keep a separate hot pool for use only on kernel pmaps). 141The pool has not been removed since there is a large performance gain keeping 142freed pv's around for reuse and not suffering the overhead of zalloc for every 143new pv we need. 144 145As pmap_enter() created new mappings it linked the new pve's for them off the 146fixed pv array for that ppn (off the next pointer). These pve's are accessed 147for several operations, one of them being address space teardown. In that case, 148we basically do this 149 150 for (every page/pte in the space) { 151 calc pve_ptr from the ppn in the pte 152 for (every pv in the list for the ppn) { 153 if (this pv is for this pmap/vaddr) { 154 do housekeeping 155 unlink/free the pv 156 } 157 } 158 } 159 160The problem arose when we were running, say 8000 (or even 2000) apache or 161other processes and one or all terminate. The list hanging off each pv array 162entry could have thousands of entries. We were continuously linearly searching 163each of these lists as we stepped through the address space we were tearing 164down. Because of the locks we hold, likely taking a cache miss for each node, 165and interrupt disabling for MP issues the system became completely unresponsive 166for many seconds while we did this. 167 168Realizing that pve's are accessed in two distinct ways (linearly running the 169list by ppn for operations like pmap_page_protect and finding and 170modifying/removing a single pve as part of pmap_enter processing) has led to 171modifying the pve structures and databases. 172 173There are now two types of pve structures. A "rooted" structure which is 174basically the original structure accessed in an array by ppn, and a ''hashed'' 175structure accessed on a hash list via a hash of [pmap, vaddr]. These have been 176designed with the two goals of minimizing wired memory and making the lookup of 177a ppn faster. Since a vast majority of pages in the system are not aliased 178and hence represented by a single pv entry I've kept the rooted entry size as 179small as possible because there is one of these dedicated for every physical 180page of memory. The hashed pve's are larger due to the addition of the hash 181link and the ppn entry needed for matching while running the hash list to find 182the entry we are looking for. This way, only systems that have lots of 183aliasing (like 2000+ httpd procs) will pay the extra memory price. Both 184structures have the same first three fields allowing some simplification in 185the code. 186 187They have these shapes 188 189typedef struct pv_rooted_entry { 190 queue_head_t qlink; 191 vm_map_offset_t va; 192 pmap_t pmap; 193} *pv_rooted_entry_t; 194 195 196typedef struct pv_hashed_entry { 197 queue_head_t qlink; 198 vm_map_offset_t va; 199 pmap_t pmap; 200 ppnum_t ppn; 201 struct pv_hashed_entry *nexth; 202} *pv_hashed_entry_t; 203 204The main flow difference is that the code is now aware of the rooted entry and 205the hashed entries. Code that runs the pv list still starts with the rooted 206entry and then continues down the qlink onto the hashed entries. Code that is 207looking up a specific pv entry first checks the rooted entry and then hashes 208and runs the hash list for the match. The hash list lengths are much smaller 209than the original pv lists that contained all aliases for the specific ppn. 210 211*/ 212 213typedef struct pv_rooted_entry { 214 /* first three entries must match pv_hashed_entry_t */ 215 queue_head_t qlink; 216 vm_map_offset_t va; /* virtual address for mapping */ 217 pmap_t pmap; /* pmap where mapping lies */ 218} *pv_rooted_entry_t; 219 220#define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0) 221 222typedef struct pv_hashed_entry { 223 /* first three entries must match pv_rooted_entry_t */ 224 queue_head_t qlink; 225 vm_map_offset_t va; 226 pmap_t pmap; 227 ppnum_t ppn; 228 struct pv_hashed_entry *nexth; 229} *pv_hashed_entry_t; 230 231#define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0) 232 233//#define PV_DEBUG 1 /* uncomment to enable some PV debugging code */ 234#ifdef PV_DEBUG 235#define CHK_NPVHASH() if(0 == npvhash) panic("npvhash uninitialized"); 236#else 237#define CHK_NPVHASH(x) 238#endif 239 240#define NPVHASH 4095 /* MUST BE 2^N - 1 */ 241#define PV_HASHED_LOW_WATER_MARK_DEFAULT 5000 242#define PV_HASHED_KERN_LOW_WATER_MARK_DEFAULT 2000 243#define PV_HASHED_ALLOC_CHUNK_INITIAL 2000 244#define PV_HASHED_KERN_ALLOC_CHUNK_INITIAL 200 245 246extern volatile uint32_t mappingrecurse; 247extern uint32_t pv_hashed_low_water_mark, pv_hashed_kern_low_water_mark; 248 249/* 250 * PV hash locking 251 */ 252 253#define LOCK_PV_HASH(hash) lock_hash_hash(hash) 254#define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash) 255extern uint32_t npvhash; 256extern pv_hashed_entry_t *pv_hash_table; /* hash lists */ 257extern pv_hashed_entry_t pv_hashed_free_list; 258extern pv_hashed_entry_t pv_hashed_kern_free_list; 259decl_simple_lock_data(extern, pv_hashed_free_list_lock) 260decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock) 261decl_simple_lock_data(extern, pv_hash_table_lock) 262 263extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry 264 * structures */ 265 266extern uint32_t pv_hashed_free_count; 267extern uint32_t pv_hashed_kern_free_count; 268/* 269 * Each entry in the pv_head_table is locked by a bit in the 270 * pv_lock_table. The lock bits are accessed by the address of 271 * the frame they lock. 272 */ 273#define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) 274#define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) 275extern char *pv_lock_table; /* pointer to array of bits */ 276extern char *pv_hash_lock_table; 277extern pv_rooted_entry_t pv_head_table; /* array of entries, one per page */ 278 279extern event_t mapping_replenish_event; 280 281static inline void PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep) { 282 pmap_assert(*pvh_ep == PV_HASHED_ENTRY_NULL); 283 simple_lock(&pv_hashed_free_list_lock); 284 /* If the kernel reserved pool is low, let non-kernel mappings allocate 285 * synchronously, possibly subject to a throttle. 286 */ 287 if ((pv_hashed_kern_free_count > pv_hashed_kern_low_water_mark) && ((*pvh_ep = pv_hashed_free_list) != 0)) { 288 pv_hashed_free_list = (pv_hashed_entry_t)(*pvh_ep)->qlink.next; 289 pv_hashed_free_count--; 290 } 291 292 simple_unlock(&pv_hashed_free_list_lock); 293 294 if (pv_hashed_free_count <= pv_hashed_low_water_mark) { 295 if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse)) 296 thread_wakeup(&mapping_replenish_event); 297 } 298} 299 300static inline void PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) { 301 simple_lock(&pv_hashed_free_list_lock); 302 pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; 303 pv_hashed_free_list = pvh_eh; 304 pv_hashed_free_count += pv_cnt; 305 simple_unlock(&pv_hashed_free_list_lock); 306} 307 308extern unsigned pmap_kern_reserve_alloc_stat; 309 310static inline void PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e) { 311 pmap_assert(*pvh_e == PV_HASHED_ENTRY_NULL); 312 simple_lock(&pv_hashed_kern_free_list_lock); 313 314 if ((*pvh_e = pv_hashed_kern_free_list) != 0) { 315 pv_hashed_kern_free_list = (pv_hashed_entry_t)(*pvh_e)->qlink.next; 316 pv_hashed_kern_free_count--; 317 pmap_kern_reserve_alloc_stat++; 318 } 319 320 simple_unlock(&pv_hashed_kern_free_list_lock); 321 322 if (pv_hashed_kern_free_count < pv_hashed_kern_low_water_mark) { 323 if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse)) 324 thread_wakeup(&mapping_replenish_event); 325 } 326} 327 328static inline void PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) { 329 simple_lock(&pv_hashed_kern_free_list_lock); 330 pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; 331 pv_hashed_kern_free_list = pvh_eh; 332 pv_hashed_kern_free_count += pv_cnt; 333 simple_unlock(&pv_hashed_kern_free_list_lock); 334} 335 336extern uint64_t pmap_pv_throttle_stat, pmap_pv_throttled_waiters; 337extern event_t pmap_user_pv_throttle_event; 338 339static inline void pmap_pv_throttle(__unused pmap_t p) { 340 pmap_assert(p != kernel_pmap); 341 /* Apply throttle on non-kernel mappings */ 342 if (pv_hashed_kern_free_count < (pv_hashed_kern_low_water_mark / 2)) { 343 pmap_pv_throttle_stat++; 344 /* This doesn't need to be strictly accurate, merely a hint 345 * to eliminate the timeout when the reserve is replenished. 346 */ 347 pmap_pv_throttled_waiters++; 348 assert_wait_timeout(&pmap_user_pv_throttle_event, THREAD_UNINT, 1, 1000 * NSEC_PER_USEC); 349 thread_block(THREAD_CONTINUE_NULL); 350 } 351} 352 353/* 354 * Index into pv_head table, its lock bits, and the modify/reference and managed bits 355 */ 356 357#define pa_index(pa) (i386_btop(pa)) 358#define ppn_to_pai(ppn) ((int)ppn) 359 360#define pai_to_pvh(pai) (&pv_head_table[pai]) 361#define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table) 362#define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table) 363#define pvhash(idx) (&pv_hash_table[idx]) 364#define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table) 365#define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table) 366 367#define IS_MANAGED_PAGE(x) \ 368 ((unsigned int)(x) <= last_managed_page && \ 369 (pmap_phys_attributes[x] & PHYS_MANAGED)) 370 371/* 372 * Physical page attributes. Copy bits from PTE definition. 373 */ 374#define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */ 375#define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */ 376#define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */ 377#define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */ 378#define PHYS_NCACHE INTEL_PTE_NCACHE 379#define PHYS_PTA INTEL_PTE_PTA 380#define PHYS_CACHEABILITY_MASK (INTEL_PTE_PTA | INTEL_PTE_NCACHE) 381 382extern const boolean_t pmap_disable_kheap_nx; 383extern const boolean_t pmap_disable_kstack_nx; 384 385#define PMAP_EXPAND_OPTIONS_NONE (0x0) 386#define PMAP_EXPAND_OPTIONS_NOWAIT (PMAP_OPTIONS_NOWAIT) 387#define PMAP_EXPAND_OPTIONS_NOENTER (PMAP_OPTIONS_NOENTER) 388 389/* 390 * Amount of virtual memory mapped by one 391 * page-directory entry. 392 */ 393#define PDE_MAPPED_SIZE (pdetova(1)) 394 395 396/* 397 * Locking and TLB invalidation 398 */ 399 400/* 401 * Locking Protocols: (changed 2/2007 JK) 402 * 403 * There are two structures in the pmap module that need locking: 404 * the pmaps themselves, and the per-page pv_lists (which are locked 405 * by locking the pv_lock_table entry that corresponds to the pv_head 406 * for the list in question.) Most routines want to lock a pmap and 407 * then do operations in it that require pv_list locking -- however 408 * pmap_remove_all and pmap_copy_on_write operate on a physical page 409 * basis and want to do the locking in the reverse order, i.e. lock 410 * a pv_list and then go through all the pmaps referenced by that list. 411 * 412 * The system wide pmap lock has been removed. Now, paths take a lock 413 * on the pmap before changing its 'shape' and the reverse order lockers 414 * (coming in by phys ppn) take a lock on the corresponding pv and then 415 * retest to be sure nothing changed during the window before they locked 416 * and can then run up/down the pv lists holding the list lock. This also 417 * lets the pmap layer run (nearly completely) interrupt enabled, unlike 418 * previously. 419 */ 420 421/* 422 * PV locking 423 */ 424 425#define LOCK_PVH(index) { \ 426 mp_disable_preemption(); \ 427 lock_pvh_pai(index); \ 428} 429 430#define UNLOCK_PVH(index) { \ 431 unlock_pvh_pai(index); \ 432 mp_enable_preemption(); \ 433} 434 435extern uint64_t pde_mapped_size; 436 437extern char *pmap_phys_attributes; 438extern ppnum_t last_managed_page; 439 440extern ppnum_t lowest_lo; 441extern ppnum_t lowest_hi; 442extern ppnum_t highest_hi; 443 444/* 445 * when spinning through pmap_remove 446 * ensure that we don't spend too much 447 * time with preemption disabled. 448 * I'm setting the current threshold 449 * to 20us 450 */ 451#define MAX_PREEMPTION_LATENCY_NS 20000 452extern uint64_t max_preemption_latency_tsc; 453 454/* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */ 455#ifdef DEBUGINTERRUPTS 456#define pmap_intr_assert() { \ 457 if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \ 458 panic("pmap interrupt assert %s, %d",__FILE__, __LINE__); \ 459} 460#else 461#define pmap_intr_assert() 462#endif 463 464extern int nx_enabled; 465extern unsigned int inuse_ptepages_count; 466 467static inline uint32_t 468pvhashidx(pmap_t pmap, vm_map_offset_t va) 469{ 470 return ((uint32_t)(uintptr_t)pmap ^ 471 ((uint32_t)(va >> PAGE_SHIFT) & 0xFFFFFFFF)) & 472 npvhash; 473} 474 475 476/* 477 * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain. 478 * properly deals with the anchor. 479 * must be called with the hash locked, does not unlock it 480 */ 481static inline void 482pmap_pvh_unlink(pv_hashed_entry_t pvh) 483{ 484 pv_hashed_entry_t curh; 485 pv_hashed_entry_t *pprevh; 486 int pvhash_idx; 487 488 CHK_NPVHASH(); 489 pvhash_idx = pvhashidx(pvh->pmap, pvh->va); 490 491 pprevh = pvhash(pvhash_idx); 492 493#if PV_DEBUG 494 if (NULL == *pprevh) 495 panic("pvh_unlink null anchor"); /* JK DEBUG */ 496#endif 497 curh = *pprevh; 498 499 while (PV_HASHED_ENTRY_NULL != curh) { 500 if (pvh == curh) 501 break; 502 pprevh = &curh->nexth; 503 curh = curh->nexth; 504 } 505 if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh"); 506 *pprevh = pvh->nexth; 507 return; 508} 509 510static inline void 511pv_hash_add(pv_hashed_entry_t pvh_e, 512 pv_rooted_entry_t pv_h) 513{ 514 pv_hashed_entry_t *hashp; 515 int pvhash_idx; 516 517 CHK_NPVHASH(); 518 pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va); 519 LOCK_PV_HASH(pvhash_idx); 520 insque(&pvh_e->qlink, &pv_h->qlink); 521 hashp = pvhash(pvhash_idx); 522#if PV_DEBUG 523 if (NULL==hashp) 524 panic("pv_hash_add(%p) null hash bucket", pvh_e); 525#endif 526 pvh_e->nexth = *hashp; 527 *hashp = pvh_e; 528 UNLOCK_PV_HASH(pvhash_idx); 529} 530 531static inline void 532pv_hash_remove(pv_hashed_entry_t pvh_e) 533{ 534 int pvhash_idx; 535 536 CHK_NPVHASH(); 537 pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va); 538 LOCK_PV_HASH(pvhash_idx); 539 remque(&pvh_e->qlink); 540 pmap_pvh_unlink(pvh_e); 541 UNLOCK_PV_HASH(pvhash_idx); 542} 543 544static inline boolean_t popcnt1(uint64_t distance) { 545 return ((distance & (distance - 1)) == 0); 546} 547 548/* 549 * Routines to handle suppression of/recovery from some forms of pagetable corruption 550 * incidents observed in the field. These can be either software induced (wild 551 * stores to the mapwindows where applicable, use after free errors 552 * (typically of pages addressed physically), mis-directed DMAs etc., or due 553 * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors, 554 * the recording mechanism is deliberately not MP-safe. The overarching goal is to 555 * still assert on potential software races, but attempt recovery from incidents 556 * identifiable as occurring due to issues beyond the control of the pmap module. 557 * The latter includes single-bit errors and malformed pagetable entries. 558 * We currently limit ourselves to recovery/suppression of one incident per 559 * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident 560 * are logged. 561 * Assertions are not suppressed if kernel debugging is enabled. (DRK 09) 562 */ 563 564typedef enum { 565 PTE_VALID = 0x0, 566 PTE_INVALID = 0x1, 567 PTE_RSVD = 0x2, 568 PTE_SUPERVISOR = 0x4, 569 PTE_BITFLIP = 0x8, 570 PV_BITFLIP = 0x10, 571 PTE_INVALID_CACHEABILITY = 0x20 572} pmap_pagetable_corruption_t; 573 574typedef enum { 575 ROOT_PRESENT = 0, 576 ROOT_ABSENT = 1 577} pmap_pv_assertion_t; 578 579typedef enum { 580 PMAP_ACTION_IGNORE = 0x0, 581 PMAP_ACTION_ASSERT = 0x1, 582 PMAP_ACTION_RETRY = 0x2, 583 PMAP_ACTION_RETRY_RELOCK = 0x4 584} pmap_pagetable_corruption_action_t; 585 586#define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL) 587extern uint64_t pmap_pagetable_corruption_interval_abstime; 588 589extern uint32_t pmap_pagetable_corruption_incidents; 590#define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8) 591typedef struct { 592 pmap_pv_assertion_t incident; 593 pmap_pagetable_corruption_t reason; 594 pmap_pagetable_corruption_action_t action; 595 pmap_t pmap; 596 vm_map_offset_t vaddr; 597 pt_entry_t pte; 598 ppnum_t ppn; 599 pmap_t pvpmap; 600 vm_map_offset_t pvva; 601 uint64_t abstime; 602} pmap_pagetable_corruption_record_t; 603 604extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[]; 605extern uint64_t pmap_pagetable_corruption_last_abstime; 606extern thread_call_t pmap_pagetable_corruption_log_call; 607extern boolean_t pmap_pagetable_corruption_timeout; 608 609static inline void 610pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva) { 611 uint32_t pmap_pagetable_corruption_log_index; 612 pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG; 613 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident; 614 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason; 615 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].action = action; 616 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pmap = pmap; 617 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].vaddr = vaddr; 618 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pte = *ptep; 619 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn; 620 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap; 621 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva; 622 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = mach_absolute_time(); 623 /* Asynchronously log */ 624 thread_call_enter(pmap_pagetable_corruption_log_call); 625} 626 627static inline pmap_pagetable_corruption_action_t 628pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident) { 629 pmap_pagetable_corruption_action_t action = PMAP_ACTION_ASSERT; 630 pmap_pagetable_corruption_t suppress_reason = PTE_VALID; 631 ppnum_t suppress_ppn = 0; 632 pt_entry_t cpte = *ptep; 633 ppnum_t cpn = pa_index(pte_to_pa(cpte)); 634 ppnum_t ppn = *ppnp; 635 pv_rooted_entry_t pv_h = pai_to_pvh(ppn_to_pai(ppn)); 636 pv_rooted_entry_t pv_e = pv_h; 637 uint32_t bitdex; 638 pmap_t pvpmap = pv_h->pmap; 639 vm_map_offset_t pvva = pv_h->va; 640 boolean_t ppcd = FALSE; 641 642 /* Ideally, we'd consult the Mach VM here to definitively determine 643 * the nature of the mapping for this address space and address. 644 * As that would be a layering violation in this context, we 645 * use various heuristics to recover from single bit errors, 646 * malformed pagetable entries etc. These are not intended 647 * to be comprehensive. 648 */ 649 650 /* As a precautionary measure, mark A+D */ 651 pmap_phys_attributes[ppn_to_pai(ppn)] |= (PHYS_MODIFIED | PHYS_REFERENCED); 652 653 /* 654 * Correct potential single bit errors in either (but not both) element 655 * of the PV 656 */ 657 do { 658 if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && pv_e->va == vaddr) || 659 (pv_e->pmap == pmap && popcnt1(pv_e->va ^ vaddr))) { 660 pv_e->pmap = pmap; 661 pv_e->va = vaddr; 662 suppress_reason = PV_BITFLIP; 663 action = PMAP_ACTION_RETRY; 664 goto pmap_cpc_exit; 665 } 666 } while (((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink))) && (pv_e != pv_h)); 667 668 /* Discover root entries with a Hamming 669 * distance of 1 from the supplied 670 * physical page frame. 671 */ 672 for (bitdex = 0; bitdex < (sizeof(ppnum_t) << 3); bitdex++) { 673 ppnum_t npn = cpn ^ (ppnum_t) (1ULL << bitdex); 674 if (IS_MANAGED_PAGE(npn)) { 675 pv_rooted_entry_t npv_h = pai_to_pvh(ppn_to_pai(npn)); 676 if (npv_h->va == vaddr && npv_h->pmap == pmap) { 677 suppress_reason = PTE_BITFLIP; 678 suppress_ppn = npn; 679 action = PMAP_ACTION_RETRY_RELOCK; 680 UNLOCK_PVH(ppn_to_pai(ppn)); 681 *ppnp = npn; 682 goto pmap_cpc_exit; 683 } 684 } 685 } 686 687 if (pmap == kernel_pmap) { 688 action = PMAP_ACTION_ASSERT; 689 goto pmap_cpc_exit; 690 } 691 692 /* Check for malformed/inconsistent entries */ 693 694 if ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PTA)) == (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU)) { 695 action = PMAP_ACTION_IGNORE; 696 suppress_reason = PTE_INVALID_CACHEABILITY; 697 } 698 else if (cpte & INTEL_PTE_RSVD) { 699 action = PMAP_ACTION_IGNORE; 700 suppress_reason = PTE_RSVD; 701 } 702 else if ((pmap != kernel_pmap) && ((cpte & INTEL_PTE_USER) == 0)) { 703 action = PMAP_ACTION_IGNORE; 704 suppress_reason = PTE_SUPERVISOR; 705 } 706pmap_cpc_exit: 707 PE_parse_boot_argn("-pmap_pagetable_corruption_deassert", &ppcd, sizeof(ppcd)); 708 709 if (debug_boot_arg && !ppcd) { 710 action = PMAP_ACTION_ASSERT; 711 } 712 713 if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) { 714 action = PMAP_ACTION_ASSERT; 715 pmap_pagetable_corruption_timeout = TRUE; 716 } 717 else 718 { 719 pmap_pagetable_corruption_last_abstime = mach_absolute_time(); 720 } 721 pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva); 722 return action; 723} 724 725/* 726 * Remove pv list entry. 727 * Called with pv_head_table entry locked. 728 * Returns pv entry to be freed (or NULL). 729 */ 730static inline __attribute__((always_inline)) pv_hashed_entry_t 731pmap_pv_remove(pmap_t pmap, 732 vm_map_offset_t vaddr, 733 ppnum_t *ppnp, 734 pt_entry_t *pte) 735{ 736 pv_hashed_entry_t pvh_e; 737 pv_rooted_entry_t pv_h; 738 pv_hashed_entry_t *pprevh; 739 int pvhash_idx; 740 uint32_t pv_cnt; 741 ppnum_t ppn; 742 743pmap_pv_remove_retry: 744 ppn = *ppnp; 745 pvh_e = PV_HASHED_ENTRY_NULL; 746 pv_h = pai_to_pvh(ppn_to_pai(ppn)); 747 748 if (__improbable(pv_h->pmap == PMAP_NULL)) { 749 pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT); 750 if (pac == PMAP_ACTION_IGNORE) 751 goto pmap_pv_remove_exit; 752 else if (pac == PMAP_ACTION_ASSERT) 753 panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p, %p): null pv_list!", pmap, vaddr, ppn, *pte, ppnp, pte); 754 else if (pac == PMAP_ACTION_RETRY_RELOCK) { 755 LOCK_PVH(ppn_to_pai(*ppnp)); 756 pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED); 757 goto pmap_pv_remove_retry; 758 } 759 else if (pac == PMAP_ACTION_RETRY) 760 goto pmap_pv_remove_retry; 761 } 762 763 if (pv_h->va == vaddr && pv_h->pmap == pmap) { 764 /* 765 * Header is the pv_rooted_entry. 766 * We can't free that. If there is a queued 767 * entry after this one we remove that 768 * from the ppn queue, we remove it from the hash chain 769 * and copy it to the rooted entry. Then free it instead. 770 */ 771 pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink); 772 if (pv_h != (pv_rooted_entry_t) pvh_e) { 773 /* 774 * Entry queued to root, remove this from hash 775 * and install as new root. 776 */ 777 CHK_NPVHASH(); 778 pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va); 779 LOCK_PV_HASH(pvhash_idx); 780 remque(&pvh_e->qlink); 781 pprevh = pvhash(pvhash_idx); 782 if (PV_HASHED_ENTRY_NULL == *pprevh) { 783 panic("pmap_pv_remove(%p,0x%llx,0x%x): " 784 "empty hash, removing rooted", 785 pmap, vaddr, ppn); 786 } 787 pmap_pvh_unlink(pvh_e); 788 UNLOCK_PV_HASH(pvhash_idx); 789 pv_h->pmap = pvh_e->pmap; 790 pv_h->va = pvh_e->va; /* dispose of pvh_e */ 791 } else { 792 /* none queued after rooted */ 793 pv_h->pmap = PMAP_NULL; 794 pvh_e = PV_HASHED_ENTRY_NULL; 795 } 796 } else { 797 /* 798 * not removing rooted pv. find it on hash chain, remove from 799 * ppn queue and hash chain and free it 800 */ 801 CHK_NPVHASH(); 802 pvhash_idx = pvhashidx(pmap, vaddr); 803 LOCK_PV_HASH(pvhash_idx); 804 pprevh = pvhash(pvhash_idx); 805 if (PV_HASHED_ENTRY_NULL == *pprevh) { 806 panic("pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p): empty hash", 807 pmap, vaddr, ppn, *pte, pte); 808 } 809 pvh_e = *pprevh; 810 pmap_pv_hashlist_walks++; 811 pv_cnt = 0; 812 while (PV_HASHED_ENTRY_NULL != pvh_e) { 813 pv_cnt++; 814 if (pvh_e->pmap == pmap && 815 pvh_e->va == vaddr && 816 pvh_e->ppn == ppn) 817 break; 818 pprevh = &pvh_e->nexth; 819 pvh_e = pvh_e->nexth; 820 } 821 822 if (PV_HASHED_ENTRY_NULL == pvh_e) { 823 pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT); 824 825 if (pac == PMAP_ACTION_ASSERT) 826 panic("pmap_pv_remove(%p, 0x%llx, 0x%x, 0x%llx, %p, %p): pv not on hash, head: %p, 0x%llx", pmap, vaddr, ppn, *pte, ppnp, pte, pv_h->pmap, pv_h->va); 827 else { 828 UNLOCK_PV_HASH(pvhash_idx); 829 if (pac == PMAP_ACTION_RETRY_RELOCK) { 830 LOCK_PVH(ppn_to_pai(*ppnp)); 831 pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED); 832 goto pmap_pv_remove_retry; 833 } 834 else if (pac == PMAP_ACTION_RETRY) { 835 goto pmap_pv_remove_retry; 836 } 837 else if (pac == PMAP_ACTION_IGNORE) { 838 goto pmap_pv_remove_exit; 839 } 840 } 841 } 842 843 pmap_pv_hashlist_cnts += pv_cnt; 844 if (pmap_pv_hashlist_max < pv_cnt) 845 pmap_pv_hashlist_max = pv_cnt; 846 *pprevh = pvh_e->nexth; 847 remque(&pvh_e->qlink); 848 UNLOCK_PV_HASH(pvhash_idx); 849 } 850pmap_pv_remove_exit: 851 return pvh_e; 852} 853 854 855extern int pt_fake_zone_index; 856static inline void 857PMAP_ZINFO_PALLOC(pmap_t pmap, vm_size_t bytes) 858{ 859 thread_t thr = current_thread(); 860 task_t task; 861 zinfo_usage_t zinfo; 862 863 pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes); 864 865 if (pt_fake_zone_index != -1 && 866 (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) 867 OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].alloc); 868} 869 870static inline void 871PMAP_ZINFO_PFREE(pmap_t pmap, vm_size_t bytes) 872{ 873 thread_t thr = current_thread(); 874 task_t task; 875 zinfo_usage_t zinfo; 876 877 pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes); 878 879 if (pt_fake_zone_index != -1 && 880 (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) 881 OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].free); 882} 883 884static inline void 885PMAP_ZINFO_SALLOC(pmap_t pmap, vm_size_t bytes) 886{ 887 pmap_ledger_credit(pmap, task_ledgers.tkm_shared, bytes); 888} 889 890static inline void 891PMAP_ZINFO_SFREE(pmap_t pmap, vm_size_t bytes) 892{ 893 pmap_ledger_debit(pmap, task_ledgers.tkm_shared, bytes); 894} 895 896extern boolean_t pmap_initialized;/* Has pmap_init completed? */ 897#define valid_page(x) (pmap_initialized && pmap_valid_page(x)) 898 899// XXX 900#define HIGH_MEM_BASE ((uint32_t)( -NBPDE) ) /* shared gdt etc seg addr */ /* XXX64 ?? */ 901// XXX 902 903 904int phys_attribute_test( 905 ppnum_t phys, 906 int bits); 907void phys_attribute_clear( 908 ppnum_t phys, 909 int bits); 910 911//#define PCID_DEBUG 1 912#if PCID_DEBUG 913#define pmap_pcid_log(fmt, args...) \ 914 do { \ 915 kprintf(fmt, ##args); \ 916 printf(fmt, ##args); \ 917 } while(0) 918#else 919#define pmap_pcid_log(fmt, args...) 920#endif 921void pmap_pcid_configure(void); 922 923 924/* 925 * Atomic 64-bit compare and exchange of a page table entry. 926 */ 927static inline boolean_t 928pmap_cmpx_pte(pt_entry_t *entryp, pt_entry_t old, pt_entry_t new) 929{ 930 boolean_t ret; 931 932#ifdef __i386__ 933 /* 934 * Load the old value into %edx:%eax 935 * Load the new value into %ecx:%ebx 936 * Compare-exchange-8bytes at address entryp (loaded in %edi) 937 * If the compare succeeds, the new value is stored, return TRUE. 938 * Otherwise, no swap is made, return FALSE. 939 */ 940 asm volatile( 941 " lock; cmpxchg8b (%1) \n\t" 942 " setz %%al \n\t" 943 " movzbl %%al,%0" 944 : "=a" (ret) 945 : "D" (entryp), 946 "a" ((uint32_t)old), 947 "d" ((uint32_t)(old >> 32)), 948 "b" ((uint32_t)new), 949 "c" ((uint32_t)(new >> 32)) 950 : "memory"); 951#else 952 /* 953 * Load the old value into %rax 954 * Load the new value into another register 955 * Compare-exchange-quad at address entryp 956 * If the compare succeeds, the new value is stored, return TRUE. 957 * Otherwise, no swap is made, return FALSE. 958 */ 959 asm volatile( 960 " lock; cmpxchgq %2,(%3) \n\t" 961 " setz %%al \n\t" 962 " movzbl %%al,%0" 963 : "=a" (ret) 964 : "a" (old), 965 "r" (new), 966 "r" (entryp) 967 : "memory"); 968#endif 969 return ret; 970} 971 972extern uint32_t pmap_update_clear_pte_count; 973 974static inline void pmap_update_pte(pt_entry_t *mptep, uint64_t pclear_bits, uint64_t pset_bits) { 975 pt_entry_t npte, opte; 976 do { 977 opte = *mptep; 978 if (__improbable(opte == 0)) { 979 pmap_update_clear_pte_count++; 980 break; 981 } 982 npte = opte & ~(pclear_bits); 983 npte |= pset_bits; 984 } while (!pmap_cmpx_pte(mptep, opte, npte)); 985} 986 987#if defined(__x86_64__) 988/* 989 * The single pml4 page per pmap is allocated at pmap create time and exists 990 * for the duration of the pmap. we allocate this page in kernel vm. 991 * this returns the address of the requested pml4 entry in the top level page. 992 */ 993static inline 994pml4_entry_t * 995pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr) 996{ 997 if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) && 998 (vaddr < 0xFFFF800000000000ULL))) { 999 return (NULL); 1000 } 1001 1002#if PMAP_ASSERT 1003 return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_cr3)[(vaddr >> PML4SHIFT) & (NPML4PG-1)]); 1004#else 1005 return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG-1)]; 1006#endif 1007} 1008 1009/* 1010 * Returns address of requested PDPT entry in the physmap. 1011 */ 1012static inline pdpt_entry_t * 1013pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr) 1014{ 1015 pml4_entry_t newpf; 1016 pml4_entry_t *pml4; 1017 1018 pml4 = pmap64_pml4(pmap, vaddr); 1019 if (pml4 && ((*pml4 & INTEL_PTE_VALID))) { 1020 newpf = *pml4 & PG_FRAME; 1021 return &((pdpt_entry_t *) PHYSMAP_PTOV(newpf)) 1022 [(vaddr >> PDPTSHIFT) & (NPDPTPG-1)]; 1023 } 1024 return (NULL); 1025} 1026/* 1027 * Returns the address of the requested PDE entry in the physmap. 1028 */ 1029static inline pd_entry_t * 1030pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr) 1031{ 1032 pdpt_entry_t newpf; 1033 pdpt_entry_t *pdpt; 1034 1035 pdpt = pmap64_pdpt(pmap, vaddr); 1036 1037 if (pdpt && ((*pdpt & INTEL_PTE_VALID))) { 1038 newpf = *pdpt & PG_FRAME; 1039 return &((pd_entry_t *) PHYSMAP_PTOV(newpf)) 1040 [(vaddr >> PDSHIFT) & (NPDPG-1)]; 1041 } 1042 return (NULL); 1043} 1044 1045static inline pd_entry_t * 1046pmap_pde(pmap_t m, vm_map_offset_t v) 1047{ 1048 pd_entry_t *pde; 1049 1050 pde = pmap64_pde(m, v); 1051 1052 return pde; 1053} 1054 1055 1056/* 1057 * return address of mapped pte for vaddr va in pmap pmap. 1058 * 1059 * In case the pde maps a superpage, return the pde, which, in this case 1060 * is the actual page table entry. 1061 */ 1062static inline pt_entry_t * 1063pmap_pte(pmap_t pmap, vm_map_offset_t vaddr) 1064{ 1065 pd_entry_t *pde; 1066 pd_entry_t newpf; 1067 1068 assert(pmap); 1069 pde = pmap64_pde(pmap, vaddr); 1070 1071 if (pde && ((*pde & INTEL_PTE_VALID))) { 1072 if (*pde & INTEL_PTE_PS) 1073 return pde; 1074 newpf = *pde & PG_FRAME; 1075 return &((pt_entry_t *)PHYSMAP_PTOV(newpf)) 1076 [i386_btop(vaddr) & (ppnum_t)(NPTEPG-1)]; 1077 } 1078 return (NULL); 1079} 1080#endif 1081#if DEBUG 1082#define DPRINTF(x...) kprintf(x) 1083#else 1084#define DPRINTF(x...) 1085#endif 1086 1087#endif /* MACH_KERNEL_PRIVATE */ 1088#endif /* _I386_PMAP_INTERNAL_ */ 1089