1/* 2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29 30#ifndef _I386_PMAP_INTERNAL_ 31#define _I386_PMAP_INTERNAL_ 32#ifdef MACH_KERNEL_PRIVATE 33 34#include <vm/pmap.h> 35#include <sys/kdebug.h> 36#include <kern/ledger.h> 37#include <kern/simple_lock.h> 38#include <i386/bit_routines.h> 39 40/* 41 * pmap locking 42 */ 43 44#define PMAP_LOCK(pmap) { \ 45 simple_lock(&(pmap)->lock); \ 46} 47 48#define PMAP_UNLOCK(pmap) { \ 49 simple_unlock(&(pmap)->lock); \ 50} 51 52#define PMAP_UPDATE_TLBS(pmap, s, e) \ 53 pmap_flush_tlbs(pmap, s, e, 0, NULL) 54 55 56#define PMAP_DELAY_TLB_FLUSH 0x01 57 58#define PMAP_UPDATE_TLBS_DELAYED(pmap, s, e, c) \ 59 pmap_flush_tlbs(pmap, s, e, PMAP_DELAY_TLB_FLUSH, c) 60 61 62#define iswired(pte) ((pte) & INTEL_PTE_WIRED) 63 64#ifdef PMAP_TRACES 65extern boolean_t pmap_trace; 66#define PMAP_TRACE(x,a,b,c,d,e) \ 67 if (pmap_trace) { \ 68 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \ 69 } 70#else 71#define PMAP_TRACE(x,a,b,c,d,e) KERNEL_DEBUG(x,a,b,c,d,e) 72#endif /* PMAP_TRACES */ 73 74#define PMAP_TRACE_CONSTANT(x,a,b,c,d,e) \ 75 KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e); \ 76 77kern_return_t pmap_expand_pml4( 78 pmap_t map, 79 vm_map_offset_t v, 80 unsigned int options); 81 82kern_return_t pmap_expand_pdpt( 83 pmap_t map, 84 vm_map_offset_t v, 85 unsigned int options); 86 87void phys_attribute_set( 88 ppnum_t phys, 89 int bits); 90 91void pmap_set_reference( 92 ppnum_t pn); 93 94boolean_t phys_page_exists( 95 ppnum_t pn); 96 97void 98pmap_flush_tlbs(pmap_t, vm_map_offset_t, vm_map_offset_t, int, pmap_flush_context *); 99 100void 101pmap_update_cache_attributes_locked(ppnum_t, unsigned); 102 103extern const boolean_t cpu_64bit; 104 105/* 106 * Private data structures. 107 */ 108 109/* 110 * For each vm_page_t, there is a list of all currently 111 * valid virtual mappings of that page. An entry is 112 * a pv_rooted_entry_t; the list is the pv_table. 113 * 114 * N.B. with the new combo rooted/hashed scheme it is 115 * only possibly to remove individual non-rooted entries 116 * if they are found via the hashed chains as there is no 117 * way to unlink the singly linked hashed entries if navigated to 118 * via the queue list off the rooted entries. Think of it as 119 * hash/walk/pull, keeping track of the prev pointer while walking 120 * the singly linked hash list. All of this is to save memory and 121 * keep both types of pv_entries as small as possible. 122 */ 123 124/* 125 126PV HASHING Changes - JK 1/2007 127 128Pve's establish physical to virtual mappings. These are used for aliasing of a 129physical page to (potentially many) virtual addresses within pmaps. In the 130previous implementation the structure of the pv_entries (each 16 bytes in size) was 131 132typedef struct pv_entry { 133 struct pv_entry_t next; 134 pmap_t pmap; 135 vm_map_offset_t va; 136} *pv_entry_t; 137 138An initial array of these is created at boot time, one per physical page of 139memory, indexed by the physical page number. Additionally, a pool of entries 140is created from a pv_zone to be used as needed by pmap_enter() when it is 141creating new mappings. Originally, we kept this pool around because the code 142in pmap_enter() was unable to block if it needed an entry and none were 143available - we'd panic. Some time ago I restructured the pmap_enter() code 144so that for user pmaps it can block while zalloc'ing a pv structure and restart, 145removing a panic from the code (in the case of the kernel pmap we cannot block 146and still panic, so, we keep a separate hot pool for use only on kernel pmaps). 147The pool has not been removed since there is a large performance gain keeping 148freed pv's around for reuse and not suffering the overhead of zalloc for every 149new pv we need. 150 151As pmap_enter() created new mappings it linked the new pve's for them off the 152fixed pv array for that ppn (off the next pointer). These pve's are accessed 153for several operations, one of them being address space teardown. In that case, 154we basically do this 155 156 for (every page/pte in the space) { 157 calc pve_ptr from the ppn in the pte 158 for (every pv in the list for the ppn) { 159 if (this pv is for this pmap/vaddr) { 160 do housekeeping 161 unlink/free the pv 162 } 163 } 164 } 165 166The problem arose when we were running, say 8000 (or even 2000) apache or 167other processes and one or all terminate. The list hanging off each pv array 168entry could have thousands of entries. We were continuously linearly searching 169each of these lists as we stepped through the address space we were tearing 170down. Because of the locks we hold, likely taking a cache miss for each node, 171and interrupt disabling for MP issues the system became completely unresponsive 172for many seconds while we did this. 173 174Realizing that pve's are accessed in two distinct ways (linearly running the 175list by ppn for operations like pmap_page_protect and finding and 176modifying/removing a single pve as part of pmap_enter processing) has led to 177modifying the pve structures and databases. 178 179There are now two types of pve structures. A "rooted" structure which is 180basically the original structure accessed in an array by ppn, and a ''hashed'' 181structure accessed on a hash list via a hash of [pmap, vaddr]. These have been 182designed with the two goals of minimizing wired memory and making the lookup of 183a ppn faster. Since a vast majority of pages in the system are not aliased 184and hence represented by a single pv entry I've kept the rooted entry size as 185small as possible because there is one of these dedicated for every physical 186page of memory. The hashed pve's are larger due to the addition of the hash 187link and the ppn entry needed for matching while running the hash list to find 188the entry we are looking for. This way, only systems that have lots of 189aliasing (like 2000+ httpd procs) will pay the extra memory price. Both 190structures have the same first three fields allowing some simplification in 191the code. 192 193They have these shapes 194 195typedef struct pv_rooted_entry { 196 queue_head_t qlink; 197 vm_map_offset_t va; 198 pmap_t pmap; 199} *pv_rooted_entry_t; 200 201 202typedef struct pv_hashed_entry { 203 queue_head_t qlink; 204 vm_map_offset_t va; 205 pmap_t pmap; 206 ppnum_t ppn; 207 struct pv_hashed_entry *nexth; 208} *pv_hashed_entry_t; 209 210The main flow difference is that the code is now aware of the rooted entry and 211the hashed entries. Code that runs the pv list still starts with the rooted 212entry and then continues down the qlink onto the hashed entries. Code that is 213looking up a specific pv entry first checks the rooted entry and then hashes 214and runs the hash list for the match. The hash list lengths are much smaller 215than the original pv lists that contained all aliases for the specific ppn. 216 217*/ 218 219typedef struct pv_rooted_entry { 220 /* first three entries must match pv_hashed_entry_t */ 221 queue_head_t qlink; 222 vm_map_offset_t va; /* virtual address for mapping */ 223 pmap_t pmap; /* pmap where mapping lies */ 224} *pv_rooted_entry_t; 225 226#define PV_ROOTED_ENTRY_NULL ((pv_rooted_entry_t) 0) 227 228typedef struct pv_hashed_entry { 229 /* first three entries must match pv_rooted_entry_t */ 230 queue_head_t qlink; 231 vm_map_offset_t va; 232 pmap_t pmap; 233 ppnum_t ppn; 234 struct pv_hashed_entry *nexth; 235} *pv_hashed_entry_t; 236 237#define PV_HASHED_ENTRY_NULL ((pv_hashed_entry_t)0) 238 239//#define PV_DEBUG 1 /* uncomment to enable some PV debugging code */ 240#ifdef PV_DEBUG 241#define CHK_NPVHASH() if(0 == npvhashmask) panic("npvhash uninitialized"); 242#else 243#define CHK_NPVHASH(x) 244#endif 245 246#define NPVHASHBUCKETS (4096) 247#define NPVHASHMASK ((NPVHASHBUCKETS) - 1) /* MUST BE 2^N - 1 */ 248#define PV_HASHED_LOW_WATER_MARK_DEFAULT 5000 249#define PV_HASHED_KERN_LOW_WATER_MARK_DEFAULT 2000 250#define PV_HASHED_ALLOC_CHUNK_INITIAL 2000 251#define PV_HASHED_KERN_ALLOC_CHUNK_INITIAL 200 252 253extern volatile uint32_t mappingrecurse; 254extern uint32_t pv_hashed_low_water_mark, pv_hashed_kern_low_water_mark; 255 256/* 257 * PV hash locking 258 */ 259 260#define LOCK_PV_HASH(hash) lock_hash_hash(hash) 261#define UNLOCK_PV_HASH(hash) unlock_hash_hash(hash) 262extern uint32_t npvhashmask; 263extern pv_hashed_entry_t *pv_hash_table; /* hash lists */ 264extern pv_hashed_entry_t pv_hashed_free_list; 265extern pv_hashed_entry_t pv_hashed_kern_free_list; 266decl_simple_lock_data(extern, pv_hashed_free_list_lock) 267decl_simple_lock_data(extern, pv_hashed_kern_free_list_lock) 268decl_simple_lock_data(extern, pv_hash_table_lock) 269decl_simple_lock_data(extern, phys_backup_lock) 270 271extern zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry 272 * structures */ 273 274extern uint32_t pv_hashed_free_count; 275extern uint32_t pv_hashed_kern_free_count; 276/* 277 * Each entry in the pv_head_table is locked by a bit in the 278 * pv_lock_table. The lock bits are accessed by the address of 279 * the frame they lock. 280 */ 281#define pv_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) 282#define pv_hash_lock_table_size(n) (((n)+BYTE_SIZE-1)/BYTE_SIZE) 283extern char *pv_lock_table; /* pointer to array of bits */ 284extern char *pv_hash_lock_table; 285extern pv_rooted_entry_t pv_head_table; /* array of entries, one per page */ 286 287extern event_t mapping_replenish_event; 288 289static inline void PV_HASHED_ALLOC(pv_hashed_entry_t *pvh_ep) { 290 pmap_assert(*pvh_ep == PV_HASHED_ENTRY_NULL); 291 simple_lock(&pv_hashed_free_list_lock); 292 /* If the kernel reserved pool is low, let non-kernel mappings allocate 293 * synchronously, possibly subject to a throttle. 294 */ 295 if ((pv_hashed_kern_free_count > pv_hashed_kern_low_water_mark) && ((*pvh_ep = pv_hashed_free_list) != 0)) { 296 pv_hashed_free_list = (pv_hashed_entry_t)(*pvh_ep)->qlink.next; 297 pv_hashed_free_count--; 298 } 299 300 simple_unlock(&pv_hashed_free_list_lock); 301 302 if (pv_hashed_free_count <= pv_hashed_low_water_mark) { 303 if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse)) 304 thread_wakeup(&mapping_replenish_event); 305 } 306} 307 308static inline void PV_HASHED_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) { 309 simple_lock(&pv_hashed_free_list_lock); 310 pvh_et->qlink.next = (queue_entry_t)pv_hashed_free_list; 311 pv_hashed_free_list = pvh_eh; 312 pv_hashed_free_count += pv_cnt; 313 simple_unlock(&pv_hashed_free_list_lock); 314} 315 316extern unsigned pmap_kern_reserve_alloc_stat; 317 318static inline void PV_HASHED_KERN_ALLOC(pv_hashed_entry_t *pvh_e) { 319 pmap_assert(*pvh_e == PV_HASHED_ENTRY_NULL); 320 simple_lock(&pv_hashed_kern_free_list_lock); 321 322 if ((*pvh_e = pv_hashed_kern_free_list) != 0) { 323 pv_hashed_kern_free_list = (pv_hashed_entry_t)(*pvh_e)->qlink.next; 324 pv_hashed_kern_free_count--; 325 pmap_kern_reserve_alloc_stat++; 326 } 327 328 simple_unlock(&pv_hashed_kern_free_list_lock); 329 330 if (pv_hashed_kern_free_count < pv_hashed_kern_low_water_mark) { 331 if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse)) 332 thread_wakeup(&mapping_replenish_event); 333 } 334} 335 336static inline void PV_HASHED_KERN_FREE_LIST(pv_hashed_entry_t pvh_eh, pv_hashed_entry_t pvh_et, int pv_cnt) { 337 simple_lock(&pv_hashed_kern_free_list_lock); 338 pvh_et->qlink.next = (queue_entry_t)pv_hashed_kern_free_list; 339 pv_hashed_kern_free_list = pvh_eh; 340 pv_hashed_kern_free_count += pv_cnt; 341 simple_unlock(&pv_hashed_kern_free_list_lock); 342} 343 344extern uint64_t pmap_pv_throttle_stat, pmap_pv_throttled_waiters; 345extern event_t pmap_user_pv_throttle_event; 346 347static inline void pmap_pv_throttle(__unused pmap_t p) { 348 pmap_assert(p != kernel_pmap); 349 /* Apply throttle on non-kernel mappings */ 350 if (pv_hashed_kern_free_count < (pv_hashed_kern_low_water_mark / 2)) { 351 pmap_pv_throttle_stat++; 352 /* This doesn't need to be strictly accurate, merely a hint 353 * to eliminate the timeout when the reserve is replenished. 354 */ 355 pmap_pv_throttled_waiters++; 356 assert_wait_timeout(&pmap_user_pv_throttle_event, THREAD_UNINT, 1, 1000 * NSEC_PER_USEC); 357 thread_block(THREAD_CONTINUE_NULL); 358 } 359} 360 361/* 362 * Index into pv_head table, its lock bits, and the modify/reference and managed bits 363 */ 364 365#define pa_index(pa) (i386_btop(pa)) 366#define ppn_to_pai(ppn) ((int)ppn) 367 368#define pai_to_pvh(pai) (&pv_head_table[pai]) 369#define lock_pvh_pai(pai) bit_lock(pai, (void *)pv_lock_table) 370#define unlock_pvh_pai(pai) bit_unlock(pai, (void *)pv_lock_table) 371#define pvhash(idx) (&pv_hash_table[idx]) 372#define lock_hash_hash(hash) bit_lock(hash, (void *)pv_hash_lock_table) 373#define unlock_hash_hash(hash) bit_unlock(hash, (void *)pv_hash_lock_table) 374 375#define IS_MANAGED_PAGE(x) \ 376 ((unsigned int)(x) <= last_managed_page && \ 377 (pmap_phys_attributes[x] & PHYS_MANAGED)) 378#define IS_INTERNAL_PAGE(x) \ 379 (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_INTERNAL)) 380#define IS_REUSABLE_PAGE(x) \ 381 (IS_MANAGED_PAGE(x) && (pmap_phys_attributes[x] & PHYS_REUSABLE)) 382 383/* 384 * Physical page attributes. Copy bits from PTE definition. 385 */ 386#define PHYS_MODIFIED INTEL_PTE_MOD /* page modified */ 387#define PHYS_REFERENCED INTEL_PTE_REF /* page referenced */ 388#define PHYS_MANAGED INTEL_PTE_VALID /* page is managed */ 389#define PHYS_NOENCRYPT INTEL_PTE_USER /* no need to encrypt this page in the hibernation image */ 390#define PHYS_NCACHE INTEL_PTE_NCACHE 391#define PHYS_PTA INTEL_PTE_PTA 392#define PHYS_CACHEABILITY_MASK (INTEL_PTE_PTA | INTEL_PTE_NCACHE) 393#define PHYS_INTERNAL INTEL_PTE_WTHRU /* page from internal object */ 394#define PHYS_REUSABLE INTEL_PTE_WRITE /* page is "reusable" */ 395 396extern const boolean_t pmap_disable_kheap_nx; 397extern const boolean_t pmap_disable_kstack_nx; 398 399#define PMAP_EXPAND_OPTIONS_NONE (0x0) 400#define PMAP_EXPAND_OPTIONS_NOWAIT (PMAP_OPTIONS_NOWAIT) 401#define PMAP_EXPAND_OPTIONS_NOENTER (PMAP_OPTIONS_NOENTER) 402 403/* 404 * Amount of virtual memory mapped by one 405 * page-directory entry. 406 */ 407#define PDE_MAPPED_SIZE (pdetova(1)) 408 409 410/* 411 * Locking and TLB invalidation 412 */ 413 414/* 415 * Locking Protocols: (changed 2/2007 JK) 416 * 417 * There are two structures in the pmap module that need locking: 418 * the pmaps themselves, and the per-page pv_lists (which are locked 419 * by locking the pv_lock_table entry that corresponds to the pv_head 420 * for the list in question.) Most routines want to lock a pmap and 421 * then do operations in it that require pv_list locking -- however 422 * pmap_remove_all and pmap_copy_on_write operate on a physical page 423 * basis and want to do the locking in the reverse order, i.e. lock 424 * a pv_list and then go through all the pmaps referenced by that list. 425 * 426 * The system wide pmap lock has been removed. Now, paths take a lock 427 * on the pmap before changing its 'shape' and the reverse order lockers 428 * (coming in by phys ppn) take a lock on the corresponding pv and then 429 * retest to be sure nothing changed during the window before they locked 430 * and can then run up/down the pv lists holding the list lock. This also 431 * lets the pmap layer run (nearly completely) interrupt enabled, unlike 432 * previously. 433 */ 434 435/* 436 * PV locking 437 */ 438 439#define LOCK_PVH(index) { \ 440 mp_disable_preemption(); \ 441 lock_pvh_pai(index); \ 442} 443 444#define UNLOCK_PVH(index) { \ 445 unlock_pvh_pai(index); \ 446 mp_enable_preemption(); \ 447} 448 449extern uint64_t pde_mapped_size; 450 451extern char *pmap_phys_attributes; 452extern ppnum_t last_managed_page; 453 454extern ppnum_t lowest_lo; 455extern ppnum_t lowest_hi; 456extern ppnum_t highest_hi; 457 458/* 459 * when spinning through pmap_remove 460 * ensure that we don't spend too much 461 * time with preemption disabled. 462 * I'm setting the current threshold 463 * to 20us 464 */ 465#define MAX_PREEMPTION_LATENCY_NS 20000 466extern uint64_t max_preemption_latency_tsc; 467 468/* #define DEBUGINTERRUPTS 1 uncomment to ensure pmap callers have interrupts enabled */ 469#ifdef DEBUGINTERRUPTS 470#define pmap_intr_assert() { \ 471 if (processor_avail_count > 1 && !ml_get_interrupts_enabled()) \ 472 panic("pmap interrupt assert %s, %d",__FILE__, __LINE__); \ 473} 474#else 475#define pmap_intr_assert() 476#endif 477 478extern int nx_enabled; 479extern unsigned int inuse_ptepages_count; 480 481static inline uint32_t 482pvhashidx(pmap_t pmap, vm_map_offset_t va) 483{ 484 uint32_t hashidx = ((uint32_t)(uintptr_t)pmap ^ 485 ((uint32_t)(va >> PAGE_SHIFT) & 0xFFFFFFFF)) & 486 npvhashmask; 487 return hashidx; 488} 489 490 491/* 492 * unlinks the pv_hashed_entry_t pvh from the singly linked hash chain. 493 * properly deals with the anchor. 494 * must be called with the hash locked, does not unlock it 495 */ 496static inline void 497pmap_pvh_unlink(pv_hashed_entry_t pvh) 498{ 499 pv_hashed_entry_t curh; 500 pv_hashed_entry_t *pprevh; 501 int pvhash_idx; 502 503 CHK_NPVHASH(); 504 pvhash_idx = pvhashidx(pvh->pmap, pvh->va); 505 506 pprevh = pvhash(pvhash_idx); 507 508#if PV_DEBUG 509 if (NULL == *pprevh) 510 panic("pvh_unlink null anchor"); /* JK DEBUG */ 511#endif 512 curh = *pprevh; 513 514 while (PV_HASHED_ENTRY_NULL != curh) { 515 if (pvh == curh) 516 break; 517 pprevh = &curh->nexth; 518 curh = curh->nexth; 519 } 520 if (PV_HASHED_ENTRY_NULL == curh) panic("pmap_pvh_unlink no pvh"); 521 *pprevh = pvh->nexth; 522 return; 523} 524 525static inline void 526pv_hash_add(pv_hashed_entry_t pvh_e, 527 pv_rooted_entry_t pv_h) 528{ 529 pv_hashed_entry_t *hashp; 530 int pvhash_idx; 531 532 CHK_NPVHASH(); 533 pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va); 534 LOCK_PV_HASH(pvhash_idx); 535 insque(&pvh_e->qlink, &pv_h->qlink); 536 hashp = pvhash(pvhash_idx); 537#if PV_DEBUG 538 if (NULL==hashp) 539 panic("pv_hash_add(%p) null hash bucket", pvh_e); 540#endif 541 pvh_e->nexth = *hashp; 542 *hashp = pvh_e; 543 UNLOCK_PV_HASH(pvhash_idx); 544} 545 546static inline void 547pv_hash_remove(pv_hashed_entry_t pvh_e) 548{ 549 int pvhash_idx; 550 551 CHK_NPVHASH(); 552 pvhash_idx = pvhashidx(pvh_e->pmap,pvh_e->va); 553 LOCK_PV_HASH(pvhash_idx); 554 remque(&pvh_e->qlink); 555 pmap_pvh_unlink(pvh_e); 556 UNLOCK_PV_HASH(pvhash_idx); 557} 558 559static inline boolean_t popcnt1(uint64_t distance) { 560 return ((distance & (distance - 1)) == 0); 561} 562 563/* 564 * Routines to handle suppression of/recovery from some forms of pagetable corruption 565 * incidents observed in the field. These can be either software induced (wild 566 * stores to the mapwindows where applicable, use after free errors 567 * (typically of pages addressed physically), mis-directed DMAs etc., or due 568 * to DRAM/memory hierarchy/interconnect errors. Given the theoretical rarity of these errors, 569 * the recording mechanism is deliberately not MP-safe. The overarching goal is to 570 * still assert on potential software races, but attempt recovery from incidents 571 * identifiable as occurring due to issues beyond the control of the pmap module. 572 * The latter includes single-bit errors and malformed pagetable entries. 573 * We currently limit ourselves to recovery/suppression of one incident per 574 * PMAP_PAGETABLE_CORRUPTION_INTERVAL seconds, and details of the incident 575 * are logged. 576 * Assertions are not suppressed if kernel debugging is enabled. (DRK 09) 577 */ 578 579typedef enum { 580 PTE_VALID = 0x0, 581 PTE_INVALID = 0x1, 582 PTE_RSVD = 0x2, 583 PTE_SUPERVISOR = 0x4, 584 PTE_BITFLIP = 0x8, 585 PV_BITFLIP = 0x10, 586 PTE_INVALID_CACHEABILITY = 0x20 587} pmap_pagetable_corruption_t; 588 589typedef enum { 590 ROOT_PRESENT = 0, 591 ROOT_ABSENT = 1 592} pmap_pv_assertion_t; 593 594typedef enum { 595 PMAP_ACTION_IGNORE = 0x0, 596 PMAP_ACTION_ASSERT = 0x1, 597 PMAP_ACTION_RETRY = 0x2, 598 PMAP_ACTION_RETRY_RELOCK = 0x4 599} pmap_pagetable_corruption_action_t; 600 601#define PMAP_PAGETABLE_CORRUPTION_INTERVAL (6ULL * 3600ULL) 602extern uint64_t pmap_pagetable_corruption_interval_abstime; 603 604extern uint32_t pmap_pagetable_corruption_incidents; 605#define PMAP_PAGETABLE_CORRUPTION_MAX_LOG (8) 606typedef struct { 607 pmap_pv_assertion_t incident; 608 pmap_pagetable_corruption_t reason; 609 pmap_pagetable_corruption_action_t action; 610 pmap_t pmap; 611 vm_map_offset_t vaddr; 612 pt_entry_t pte; 613 ppnum_t ppn; 614 pmap_t pvpmap; 615 vm_map_offset_t pvva; 616 uint64_t abstime; 617} pmap_pagetable_corruption_record_t; 618 619extern pmap_pagetable_corruption_record_t pmap_pagetable_corruption_records[]; 620extern uint64_t pmap_pagetable_corruption_last_abstime; 621extern thread_call_t pmap_pagetable_corruption_log_call; 622extern boolean_t pmap_pagetable_corruption_timeout; 623 624static inline void 625pmap_pagetable_corruption_log(pmap_pv_assertion_t incident, pmap_pagetable_corruption_t suppress_reason, pmap_pagetable_corruption_action_t action, pmap_t pmap, vm_map_offset_t vaddr, pt_entry_t *ptep, ppnum_t ppn, pmap_t pvpmap, vm_map_offset_t pvva) { 626 uint32_t pmap_pagetable_corruption_log_index; 627 pmap_pagetable_corruption_log_index = pmap_pagetable_corruption_incidents++ % PMAP_PAGETABLE_CORRUPTION_MAX_LOG; 628 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].incident = incident; 629 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].reason = suppress_reason; 630 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].action = action; 631 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pmap = pmap; 632 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].vaddr = vaddr; 633 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pte = *ptep; 634 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].ppn = ppn; 635 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvpmap = pvpmap; 636 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].pvva = pvva; 637 pmap_pagetable_corruption_records[pmap_pagetable_corruption_log_index].abstime = mach_absolute_time(); 638 /* Asynchronously log */ 639 thread_call_enter(pmap_pagetable_corruption_log_call); 640} 641 642static inline pmap_pagetable_corruption_action_t 643pmap_classify_pagetable_corruption(pmap_t pmap, vm_map_offset_t vaddr, ppnum_t *ppnp, pt_entry_t *ptep, pmap_pv_assertion_t incident) { 644 pmap_pagetable_corruption_action_t action = PMAP_ACTION_ASSERT; 645 pmap_pagetable_corruption_t suppress_reason = PTE_VALID; 646 ppnum_t suppress_ppn = 0; 647 pt_entry_t cpte = *ptep; 648 ppnum_t cpn = pa_index(pte_to_pa(cpte)); 649 ppnum_t ppn = *ppnp; 650 pv_rooted_entry_t pv_h = pai_to_pvh(ppn_to_pai(ppn)); 651 pv_rooted_entry_t pv_e = pv_h; 652 uint32_t bitdex; 653 pmap_t pvpmap = pv_h->pmap; 654 vm_map_offset_t pvva = pv_h->va; 655 boolean_t ppcd = FALSE; 656 657 /* Ideally, we'd consult the Mach VM here to definitively determine 658 * the nature of the mapping for this address space and address. 659 * As that would be a layering violation in this context, we 660 * use various heuristics to recover from single bit errors, 661 * malformed pagetable entries etc. These are not intended 662 * to be comprehensive. 663 */ 664 665 /* As a precautionary measure, mark A+D */ 666 pmap_phys_attributes[ppn_to_pai(ppn)] |= (PHYS_MODIFIED | PHYS_REFERENCED); 667 668 /* 669 * Correct potential single bit errors in either (but not both) element 670 * of the PV 671 */ 672 do { 673 if ((popcnt1((uintptr_t)pv_e->pmap ^ (uintptr_t)pmap) && pv_e->va == vaddr) || 674 (pv_e->pmap == pmap && popcnt1(pv_e->va ^ vaddr))) { 675 pv_e->pmap = pmap; 676 pv_e->va = vaddr; 677 suppress_reason = PV_BITFLIP; 678 action = PMAP_ACTION_RETRY; 679 goto pmap_cpc_exit; 680 } 681 } while (((pv_e = (pv_rooted_entry_t) queue_next(&pv_e->qlink))) && (pv_e != pv_h)); 682 683 /* Discover root entries with a Hamming 684 * distance of 1 from the supplied 685 * physical page frame. 686 */ 687 for (bitdex = 0; bitdex < (sizeof(ppnum_t) << 3); bitdex++) { 688 ppnum_t npn = cpn ^ (ppnum_t) (1ULL << bitdex); 689 if (IS_MANAGED_PAGE(npn)) { 690 pv_rooted_entry_t npv_h = pai_to_pvh(ppn_to_pai(npn)); 691 if (npv_h->va == vaddr && npv_h->pmap == pmap) { 692 suppress_reason = PTE_BITFLIP; 693 suppress_ppn = npn; 694 action = PMAP_ACTION_RETRY_RELOCK; 695 UNLOCK_PVH(ppn_to_pai(ppn)); 696 *ppnp = npn; 697 goto pmap_cpc_exit; 698 } 699 } 700 } 701 702 if (pmap == kernel_pmap) { 703 action = PMAP_ACTION_ASSERT; 704 goto pmap_cpc_exit; 705 } 706 707 /* Check for malformed/inconsistent entries */ 708 709 if ((cpte & (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU | INTEL_PTE_PTA)) == (INTEL_PTE_NCACHE | INTEL_PTE_WTHRU)) { 710 action = PMAP_ACTION_IGNORE; 711 suppress_reason = PTE_INVALID_CACHEABILITY; 712 } 713 else if (cpte & INTEL_PTE_RSVD) { 714 action = PMAP_ACTION_IGNORE; 715 suppress_reason = PTE_RSVD; 716 } 717 else if ((pmap != kernel_pmap) && ((cpte & INTEL_PTE_USER) == 0)) { 718 action = PMAP_ACTION_IGNORE; 719 suppress_reason = PTE_SUPERVISOR; 720 } 721pmap_cpc_exit: 722 PE_parse_boot_argn("-pmap_pagetable_corruption_deassert", &ppcd, sizeof(ppcd)); 723 724 if (debug_boot_arg && !ppcd) { 725 action = PMAP_ACTION_ASSERT; 726 } 727 728 if ((mach_absolute_time() - pmap_pagetable_corruption_last_abstime) < pmap_pagetable_corruption_interval_abstime) { 729 action = PMAP_ACTION_ASSERT; 730 pmap_pagetable_corruption_timeout = TRUE; 731 } 732 else 733 { 734 pmap_pagetable_corruption_last_abstime = mach_absolute_time(); 735 } 736 pmap_pagetable_corruption_log(incident, suppress_reason, action, pmap, vaddr, &cpte, *ppnp, pvpmap, pvva); 737 return action; 738} 739 740/* 741 * Remove pv list entry. 742 * Called with pv_head_table entry locked. 743 * Returns pv entry to be freed (or NULL). 744 */ 745static inline __attribute__((always_inline)) pv_hashed_entry_t 746pmap_pv_remove(pmap_t pmap, 747 vm_map_offset_t vaddr, 748 ppnum_t *ppnp, 749 pt_entry_t *pte) 750{ 751 pv_hashed_entry_t pvh_e; 752 pv_rooted_entry_t pv_h; 753 pv_hashed_entry_t *pprevh; 754 int pvhash_idx; 755 uint32_t pv_cnt; 756 ppnum_t ppn; 757 758pmap_pv_remove_retry: 759 ppn = *ppnp; 760 pvh_e = PV_HASHED_ENTRY_NULL; 761 pv_h = pai_to_pvh(ppn_to_pai(ppn)); 762 763 if (__improbable(pv_h->pmap == PMAP_NULL)) { 764 pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_ABSENT); 765 if (pac == PMAP_ACTION_IGNORE) 766 goto pmap_pv_remove_exit; 767 else if (pac == PMAP_ACTION_ASSERT) 768 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p, %p): null pv_list!", pmap, vaddr, ppn, *pte, ppnp, pte); 769 else if (pac == PMAP_ACTION_RETRY_RELOCK) { 770 LOCK_PVH(ppn_to_pai(*ppnp)); 771 pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED); 772 goto pmap_pv_remove_retry; 773 } 774 else if (pac == PMAP_ACTION_RETRY) 775 goto pmap_pv_remove_retry; 776 } 777 778 if (pv_h->va == vaddr && pv_h->pmap == pmap) { 779 /* 780 * Header is the pv_rooted_entry. 781 * We can't free that. If there is a queued 782 * entry after this one we remove that 783 * from the ppn queue, we remove it from the hash chain 784 * and copy it to the rooted entry. Then free it instead. 785 */ 786 pvh_e = (pv_hashed_entry_t) queue_next(&pv_h->qlink); 787 if (pv_h != (pv_rooted_entry_t) pvh_e) { 788 /* 789 * Entry queued to root, remove this from hash 790 * and install as new root. 791 */ 792 CHK_NPVHASH(); 793 pvhash_idx = pvhashidx(pvh_e->pmap, pvh_e->va); 794 LOCK_PV_HASH(pvhash_idx); 795 remque(&pvh_e->qlink); 796 pprevh = pvhash(pvhash_idx); 797 if (PV_HASHED_ENTRY_NULL == *pprevh) { 798 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x): " 799 "empty hash, removing rooted", 800 pmap, vaddr, ppn); 801 } 802 pmap_pvh_unlink(pvh_e); 803 UNLOCK_PV_HASH(pvhash_idx); 804 pv_h->pmap = pvh_e->pmap; 805 pv_h->va = pvh_e->va; /* dispose of pvh_e */ 806 } else { 807 /* none queued after rooted */ 808 pv_h->pmap = PMAP_NULL; 809 pvh_e = PV_HASHED_ENTRY_NULL; 810 } 811 } else { 812 /* 813 * not removing rooted pv. find it on hash chain, remove from 814 * ppn queue and hash chain and free it 815 */ 816 CHK_NPVHASH(); 817 pvhash_idx = pvhashidx(pmap, vaddr); 818 LOCK_PV_HASH(pvhash_idx); 819 pprevh = pvhash(pvhash_idx); 820 if (PV_HASHED_ENTRY_NULL == *pprevh) { 821 panic("Possible memory corruption: pmap_pv_remove(%p,0x%llx,0x%x, 0x%llx, %p): empty hash", 822 pmap, vaddr, ppn, *pte, pte); 823 } 824 pvh_e = *pprevh; 825 pmap_pv_hashlist_walks++; 826 pv_cnt = 0; 827 while (PV_HASHED_ENTRY_NULL != pvh_e) { 828 pv_cnt++; 829 if (pvh_e->pmap == pmap && 830 pvh_e->va == vaddr && 831 pvh_e->ppn == ppn) 832 break; 833 pprevh = &pvh_e->nexth; 834 pvh_e = pvh_e->nexth; 835 } 836 837 if (PV_HASHED_ENTRY_NULL == pvh_e) { 838 pmap_pagetable_corruption_action_t pac = pmap_classify_pagetable_corruption(pmap, vaddr, ppnp, pte, ROOT_PRESENT); 839 840 if (pac == PMAP_ACTION_ASSERT) 841 panic("Possible memory corruption: pmap_pv_remove(%p, 0x%llx, 0x%x, 0x%llx, %p, %p): pv not on hash, head: %p, 0x%llx", pmap, vaddr, ppn, *pte, ppnp, pte, pv_h->pmap, pv_h->va); 842 else { 843 UNLOCK_PV_HASH(pvhash_idx); 844 if (pac == PMAP_ACTION_RETRY_RELOCK) { 845 LOCK_PVH(ppn_to_pai(*ppnp)); 846 pmap_phys_attributes[ppn_to_pai(*ppnp)] |= (PHYS_MODIFIED | PHYS_REFERENCED); 847 goto pmap_pv_remove_retry; 848 } 849 else if (pac == PMAP_ACTION_RETRY) { 850 goto pmap_pv_remove_retry; 851 } 852 else if (pac == PMAP_ACTION_IGNORE) { 853 goto pmap_pv_remove_exit; 854 } 855 } 856 } 857 858 pmap_pv_hashlist_cnts += pv_cnt; 859 if (pmap_pv_hashlist_max < pv_cnt) 860 pmap_pv_hashlist_max = pv_cnt; 861 *pprevh = pvh_e->nexth; 862 remque(&pvh_e->qlink); 863 UNLOCK_PV_HASH(pvhash_idx); 864 } 865pmap_pv_remove_exit: 866 return pvh_e; 867} 868 869 870extern int pt_fake_zone_index; 871static inline void 872PMAP_ZINFO_PALLOC(pmap_t pmap, vm_size_t bytes) 873{ 874 thread_t thr = current_thread(); 875 task_t task; 876 zinfo_usage_t zinfo; 877 878 pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes); 879 880 if (pt_fake_zone_index != -1 && 881 (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) 882 OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].alloc); 883} 884 885static inline void 886PMAP_ZINFO_PFREE(pmap_t pmap, vm_size_t bytes) 887{ 888 thread_t thr = current_thread(); 889 task_t task; 890 zinfo_usage_t zinfo; 891 892 pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes); 893 894 if (pt_fake_zone_index != -1 && 895 (task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL) 896 OSAddAtomic64(bytes, (int64_t *)&zinfo[pt_fake_zone_index].free); 897} 898 899static inline void 900PMAP_ZINFO_SALLOC(pmap_t pmap, vm_size_t bytes) 901{ 902 pmap_ledger_credit(pmap, task_ledgers.tkm_shared, bytes); 903} 904 905static inline void 906PMAP_ZINFO_SFREE(pmap_t pmap, vm_size_t bytes) 907{ 908 pmap_ledger_debit(pmap, task_ledgers.tkm_shared, bytes); 909} 910 911extern boolean_t pmap_initialized;/* Has pmap_init completed? */ 912#define valid_page(x) (pmap_initialized && pmap_valid_page(x)) 913 914// XXX 915#define HIGH_MEM_BASE ((uint32_t)( -NBPDE) ) /* shared gdt etc seg addr */ /* XXX64 ?? */ 916// XXX 917 918 919int phys_attribute_test( 920 ppnum_t phys, 921 int bits); 922void phys_attribute_clear( 923 ppnum_t phys, 924 int bits, 925 unsigned int options, 926 void *arg); 927 928//#define PCID_DEBUG 1 929#if PCID_DEBUG 930#define pmap_pcid_log(fmt, args...) \ 931 do { \ 932 kprintf(fmt, ##args); \ 933 printf(fmt, ##args); \ 934 } while(0) 935#else 936#define pmap_pcid_log(fmt, args...) 937#endif 938void pmap_pcid_configure(void); 939 940 941/* 942 * Atomic 64-bit compare and exchange of a page table entry. 943 */ 944static inline boolean_t 945pmap_cmpx_pte(pt_entry_t *entryp, pt_entry_t old, pt_entry_t new) 946{ 947 boolean_t ret; 948 949 /* 950 * Load the old value into %rax 951 * Load the new value into another register 952 * Compare-exchange-quad at address entryp 953 * If the compare succeeds, the new value is stored, return TRUE. 954 * Otherwise, no swap is made, return FALSE. 955 */ 956 asm volatile( 957 " lock; cmpxchgq %2,(%3) \n\t" 958 " setz %%al \n\t" 959 " movzbl %%al,%0" 960 : "=a" (ret) 961 : "a" (old), 962 "r" (new), 963 "r" (entryp) 964 : "memory"); 965 return ret; 966} 967 968extern uint32_t pmap_update_clear_pte_count; 969 970static inline void pmap_update_pte(pt_entry_t *mptep, uint64_t pclear_bits, uint64_t pset_bits) { 971 pt_entry_t npte, opte; 972 do { 973 opte = *mptep; 974 if (__improbable(opte == 0)) { 975 pmap_update_clear_pte_count++; 976 break; 977 } 978 npte = opte & ~(pclear_bits); 979 npte |= pset_bits; 980 } while (!pmap_cmpx_pte(mptep, opte, npte)); 981} 982 983#if defined(__x86_64__) 984/* 985 * The single pml4 page per pmap is allocated at pmap create time and exists 986 * for the duration of the pmap. we allocate this page in kernel vm. 987 * this returns the address of the requested pml4 entry in the top level page. 988 */ 989static inline 990pml4_entry_t * 991pmap64_pml4(pmap_t pmap, vm_map_offset_t vaddr) 992{ 993 if (__improbable((vaddr > 0x00007FFFFFFFFFFFULL) && 994 (vaddr < 0xFFFF800000000000ULL))) { 995 return (NULL); 996 } 997 998#if PMAP_ASSERT 999 return PHYSMAP_PTOV(&((pml4_entry_t *)pmap->pm_cr3)[(vaddr >> PML4SHIFT) & (NPML4PG-1)]); 1000#else 1001 return &pmap->pm_pml4[(vaddr >> PML4SHIFT) & (NPML4PG-1)]; 1002#endif 1003} 1004 1005/* 1006 * Returns address of requested PDPT entry in the physmap. 1007 */ 1008static inline pdpt_entry_t * 1009pmap64_pdpt(pmap_t pmap, vm_map_offset_t vaddr) 1010{ 1011 pml4_entry_t newpf; 1012 pml4_entry_t *pml4; 1013 1014 pml4 = pmap64_pml4(pmap, vaddr); 1015 if (pml4 && ((*pml4 & INTEL_PTE_VALID))) { 1016 newpf = *pml4 & PG_FRAME; 1017 return &((pdpt_entry_t *) PHYSMAP_PTOV(newpf)) 1018 [(vaddr >> PDPTSHIFT) & (NPDPTPG-1)]; 1019 } 1020 return (NULL); 1021} 1022/* 1023 * Returns the address of the requested PDE entry in the physmap. 1024 */ 1025static inline pd_entry_t * 1026pmap64_pde(pmap_t pmap, vm_map_offset_t vaddr) 1027{ 1028 pdpt_entry_t newpf; 1029 pdpt_entry_t *pdpt; 1030 1031 pdpt = pmap64_pdpt(pmap, vaddr); 1032 1033 if (pdpt && ((*pdpt & INTEL_PTE_VALID))) { 1034 newpf = *pdpt & PG_FRAME; 1035 return &((pd_entry_t *) PHYSMAP_PTOV(newpf)) 1036 [(vaddr >> PDSHIFT) & (NPDPG-1)]; 1037 } 1038 return (NULL); 1039} 1040 1041static inline pd_entry_t * 1042pmap_pde(pmap_t m, vm_map_offset_t v) 1043{ 1044 pd_entry_t *pde; 1045 1046 pde = pmap64_pde(m, v); 1047 1048 return pde; 1049} 1050 1051 1052/* 1053 * return address of mapped pte for vaddr va in pmap pmap. 1054 * 1055 * In case the pde maps a superpage, return the pde, which, in this case 1056 * is the actual page table entry. 1057 */ 1058static inline pt_entry_t * 1059pmap_pte(pmap_t pmap, vm_map_offset_t vaddr) 1060{ 1061 pd_entry_t *pde; 1062 pd_entry_t newpf; 1063 1064 assert(pmap); 1065 pde = pmap64_pde(pmap, vaddr); 1066 1067 if (pde && ((*pde & INTEL_PTE_VALID))) { 1068 if (*pde & INTEL_PTE_PS) 1069 return pde; 1070 newpf = *pde & PG_FRAME; 1071 return &((pt_entry_t *)PHYSMAP_PTOV(newpf)) 1072 [i386_btop(vaddr) & (ppnum_t)(NPTEPG-1)]; 1073 } 1074 return (NULL); 1075} 1076#endif 1077#if DEBUG 1078#define DPRINTF(x...) kprintf(x) 1079#else 1080#define DPRINTF(x...) 1081#endif 1082 1083#endif /* MACH_KERNEL_PRIVATE */ 1084#endif /* _I386_PMAP_INTERNAL_ */ 1085