1/* 2 * Copyright (c) 2000-2010 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * @OSF_COPYRIGHT@ 30 */ 31/* 32 * Mach Operating System 33 * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University 34 * All Rights Reserved. 35 * 36 * Permission to use, copy, modify and distribute this software and its 37 * documentation is hereby granted, provided that both the copyright 38 * notice and this permission notice appear in all copies of the 39 * software, derivative works or modified versions, and any portions 40 * thereof, and that both notices appear in supporting documentation. 41 * 42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR 44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 45 * 46 * Carnegie Mellon requests users of this software to return to 47 * 48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 49 * School of Computer Science 50 * Carnegie Mellon University 51 * Pittsburgh PA 15213-3890 52 * 53 * any improvements or extensions that they make and grant Carnegie Mellon 54 * the rights to redistribute these changes. 55 */ 56/* 57 */ 58 59/* 60 * File: pmap.c 61 * Author: Avadis Tevanian, Jr., Michael Wayne Young 62 * (These guys wrote the Vax version) 63 * 64 * Physical Map management code for Intel i386, i486, and i860. 65 * 66 * Manages physical address maps. 67 * 68 * In addition to hardware address maps, this 69 * module is called upon to provide software-use-only 70 * maps which may or may not be stored in the same 71 * form as hardware maps. These pseudo-maps are 72 * used to store intermediate results from copy 73 * operations to and from address spaces. 74 * 75 * Since the information managed by this module is 76 * also stored by the logical address mapping module, 77 * this module may throw away valid virtual-to-physical 78 * mappings at almost any time. However, invalidations 79 * of virtual-to-physical mappings must be done as 80 * requested. 81 * 82 * In order to cope with hardware architectures which 83 * make virtual-to-physical map invalidates expensive, 84 * this module may delay invalidate or reduced protection 85 * operations until such time as they are actually 86 * necessary. This module is given full information as 87 * to which processors are currently using which maps, 88 * and to when physical maps must be made correct. 89 */ 90 91#include <string.h> 92#include <mach_ldebug.h> 93 94#include <libkern/OSAtomic.h> 95 96#include <mach/machine/vm_types.h> 97 98#include <mach/boolean.h> 99#include <kern/thread.h> 100#include <kern/zalloc.h> 101#include <kern/queue.h> 102#include <kern/ledger.h> 103#include <kern/mach_param.h> 104 105#include <kern/lock.h> 106#include <kern/kalloc.h> 107#include <kern/spl.h> 108 109#include <vm/pmap.h> 110#include <vm/vm_map.h> 111#include <vm/vm_kern.h> 112#include <mach/vm_param.h> 113#include <mach/vm_prot.h> 114#include <vm/vm_object.h> 115#include <vm/vm_page.h> 116 117#include <mach/machine/vm_param.h> 118#include <machine/thread.h> 119 120#include <kern/misc_protos.h> /* prototyping */ 121#include <i386/misc_protos.h> 122#include <i386/i386_lowmem.h> 123#include <x86_64/lowglobals.h> 124 125#include <i386/cpuid.h> 126#include <i386/cpu_data.h> 127#include <i386/cpu_number.h> 128#include <i386/machine_cpu.h> 129#include <i386/seg.h> 130#include <i386/serial_io.h> 131#include <i386/cpu_capabilities.h> 132#include <i386/machine_routines.h> 133#include <i386/proc_reg.h> 134#include <i386/tsc.h> 135#include <i386/pmap_internal.h> 136#include <i386/pmap_pcid.h> 137 138#include <vm/vm_protos.h> 139 140#include <i386/mp.h> 141#include <i386/mp_desc.h> 142#include <libkern/kernel_mach_header.h> 143 144#include <pexpert/i386/efi.h> 145 146 147#ifdef IWANTTODEBUG 148#undef DEBUG 149#define DEBUG 1 150#define POSTCODE_DELAY 1 151#include <i386/postcode.h> 152#endif /* IWANTTODEBUG */ 153 154#ifdef PMAP_DEBUG 155#define DBG(x...) kprintf("DBG: " x) 156#else 157#define DBG(x...) 158#endif 159/* Compile time assert to ensure adjacency/alignment of per-CPU data fields used 160 * in the trampolines for kernel/user boundary TLB coherency. 161 */ 162char pmap_cpu_data_assert[(((offsetof(cpu_data_t, cpu_tlb_invalid) - offsetof(cpu_data_t, cpu_active_cr3)) == 8) && (offsetof(cpu_data_t, cpu_active_cr3) % 64 == 0)) ? 1 : -1]; 163boolean_t pmap_trace = FALSE; 164 165boolean_t no_shared_cr3 = DEBUG; /* TRUE for DEBUG by default */ 166 167int nx_enabled = 1; /* enable no-execute protection */ 168int allow_data_exec = VM_ABI_32; /* 32-bit apps may execute data by default, 64-bit apps may not */ 169int allow_stack_exec = 0; /* No apps may execute from the stack by default */ 170 171const boolean_t cpu_64bit = TRUE; /* Mais oui! */ 172 173uint64_t max_preemption_latency_tsc = 0; 174 175pv_hashed_entry_t *pv_hash_table; /* hash lists */ 176 177uint32_t npvhash = 0; 178 179pv_hashed_entry_t pv_hashed_free_list = PV_HASHED_ENTRY_NULL; 180pv_hashed_entry_t pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL; 181decl_simple_lock_data(,pv_hashed_free_list_lock) 182decl_simple_lock_data(,pv_hashed_kern_free_list_lock) 183decl_simple_lock_data(,pv_hash_table_lock) 184 185zone_t pv_hashed_list_zone; /* zone of pv_hashed_entry structures */ 186 187/* 188 * First and last physical addresses that we maintain any information 189 * for. Initialized to zero so that pmap operations done before 190 * pmap_init won't touch any non-existent structures. 191 */ 192boolean_t pmap_initialized = FALSE;/* Has pmap_init completed? */ 193 194static struct vm_object kptobj_object_store; 195static struct vm_object kpml4obj_object_store; 196static struct vm_object kpdptobj_object_store; 197 198/* 199 * Array of physical page attribites for managed pages. 200 * One byte per physical page. 201 */ 202char *pmap_phys_attributes; 203ppnum_t last_managed_page = 0; 204 205/* 206 * Amount of virtual memory mapped by one 207 * page-directory entry. 208 */ 209 210uint64_t pde_mapped_size = PDE_MAPPED_SIZE; 211 212unsigned pmap_memory_region_count; 213unsigned pmap_memory_region_current; 214 215pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE]; 216 217/* 218 * Other useful macros. 219 */ 220#define current_pmap() (vm_map_pmap(current_thread()->map)) 221 222struct pmap kernel_pmap_store; 223pmap_t kernel_pmap; 224 225struct zone *pmap_zone; /* zone of pmap structures */ 226 227struct zone *pmap_anchor_zone; 228int pmap_debug = 0; /* flag for debugging prints */ 229 230unsigned int inuse_ptepages_count = 0; 231long long alloc_ptepages_count __attribute__((aligned(8))) = 0; /* aligned for atomic access */ 232unsigned int bootstrap_wired_pages = 0; 233int pt_fake_zone_index = -1; 234 235extern long NMIPI_acks; 236 237boolean_t kernel_text_ps_4K = TRUE; 238boolean_t wpkernel = TRUE; 239 240extern char end; 241 242static int nkpt; 243 244pt_entry_t *DMAP1, *DMAP2; 245caddr_t DADDR1; 246caddr_t DADDR2; 247 248const boolean_t pmap_disable_kheap_nx = FALSE; 249const boolean_t pmap_disable_kstack_nx = FALSE; 250extern boolean_t doconstro_override; 251 252extern long __stack_chk_guard[]; 253 254/* 255 * Map memory at initialization. The physical addresses being 256 * mapped are not managed and are never unmapped. 257 * 258 * For now, VM is already on, we only need to map the 259 * specified memory. 260 */ 261vm_offset_t 262pmap_map( 263 vm_offset_t virt, 264 vm_map_offset_t start_addr, 265 vm_map_offset_t end_addr, 266 vm_prot_t prot, 267 unsigned int flags) 268{ 269 int ps; 270 271 ps = PAGE_SIZE; 272 while (start_addr < end_addr) { 273 pmap_enter(kernel_pmap, (vm_map_offset_t)virt, 274 (ppnum_t) i386_btop(start_addr), prot, VM_PROT_NONE, flags, TRUE); 275 virt += ps; 276 start_addr += ps; 277 } 278 return(virt); 279} 280 281extern char *first_avail; 282extern vm_offset_t virtual_avail, virtual_end; 283extern pmap_paddr_t avail_start, avail_end; 284extern vm_offset_t sHIB; 285extern vm_offset_t eHIB; 286extern vm_offset_t stext; 287extern vm_offset_t etext; 288extern vm_offset_t sdata, edata; 289extern vm_offset_t sconstdata, econstdata; 290 291extern void *KPTphys; 292 293boolean_t pmap_smep_enabled = FALSE; 294 295void 296pmap_cpu_init(void) 297{ 298 cpu_data_t *cdp = current_cpu_datap(); 299 /* 300 * Here early in the life of a processor (from cpu_mode_init()). 301 * Ensure global page feature is disabled at this point. 302 */ 303 304 set_cr4(get_cr4() &~ CR4_PGE); 305 306 /* 307 * Initialize the per-cpu, TLB-related fields. 308 */ 309 cdp->cpu_kernel_cr3 = kernel_pmap->pm_cr3; 310 cdp->cpu_active_cr3 = kernel_pmap->pm_cr3; 311 cdp->cpu_tlb_invalid = FALSE; 312 cdp->cpu_task_map = TASK_MAP_64BIT; 313 pmap_pcid_configure(); 314 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMEP) { 315 boolean_t nsmep; 316 if (!PE_parse_boot_argn("-pmap_smep_disable", &nsmep, sizeof(nsmep))) { 317 set_cr4(get_cr4() | CR4_SMEP); 318 pmap_smep_enabled = TRUE; 319 } 320 } 321 322 if (cdp->cpu_fixed_pmcs_enabled) { 323 boolean_t enable = TRUE; 324 cpu_pmc_control(&enable); 325 } 326} 327 328 329 330/* 331 * Bootstrap the system enough to run with virtual memory. 332 * Map the kernel's code and data, and allocate the system page table. 333 * Called with mapping OFF. Page_size must already be set. 334 */ 335 336void 337pmap_bootstrap( 338 __unused vm_offset_t load_start, 339 __unused boolean_t IA32e) 340{ 341#if NCOPY_WINDOWS > 0 342 vm_offset_t va; 343 int i; 344#endif 345 assert(IA32e); 346 347 vm_last_addr = VM_MAX_KERNEL_ADDRESS; /* Set the highest address 348 * known to VM */ 349 /* 350 * The kernel's pmap is statically allocated so we don't 351 * have to use pmap_create, which is unlikely to work 352 * correctly at this part of the boot sequence. 353 */ 354 355 kernel_pmap = &kernel_pmap_store; 356 kernel_pmap->ref_count = 1; 357 kernel_pmap->nx_enabled = TRUE; 358 kernel_pmap->pm_task_map = TASK_MAP_64BIT; 359 kernel_pmap->pm_obj = (vm_object_t) NULL; 360 kernel_pmap->dirbase = (pd_entry_t *)((uintptr_t)IdlePTD); 361 kernel_pmap->pm_pdpt = (pd_entry_t *) ((uintptr_t)IdlePDPT); 362 kernel_pmap->pm_pml4 = IdlePML4; 363 kernel_pmap->pm_cr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4); 364 pmap_pcid_initialize_kernel(kernel_pmap); 365 366 367 368 current_cpu_datap()->cpu_kernel_cr3 = (addr64_t) kernel_pmap->pm_cr3; 369 370 nkpt = NKPT; 371 OSAddAtomic(NKPT, &inuse_ptepages_count); 372 OSAddAtomic64(NKPT, &alloc_ptepages_count); 373 bootstrap_wired_pages = NKPT; 374 375 virtual_avail = (vm_offset_t)(VM_MIN_KERNEL_ADDRESS) + (vm_offset_t)first_avail; 376 virtual_end = (vm_offset_t)(VM_MAX_KERNEL_ADDRESS); 377 378#if NCOPY_WINDOWS > 0 379 /* 380 * Reserve some special page table entries/VA space for temporary 381 * mapping of pages. 382 */ 383#define SYSMAP(c, p, v, n) \ 384 v = (c)va; va += ((n)*INTEL_PGBYTES); 385 386 va = virtual_avail; 387 388 for (i=0; i<PMAP_NWINDOWS; i++) { 389#if 1 390 kprintf("trying to do SYSMAP idx %d %p\n", i, 391 current_cpu_datap()); 392 kprintf("cpu_pmap %p\n", current_cpu_datap()->cpu_pmap); 393 kprintf("mapwindow %p\n", current_cpu_datap()->cpu_pmap->mapwindow); 394 kprintf("two stuff %p %p\n", 395 (void *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP), 396 (void *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR)); 397#endif 398 SYSMAP(caddr_t, 399 (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP), 400 (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR), 401 1); 402 current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP = 403 &(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP_store); 404 *current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP = 0; 405 } 406 407 /* DMAP user for debugger */ 408 SYSMAP(caddr_t, DMAP1, DADDR1, 1); 409 SYSMAP(caddr_t, DMAP2, DADDR2, 1); /* XXX temporary - can remove */ 410 411 virtual_avail = va; 412#endif 413 414 if (PE_parse_boot_argn("npvhash", &npvhash, sizeof (npvhash))) { 415 if (0 != ((npvhash + 1) & npvhash)) { 416 kprintf("invalid hash %d, must be ((2^N)-1), " 417 "using default %d\n", npvhash, NPVHASH); 418 npvhash = NPVHASH; 419 } 420 } else { 421 npvhash = NPVHASH; 422 } 423 424 simple_lock_init(&kernel_pmap->lock, 0); 425 simple_lock_init(&pv_hashed_free_list_lock, 0); 426 simple_lock_init(&pv_hashed_kern_free_list_lock, 0); 427 simple_lock_init(&pv_hash_table_lock,0); 428 429 pmap_cpu_init(); 430 431 if (pmap_pcid_ncpus) 432 printf("PMAP: PCID enabled\n"); 433 434 if (pmap_smep_enabled) 435 printf("PMAP: Supervisor Mode Execute Protection enabled\n"); 436 437#if DEBUG 438 printf("Stack canary: 0x%lx\n", __stack_chk_guard[0]); 439 printf("ml_early_random(): 0x%qx\n", ml_early_random()); 440#endif 441 boolean_t ptmp; 442 /* Check if the user has requested disabling stack or heap no-execute 443 * enforcement. These are "const" variables; that qualifier is cast away 444 * when altering them. The TEXT/DATA const sections are marked 445 * write protected later in the kernel startup sequence, so altering 446 * them is possible at this point, in pmap_bootstrap(). 447 */ 448 if (PE_parse_boot_argn("-pmap_disable_kheap_nx", &ptmp, sizeof(ptmp))) { 449 boolean_t *pdknxp = (boolean_t *) &pmap_disable_kheap_nx; 450 *pdknxp = TRUE; 451 } 452 453 if (PE_parse_boot_argn("-pmap_disable_kstack_nx", &ptmp, sizeof(ptmp))) { 454 boolean_t *pdknhp = (boolean_t *) &pmap_disable_kstack_nx; 455 *pdknhp = TRUE; 456 } 457 458 boot_args *args = (boot_args *)PE_state.bootArgs; 459 if (args->efiMode == kBootArgsEfiMode32) { 460 printf("EFI32: kernel virtual space limited to 4GB\n"); 461 virtual_end = VM_MAX_KERNEL_ADDRESS_EFI32; 462 } 463 kprintf("Kernel virtual space from 0x%lx to 0x%lx.\n", 464 (long)KERNEL_BASE, (long)virtual_end); 465 kprintf("Available physical space from 0x%llx to 0x%llx\n", 466 avail_start, avail_end); 467 468 /* 469 * The -no_shared_cr3 boot-arg is a debugging feature (set by default 470 * in the DEBUG kernel) to force the kernel to switch to its own map 471 * (and cr3) when control is in kernelspace. The kernel's map does not 472 * include (i.e. share) userspace so wild references will cause 473 * a panic. Only copyin and copyout are exempt from this. 474 */ 475 (void) PE_parse_boot_argn("-no_shared_cr3", 476 &no_shared_cr3, sizeof (no_shared_cr3)); 477 if (no_shared_cr3) 478 kprintf("Kernel not sharing user map\n"); 479 480#ifdef PMAP_TRACES 481 if (PE_parse_boot_argn("-pmap_trace", &pmap_trace, sizeof (pmap_trace))) { 482 kprintf("Kernel traces for pmap operations enabled\n"); 483 } 484#endif /* PMAP_TRACES */ 485} 486 487void 488pmap_virtual_space( 489 vm_offset_t *startp, 490 vm_offset_t *endp) 491{ 492 *startp = virtual_avail; 493 *endp = virtual_end; 494} 495 496/* 497 * Initialize the pmap module. 498 * Called by vm_init, to initialize any structures that the pmap 499 * system needs to map virtual memory. 500 */ 501void 502pmap_init(void) 503{ 504 long npages; 505 vm_offset_t addr; 506 vm_size_t s, vsize; 507 vm_map_offset_t vaddr; 508 ppnum_t ppn; 509 510 511 kernel_pmap->pm_obj_pml4 = &kpml4obj_object_store; 512 _vm_object_allocate((vm_object_size_t)NPML4PGS, &kpml4obj_object_store); 513 514 kernel_pmap->pm_obj_pdpt = &kpdptobj_object_store; 515 _vm_object_allocate((vm_object_size_t)NPDPTPGS, &kpdptobj_object_store); 516 517 kernel_pmap->pm_obj = &kptobj_object_store; 518 _vm_object_allocate((vm_object_size_t)NPDEPGS, &kptobj_object_store); 519 520 /* 521 * Allocate memory for the pv_head_table and its lock bits, 522 * the modify bit array, and the pte_page table. 523 */ 524 525 /* 526 * zero bias all these arrays now instead of off avail_start 527 * so we cover all memory 528 */ 529 530 npages = i386_btop(avail_end); 531 s = (vm_size_t) (sizeof(struct pv_rooted_entry) * npages 532 + (sizeof (struct pv_hashed_entry_t *) * (npvhash+1)) 533 + pv_lock_table_size(npages) 534 + pv_hash_lock_table_size((npvhash+1)) 535 + npages); 536 537 s = round_page(s); 538 if (kernel_memory_allocate(kernel_map, &addr, s, 0, 539 KMA_KOBJECT | KMA_PERMANENT) 540 != KERN_SUCCESS) 541 panic("pmap_init"); 542 543 memset((char *)addr, 0, s); 544 545 vaddr = addr; 546 vsize = s; 547 548#if PV_DEBUG 549 if (0 == npvhash) panic("npvhash not initialized"); 550#endif 551 552 /* 553 * Allocate the structures first to preserve word-alignment. 554 */ 555 pv_head_table = (pv_rooted_entry_t) addr; 556 addr = (vm_offset_t) (pv_head_table + npages); 557 558 pv_hash_table = (pv_hashed_entry_t *)addr; 559 addr = (vm_offset_t) (pv_hash_table + (npvhash + 1)); 560 561 pv_lock_table = (char *) addr; 562 addr = (vm_offset_t) (pv_lock_table + pv_lock_table_size(npages)); 563 564 pv_hash_lock_table = (char *) addr; 565 addr = (vm_offset_t) (pv_hash_lock_table + pv_hash_lock_table_size((npvhash+1))); 566 567 pmap_phys_attributes = (char *) addr; 568 569 ppnum_t last_pn = i386_btop(avail_end); 570 unsigned int i; 571 pmap_memory_region_t *pmptr = pmap_memory_regions; 572 for (i = 0; i < pmap_memory_region_count; i++, pmptr++) { 573 if (pmptr->type != kEfiConventionalMemory) 574 continue; 575 ppnum_t pn; 576 for (pn = pmptr->base; pn <= pmptr->end; pn++) { 577 if (pn < last_pn) { 578 pmap_phys_attributes[pn] |= PHYS_MANAGED; 579 580 if (pn > last_managed_page) 581 last_managed_page = pn; 582 583 if (pn >= lowest_hi && pn <= highest_hi) 584 pmap_phys_attributes[pn] |= PHYS_NOENCRYPT; 585 } 586 } 587 } 588 while (vsize) { 589 ppn = pmap_find_phys(kernel_pmap, vaddr); 590 591 pmap_phys_attributes[ppn] |= PHYS_NOENCRYPT; 592 593 vaddr += PAGE_SIZE; 594 vsize -= PAGE_SIZE; 595 } 596 /* 597 * Create the zone of physical maps, 598 * and of the physical-to-virtual entries. 599 */ 600 s = (vm_size_t) sizeof(struct pmap); 601 pmap_zone = zinit(s, 400*s, 4096, "pmap"); /* XXX */ 602 zone_change(pmap_zone, Z_NOENCRYPT, TRUE); 603 604 pmap_anchor_zone = zinit(PAGE_SIZE, task_max, PAGE_SIZE, "pagetable anchors"); 605 zone_change(pmap_anchor_zone, Z_NOENCRYPT, TRUE); 606 607 /* The anchor is required to be page aligned. Zone debugging adds 608 * padding which may violate that requirement. Tell the zone 609 * subsystem that alignment is required. 610 */ 611 612 zone_change(pmap_anchor_zone, Z_ALIGNMENT_REQUIRED, TRUE); 613 614 s = (vm_size_t) sizeof(struct pv_hashed_entry); 615 pv_hashed_list_zone = zinit(s, 10000*s /* Expandable zone */, 616 4096 * 3 /* LCM x86_64*/, "pv_list"); 617 zone_change(pv_hashed_list_zone, Z_NOENCRYPT, TRUE); 618 619 /* create pv entries for kernel pages mapped by low level 620 startup code. these have to exist so we can pmap_remove() 621 e.g. kext pages from the middle of our addr space */ 622 623 vaddr = (vm_map_offset_t) VM_MIN_KERNEL_ADDRESS; 624 for (ppn = VM_MIN_KERNEL_PAGE; ppn < i386_btop(avail_start); ppn++) { 625 pv_rooted_entry_t pv_e; 626 627 pv_e = pai_to_pvh(ppn); 628 pv_e->va = vaddr; 629 vaddr += PAGE_SIZE; 630 pv_e->pmap = kernel_pmap; 631 queue_init(&pv_e->qlink); 632 } 633 pmap_initialized = TRUE; 634 635 max_preemption_latency_tsc = tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS, tscFCvtn2t); 636 637 /* 638 * Ensure the kernel's PML4 entry exists for the basement 639 * before this is shared with any user. 640 */ 641 pmap_expand_pml4(kernel_pmap, KERNEL_BASEMENT, PMAP_EXPAND_OPTIONS_NONE); 642} 643 644static 645void pmap_mark_range(pmap_t npmap, uint64_t sv, uint64_t nxrosz, boolean_t NX, boolean_t ro) { 646 uint64_t ev = sv + nxrosz, cv = sv; 647 pd_entry_t *pdep; 648 pt_entry_t *ptep = NULL; 649 650 assert(((sv & 0xFFFULL) | (nxrosz & 0xFFFULL)) == 0); 651 652 for (pdep = pmap_pde(npmap, cv); pdep != NULL && (cv < ev);) { 653 uint64_t pdev = (cv & ~((uint64_t)PDEMASK)); 654 655 if (*pdep & INTEL_PTE_PS) { 656 if (NX) 657 *pdep |= INTEL_PTE_NX; 658 if (ro) 659 *pdep &= ~INTEL_PTE_WRITE; 660 cv += NBPD; 661 cv &= ~((uint64_t) PDEMASK); 662 pdep = pmap_pde(npmap, cv); 663 continue; 664 } 665 666 for (ptep = pmap_pte(npmap, cv); ptep != NULL && (cv < (pdev + NBPD)) && (cv < ev);) { 667 if (NX) 668 *ptep |= INTEL_PTE_NX; 669 if (ro) 670 *ptep &= ~INTEL_PTE_WRITE; 671 cv += NBPT; 672 ptep = pmap_pte(npmap, cv); 673 } 674 } 675 DPRINTF("%s(0x%llx, 0x%llx, %u, %u): 0x%llx, 0x%llx\n", __FUNCTION__, sv, nxrosz, NX, ro, cv, ptep ? *ptep: 0); 676} 677 678/* 679 * Called once VM is fully initialized so that we can release unused 680 * sections of low memory to the general pool. 681 * Also complete the set-up of identity-mapped sections of the kernel: 682 * 1) write-protect kernel text 683 * 2) map kernel text using large pages if possible 684 * 3) read and write-protect page zero (for K32) 685 * 4) map the global page at the appropriate virtual address. 686 * 687 * Use of large pages 688 * ------------------ 689 * To effectively map and write-protect all kernel text pages, the text 690 * must be 2M-aligned at the base, and the data section above must also be 691 * 2M-aligned. That is, there's padding below and above. This is achieved 692 * through linker directives. Large pages are used only if this alignment 693 * exists (and not overriden by the -kernel_text_page_4K boot-arg). The 694 * memory layout is: 695 * 696 * : : 697 * | __DATA | 698 * sdata: ================== 2Meg 699 * | | 700 * | zero-padding | 701 * | | 702 * etext: ------------------ 703 * | | 704 * : : 705 * | | 706 * | __TEXT | 707 * | | 708 * : : 709 * | | 710 * stext: ================== 2Meg 711 * | | 712 * | zero-padding | 713 * | | 714 * eHIB: ------------------ 715 * | __HIB | 716 * : : 717 * 718 * Prior to changing the mapping from 4K to 2M, the zero-padding pages 719 * [eHIB,stext] and [etext,sdata] are ml_static_mfree()'d. Then all the 720 * 4K pages covering [stext,etext] are coalesced as 2M large pages. 721 * The now unused level-1 PTE pages are also freed. 722 */ 723extern ppnum_t vm_kernel_base_page; 724void 725pmap_lowmem_finalize(void) 726{ 727 spl_t spl; 728 int i; 729 730 /* 731 * Update wired memory statistics for early boot pages 732 */ 733 PMAP_ZINFO_PALLOC(kernel_pmap, bootstrap_wired_pages * PAGE_SIZE); 734 735 /* 736 * Free pages in pmap regions below the base: 737 * rdar://6332712 738 * We can't free all the pages to VM that EFI reports available. 739 * Pages in the range 0xc0000-0xff000 aren't safe over sleep/wake. 740 * There's also a size miscalculation here: pend is one page less 741 * than it should be but this is not fixed to be backwards 742 * compatible. 743 * This is important for KASLR because up to 256*2MB = 512MB of space 744 * needs has to be released to VM. 745 */ 746 for (i = 0; 747 pmap_memory_regions[i].end < vm_kernel_base_page; 748 i++) { 749 vm_offset_t pbase = i386_ptob(pmap_memory_regions[i].base); 750 vm_offset_t pend = i386_ptob(pmap_memory_regions[i].end+1); 751 752 DBG("pmap region %d [%p..[%p\n", 753 i, (void *) pbase, (void *) pend); 754 755 if (pmap_memory_regions[i].attribute & EFI_MEMORY_KERN_RESERVED) 756 continue; 757 /* 758 * rdar://6332712 759 * Adjust limits not to free pages in range 0xc0000-0xff000. 760 */ 761 if (pbase >= 0xc0000 && pend <= 0x100000) 762 continue; 763 if (pbase < 0xc0000 && pend > 0x100000) { 764 /* page range entirely within region, free lower part */ 765 DBG("- ml_static_mfree(%p,%p)\n", 766 (void *) ml_static_ptovirt(pbase), 767 (void *) (0xc0000-pbase)); 768 ml_static_mfree(ml_static_ptovirt(pbase),0xc0000-pbase); 769 pbase = 0x100000; 770 } 771 if (pbase < 0xc0000) 772 pend = MIN(pend, 0xc0000); 773 if (pend > 0x100000) 774 pbase = MAX(pbase, 0x100000); 775 DBG("- ml_static_mfree(%p,%p)\n", 776 (void *) ml_static_ptovirt(pbase), 777 (void *) (pend - pbase)); 778 ml_static_mfree(ml_static_ptovirt(pbase), pend - pbase); 779 } 780 781 /* A final pass to get rid of all initial identity mappings to 782 * low pages. 783 */ 784 DPRINTF("%s: Removing mappings from 0->0x%lx\n", __FUNCTION__, vm_kernel_base); 785 786 /* Remove all mappings past the descriptor aliases and low globals */ 787 pmap_remove(kernel_pmap, LOWGLOBAL_ALIAS + PAGE_SIZE, vm_kernel_base); 788 789 /* 790 * If text and data are both 2MB-aligned, 791 * we can map text with large-pages, 792 * unless the -kernel_text_ps_4K boot-arg overrides. 793 */ 794 if ((stext & I386_LPGMASK) == 0 && (sdata & I386_LPGMASK) == 0) { 795 kprintf("Kernel text is 2MB aligned"); 796 kernel_text_ps_4K = FALSE; 797 if (PE_parse_boot_argn("-kernel_text_ps_4K", 798 &kernel_text_ps_4K, 799 sizeof (kernel_text_ps_4K))) 800 kprintf(" but will be mapped with 4K pages\n"); 801 else 802 kprintf(" and will be mapped with 2M pages\n"); 803 } 804 805 (void) PE_parse_boot_argn("wpkernel", &wpkernel, sizeof (wpkernel)); 806 if (wpkernel) 807 kprintf("Kernel text %p-%p to be write-protected\n", 808 (void *) stext, (void *) etext); 809 810 spl = splhigh(); 811 812 /* 813 * Scan over text if mappings are to be changed: 814 * - Remap kernel text readonly unless the "wpkernel" boot-arg is 0 815 * - Change to large-pages if possible and not overriden. 816 */ 817 if (kernel_text_ps_4K && wpkernel) { 818 vm_offset_t myva; 819 for (myva = stext; myva < etext; myva += PAGE_SIZE) { 820 pt_entry_t *ptep; 821 822 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva); 823 if (ptep) 824 pmap_store_pte(ptep, *ptep & ~INTEL_PTE_WRITE); 825 } 826 } 827 828 if (!kernel_text_ps_4K) { 829 vm_offset_t myva; 830 831 /* 832 * Release zero-filled page padding used for 2M-alignment. 833 */ 834 DBG("ml_static_mfree(%p,%p) for padding below text\n", 835 (void *) eHIB, (void *) (stext - eHIB)); 836 ml_static_mfree(eHIB, stext - eHIB); 837 DBG("ml_static_mfree(%p,%p) for padding above text\n", 838 (void *) etext, (void *) (sdata - etext)); 839 ml_static_mfree(etext, sdata - etext); 840 841 /* 842 * Coalesce text pages into large pages. 843 */ 844 for (myva = stext; myva < sdata; myva += I386_LPGBYTES) { 845 pt_entry_t *ptep; 846 vm_offset_t pte_phys; 847 pt_entry_t *pdep; 848 pt_entry_t pde; 849 850 pdep = pmap_pde(kernel_pmap, (vm_map_offset_t)myva); 851 ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva); 852 DBG("myva: %p pdep: %p ptep: %p\n", 853 (void *) myva, (void *) pdep, (void *) ptep); 854 if ((*ptep & INTEL_PTE_VALID) == 0) 855 continue; 856 pte_phys = (vm_offset_t)(*ptep & PG_FRAME); 857 pde = *pdep & PTMASK; /* page attributes from pde */ 858 pde |= INTEL_PTE_PS; /* make it a 2M entry */ 859 pde |= pte_phys; /* take page frame from pte */ 860 861 if (wpkernel) 862 pde &= ~INTEL_PTE_WRITE; 863 DBG("pmap_store_pte(%p,0x%llx)\n", 864 (void *)pdep, pde); 865 pmap_store_pte(pdep, pde); 866 867 /* 868 * Free the now-unused level-1 pte. 869 * Note: ptep is a virtual address to the pte in the 870 * recursive map. We can't use this address to free 871 * the page. Instead we need to compute its address 872 * in the Idle PTEs in "low memory". 873 */ 874 vm_offset_t vm_ptep = (vm_offset_t) KPTphys 875 + (pte_phys >> PTPGSHIFT); 876 DBG("ml_static_mfree(%p,0x%x) for pte\n", 877 (void *) vm_ptep, PAGE_SIZE); 878 ml_static_mfree(vm_ptep, PAGE_SIZE); 879 } 880 881 /* Change variable read by sysctl machdep.pmap */ 882 pmap_kernel_text_ps = I386_LPGBYTES; 883 } 884 885 boolean_t doconstro = TRUE; 886 887 (void) PE_parse_boot_argn("dataconstro", &doconstro, sizeof(doconstro)); 888 889 if ((sconstdata | econstdata) & PAGE_MASK) { 890 kprintf("Const DATA misaligned 0x%lx 0x%lx\n", sconstdata, econstdata); 891 if ((sconstdata & PAGE_MASK) || (doconstro_override == FALSE)) 892 doconstro = FALSE; 893 } 894 895 if ((sconstdata > edata) || (sconstdata < sdata) || ((econstdata - sconstdata) >= (edata - sdata))) { 896 kprintf("Const DATA incorrect size 0x%lx 0x%lx 0x%lx 0x%lx\n", sconstdata, econstdata, sdata, edata); 897 doconstro = FALSE; 898 } 899 900 if (doconstro) 901 kprintf("Marking const DATA read-only\n"); 902 903 vm_offset_t dva; 904 905 for (dva = sdata; dva < edata; dva += I386_PGBYTES) { 906 assert(((sdata | edata) & PAGE_MASK) == 0); 907 if ( (sdata | edata) & PAGE_MASK) { 908 kprintf("DATA misaligned, 0x%lx, 0x%lx\n", sdata, edata); 909 break; 910 } 911 912 pt_entry_t dpte, *dptep = pmap_pte(kernel_pmap, dva); 913 914 dpte = *dptep; 915 916 assert((dpte & INTEL_PTE_VALID)); 917 if ((dpte & INTEL_PTE_VALID) == 0) { 918 kprintf("Missing data mapping 0x%lx 0x%lx 0x%lx\n", dva, sdata, edata); 919 continue; 920 } 921 922 dpte |= INTEL_PTE_NX; 923 if (doconstro && (dva >= sconstdata) && (dva < econstdata)) { 924 dpte &= ~INTEL_PTE_WRITE; 925 } 926 pmap_store_pte(dptep, dpte); 927 } 928 kernel_segment_command_t * seg; 929 kernel_section_t * sec; 930 931 for (seg = firstseg(); seg != NULL; seg = nextsegfromheader(&_mh_execute_header, seg)) { 932 if (!strcmp(seg->segname, "__TEXT") || 933 !strcmp(seg->segname, "__DATA")) { 934 continue; 935 } 936 //XXX 937 if (!strcmp(seg->segname, "__KLD")) { 938 continue; 939 } 940 if (!strcmp(seg->segname, "__HIB")) { 941 for (sec = firstsect(seg); sec != NULL; sec = nextsect(seg, sec)) { 942 if (sec->addr & PAGE_MASK) 943 panic("__HIB segment's sections misaligned"); 944 if (!strcmp(sec->sectname, "__text")) { 945 pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), FALSE, TRUE); 946 } else { 947 pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), TRUE, FALSE); 948 } 949 } 950 } else { 951 pmap_mark_range(kernel_pmap, seg->vmaddr, round_page_64(seg->vmsize), TRUE, FALSE); 952 } 953 } 954 955 /* 956 * If we're debugging, map the low global vector page at the fixed 957 * virtual address. Otherwise, remove the mapping for this. 958 */ 959 if (debug_boot_arg) { 960 pt_entry_t *pte = NULL; 961 if (0 == (pte = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS))) 962 panic("lowmem pte"); 963 /* make sure it is defined on page boundary */ 964 assert(0 == ((vm_offset_t) &lowGlo & PAGE_MASK)); 965 pmap_store_pte(pte, kvtophys((vm_offset_t)&lowGlo) 966 | INTEL_PTE_REF 967 | INTEL_PTE_MOD 968 | INTEL_PTE_WIRED 969 | INTEL_PTE_VALID 970 | INTEL_PTE_WRITE 971 | INTEL_PTE_NX); 972 } else { 973 pmap_remove(kernel_pmap, 974 LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE); 975 } 976 977 splx(spl); 978 if (pmap_pcid_ncpus) 979 tlb_flush_global(); 980 else 981 flush_tlb_raw(); 982} 983 984/* 985 * this function is only used for debugging fron the vm layer 986 */ 987boolean_t 988pmap_verify_free( 989 ppnum_t pn) 990{ 991 pv_rooted_entry_t pv_h; 992 int pai; 993 boolean_t result; 994 995 assert(pn != vm_page_fictitious_addr); 996 997 if (!pmap_initialized) 998 return(TRUE); 999 1000 if (pn == vm_page_guard_addr) 1001 return TRUE; 1002 1003 pai = ppn_to_pai(pn); 1004 if (!IS_MANAGED_PAGE(pai)) 1005 return(FALSE); 1006 pv_h = pai_to_pvh(pn); 1007 result = (pv_h->pmap == PMAP_NULL); 1008 return(result); 1009} 1010 1011boolean_t 1012pmap_is_empty( 1013 pmap_t pmap, 1014 vm_map_offset_t va_start, 1015 vm_map_offset_t va_end) 1016{ 1017 vm_map_offset_t offset; 1018 ppnum_t phys_page; 1019 1020 if (pmap == PMAP_NULL) { 1021 return TRUE; 1022 } 1023 1024 /* 1025 * Check the resident page count 1026 * - if it's zero, the pmap is completely empty. 1027 * This short-circuit test prevents a virtual address scan which is 1028 * painfully slow for 64-bit spaces. 1029 * This assumes the count is correct 1030 * .. the debug kernel ought to be checking perhaps by page table walk. 1031 */ 1032 if (pmap->stats.resident_count == 0) 1033 return TRUE; 1034 1035 for (offset = va_start; 1036 offset < va_end; 1037 offset += PAGE_SIZE_64) { 1038 phys_page = pmap_find_phys(pmap, offset); 1039 if (phys_page) { 1040 kprintf("pmap_is_empty(%p,0x%llx,0x%llx): " 1041 "page %d at 0x%llx\n", 1042 pmap, va_start, va_end, phys_page, offset); 1043 return FALSE; 1044 } 1045 } 1046 1047 return TRUE; 1048} 1049 1050 1051/* 1052 * Create and return a physical map. 1053 * 1054 * If the size specified for the map 1055 * is zero, the map is an actual physical 1056 * map, and may be referenced by the 1057 * hardware. 1058 * 1059 * If the size specified is non-zero, 1060 * the map will be used in software only, and 1061 * is bounded by that size. 1062 */ 1063pmap_t 1064pmap_create( 1065 ledger_t ledger, 1066 vm_map_size_t sz, 1067 boolean_t is_64bit) 1068{ 1069 pmap_t p; 1070 vm_size_t size; 1071 pml4_entry_t *pml4; 1072 pml4_entry_t *kpml4; 1073 1074 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, 1075 (uint32_t) (sz>>32), (uint32_t) sz, is_64bit, 0, 0); 1076 1077 size = (vm_size_t) sz; 1078 1079 /* 1080 * A software use-only map doesn't even need a map. 1081 */ 1082 1083 if (size != 0) { 1084 return(PMAP_NULL); 1085 } 1086 1087 p = (pmap_t) zalloc(pmap_zone); 1088 if (PMAP_NULL == p) 1089 panic("pmap_create zalloc"); 1090 /* Zero all fields */ 1091 bzero(p, sizeof(*p)); 1092 /* init counts now since we'll be bumping some */ 1093 simple_lock_init(&p->lock, 0); 1094 p->stats.resident_count = 0; 1095 p->stats.resident_max = 0; 1096 p->stats.wired_count = 0; 1097 p->ref_count = 1; 1098 p->nx_enabled = 1; 1099 p->pm_shared = FALSE; 1100 ledger_reference(ledger); 1101 p->ledger = ledger; 1102 1103 p->pm_task_map = is_64bit ? TASK_MAP_64BIT : TASK_MAP_32BIT;; 1104 if (pmap_pcid_ncpus) 1105 pmap_pcid_initialize(p); 1106 1107 p->pm_pml4 = zalloc(pmap_anchor_zone); 1108 1109 pmap_assert((((uintptr_t)p->pm_pml4) & PAGE_MASK) == 0); 1110 1111 memset((char *)p->pm_pml4, 0, PAGE_SIZE); 1112 1113 p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4); 1114 1115 /* allocate the vm_objs to hold the pdpt, pde and pte pages */ 1116 1117 p->pm_obj_pml4 = vm_object_allocate((vm_object_size_t)(NPML4PGS)); 1118 if (NULL == p->pm_obj_pml4) 1119 panic("pmap_create pdpt obj"); 1120 1121 p->pm_obj_pdpt = vm_object_allocate((vm_object_size_t)(NPDPTPGS)); 1122 if (NULL == p->pm_obj_pdpt) 1123 panic("pmap_create pdpt obj"); 1124 1125 p->pm_obj = vm_object_allocate((vm_object_size_t)(NPDEPGS)); 1126 if (NULL == p->pm_obj) 1127 panic("pmap_create pte obj"); 1128 1129 /* All pmaps share the kernel's pml4 */ 1130 pml4 = pmap64_pml4(p, 0ULL); 1131 kpml4 = kernel_pmap->pm_pml4; 1132 pml4[KERNEL_PML4_INDEX] = kpml4[KERNEL_PML4_INDEX]; 1133 pml4[KERNEL_KEXTS_INDEX] = kpml4[KERNEL_KEXTS_INDEX]; 1134 pml4[KERNEL_PHYSMAP_PML4_INDEX] = kpml4[KERNEL_PHYSMAP_PML4_INDEX]; 1135 1136 PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, 1137 p, is_64bit, 0, 0, 0); 1138 1139 return(p); 1140} 1141 1142/* 1143 * Retire the given physical map from service. 1144 * Should only be called if the map contains 1145 * no valid mappings. 1146 */ 1147 1148void 1149pmap_destroy(pmap_t p) 1150{ 1151 int c; 1152 1153 if (p == PMAP_NULL) 1154 return; 1155 1156 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, 1157 p, 0, 0, 0, 0); 1158 1159 PMAP_LOCK(p); 1160 1161 c = --p->ref_count; 1162 1163 pmap_assert((current_thread() && (current_thread()->map)) ? (current_thread()->map->pmap != p) : TRUE); 1164 1165 if (c == 0) { 1166 /* 1167 * If some cpu is not using the physical pmap pointer that it 1168 * is supposed to be (see set_dirbase), we might be using the 1169 * pmap that is being destroyed! Make sure we are 1170 * physically on the right pmap: 1171 */ 1172 PMAP_UPDATE_TLBS(p, 0x0ULL, 0xFFFFFFFFFFFFF000ULL); 1173 if (pmap_pcid_ncpus) 1174 pmap_destroy_pcid_sync(p); 1175 } 1176 1177 PMAP_UNLOCK(p); 1178 1179 if (c != 0) { 1180 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END, 1181 p, 1, 0, 0, 0); 1182 pmap_assert(p == kernel_pmap); 1183 return; /* still in use */ 1184 } 1185 1186 /* 1187 * Free the memory maps, then the 1188 * pmap structure. 1189 */ 1190 int inuse_ptepages = 0; 1191 1192 zfree(pmap_anchor_zone, p->pm_pml4); 1193 1194 inuse_ptepages += p->pm_obj_pml4->resident_page_count; 1195 vm_object_deallocate(p->pm_obj_pml4); 1196 1197 inuse_ptepages += p->pm_obj_pdpt->resident_page_count; 1198 vm_object_deallocate(p->pm_obj_pdpt); 1199 1200 inuse_ptepages += p->pm_obj->resident_page_count; 1201 vm_object_deallocate(p->pm_obj); 1202 1203 OSAddAtomic(-inuse_ptepages, &inuse_ptepages_count); 1204 PMAP_ZINFO_PFREE(p, inuse_ptepages * PAGE_SIZE); 1205 ledger_dereference(p->ledger); 1206 zfree(pmap_zone, p); 1207 1208 PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END, 1209 0, 0, 0, 0, 0); 1210} 1211 1212/* 1213 * Add a reference to the specified pmap. 1214 */ 1215 1216void 1217pmap_reference(pmap_t p) 1218{ 1219 if (p != PMAP_NULL) { 1220 PMAP_LOCK(p); 1221 p->ref_count++; 1222 PMAP_UNLOCK(p);; 1223 } 1224} 1225 1226/* 1227 * Remove phys addr if mapped in specified map 1228 * 1229 */ 1230void 1231pmap_remove_some_phys( 1232 __unused pmap_t map, 1233 __unused ppnum_t pn) 1234{ 1235 1236/* Implement to support working set code */ 1237 1238} 1239 1240/* 1241 * Set the physical protection on the 1242 * specified range of this map as requested. 1243 * Will not increase permissions. 1244 */ 1245void 1246pmap_protect( 1247 pmap_t map, 1248 vm_map_offset_t sva, 1249 vm_map_offset_t eva, 1250 vm_prot_t prot) 1251{ 1252 pt_entry_t *pde; 1253 pt_entry_t *spte, *epte; 1254 vm_map_offset_t lva; 1255 vm_map_offset_t orig_sva; 1256 boolean_t set_NX; 1257 int num_found = 0; 1258 1259 pmap_intr_assert(); 1260 1261 if (map == PMAP_NULL) 1262 return; 1263 1264 if (prot == VM_PROT_NONE) { 1265 pmap_remove(map, sva, eva); 1266 return; 1267 } 1268 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START, 1269 map, 1270 (uint32_t) (sva >> 32), (uint32_t) sva, 1271 (uint32_t) (eva >> 32), (uint32_t) eva); 1272 1273 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !map->nx_enabled) 1274 set_NX = FALSE; 1275 else 1276 set_NX = TRUE; 1277 1278 PMAP_LOCK(map); 1279 1280 orig_sva = sva; 1281 while (sva < eva) { 1282 lva = (sva + pde_mapped_size) & ~(pde_mapped_size - 1); 1283 if (lva > eva) 1284 lva = eva; 1285 pde = pmap_pde(map, sva); 1286 if (pde && (*pde & INTEL_PTE_VALID)) { 1287 if (*pde & INTEL_PTE_PS) { 1288 /* superpage */ 1289 spte = pde; 1290 epte = spte+1; /* excluded */ 1291 } else { 1292 spte = pmap_pte(map, (sva & ~(pde_mapped_size - 1))); 1293 spte = &spte[ptenum(sva)]; 1294 epte = &spte[intel_btop(lva - sva)]; 1295 } 1296 1297 for (; spte < epte; spte++) { 1298 if (!(*spte & INTEL_PTE_VALID)) 1299 continue; 1300 1301 if (prot & VM_PROT_WRITE) 1302 pmap_update_pte(spte, 0, INTEL_PTE_WRITE); 1303 else 1304 pmap_update_pte(spte, INTEL_PTE_WRITE, 0); 1305 1306 if (set_NX) 1307 pmap_update_pte(spte, 0, INTEL_PTE_NX); 1308 else 1309 pmap_update_pte(spte, INTEL_PTE_NX, 0); 1310 num_found++; 1311 } 1312 } 1313 sva = lva; 1314 } 1315 if (num_found) 1316 PMAP_UPDATE_TLBS(map, orig_sva, eva); 1317 1318 PMAP_UNLOCK(map); 1319 1320 PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END, 1321 0, 0, 0, 0, 0); 1322 1323} 1324 1325/* Map a (possibly) autogenned block */ 1326void 1327pmap_map_block( 1328 pmap_t pmap, 1329 addr64_t va, 1330 ppnum_t pa, 1331 uint32_t size, 1332 vm_prot_t prot, 1333 int attr, 1334 __unused unsigned int flags) 1335{ 1336 uint32_t page; 1337 int cur_page_size; 1338 1339 if (attr & VM_MEM_SUPERPAGE) 1340 cur_page_size = SUPERPAGE_SIZE; 1341 else 1342 cur_page_size = PAGE_SIZE; 1343 1344 for (page = 0; page < size; page+=cur_page_size/PAGE_SIZE) { 1345 pmap_enter(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE); 1346 va += cur_page_size; 1347 pa+=cur_page_size/PAGE_SIZE; 1348 } 1349} 1350 1351kern_return_t 1352pmap_expand_pml4( 1353 pmap_t map, 1354 vm_map_offset_t vaddr, 1355 unsigned int options) 1356{ 1357 vm_page_t m; 1358 pmap_paddr_t pa; 1359 uint64_t i; 1360 ppnum_t pn; 1361 pml4_entry_t *pml4p; 1362 1363 DBG("pmap_expand_pml4(%p,%p)\n", map, (void *)vaddr); 1364 1365 /* 1366 * Allocate a VM page for the pml4 page 1367 */ 1368 while ((m = vm_page_grab()) == VM_PAGE_NULL) { 1369 if (options & PMAP_EXPAND_OPTIONS_NOWAIT) 1370 return KERN_RESOURCE_SHORTAGE; 1371 VM_PAGE_WAIT(); 1372 } 1373 /* 1374 * put the page into the pmap's obj list so it 1375 * can be found later. 1376 */ 1377 pn = m->phys_page; 1378 pa = i386_ptob(pn); 1379 i = pml4idx(map, vaddr); 1380 1381 /* 1382 * Zero the page. 1383 */ 1384 pmap_zero_page(pn); 1385 1386 vm_page_lockspin_queues(); 1387 vm_page_wire(m); 1388 vm_page_unlock_queues(); 1389 1390 OSAddAtomic(1, &inuse_ptepages_count); 1391 OSAddAtomic64(1, &alloc_ptepages_count); 1392 PMAP_ZINFO_PALLOC(map, PAGE_SIZE); 1393 1394 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */ 1395 vm_object_lock(map->pm_obj_pml4); 1396 1397 PMAP_LOCK(map); 1398 /* 1399 * See if someone else expanded us first 1400 */ 1401 if (pmap64_pdpt(map, vaddr) != PDPT_ENTRY_NULL) { 1402 PMAP_UNLOCK(map); 1403 vm_object_unlock(map->pm_obj_pml4); 1404 1405 VM_PAGE_FREE(m); 1406 1407 OSAddAtomic(-1, &inuse_ptepages_count); 1408 PMAP_ZINFO_PFREE(map, PAGE_SIZE); 1409 return KERN_SUCCESS; 1410 } 1411 1412#if 0 /* DEBUG */ 1413 if (0 != vm_page_lookup(map->pm_obj_pml4, (vm_object_offset_t)i)) { 1414 panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n", 1415 map, map->pm_obj_pml4, vaddr, i); 1416 } 1417#endif 1418 vm_page_insert(m, map->pm_obj_pml4, (vm_object_offset_t)i); 1419 vm_object_unlock(map->pm_obj_pml4); 1420 1421 /* 1422 * Set the page directory entry for this page table. 1423 */ 1424 pml4p = pmap64_pml4(map, vaddr); /* refetch under lock */ 1425 1426 pmap_store_pte(pml4p, pa_to_pte(pa) 1427 | INTEL_PTE_VALID 1428 | INTEL_PTE_USER 1429 | INTEL_PTE_WRITE); 1430 1431 PMAP_UNLOCK(map); 1432 1433 return KERN_SUCCESS; 1434} 1435 1436kern_return_t 1437pmap_expand_pdpt(pmap_t map, vm_map_offset_t vaddr, unsigned int options) 1438{ 1439 vm_page_t m; 1440 pmap_paddr_t pa; 1441 uint64_t i; 1442 ppnum_t pn; 1443 pdpt_entry_t *pdptp; 1444 1445 DBG("pmap_expand_pdpt(%p,%p)\n", map, (void *)vaddr); 1446 1447 while ((pdptp = pmap64_pdpt(map, vaddr)) == PDPT_ENTRY_NULL) { 1448 kern_return_t pep4kr = pmap_expand_pml4(map, vaddr, options); 1449 if (pep4kr != KERN_SUCCESS) 1450 return pep4kr; 1451 } 1452 1453 /* 1454 * Allocate a VM page for the pdpt page 1455 */ 1456 while ((m = vm_page_grab()) == VM_PAGE_NULL) { 1457 if (options & PMAP_EXPAND_OPTIONS_NOWAIT) 1458 return KERN_RESOURCE_SHORTAGE; 1459 VM_PAGE_WAIT(); 1460 } 1461 1462 /* 1463 * put the page into the pmap's obj list so it 1464 * can be found later. 1465 */ 1466 pn = m->phys_page; 1467 pa = i386_ptob(pn); 1468 i = pdptidx(map, vaddr); 1469 1470 /* 1471 * Zero the page. 1472 */ 1473 pmap_zero_page(pn); 1474 1475 vm_page_lockspin_queues(); 1476 vm_page_wire(m); 1477 vm_page_unlock_queues(); 1478 1479 OSAddAtomic(1, &inuse_ptepages_count); 1480 OSAddAtomic64(1, &alloc_ptepages_count); 1481 PMAP_ZINFO_PALLOC(map, PAGE_SIZE); 1482 1483 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */ 1484 vm_object_lock(map->pm_obj_pdpt); 1485 1486 PMAP_LOCK(map); 1487 /* 1488 * See if someone else expanded us first 1489 */ 1490 if (pmap64_pde(map, vaddr) != PD_ENTRY_NULL) { 1491 PMAP_UNLOCK(map); 1492 vm_object_unlock(map->pm_obj_pdpt); 1493 1494 VM_PAGE_FREE(m); 1495 1496 OSAddAtomic(-1, &inuse_ptepages_count); 1497 PMAP_ZINFO_PFREE(map, PAGE_SIZE); 1498 return KERN_SUCCESS; 1499 } 1500 1501#if 0 /* DEBUG */ 1502 if (0 != vm_page_lookup(map->pm_obj_pdpt, (vm_object_offset_t)i)) { 1503 panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n", 1504 map, map->pm_obj_pdpt, vaddr, i); 1505 } 1506#endif 1507 vm_page_insert(m, map->pm_obj_pdpt, (vm_object_offset_t)i); 1508 vm_object_unlock(map->pm_obj_pdpt); 1509 1510 /* 1511 * Set the page directory entry for this page table. 1512 */ 1513 pdptp = pmap64_pdpt(map, vaddr); /* refetch under lock */ 1514 1515 pmap_store_pte(pdptp, pa_to_pte(pa) 1516 | INTEL_PTE_VALID 1517 | INTEL_PTE_USER 1518 | INTEL_PTE_WRITE); 1519 1520 PMAP_UNLOCK(map); 1521 1522 return KERN_SUCCESS; 1523 1524} 1525 1526 1527 1528/* 1529 * Routine: pmap_expand 1530 * 1531 * Expands a pmap to be able to map the specified virtual address. 1532 * 1533 * Allocates new virtual memory for the P0 or P1 portion of the 1534 * pmap, then re-maps the physical pages that were in the old 1535 * pmap to be in the new pmap. 1536 * 1537 * Must be called with the pmap system and the pmap unlocked, 1538 * since these must be unlocked to use vm_allocate or vm_deallocate. 1539 * Thus it must be called in a loop that checks whether the map 1540 * has been expanded enough. 1541 * (We won't loop forever, since page tables aren't shrunk.) 1542 */ 1543kern_return_t 1544pmap_expand( 1545 pmap_t map, 1546 vm_map_offset_t vaddr, 1547 unsigned int options) 1548{ 1549 pt_entry_t *pdp; 1550 register vm_page_t m; 1551 register pmap_paddr_t pa; 1552 uint64_t i; 1553 ppnum_t pn; 1554 1555 1556 /* 1557 * For the kernel, the virtual address must be in or above the basement 1558 * which is for kexts and is in the 512GB immediately below the kernel.. 1559 * XXX - should use VM_MIN_KERNEL_AND_KEXT_ADDRESS not KERNEL_BASEMENT 1560 */ 1561 if (map == kernel_pmap && 1562 !(vaddr >= KERNEL_BASEMENT && vaddr <= VM_MAX_KERNEL_ADDRESS)) 1563 panic("pmap_expand: bad vaddr 0x%llx for kernel pmap", vaddr); 1564 1565 1566 while ((pdp = pmap64_pde(map, vaddr)) == PD_ENTRY_NULL) { 1567 kern_return_t pepkr = pmap_expand_pdpt(map, vaddr, options); 1568 if (pepkr != KERN_SUCCESS) 1569 return pepkr; 1570 } 1571 1572 /* 1573 * Allocate a VM page for the pde entries. 1574 */ 1575 while ((m = vm_page_grab()) == VM_PAGE_NULL) { 1576 if (options & PMAP_EXPAND_OPTIONS_NOWAIT) 1577 return KERN_RESOURCE_SHORTAGE; 1578 VM_PAGE_WAIT(); 1579 } 1580 1581 /* 1582 * put the page into the pmap's obj list so it 1583 * can be found later. 1584 */ 1585 pn = m->phys_page; 1586 pa = i386_ptob(pn); 1587 i = pdeidx(map, vaddr); 1588 1589 /* 1590 * Zero the page. 1591 */ 1592 pmap_zero_page(pn); 1593 1594 vm_page_lockspin_queues(); 1595 vm_page_wire(m); 1596 vm_page_unlock_queues(); 1597 1598 OSAddAtomic(1, &inuse_ptepages_count); 1599 OSAddAtomic64(1, &alloc_ptepages_count); 1600 PMAP_ZINFO_PALLOC(map, PAGE_SIZE); 1601 1602 /* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */ 1603 vm_object_lock(map->pm_obj); 1604 1605 PMAP_LOCK(map); 1606 1607 /* 1608 * See if someone else expanded us first 1609 */ 1610 if (pmap_pte(map, vaddr) != PT_ENTRY_NULL) { 1611 PMAP_UNLOCK(map); 1612 vm_object_unlock(map->pm_obj); 1613 1614 VM_PAGE_FREE(m); 1615 1616 OSAddAtomic(-1, &inuse_ptepages_count); 1617 PMAP_ZINFO_PFREE(map, PAGE_SIZE); 1618 return KERN_SUCCESS; 1619 } 1620 1621#if 0 /* DEBUG */ 1622 if (0 != vm_page_lookup(map->pm_obj, (vm_object_offset_t)i)) { 1623 panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx\n", 1624 map, map->pm_obj, vaddr, i); 1625 } 1626#endif 1627 vm_page_insert(m, map->pm_obj, (vm_object_offset_t)i); 1628 vm_object_unlock(map->pm_obj); 1629 1630 /* 1631 * Set the page directory entry for this page table. 1632 */ 1633 pdp = pmap_pde(map, vaddr); 1634 pmap_store_pte(pdp, pa_to_pte(pa) 1635 | INTEL_PTE_VALID 1636 | INTEL_PTE_USER 1637 | INTEL_PTE_WRITE); 1638 1639 PMAP_UNLOCK(map); 1640 1641 return KERN_SUCCESS; 1642} 1643 1644/* On K64 machines with more than 32GB of memory, pmap_steal_memory 1645 * will allocate past the 1GB of pre-expanded virtual kernel area. This 1646 * function allocates all the page tables using memory from the same pool 1647 * that pmap_steal_memory uses, rather than calling vm_page_grab (which 1648 * isn't available yet). */ 1649void 1650pmap_pre_expand(pmap_t pmap, vm_map_offset_t vaddr) 1651{ 1652 ppnum_t pn; 1653 pt_entry_t *pte; 1654 1655 PMAP_LOCK(pmap); 1656 1657 if(pmap64_pdpt(pmap, vaddr) == PDPT_ENTRY_NULL) { 1658 if (!pmap_next_page_hi(&pn)) 1659 panic("pmap_pre_expand"); 1660 1661 pmap_zero_page(pn); 1662 1663 pte = pmap64_pml4(pmap, vaddr); 1664 1665 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn)) 1666 | INTEL_PTE_VALID 1667 | INTEL_PTE_USER 1668 | INTEL_PTE_WRITE); 1669 } 1670 1671 if(pmap64_pde(pmap, vaddr) == PD_ENTRY_NULL) { 1672 if (!pmap_next_page_hi(&pn)) 1673 panic("pmap_pre_expand"); 1674 1675 pmap_zero_page(pn); 1676 1677 pte = pmap64_pdpt(pmap, vaddr); 1678 1679 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn)) 1680 | INTEL_PTE_VALID 1681 | INTEL_PTE_USER 1682 | INTEL_PTE_WRITE); 1683 } 1684 1685 if(pmap_pte(pmap, vaddr) == PT_ENTRY_NULL) { 1686 if (!pmap_next_page_hi(&pn)) 1687 panic("pmap_pre_expand"); 1688 1689 pmap_zero_page(pn); 1690 1691 pte = pmap64_pde(pmap, vaddr); 1692 1693 pmap_store_pte(pte, pa_to_pte(i386_ptob(pn)) 1694 | INTEL_PTE_VALID 1695 | INTEL_PTE_USER 1696 | INTEL_PTE_WRITE); 1697 } 1698 1699 PMAP_UNLOCK(pmap); 1700} 1701 1702/* 1703 * pmap_sync_page_data_phys(ppnum_t pa) 1704 * 1705 * Invalidates all of the instruction cache on a physical page and 1706 * pushes any dirty data from the data cache for the same physical page 1707 * Not required in i386. 1708 */ 1709void 1710pmap_sync_page_data_phys(__unused ppnum_t pa) 1711{ 1712 return; 1713} 1714 1715/* 1716 * pmap_sync_page_attributes_phys(ppnum_t pa) 1717 * 1718 * Write back and invalidate all cachelines on a physical page. 1719 */ 1720void 1721pmap_sync_page_attributes_phys(ppnum_t pa) 1722{ 1723 cache_flush_page_phys(pa); 1724} 1725 1726 1727 1728#ifdef CURRENTLY_UNUSED_AND_UNTESTED 1729 1730int collect_ref; 1731int collect_unref; 1732 1733/* 1734 * Routine: pmap_collect 1735 * Function: 1736 * Garbage collects the physical map system for 1737 * pages which are no longer used. 1738 * Success need not be guaranteed -- that is, there 1739 * may well be pages which are not referenced, but 1740 * others may be collected. 1741 * Usage: 1742 * Called by the pageout daemon when pages are scarce. 1743 */ 1744void 1745pmap_collect( 1746 pmap_t p) 1747{ 1748 register pt_entry_t *pdp, *ptp; 1749 pt_entry_t *eptp; 1750 int wired; 1751 1752 if (p == PMAP_NULL) 1753 return; 1754 1755 if (p == kernel_pmap) 1756 return; 1757 1758 /* 1759 * Garbage collect map. 1760 */ 1761 PMAP_LOCK(p); 1762 1763 for (pdp = (pt_entry_t *)p->dirbase; 1764 pdp < (pt_entry_t *)&p->dirbase[(UMAXPTDI+1)]; 1765 pdp++) 1766 { 1767 if (*pdp & INTEL_PTE_VALID) { 1768 if(*pdp & INTEL_PTE_REF) { 1769 pmap_store_pte(pdp, *pdp & ~INTEL_PTE_REF); 1770 collect_ref++; 1771 } else { 1772 collect_unref++; 1773 ptp = pmap_pte(p, pdetova(pdp - (pt_entry_t *)p->dirbase)); 1774 eptp = ptp + NPTEPG; 1775 1776 /* 1777 * If the pte page has any wired mappings, we cannot 1778 * free it. 1779 */ 1780 wired = 0; 1781 { 1782 register pt_entry_t *ptep; 1783 for (ptep = ptp; ptep < eptp; ptep++) { 1784 if (iswired(*ptep)) { 1785 wired = 1; 1786 break; 1787 } 1788 } 1789 } 1790 if (!wired) { 1791 /* 1792 * Remove the virtual addresses mapped by this pte page. 1793 */ 1794 pmap_remove_range(p, 1795 pdetova(pdp - (pt_entry_t *)p->dirbase), 1796 ptp, 1797 eptp); 1798 1799 /* 1800 * Invalidate the page directory pointer. 1801 */ 1802 pmap_store_pte(pdp, 0x0); 1803 1804 PMAP_UNLOCK(p); 1805 1806 /* 1807 * And free the pte page itself. 1808 */ 1809 { 1810 register vm_page_t m; 1811 1812 vm_object_lock(p->pm_obj); 1813 1814 m = vm_page_lookup(p->pm_obj,(vm_object_offset_t)(pdp - (pt_entry_t *)&p->dirbase[0])); 1815 if (m == VM_PAGE_NULL) 1816 panic("pmap_collect: pte page not in object"); 1817 1818 vm_object_unlock(p->pm_obj); 1819 1820 VM_PAGE_FREE(m); 1821 1822 OSAddAtomic(-1, &inuse_ptepages_count); 1823 PMAP_ZINFO_PFREE(p, PAGE_SIZE); 1824 } 1825 1826 PMAP_LOCK(p); 1827 } 1828 } 1829 } 1830 } 1831 1832 PMAP_UPDATE_TLBS(p, 0x0, 0xFFFFFFFFFFFFF000ULL); 1833 PMAP_UNLOCK(p); 1834 return; 1835 1836} 1837#endif 1838 1839 1840void 1841pmap_copy_page(ppnum_t src, ppnum_t dst) 1842{ 1843 bcopy_phys((addr64_t)i386_ptob(src), 1844 (addr64_t)i386_ptob(dst), 1845 PAGE_SIZE); 1846} 1847 1848 1849/* 1850 * Routine: pmap_pageable 1851 * Function: 1852 * Make the specified pages (by pmap, offset) 1853 * pageable (or not) as requested. 1854 * 1855 * A page which is not pageable may not take 1856 * a fault; therefore, its page table entry 1857 * must remain valid for the duration. 1858 * 1859 * This routine is merely advisory; pmap_enter 1860 * will specify that these pages are to be wired 1861 * down (or not) as appropriate. 1862 */ 1863void 1864pmap_pageable( 1865 __unused pmap_t pmap, 1866 __unused vm_map_offset_t start_addr, 1867 __unused vm_map_offset_t end_addr, 1868 __unused boolean_t pageable) 1869{ 1870#ifdef lint 1871 pmap++; start_addr++; end_addr++; pageable++; 1872#endif /* lint */ 1873} 1874 1875void 1876invalidate_icache(__unused vm_offset_t addr, 1877 __unused unsigned cnt, 1878 __unused int phys) 1879{ 1880 return; 1881} 1882 1883void 1884flush_dcache(__unused vm_offset_t addr, 1885 __unused unsigned count, 1886 __unused int phys) 1887{ 1888 return; 1889} 1890 1891#if CONFIG_DTRACE 1892/* 1893 * Constrain DTrace copyin/copyout actions 1894 */ 1895extern kern_return_t dtrace_copyio_preflight(addr64_t); 1896extern kern_return_t dtrace_copyio_postflight(addr64_t); 1897 1898kern_return_t dtrace_copyio_preflight(__unused addr64_t va) 1899{ 1900 thread_t thread = current_thread(); 1901 uint64_t ccr3; 1902 1903 if (current_map() == kernel_map) 1904 return KERN_FAILURE; 1905 else if (((ccr3 = get_cr3_base()) != thread->map->pmap->pm_cr3) && (no_shared_cr3 == FALSE)) 1906 return KERN_FAILURE; 1907 else if (no_shared_cr3 && (ccr3 != kernel_pmap->pm_cr3)) 1908 return KERN_FAILURE; 1909 else if (thread->machine.specFlags & CopyIOActive) 1910 return KERN_FAILURE; 1911 else 1912 return KERN_SUCCESS; 1913} 1914 1915kern_return_t dtrace_copyio_postflight(__unused addr64_t va) 1916{ 1917 return KERN_SUCCESS; 1918} 1919#endif /* CONFIG_DTRACE */ 1920 1921#include <mach_vm_debug.h> 1922#if MACH_VM_DEBUG 1923#include <vm/vm_debug.h> 1924 1925int 1926pmap_list_resident_pages( 1927 __unused pmap_t pmap, 1928 __unused vm_offset_t *listp, 1929 __unused int space) 1930{ 1931 return 0; 1932} 1933#endif /* MACH_VM_DEBUG */ 1934 1935 1936 1937/* temporary workaround */ 1938boolean_t 1939coredumpok(__unused vm_map_t map, __unused vm_offset_t va) 1940{ 1941#if 0 1942 pt_entry_t *ptep; 1943 1944 ptep = pmap_pte(map->pmap, va); 1945 if (0 == ptep) 1946 return FALSE; 1947 return ((*ptep & (INTEL_PTE_NCACHE | INTEL_PTE_WIRED)) != (INTEL_PTE_NCACHE | INTEL_PTE_WIRED)); 1948#else 1949 return TRUE; 1950#endif 1951} 1952 1953 1954boolean_t 1955phys_page_exists(ppnum_t pn) 1956{ 1957 assert(pn != vm_page_fictitious_addr); 1958 1959 if (!pmap_initialized) 1960 return TRUE; 1961 1962 if (pn == vm_page_guard_addr) 1963 return FALSE; 1964 1965 if (!IS_MANAGED_PAGE(ppn_to_pai(pn))) 1966 return FALSE; 1967 1968 return TRUE; 1969} 1970 1971 1972 1973void 1974pmap_switch(pmap_t tpmap) 1975{ 1976 spl_t s; 1977 1978 s = splhigh(); /* Make sure interruptions are disabled */ 1979 set_dirbase(tpmap, current_thread()); 1980 splx(s); 1981} 1982 1983 1984/* 1985 * disable no-execute capability on 1986 * the specified pmap 1987 */ 1988void 1989pmap_disable_NX(pmap_t pmap) 1990{ 1991 pmap->nx_enabled = 0; 1992} 1993 1994void 1995pt_fake_zone_init(int zone_index) 1996{ 1997 pt_fake_zone_index = zone_index; 1998} 1999 2000void 2001pt_fake_zone_info( 2002 int *count, 2003 vm_size_t *cur_size, 2004 vm_size_t *max_size, 2005 vm_size_t *elem_size, 2006 vm_size_t *alloc_size, 2007 uint64_t *sum_size, 2008 int *collectable, 2009 int *exhaustable, 2010 int *caller_acct) 2011{ 2012 *count = inuse_ptepages_count; 2013 *cur_size = PAGE_SIZE * inuse_ptepages_count; 2014 *max_size = PAGE_SIZE * (inuse_ptepages_count + 2015 vm_page_inactive_count + 2016 vm_page_active_count + 2017 vm_page_free_count); 2018 *elem_size = PAGE_SIZE; 2019 *alloc_size = PAGE_SIZE; 2020 *sum_size = alloc_ptepages_count * PAGE_SIZE; 2021 2022 *collectable = 1; 2023 *exhaustable = 0; 2024 *caller_acct = 1; 2025} 2026 2027static inline void 2028pmap_cpuset_NMIPI(cpu_set cpu_mask) { 2029 unsigned int cpu, cpu_bit; 2030 uint64_t deadline; 2031 2032 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) { 2033 if (cpu_mask & cpu_bit) 2034 cpu_NMI_interrupt(cpu); 2035 } 2036 deadline = mach_absolute_time() + (LockTimeOut); 2037 while (mach_absolute_time() < deadline) 2038 cpu_pause(); 2039} 2040 2041/* 2042 * Called with pmap locked, we: 2043 * - scan through per-cpu data to see which other cpus need to flush 2044 * - send an IPI to each non-idle cpu to be flushed 2045 * - wait for all to signal back that they are inactive or we see that 2046 * they are at a safe point (idle). 2047 * - flush the local tlb if active for this pmap 2048 * - return ... the caller will unlock the pmap 2049 */ 2050 2051void 2052pmap_flush_tlbs(pmap_t pmap, vm_map_offset_t startv, vm_map_offset_t endv) 2053{ 2054 unsigned int cpu; 2055 unsigned int cpu_bit; 2056 cpu_set cpus_to_signal; 2057 unsigned int my_cpu = cpu_number(); 2058 pmap_paddr_t pmap_cr3 = pmap->pm_cr3; 2059 boolean_t flush_self = FALSE; 2060 uint64_t deadline; 2061 boolean_t pmap_is_shared = (pmap->pm_shared || (pmap == kernel_pmap)); 2062 2063 assert((processor_avail_count < 2) || 2064 (ml_get_interrupts_enabled() && get_preemption_level() != 0)); 2065 2066 /* 2067 * Scan other cpus for matching active or task CR3. 2068 * For idle cpus (with no active map) we mark them invalid but 2069 * don't signal -- they'll check as they go busy. 2070 */ 2071 cpus_to_signal = 0; 2072 2073 if (pmap_pcid_ncpus) { 2074 pmap_pcid_invalidate_all_cpus(pmap); 2075 __asm__ volatile("mfence":::"memory"); 2076 } 2077 2078 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) { 2079 if (!cpu_datap(cpu)->cpu_running) 2080 continue; 2081 uint64_t cpu_active_cr3 = CPU_GET_ACTIVE_CR3(cpu); 2082 uint64_t cpu_task_cr3 = CPU_GET_TASK_CR3(cpu); 2083 2084 if ((pmap_cr3 == cpu_task_cr3) || 2085 (pmap_cr3 == cpu_active_cr3) || 2086 (pmap_is_shared)) { 2087 if (cpu == my_cpu) { 2088 flush_self = TRUE; 2089 continue; 2090 } 2091 if (pmap_pcid_ncpus && pmap_is_shared) 2092 cpu_datap(cpu)->cpu_tlb_invalid_global = TRUE; 2093 else 2094 cpu_datap(cpu)->cpu_tlb_invalid_local = TRUE; 2095 __asm__ volatile("mfence":::"memory"); 2096 2097 /* 2098 * We don't need to signal processors which will flush 2099 * lazily at the idle state or kernel boundary. 2100 * For example, if we're invalidating the kernel pmap, 2101 * processors currently in userspace don't need to flush 2102 * their TLBs until the next time they enter the kernel. 2103 * Alterations to the address space of a task active 2104 * on a remote processor result in a signal, to 2105 * account for copy operations. (There may be room 2106 * for optimization in such cases). 2107 * The order of the loads below with respect 2108 * to the store to the "cpu_tlb_invalid" field above 2109 * is important--hence the barrier. 2110 */ 2111 if (CPU_CR3_IS_ACTIVE(cpu) && 2112 (pmap_cr3 == CPU_GET_ACTIVE_CR3(cpu) || 2113 pmap->pm_shared || 2114 (pmap_cr3 == CPU_GET_TASK_CR3(cpu)))) { 2115 cpus_to_signal |= cpu_bit; 2116 i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC); 2117 } 2118 } 2119 } 2120 2121 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_START, 2122 pmap, cpus_to_signal, flush_self, startv, endv); 2123 2124 /* 2125 * Flush local tlb if required. 2126 * Do this now to overlap with other processors responding. 2127 */ 2128 if (flush_self) { 2129 if (pmap_pcid_ncpus) { 2130 pmap_pcid_validate_cpu(pmap, my_cpu); 2131 if (pmap_is_shared) 2132 tlb_flush_global(); 2133 else 2134 flush_tlb_raw(); 2135 } 2136 else 2137 flush_tlb_raw(); 2138 } 2139 2140 if (cpus_to_signal) { 2141 cpu_set cpus_to_respond = cpus_to_signal; 2142 2143 deadline = mach_absolute_time() + LockTimeOut; 2144 /* 2145 * Wait for those other cpus to acknowledge 2146 */ 2147 while (cpus_to_respond != 0) { 2148 long orig_acks = 0; 2149 2150 for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) { 2151 /* Consider checking local/global invalidity 2152 * as appropriate in the PCID case. 2153 */ 2154 if ((cpus_to_respond & cpu_bit) != 0) { 2155 if (!cpu_datap(cpu)->cpu_running || 2156 cpu_datap(cpu)->cpu_tlb_invalid == FALSE || 2157 !CPU_CR3_IS_ACTIVE(cpu)) { 2158 cpus_to_respond &= ~cpu_bit; 2159 } 2160 cpu_pause(); 2161 } 2162 if (cpus_to_respond == 0) 2163 break; 2164 } 2165 if (cpus_to_respond && (mach_absolute_time() > deadline)) { 2166 if (machine_timeout_suspended()) 2167 continue; 2168 pmap_tlb_flush_timeout = TRUE; 2169 orig_acks = NMIPI_acks; 2170 pmap_cpuset_NMIPI(cpus_to_respond); 2171 2172 panic("TLB invalidation IPI timeout: " 2173 "CPU(s) failed to respond to interrupts, unresponsive CPU bitmap: 0x%lx, NMIPI acks: orig: 0x%lx, now: 0x%lx", 2174 cpus_to_respond, orig_acks, NMIPI_acks); 2175 } 2176 } 2177 } 2178 2179 if (__improbable((pmap == kernel_pmap) && (flush_self != TRUE))) { 2180 panic("pmap_flush_tlbs: pmap == kernel_pmap && flush_self != TRUE; kernel CR3: 0x%llX, CPU active CR3: 0x%llX, CPU Task Map: %d", kernel_pmap->pm_cr3, current_cpu_datap()->cpu_active_cr3, current_cpu_datap()->cpu_task_map); 2181 } 2182 2183 PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_END, 2184 pmap, cpus_to_signal, startv, endv, 0); 2185} 2186 2187void 2188process_pmap_updates(void) 2189{ 2190 int ccpu = cpu_number(); 2191 pmap_assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0); 2192 if (pmap_pcid_ncpus) { 2193 pmap_pcid_validate_current(); 2194 if (cpu_datap(ccpu)->cpu_tlb_invalid_global) { 2195 cpu_datap(ccpu)->cpu_tlb_invalid = FALSE; 2196 tlb_flush_global(); 2197 } 2198 else { 2199 cpu_datap(ccpu)->cpu_tlb_invalid_local = FALSE; 2200 flush_tlb_raw(); 2201 } 2202 } 2203 else { 2204 current_cpu_datap()->cpu_tlb_invalid = FALSE; 2205 flush_tlb_raw(); 2206 } 2207 2208 __asm__ volatile("mfence"); 2209} 2210 2211void 2212pmap_update_interrupt(void) 2213{ 2214 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_START, 2215 0, 0, 0, 0, 0); 2216 2217 process_pmap_updates(); 2218 2219 PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_END, 2220 0, 0, 0, 0, 0); 2221} 2222 2223#include <mach/mach_vm.h> /* mach_vm_region_recurse() */ 2224/* Scan kernel pmap for W+X PTEs, scan kernel VM map for W+X map entries 2225 * and identify ranges with mismatched VM permissions and PTE permissions 2226 */ 2227kern_return_t 2228pmap_permissions_verify(pmap_t ipmap, vm_map_t ivmmap, vm_offset_t sv, vm_offset_t ev) { 2229 vm_offset_t cv = sv; 2230 kern_return_t rv = KERN_SUCCESS; 2231 uint64_t skip4 = 0, skip2 = 0; 2232 2233 sv &= ~PAGE_MASK_64; 2234 ev &= ~PAGE_MASK_64; 2235 while (cv < ev) { 2236 if (__improbable((cv > 0x00007FFFFFFFFFFFULL) && 2237 (cv < 0xFFFF800000000000ULL))) { 2238 cv = 0xFFFF800000000000ULL; 2239 } 2240 /* Potential inconsistencies from not holding pmap lock 2241 * but harmless for the moment. 2242 */ 2243 if (((cv & PML4MASK) == 0) && (pmap64_pml4(ipmap, cv) == 0)) { 2244 if ((cv + NBPML4) > cv) 2245 cv += NBPML4; 2246 else 2247 break; 2248 skip4++; 2249 continue; 2250 } 2251 if (((cv & PDMASK) == 0) && (pmap_pde(ipmap, cv) == 0)) { 2252 if ((cv + NBPD) > cv) 2253 cv += NBPD; 2254 else 2255 break; 2256 skip2++; 2257 continue; 2258 } 2259 2260 pt_entry_t *ptep = pmap_pte(ipmap, cv); 2261 if (ptep && (*ptep & INTEL_PTE_VALID)) { 2262 if (*ptep & INTEL_PTE_WRITE) { 2263 if (!(*ptep & INTEL_PTE_NX)) { 2264 kprintf("W+X PTE at 0x%lx, P4: 0x%llx, P3: 0x%llx, P2: 0x%llx, PT: 0x%llx, VP: %u\n", cv, *pmap64_pml4(ipmap, cv), *pmap64_pdpt(ipmap, cv), *pmap64_pde(ipmap, cv), *ptep, pmap_valid_page((ppnum_t)(i386_btop(pte_to_pa(*ptep))))); 2265 rv = KERN_FAILURE; 2266 } 2267 } 2268 } 2269 cv += PAGE_SIZE; 2270 } 2271 kprintf("Completed pmap scan\n"); 2272 cv = sv; 2273 2274 struct vm_region_submap_info_64 vbr; 2275 mach_msg_type_number_t vbrcount = 0; 2276 mach_vm_size_t vmsize; 2277 vm_prot_t prot; 2278 uint32_t nesting_depth = 0; 2279 kern_return_t kret; 2280 2281 while (cv < ev) { 2282 2283 for (;;) { 2284 vbrcount = VM_REGION_SUBMAP_INFO_COUNT_64; 2285 if((kret = mach_vm_region_recurse(ivmmap, 2286 (mach_vm_address_t *) &cv, &vmsize, &nesting_depth, 2287 (vm_region_recurse_info_t)&vbr, 2288 &vbrcount)) != KERN_SUCCESS) { 2289 break; 2290 } 2291 2292 if(vbr.is_submap) { 2293 nesting_depth++; 2294 continue; 2295 } else { 2296 break; 2297 } 2298 } 2299 2300 if(kret != KERN_SUCCESS) 2301 break; 2302 2303 prot = vbr.protection; 2304 2305 if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) { 2306 kprintf("W+X map entry at address 0x%lx\n", cv); 2307 rv = KERN_FAILURE; 2308 } 2309 2310 if (prot) { 2311 vm_offset_t pcv; 2312 for (pcv = cv; pcv < cv + vmsize; pcv += PAGE_SIZE) { 2313 pt_entry_t *ptep = pmap_pte(ipmap, pcv); 2314 vm_prot_t tprot; 2315 2316 if ((ptep == NULL) || !(*ptep & INTEL_PTE_VALID)) 2317 continue; 2318 tprot = VM_PROT_READ; 2319 if (*ptep & INTEL_PTE_WRITE) 2320 tprot |= VM_PROT_WRITE; 2321 if ((*ptep & INTEL_PTE_NX) == 0) 2322 tprot |= VM_PROT_EXECUTE; 2323 if (tprot != prot) { 2324 kprintf("PTE/map entry permissions mismatch at address 0x%lx, pte: 0x%llx, protection: 0x%x\n", pcv, *ptep, prot); 2325 rv = KERN_FAILURE; 2326 } 2327 } 2328 } 2329 cv += vmsize; 2330 } 2331 return rv; 2332} 2333