pmap.c revision 151910
1/*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2003 Peter Wemm 9 * All rights reserved. 10 * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu> 11 * All rights reserved. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * the Systems Programming Group of the University of Utah Computer 15 * Science Department and William Jolitz of UUNET Technologies Inc. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. All advertising materials mentioning features or use of this software 26 * must display the following acknowledgement: 27 * This product includes software developed by the University of 28 * California, Berkeley and its contributors. 29 * 4. Neither the name of the University nor the names of its contributors 30 * may be used to endorse or promote products derived from this software 31 * without specific prior written permission. 32 * 33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 36 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 43 * SUCH DAMAGE. 44 * 45 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 46 */ 47/*- 48 * Copyright (c) 2003 Networks Associates Technology, Inc. 49 * All rights reserved. 50 * 51 * This software was developed for the FreeBSD Project by Jake Burkholder, 52 * Safeport Network Services, and Network Associates Laboratories, the 53 * Security Research Division of Network Associates, Inc. under 54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 55 * CHATS research program. 56 * 57 * Redistribution and use in source and binary forms, with or without 58 * modification, are permitted provided that the following conditions 59 * are met: 60 * 1. Redistributions of source code must retain the above copyright 61 * notice, this list of conditions and the following disclaimer. 62 * 2. Redistributions in binary form must reproduce the above copyright 63 * notice, this list of conditions and the following disclaimer in the 64 * documentation and/or other materials provided with the distribution. 65 * 66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 69 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 76 * SUCH DAMAGE. 77 */ 78 79#include <sys/cdefs.h> 80__FBSDID("$FreeBSD: head/sys/amd64/amd64/pmap.c 151910 2005-10-31 21:25:33Z alc $"); 81 82/* 83 * Manages physical address maps. 84 * 85 * In addition to hardware address maps, this 86 * module is called upon to provide software-use-only 87 * maps which may or may not be stored in the same 88 * form as hardware maps. These pseudo-maps are 89 * used to store intermediate results from copy 90 * operations to and from address spaces. 91 * 92 * Since the information managed by this module is 93 * also stored by the logical address mapping module, 94 * this module may throw away valid virtual-to-physical 95 * mappings at almost any time. However, invalidations 96 * of virtual-to-physical mappings must be done as 97 * requested. 98 * 99 * In order to cope with hardware architectures which 100 * make virtual-to-physical map invalidates expensive, 101 * this module may delay invalidate or reduced protection 102 * operations until such time as they are actually 103 * necessary. This module is given full information as 104 * to which processors are currently using which maps, 105 * and to when physical maps must be made correct. 106 */ 107 108#include "opt_msgbuf.h" 109 110#include <sys/param.h> 111#include <sys/systm.h> 112#include <sys/kernel.h> 113#include <sys/lock.h> 114#include <sys/malloc.h> 115#include <sys/mman.h> 116#include <sys/msgbuf.h> 117#include <sys/mutex.h> 118#include <sys/proc.h> 119#include <sys/sx.h> 120#include <sys/vmmeter.h> 121#include <sys/sched.h> 122#include <sys/sysctl.h> 123#ifdef SMP 124#include <sys/smp.h> 125#endif 126 127#include <vm/vm.h> 128#include <vm/vm_param.h> 129#include <vm/vm_kern.h> 130#include <vm/vm_page.h> 131#include <vm/vm_map.h> 132#include <vm/vm_object.h> 133#include <vm/vm_extern.h> 134#include <vm/vm_pageout.h> 135#include <vm/vm_pager.h> 136#include <vm/uma.h> 137 138#include <machine/cpu.h> 139#include <machine/cputypes.h> 140#include <machine/md_var.h> 141#include <machine/pcb.h> 142#include <machine/specialreg.h> 143#ifdef SMP 144#include <machine/smp.h> 145#endif 146 147#ifndef PMAP_SHPGPERPROC 148#define PMAP_SHPGPERPROC 200 149#endif 150 151#if defined(DIAGNOSTIC) 152#define PMAP_DIAGNOSTIC 153#endif 154 155#define MINPV 2048 156 157#if !defined(PMAP_DIAGNOSTIC) 158#define PMAP_INLINE __inline 159#else 160#define PMAP_INLINE 161#endif 162 163struct pmap kernel_pmap_store; 164 165vm_paddr_t avail_start; /* PA of first available physical page */ 166vm_paddr_t avail_end; /* PA of last available physical page */ 167vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 168vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 169 170static int nkpt; 171static int ndmpdp; 172static vm_paddr_t dmaplimit; 173vm_offset_t kernel_vm_end; 174pt_entry_t pg_nx; 175 176static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 177static u_int64_t KPDphys; /* phys addr of kernel level 2 */ 178static u_int64_t KPDPphys; /* phys addr of kernel level 3 */ 179u_int64_t KPML4phys; /* phys addr of kernel level 4 */ 180 181static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ 182static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 183 184/* 185 * Data for the pv entry allocation mechanism 186 */ 187static uma_zone_t pvzone; 188static struct vm_object pvzone_obj; 189static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 190int pmap_pagedaemon_waken; 191 192/* 193 * All those kernel PT submaps that BSD is so fond of 194 */ 195pt_entry_t *CMAP1 = 0; 196caddr_t CADDR1 = 0; 197struct msgbuf *msgbufp = 0; 198 199/* 200 * Crashdump maps. 201 */ 202static caddr_t crashdumpmap; 203 204static PMAP_INLINE void free_pv_entry(pv_entry_t pv); 205static pv_entry_t get_pv_entry(void); 206static pv_entry_t pv_entry_reclaim(pmap_t locked_pmap); 207static void pmap_clear_ptes(vm_page_t m, long bit); 208 209static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, 210 vm_offset_t sva, pd_entry_t ptepde); 211static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde); 212static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, 213 vm_offset_t va); 214static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); 215 216static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags); 217static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags); 218 219static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags); 220static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m); 221static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t); 222static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 223 224CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); 225CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); 226 227/* 228 * Move the kernel virtual free pointer to the next 229 * 2MB. This is used to help improve performance 230 * by using a large (2MB) page for much of the kernel 231 * (.text, .data, .bss) 232 */ 233static vm_offset_t 234pmap_kmem_choose(vm_offset_t addr) 235{ 236 vm_offset_t newaddr = addr; 237 238 newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); 239 return newaddr; 240} 241 242/********************/ 243/* Inline functions */ 244/********************/ 245 246/* Return a non-clipped PD index for a given VA */ 247static __inline vm_pindex_t 248pmap_pde_pindex(vm_offset_t va) 249{ 250 return va >> PDRSHIFT; 251} 252 253 254/* Return various clipped indexes for a given VA */ 255static __inline vm_pindex_t 256pmap_pte_index(vm_offset_t va) 257{ 258 259 return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); 260} 261 262static __inline vm_pindex_t 263pmap_pde_index(vm_offset_t va) 264{ 265 266 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 267} 268 269static __inline vm_pindex_t 270pmap_pdpe_index(vm_offset_t va) 271{ 272 273 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 274} 275 276static __inline vm_pindex_t 277pmap_pml4e_index(vm_offset_t va) 278{ 279 280 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 281} 282 283/* Return a pointer to the PML4 slot that corresponds to a VA */ 284static __inline pml4_entry_t * 285pmap_pml4e(pmap_t pmap, vm_offset_t va) 286{ 287 288 if (!pmap) 289 return NULL; 290 return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 291} 292 293/* Return a pointer to the PDP slot that corresponds to a VA */ 294static __inline pdp_entry_t * 295pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 296{ 297 pdp_entry_t *pdpe; 298 299 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 300 return (&pdpe[pmap_pdpe_index(va)]); 301} 302 303/* Return a pointer to the PDP slot that corresponds to a VA */ 304static __inline pdp_entry_t * 305pmap_pdpe(pmap_t pmap, vm_offset_t va) 306{ 307 pml4_entry_t *pml4e; 308 309 pml4e = pmap_pml4e(pmap, va); 310 if (pml4e == NULL || (*pml4e & PG_V) == 0) 311 return NULL; 312 return (pmap_pml4e_to_pdpe(pml4e, va)); 313} 314 315/* Return a pointer to the PD slot that corresponds to a VA */ 316static __inline pd_entry_t * 317pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 318{ 319 pd_entry_t *pde; 320 321 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 322 return (&pde[pmap_pde_index(va)]); 323} 324 325/* Return a pointer to the PD slot that corresponds to a VA */ 326static __inline pd_entry_t * 327pmap_pde(pmap_t pmap, vm_offset_t va) 328{ 329 pdp_entry_t *pdpe; 330 331 pdpe = pmap_pdpe(pmap, va); 332 if (pdpe == NULL || (*pdpe & PG_V) == 0) 333 return NULL; 334 return (pmap_pdpe_to_pde(pdpe, va)); 335} 336 337/* Return a pointer to the PT slot that corresponds to a VA */ 338static __inline pt_entry_t * 339pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 340{ 341 pt_entry_t *pte; 342 343 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 344 return (&pte[pmap_pte_index(va)]); 345} 346 347/* Return a pointer to the PT slot that corresponds to a VA */ 348static __inline pt_entry_t * 349pmap_pte(pmap_t pmap, vm_offset_t va) 350{ 351 pd_entry_t *pde; 352 353 pde = pmap_pde(pmap, va); 354 if (pde == NULL || (*pde & PG_V) == 0) 355 return NULL; 356 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 357 return ((pt_entry_t *)pde); 358 return (pmap_pde_to_pte(pde, va)); 359} 360 361 362static __inline pt_entry_t * 363pmap_pte_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *ptepde) 364{ 365 pd_entry_t *pde; 366 367 pde = pmap_pde(pmap, va); 368 if (pde == NULL || (*pde & PG_V) == 0) 369 return NULL; 370 *ptepde = *pde; 371 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 372 return ((pt_entry_t *)pde); 373 return (pmap_pde_to_pte(pde, va)); 374} 375 376 377PMAP_INLINE pt_entry_t * 378vtopte(vm_offset_t va) 379{ 380 u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 381 382 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 383} 384 385static __inline pd_entry_t * 386vtopde(vm_offset_t va) 387{ 388 u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 389 390 return (PDmap + ((va >> PDRSHIFT) & mask)); 391} 392 393static u_int64_t 394allocpages(int n) 395{ 396 u_int64_t ret; 397 398 ret = avail_start; 399 bzero((void *)ret, n * PAGE_SIZE); 400 avail_start += n * PAGE_SIZE; 401 return (ret); 402} 403 404static void 405create_pagetables(void) 406{ 407 int i; 408 409 /* Allocate pages */ 410 KPTphys = allocpages(NKPT); 411 KPML4phys = allocpages(1); 412 KPDPphys = allocpages(NKPML4E); 413 KPDphys = allocpages(NKPDPE); 414 415 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; 416 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 417 ndmpdp = 4; 418 DMPDPphys = allocpages(NDMPML4E); 419 DMPDphys = allocpages(ndmpdp); 420 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 421 422 /* Fill in the underlying page table pages */ 423 /* Read-only from zero to physfree */ 424 /* XXX not fully used, underneath 2M pages */ 425 for (i = 0; (i << PAGE_SHIFT) < avail_start; i++) { 426 ((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT; 427 ((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G; 428 } 429 430 /* Now map the page tables at their location within PTmap */ 431 for (i = 0; i < NKPT; i++) { 432 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); 433 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V; 434 } 435 436 /* Map from zero to end of allocations under 2M pages */ 437 /* This replaces some of the KPTphys entries above */ 438 for (i = 0; (i << PDRSHIFT) < avail_start; i++) { 439 ((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT; 440 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G; 441 } 442 443 /* And connect up the PD to the PDP */ 444 for (i = 0; i < NKPDPE; i++) { 445 ((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys + (i << PAGE_SHIFT); 446 ((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U; 447 } 448 449 450 /* Now set up the direct map space using 2MB pages */ 451 for (i = 0; i < NPDEPG * ndmpdp; i++) { 452 ((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT; 453 ((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G; 454 } 455 456 /* And the direct map space's PDP */ 457 for (i = 0; i < ndmpdp; i++) { 458 ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (i << PAGE_SHIFT); 459 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U; 460 } 461 462 /* And recursively map PML4 to itself in order to get PTmap */ 463 ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; 464 ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U; 465 466 /* Connect the Direct Map slot up to the PML4 */ 467 ((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys; 468 ((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U; 469 470 /* Connect the KVA slot up to the PML4 */ 471 ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys; 472 ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U; 473} 474 475/* 476 * Bootstrap the system enough to run with virtual memory. 477 * 478 * On amd64 this is called after mapping has already been enabled 479 * and just syncs the pmap module with what has already been done. 480 * [We can't call it easily with mapping off since the kernel is not 481 * mapped with PA == VA, hence we would have to relocate every address 482 * from the linked base (virtual) address "KERNBASE" to the actual 483 * (physical) address starting relative to 0] 484 */ 485void 486pmap_bootstrap(firstaddr) 487 vm_paddr_t *firstaddr; 488{ 489 vm_offset_t va; 490 pt_entry_t *pte, *unused; 491 492 avail_start = *firstaddr; 493 494 /* 495 * Create an initial set of page tables to run the kernel in. 496 */ 497 create_pagetables(); 498 *firstaddr = avail_start; 499 500 virtual_avail = (vm_offset_t) KERNBASE + avail_start; 501 virtual_avail = pmap_kmem_choose(virtual_avail); 502 503 virtual_end = VM_MAX_KERNEL_ADDRESS; 504 505 506 /* XXX do %cr0 as well */ 507 load_cr4(rcr4() | CR4_PGE | CR4_PSE); 508 load_cr3(KPML4phys); 509 510 /* 511 * Initialize the kernel pmap (which is statically allocated). 512 */ 513 PMAP_LOCK_INIT(kernel_pmap); 514 kernel_pmap->pm_pml4 = (pdp_entry_t *) (KERNBASE + KPML4phys); 515 kernel_pmap->pm_active = -1; /* don't allow deactivation */ 516 TAILQ_INIT(&kernel_pmap->pm_pvlist); 517 nkpt = NKPT; 518 519 /* 520 * Reserve some special page table entries/VA space for temporary 521 * mapping of pages. 522 */ 523#define SYSMAP(c, p, v, n) \ 524 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 525 526 va = virtual_avail; 527 pte = vtopte(va); 528 529 /* 530 * CMAP1 is only used for the memory test. 531 */ 532 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 533 534 /* 535 * Crashdump maps. 536 */ 537 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) 538 539 /* 540 * msgbufp is used to map the system message buffer. 541 */ 542 SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE))) 543 544 virtual_avail = va; 545 546 *CMAP1 = 0; 547 548 invltlb(); 549} 550 551/* 552 * Initialize a vm_page's machine-dependent fields. 553 */ 554void 555pmap_page_init(vm_page_t m) 556{ 557 558 TAILQ_INIT(&m->md.pv_list); 559 m->md.pv_list_count = 0; 560} 561 562/* 563 * Initialize the pmap module. 564 * Called by vm_init, to initialize any structures that the pmap 565 * system needs to map virtual memory. 566 */ 567void 568pmap_init(void) 569{ 570 571 /* 572 * init the pv free list 573 */ 574 pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL, 575 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); 576 uma_prealloc(pvzone, MINPV); 577} 578 579/* 580 * Initialize the address space (zone) for the pv_entries. Set a 581 * high water mark so that the system can recover from excessive 582 * numbers of pv entries. 583 */ 584void 585pmap_init2() 586{ 587 int shpgperproc = PMAP_SHPGPERPROC; 588 589 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 590 pv_entry_max = shpgperproc * maxproc + vm_page_array_size; 591 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 592 pv_entry_high_water = 9 * (pv_entry_max / 10); 593 uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max); 594} 595 596 597/*************************************************** 598 * Low level helper routines..... 599 ***************************************************/ 600 601 602/* 603 * this routine defines the region(s) of memory that should 604 * not be tested for the modified bit. 605 */ 606static PMAP_INLINE int 607pmap_track_modified(vm_offset_t va) 608{ 609 if ((va < kmi.clean_sva) || (va >= kmi.clean_eva)) 610 return 1; 611 else 612 return 0; 613} 614 615#ifdef SMP 616/* 617 * For SMP, these functions have to use the IPI mechanism for coherence. 618 */ 619void 620pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 621{ 622 u_int cpumask; 623 u_int other_cpus; 624 625 if (smp_started) { 626 if (!(read_rflags() & PSL_I)) 627 panic("%s: interrupts disabled", __func__); 628 mtx_lock_spin(&smp_ipi_mtx); 629 } else 630 critical_enter(); 631 /* 632 * We need to disable interrupt preemption but MUST NOT have 633 * interrupts disabled here. 634 * XXX we may need to hold schedlock to get a coherent pm_active 635 * XXX critical sections disable interrupts again 636 */ 637 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 638 invlpg(va); 639 smp_invlpg(va); 640 } else { 641 cpumask = PCPU_GET(cpumask); 642 other_cpus = PCPU_GET(other_cpus); 643 if (pmap->pm_active & cpumask) 644 invlpg(va); 645 if (pmap->pm_active & other_cpus) 646 smp_masked_invlpg(pmap->pm_active & other_cpus, va); 647 } 648 if (smp_started) 649 mtx_unlock_spin(&smp_ipi_mtx); 650 else 651 critical_exit(); 652} 653 654void 655pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 656{ 657 u_int cpumask; 658 u_int other_cpus; 659 vm_offset_t addr; 660 661 if (smp_started) { 662 if (!(read_rflags() & PSL_I)) 663 panic("%s: interrupts disabled", __func__); 664 mtx_lock_spin(&smp_ipi_mtx); 665 } else 666 critical_enter(); 667 /* 668 * We need to disable interrupt preemption but MUST NOT have 669 * interrupts disabled here. 670 * XXX we may need to hold schedlock to get a coherent pm_active 671 * XXX critical sections disable interrupts again 672 */ 673 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 674 for (addr = sva; addr < eva; addr += PAGE_SIZE) 675 invlpg(addr); 676 smp_invlpg_range(sva, eva); 677 } else { 678 cpumask = PCPU_GET(cpumask); 679 other_cpus = PCPU_GET(other_cpus); 680 if (pmap->pm_active & cpumask) 681 for (addr = sva; addr < eva; addr += PAGE_SIZE) 682 invlpg(addr); 683 if (pmap->pm_active & other_cpus) 684 smp_masked_invlpg_range(pmap->pm_active & other_cpus, 685 sva, eva); 686 } 687 if (smp_started) 688 mtx_unlock_spin(&smp_ipi_mtx); 689 else 690 critical_exit(); 691} 692 693void 694pmap_invalidate_all(pmap_t pmap) 695{ 696 u_int cpumask; 697 u_int other_cpus; 698 699 if (smp_started) { 700 if (!(read_rflags() & PSL_I)) 701 panic("%s: interrupts disabled", __func__); 702 mtx_lock_spin(&smp_ipi_mtx); 703 } else 704 critical_enter(); 705 /* 706 * We need to disable interrupt preemption but MUST NOT have 707 * interrupts disabled here. 708 * XXX we may need to hold schedlock to get a coherent pm_active 709 * XXX critical sections disable interrupts again 710 */ 711 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 712 invltlb(); 713 smp_invltlb(); 714 } else { 715 cpumask = PCPU_GET(cpumask); 716 other_cpus = PCPU_GET(other_cpus); 717 if (pmap->pm_active & cpumask) 718 invltlb(); 719 if (pmap->pm_active & other_cpus) 720 smp_masked_invltlb(pmap->pm_active & other_cpus); 721 } 722 if (smp_started) 723 mtx_unlock_spin(&smp_ipi_mtx); 724 else 725 critical_exit(); 726} 727#else /* !SMP */ 728/* 729 * Normal, non-SMP, invalidation functions. 730 * We inline these within pmap.c for speed. 731 */ 732PMAP_INLINE void 733pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 734{ 735 736 if (pmap == kernel_pmap || pmap->pm_active) 737 invlpg(va); 738} 739 740PMAP_INLINE void 741pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 742{ 743 vm_offset_t addr; 744 745 if (pmap == kernel_pmap || pmap->pm_active) 746 for (addr = sva; addr < eva; addr += PAGE_SIZE) 747 invlpg(addr); 748} 749 750PMAP_INLINE void 751pmap_invalidate_all(pmap_t pmap) 752{ 753 754 if (pmap == kernel_pmap || pmap->pm_active) 755 invltlb(); 756} 757#endif /* !SMP */ 758 759/* 760 * Are we current address space or kernel? 761 */ 762static __inline int 763pmap_is_current(pmap_t pmap) 764{ 765 return (pmap == kernel_pmap || 766 (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME)); 767} 768 769/* 770 * Routine: pmap_extract 771 * Function: 772 * Extract the physical page address associated 773 * with the given map/virtual_address pair. 774 */ 775vm_paddr_t 776pmap_extract(pmap_t pmap, vm_offset_t va) 777{ 778 vm_paddr_t rtval; 779 pt_entry_t *pte; 780 pd_entry_t pde, *pdep; 781 782 rtval = 0; 783 PMAP_LOCK(pmap); 784 pdep = pmap_pde(pmap, va); 785 if (pdep != NULL) { 786 pde = *pdep; 787 if (pde) { 788 if ((pde & PG_PS) != 0) { 789 KASSERT((pde & PG_FRAME & PDRMASK) == 0, 790 ("pmap_extract: bad pde")); 791 rtval = (pde & PG_FRAME) | (va & PDRMASK); 792 PMAP_UNLOCK(pmap); 793 return rtval; 794 } 795 pte = pmap_pde_to_pte(pdep, va); 796 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); 797 } 798 } 799 PMAP_UNLOCK(pmap); 800 return (rtval); 801} 802 803/* 804 * Routine: pmap_extract_and_hold 805 * Function: 806 * Atomically extract and hold the physical page 807 * with the given pmap and virtual address pair 808 * if that mapping permits the given protection. 809 */ 810vm_page_t 811pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 812{ 813 pd_entry_t pde, *pdep; 814 pt_entry_t pte; 815 vm_page_t m; 816 817 m = NULL; 818 vm_page_lock_queues(); 819 PMAP_LOCK(pmap); 820 pdep = pmap_pde(pmap, va); 821 if (pdep != NULL && (pde = *pdep)) { 822 if (pde & PG_PS) { 823 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 824 KASSERT((pde & PG_FRAME & PDRMASK) == 0, 825 ("pmap_extract_and_hold: bad pde")); 826 m = PHYS_TO_VM_PAGE((pde & PG_FRAME) | 827 (va & PDRMASK)); 828 vm_page_hold(m); 829 } 830 } else { 831 pte = *pmap_pde_to_pte(pdep, va); 832 if ((pte & PG_V) && 833 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 834 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 835 vm_page_hold(m); 836 } 837 } 838 } 839 vm_page_unlock_queues(); 840 PMAP_UNLOCK(pmap); 841 return (m); 842} 843 844vm_paddr_t 845pmap_kextract(vm_offset_t va) 846{ 847 pd_entry_t *pde; 848 vm_paddr_t pa; 849 850 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 851 pa = DMAP_TO_PHYS(va); 852 } else { 853 pde = vtopde(va); 854 if (*pde & PG_PS) { 855 pa = (*pde & ~(NBPDR - 1)) | (va & (NBPDR - 1)); 856 } else { 857 pa = *vtopte(va); 858 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 859 } 860 } 861 return pa; 862} 863 864/*************************************************** 865 * Low level mapping routines..... 866 ***************************************************/ 867 868/* 869 * Add a wired page to the kva. 870 * Note: not SMP coherent. 871 */ 872PMAP_INLINE void 873pmap_kenter(vm_offset_t va, vm_paddr_t pa) 874{ 875 pt_entry_t *pte; 876 877 pte = vtopte(va); 878 pte_store(pte, pa | PG_RW | PG_V | PG_G); 879} 880 881/* 882 * Remove a page from the kernel pagetables. 883 * Note: not SMP coherent. 884 */ 885PMAP_INLINE void 886pmap_kremove(vm_offset_t va) 887{ 888 pt_entry_t *pte; 889 890 pte = vtopte(va); 891 pte_clear(pte); 892} 893 894/* 895 * Used to map a range of physical addresses into kernel 896 * virtual address space. 897 * 898 * The value passed in '*virt' is a suggested virtual address for 899 * the mapping. Architectures which can support a direct-mapped 900 * physical to virtual region can return the appropriate address 901 * within that region, leaving '*virt' unchanged. Other 902 * architectures should map the pages starting at '*virt' and 903 * update '*virt' with the first usable address after the mapped 904 * region. 905 */ 906vm_offset_t 907pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 908{ 909 return PHYS_TO_DMAP(start); 910} 911 912 913/* 914 * Add a list of wired pages to the kva 915 * this routine is only used for temporary 916 * kernel mappings that do not need to have 917 * page modification or references recorded. 918 * Note that old mappings are simply written 919 * over. The page *must* be wired. 920 * Note: SMP coherent. Uses a ranged shootdown IPI. 921 */ 922void 923pmap_qenter(vm_offset_t sva, vm_page_t *m, int count) 924{ 925 vm_offset_t va; 926 927 va = sva; 928 while (count-- > 0) { 929 pmap_kenter(va, VM_PAGE_TO_PHYS(*m)); 930 va += PAGE_SIZE; 931 m++; 932 } 933 pmap_invalidate_range(kernel_pmap, sva, va); 934} 935 936/* 937 * This routine tears out page mappings from the 938 * kernel -- it is meant only for temporary mappings. 939 * Note: SMP coherent. Uses a ranged shootdown IPI. 940 */ 941void 942pmap_qremove(vm_offset_t sva, int count) 943{ 944 vm_offset_t va; 945 946 va = sva; 947 while (count-- > 0) { 948 pmap_kremove(va); 949 va += PAGE_SIZE; 950 } 951 pmap_invalidate_range(kernel_pmap, sva, va); 952} 953 954/*************************************************** 955 * Page table page management routines..... 956 ***************************************************/ 957 958/* 959 * This routine unholds page table pages, and if the hold count 960 * drops to zero, then it decrements the wire count. 961 */ 962static PMAP_INLINE int 963pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m) 964{ 965 966 --m->wire_count; 967 if (m->wire_count == 0) 968 return _pmap_unwire_pte_hold(pmap, va, m); 969 else 970 return 0; 971} 972 973static int 974_pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m) 975{ 976 vm_offset_t pteva; 977 978 /* 979 * unmap the page table page 980 */ 981 if (m->pindex >= (NUPDE + NUPDPE)) { 982 /* PDP page */ 983 pml4_entry_t *pml4; 984 pml4 = pmap_pml4e(pmap, va); 985 pteva = (vm_offset_t) PDPmap + amd64_ptob(m->pindex - (NUPDE + NUPDPE)); 986 *pml4 = 0; 987 } else if (m->pindex >= NUPDE) { 988 /* PD page */ 989 pdp_entry_t *pdp; 990 pdp = pmap_pdpe(pmap, va); 991 pteva = (vm_offset_t) PDmap + amd64_ptob(m->pindex - NUPDE); 992 *pdp = 0; 993 } else { 994 /* PTE page */ 995 pd_entry_t *pd; 996 pd = pmap_pde(pmap, va); 997 pteva = (vm_offset_t) PTmap + amd64_ptob(m->pindex); 998 *pd = 0; 999 } 1000 --pmap->pm_stats.resident_count; 1001 if (m->pindex < NUPDE) { 1002 /* We just released a PT, unhold the matching PD */ 1003 vm_page_t pdpg; 1004 1005 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 1006 pmap_unwire_pte_hold(pmap, va, pdpg); 1007 } 1008 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 1009 /* We just released a PD, unhold the matching PDP */ 1010 vm_page_t pdppg; 1011 1012 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 1013 pmap_unwire_pte_hold(pmap, va, pdppg); 1014 } 1015 1016 /* 1017 * Do an invltlb to make the invalidated mapping 1018 * take effect immediately. 1019 */ 1020 pmap_invalidate_page(pmap, pteva); 1021 1022 vm_page_free_zero(m); 1023 atomic_subtract_int(&cnt.v_wire_count, 1); 1024 return 1; 1025} 1026 1027/* 1028 * After removing a page table entry, this routine is used to 1029 * conditionally free the page, and manage the hold/wire counts. 1030 */ 1031static int 1032pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde) 1033{ 1034 vm_page_t mpte; 1035 1036 if (va >= VM_MAXUSER_ADDRESS) 1037 return 0; 1038 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 1039 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 1040 return pmap_unwire_pte_hold(pmap, va, mpte); 1041} 1042 1043void 1044pmap_pinit0(pmap) 1045 struct pmap *pmap; 1046{ 1047 1048 PMAP_LOCK_INIT(pmap); 1049 pmap->pm_pml4 = (pml4_entry_t *)(KERNBASE + KPML4phys); 1050 pmap->pm_active = 0; 1051 TAILQ_INIT(&pmap->pm_pvlist); 1052 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1053} 1054 1055/* 1056 * Initialize a preallocated and zeroed pmap structure, 1057 * such as one in a vmspace structure. 1058 */ 1059void 1060pmap_pinit(pmap) 1061 register struct pmap *pmap; 1062{ 1063 vm_page_t pml4pg; 1064 static vm_pindex_t color; 1065 1066 PMAP_LOCK_INIT(pmap); 1067 1068 /* 1069 * allocate the page directory page 1070 */ 1071 while ((pml4pg = vm_page_alloc(NULL, color++, VM_ALLOC_NOOBJ | 1072 VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) 1073 VM_WAIT; 1074 1075 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); 1076 1077 if ((pml4pg->flags & PG_ZERO) == 0) 1078 pagezero(pmap->pm_pml4); 1079 1080 /* Wire in kernel global address entries. */ 1081 pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U; 1082 pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U; 1083 1084 /* install self-referential address mapping entry(s) */ 1085 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M; 1086 1087 pmap->pm_active = 0; 1088 TAILQ_INIT(&pmap->pm_pvlist); 1089 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1090} 1091 1092/* 1093 * this routine is called if the page table page is not 1094 * mapped correctly. 1095 * 1096 * Note: If a page allocation fails at page table level two or three, 1097 * one or two pages may be held during the wait, only to be released 1098 * afterwards. This conservative approach is easily argued to avoid 1099 * race conditions. 1100 */ 1101static vm_page_t 1102_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags) 1103{ 1104 vm_page_t m, pdppg, pdpg; 1105 1106 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || 1107 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, 1108 ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); 1109 1110 /* 1111 * Allocate a page table page. 1112 */ 1113 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1114 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1115 if (flags & M_WAITOK) { 1116 PMAP_UNLOCK(pmap); 1117 vm_page_unlock_queues(); 1118 VM_WAIT; 1119 vm_page_lock_queues(); 1120 PMAP_LOCK(pmap); 1121 } 1122 1123 /* 1124 * Indicate the need to retry. While waiting, the page table 1125 * page may have been allocated. 1126 */ 1127 return (NULL); 1128 } 1129 if ((m->flags & PG_ZERO) == 0) 1130 pmap_zero_page(m); 1131 1132 /* 1133 * Map the pagetable page into the process address space, if 1134 * it isn't already there. 1135 */ 1136 1137 pmap->pm_stats.resident_count++; 1138 1139 if (ptepindex >= (NUPDE + NUPDPE)) { 1140 pml4_entry_t *pml4; 1141 vm_pindex_t pml4index; 1142 1143 /* Wire up a new PDPE page */ 1144 pml4index = ptepindex - (NUPDE + NUPDPE); 1145 pml4 = &pmap->pm_pml4[pml4index]; 1146 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1147 1148 } else if (ptepindex >= NUPDE) { 1149 vm_pindex_t pml4index; 1150 vm_pindex_t pdpindex; 1151 pml4_entry_t *pml4; 1152 pdp_entry_t *pdp; 1153 1154 /* Wire up a new PDE page */ 1155 pdpindex = ptepindex - NUPDE; 1156 pml4index = pdpindex >> NPML4EPGSHIFT; 1157 1158 pml4 = &pmap->pm_pml4[pml4index]; 1159 if ((*pml4 & PG_V) == 0) { 1160 /* Have to allocate a new pdp, recurse */ 1161 if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, 1162 flags) == NULL) { 1163 --m->wire_count; 1164 vm_page_free(m); 1165 return (NULL); 1166 } 1167 } else { 1168 /* Add reference to pdp page */ 1169 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 1170 pdppg->wire_count++; 1171 } 1172 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1173 1174 /* Now find the pdp page */ 1175 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1176 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1177 1178 } else { 1179 vm_pindex_t pml4index; 1180 vm_pindex_t pdpindex; 1181 pml4_entry_t *pml4; 1182 pdp_entry_t *pdp; 1183 pd_entry_t *pd; 1184 1185 /* Wire up a new PTE page */ 1186 pdpindex = ptepindex >> NPDPEPGSHIFT; 1187 pml4index = pdpindex >> NPML4EPGSHIFT; 1188 1189 /* First, find the pdp and check that its valid. */ 1190 pml4 = &pmap->pm_pml4[pml4index]; 1191 if ((*pml4 & PG_V) == 0) { 1192 /* Have to allocate a new pd, recurse */ 1193 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 1194 flags) == NULL) { 1195 --m->wire_count; 1196 vm_page_free(m); 1197 return (NULL); 1198 } 1199 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1200 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1201 } else { 1202 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1203 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1204 if ((*pdp & PG_V) == 0) { 1205 /* Have to allocate a new pd, recurse */ 1206 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 1207 flags) == NULL) { 1208 --m->wire_count; 1209 vm_page_free(m); 1210 return (NULL); 1211 } 1212 } else { 1213 /* Add reference to the pd page */ 1214 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 1215 pdpg->wire_count++; 1216 } 1217 } 1218 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 1219 1220 /* Now we know where the page directory page is */ 1221 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; 1222 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1223 } 1224 1225 return m; 1226} 1227 1228static vm_page_t 1229pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags) 1230{ 1231 vm_pindex_t pdpindex, ptepindex; 1232 pdp_entry_t *pdpe; 1233 vm_page_t pdpg; 1234 1235 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || 1236 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, 1237 ("pmap_allocpde: flags is neither M_NOWAIT nor M_WAITOK")); 1238retry: 1239 pdpe = pmap_pdpe(pmap, va); 1240 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 1241 /* Add a reference to the pd page. */ 1242 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 1243 pdpg->wire_count++; 1244 } else { 1245 /* Allocate a pd page. */ 1246 ptepindex = pmap_pde_pindex(va); 1247 pdpindex = ptepindex >> NPDPEPGSHIFT; 1248 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, flags); 1249 if (pdpg == NULL && (flags & M_WAITOK)) 1250 goto retry; 1251 } 1252 return (pdpg); 1253} 1254 1255static vm_page_t 1256pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags) 1257{ 1258 vm_pindex_t ptepindex; 1259 pd_entry_t *pd; 1260 vm_page_t m; 1261 1262 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || 1263 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, 1264 ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); 1265 1266 /* 1267 * Calculate pagetable page index 1268 */ 1269 ptepindex = pmap_pde_pindex(va); 1270retry: 1271 /* 1272 * Get the page directory entry 1273 */ 1274 pd = pmap_pde(pmap, va); 1275 1276 /* 1277 * This supports switching from a 2MB page to a 1278 * normal 4K page. 1279 */ 1280 if (pd != 0 && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 1281 *pd = 0; 1282 pd = 0; 1283 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 1284 pmap_unuse_pt(pmap, va, *pmap_pdpe(pmap, va)); 1285 pmap_invalidate_all(kernel_pmap); 1286 } 1287 1288 /* 1289 * If the page table page is mapped, we just increment the 1290 * hold count, and activate it. 1291 */ 1292 if (pd != 0 && (*pd & PG_V) != 0) { 1293 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 1294 m->wire_count++; 1295 } else { 1296 /* 1297 * Here if the pte page isn't mapped, or if it has been 1298 * deallocated. 1299 */ 1300 m = _pmap_allocpte(pmap, ptepindex, flags); 1301 if (m == NULL && (flags & M_WAITOK)) 1302 goto retry; 1303 } 1304 return (m); 1305} 1306 1307 1308/*************************************************** 1309 * Pmap allocation/deallocation routines. 1310 ***************************************************/ 1311 1312/* 1313 * Release any resources held by the given physical map. 1314 * Called when a pmap initialized by pmap_pinit is being released. 1315 * Should only be called if the map contains no valid mappings. 1316 */ 1317void 1318pmap_release(pmap_t pmap) 1319{ 1320 vm_page_t m; 1321 1322 KASSERT(pmap->pm_stats.resident_count == 0, 1323 ("pmap_release: pmap resident count %ld != 0", 1324 pmap->pm_stats.resident_count)); 1325 1326 m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME); 1327 1328 pmap->pm_pml4[KPML4I] = 0; /* KVA */ 1329 pmap->pm_pml4[DMPML4I] = 0; /* Direct Map */ 1330 pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ 1331 1332 vm_page_lock_queues(); 1333 m->wire_count--; 1334 atomic_subtract_int(&cnt.v_wire_count, 1); 1335 vm_page_free_zero(m); 1336 vm_page_unlock_queues(); 1337 PMAP_LOCK_DESTROY(pmap); 1338} 1339 1340static int 1341kvm_size(SYSCTL_HANDLER_ARGS) 1342{ 1343 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; 1344 1345 return sysctl_handle_long(oidp, &ksize, 0, req); 1346} 1347SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 1348 0, 0, kvm_size, "IU", "Size of KVM"); 1349 1350static int 1351kvm_free(SYSCTL_HANDLER_ARGS) 1352{ 1353 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 1354 1355 return sysctl_handle_long(oidp, &kfree, 0, req); 1356} 1357SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 1358 0, 0, kvm_free, "IU", "Amount of KVM free"); 1359 1360/* 1361 * grow the number of kernel page table entries, if needed 1362 */ 1363void 1364pmap_growkernel(vm_offset_t addr) 1365{ 1366 vm_paddr_t paddr; 1367 vm_page_t nkpg; 1368 pd_entry_t *pde, newpdir; 1369 pdp_entry_t newpdp; 1370 1371 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 1372 if (kernel_vm_end == 0) { 1373 kernel_vm_end = KERNBASE; 1374 nkpt = 0; 1375 while ((*pmap_pde(kernel_pmap, kernel_vm_end) & PG_V) != 0) { 1376 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1377 nkpt++; 1378 } 1379 } 1380 addr = roundup2(addr, PAGE_SIZE * NPTEPG); 1381 while (kernel_vm_end < addr) { 1382 pde = pmap_pde(kernel_pmap, kernel_vm_end); 1383 if (pde == NULL) { 1384 /* We need a new PDP entry */ 1385 nkpg = vm_page_alloc(NULL, nkpt, 1386 VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); 1387 if (!nkpg) 1388 panic("pmap_growkernel: no memory to grow kernel"); 1389 pmap_zero_page(nkpg); 1390 paddr = VM_PAGE_TO_PHYS(nkpg); 1391 newpdp = (pdp_entry_t) 1392 (paddr | PG_V | PG_RW | PG_A | PG_M); 1393 *pmap_pdpe(kernel_pmap, kernel_vm_end) = newpdp; 1394 continue; /* try again */ 1395 } 1396 if ((*pde & PG_V) != 0) { 1397 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1398 continue; 1399 } 1400 1401 /* 1402 * This index is bogus, but out of the way 1403 */ 1404 nkpg = vm_page_alloc(NULL, nkpt, 1405 VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); 1406 if (!nkpg) 1407 panic("pmap_growkernel: no memory to grow kernel"); 1408 1409 nkpt++; 1410 1411 pmap_zero_page(nkpg); 1412 paddr = VM_PAGE_TO_PHYS(nkpg); 1413 newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M); 1414 *pmap_pde(kernel_pmap, kernel_vm_end) = newpdir; 1415 1416 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1417 } 1418} 1419 1420 1421/*************************************************** 1422 * page management routines. 1423 ***************************************************/ 1424 1425/* 1426 * free the pv_entry back to the free list 1427 */ 1428static PMAP_INLINE void 1429free_pv_entry(pv_entry_t pv) 1430{ 1431 pv_entry_count--; 1432 uma_zfree(pvzone, pv); 1433} 1434 1435/* 1436 * get a new pv_entry, allocating a block from the system 1437 * when needed. 1438 * the memory allocation is performed bypassing the malloc code 1439 * because of the possibility of allocations at interrupt time. 1440 */ 1441static pv_entry_t 1442get_pv_entry(void) 1443{ 1444 pv_entry_count++; 1445 if (pv_entry_high_water && 1446 (pv_entry_count > pv_entry_high_water) && 1447 (pmap_pagedaemon_waken == 0)) { 1448 pmap_pagedaemon_waken = 1; 1449 wakeup (&vm_pages_needed); 1450 } 1451 return uma_zalloc(pvzone, M_NOWAIT); 1452} 1453 1454/* 1455 * Reclaim a pv entry by removing a mapping to an inactive page. 1456 */ 1457static pv_entry_t 1458pv_entry_reclaim(pmap_t locked_pmap) 1459{ 1460 pd_entry_t ptepde; 1461 pmap_t pmap; 1462 pt_entry_t *pte, tpte; 1463 pv_entry_t pv; 1464 vm_offset_t va; 1465 vm_page_t m; 1466 1467 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 1468 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1469 TAILQ_FOREACH(m, &vm_page_queues[PQ_INACTIVE].pl, pageq) { 1470 if (m->hold_count || m->busy || (m->flags & PG_BUSY)) 1471 continue; 1472 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 1473 va = pv->pv_va; 1474 pmap = pv->pv_pmap; 1475 if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) 1476 continue; 1477 pmap->pm_stats.resident_count--; 1478 pte = pmap_pte_pde(pmap, va, &ptepde); 1479 tpte = pte_load_clear(pte); 1480 KASSERT((tpte & PG_W) == 0, 1481 ("pv_entry_reclaim: wired pte %#lx", tpte)); 1482 if (tpte & PG_A) 1483 vm_page_flag_set(m, PG_REFERENCED); 1484 if (tpte & PG_M) { 1485 KASSERT((tpte & PG_RW), 1486 ("pv_entry_reclaim: modified page not writable: va: %#lx, pte: %#lx", 1487 va, tpte)); 1488 if (pmap_track_modified(va)) 1489 vm_page_dirty(m); 1490 } 1491 pmap_invalidate_page(pmap, va); 1492 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); 1493 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 1494 if (TAILQ_EMPTY(&m->md.pv_list)) 1495 vm_page_flag_clear(m, PG_WRITEABLE); 1496 m->md.pv_list_count--; 1497 pmap_unuse_pt(pmap, va, ptepde); 1498 if (pmap != locked_pmap) 1499 PMAP_UNLOCK(pmap); 1500 return (pv); 1501 } 1502 } 1503 panic("pv_entry_reclaim: increase vm.pmap.shpgperproc"); 1504} 1505 1506static void 1507pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 1508{ 1509 pv_entry_t pv; 1510 1511 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1512 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1513 if (m->md.pv_list_count < pmap->pm_stats.resident_count) { 1514 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 1515 if (pmap == pv->pv_pmap && va == pv->pv_va) 1516 break; 1517 } 1518 } else { 1519 TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) { 1520 if (va == pv->pv_va) 1521 break; 1522 } 1523 } 1524 KASSERT(pv != NULL, ("pmap_remove_entry: pv not found")); 1525 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 1526 m->md.pv_list_count--; 1527 if (TAILQ_EMPTY(&m->md.pv_list)) 1528 vm_page_flag_clear(m, PG_WRITEABLE); 1529 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); 1530 free_pv_entry(pv); 1531} 1532 1533/* 1534 * Create a pv entry for page at pa for 1535 * (pmap, va). 1536 */ 1537static void 1538pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 1539{ 1540 pv_entry_t pv; 1541 1542 pv = get_pv_entry(); 1543 if (pv == NULL) { 1544 pv_entry_count--; 1545 pv = pv_entry_reclaim(pmap); 1546 } 1547 pv->pv_va = va; 1548 pv->pv_pmap = pmap; 1549 1550 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1551 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1552 TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); 1553 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 1554 m->md.pv_list_count++; 1555} 1556 1557/* 1558 * pmap_remove_pte: do the things to unmap a page in a process 1559 */ 1560static int 1561pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, pd_entry_t ptepde) 1562{ 1563 pt_entry_t oldpte; 1564 vm_page_t m; 1565 1566 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1567 oldpte = pte_load_clear(ptq); 1568 if (oldpte & PG_W) 1569 pmap->pm_stats.wired_count -= 1; 1570 /* 1571 * Machines that don't support invlpg, also don't support 1572 * PG_G. 1573 */ 1574 if (oldpte & PG_G) 1575 pmap_invalidate_page(kernel_pmap, va); 1576 pmap->pm_stats.resident_count -= 1; 1577 if (oldpte & PG_MANAGED) { 1578 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 1579 if (oldpte & PG_M) { 1580 KASSERT((oldpte & PG_RW), 1581 ("pmap_remove_pte: modified page not writable: va: %#lx, pte: %#lx", 1582 va, oldpte)); 1583 if (pmap_track_modified(va)) 1584 vm_page_dirty(m); 1585 } 1586 if (oldpte & PG_A) 1587 vm_page_flag_set(m, PG_REFERENCED); 1588 pmap_remove_entry(pmap, m, va); 1589 } 1590 return (pmap_unuse_pt(pmap, va, ptepde)); 1591} 1592 1593/* 1594 * Remove a single page from a process address space 1595 */ 1596static void 1597pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde) 1598{ 1599 pt_entry_t *pte; 1600 1601 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1602 if ((*pde & PG_V) == 0) 1603 return; 1604 pte = pmap_pde_to_pte(pde, va); 1605 if ((*pte & PG_V) == 0) 1606 return; 1607 pmap_remove_pte(pmap, pte, va, *pde); 1608 pmap_invalidate_page(pmap, va); 1609} 1610 1611/* 1612 * Remove the given range of addresses from the specified map. 1613 * 1614 * It is assumed that the start and end are properly 1615 * rounded to the page size. 1616 */ 1617void 1618pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1619{ 1620 vm_offset_t va_next; 1621 pml4_entry_t *pml4e; 1622 pdp_entry_t *pdpe; 1623 pd_entry_t ptpaddr, *pde; 1624 pt_entry_t *pte; 1625 int anyvalid; 1626 1627 /* 1628 * Perform an unsynchronized read. This is, however, safe. 1629 */ 1630 if (pmap->pm_stats.resident_count == 0) 1631 return; 1632 1633 anyvalid = 0; 1634 1635 vm_page_lock_queues(); 1636 PMAP_LOCK(pmap); 1637 1638 /* 1639 * special handling of removing one page. a very 1640 * common operation and easy to short circuit some 1641 * code. 1642 */ 1643 if (sva + PAGE_SIZE == eva) { 1644 pde = pmap_pde(pmap, sva); 1645 if (pde && (*pde & PG_PS) == 0) { 1646 pmap_remove_page(pmap, sva, pde); 1647 goto out; 1648 } 1649 } 1650 1651 for (; sva < eva; sva = va_next) { 1652 1653 if (pmap->pm_stats.resident_count == 0) 1654 break; 1655 1656 pml4e = pmap_pml4e(pmap, sva); 1657 if ((*pml4e & PG_V) == 0) { 1658 va_next = (sva + NBPML4) & ~PML4MASK; 1659 continue; 1660 } 1661 1662 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 1663 if ((*pdpe & PG_V) == 0) { 1664 va_next = (sva + NBPDP) & ~PDPMASK; 1665 continue; 1666 } 1667 1668 /* 1669 * Calculate index for next page table. 1670 */ 1671 va_next = (sva + NBPDR) & ~PDRMASK; 1672 1673 pde = pmap_pdpe_to_pde(pdpe, sva); 1674 ptpaddr = *pde; 1675 1676 /* 1677 * Weed out invalid mappings. 1678 */ 1679 if (ptpaddr == 0) 1680 continue; 1681 1682 /* 1683 * Check for large page. 1684 */ 1685 if ((ptpaddr & PG_PS) != 0) { 1686 *pde = 0; 1687 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 1688 pmap_unuse_pt(pmap, sva, *pdpe); 1689 anyvalid = 1; 1690 continue; 1691 } 1692 1693 /* 1694 * Limit our scan to either the end of the va represented 1695 * by the current page table page, or to the end of the 1696 * range being removed. 1697 */ 1698 if (va_next > eva) 1699 va_next = eva; 1700 1701 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 1702 sva += PAGE_SIZE) { 1703 if (*pte == 0) 1704 continue; 1705 anyvalid = 1; 1706 if (pmap_remove_pte(pmap, pte, sva, ptpaddr)) 1707 break; 1708 } 1709 } 1710out: 1711 vm_page_unlock_queues(); 1712 if (anyvalid) 1713 pmap_invalidate_all(pmap); 1714 PMAP_UNLOCK(pmap); 1715} 1716 1717/* 1718 * Routine: pmap_remove_all 1719 * Function: 1720 * Removes this physical page from 1721 * all physical maps in which it resides. 1722 * Reflects back modify bits to the pager. 1723 * 1724 * Notes: 1725 * Original versions of this routine were very 1726 * inefficient because they iteratively called 1727 * pmap_remove (slow...) 1728 */ 1729 1730void 1731pmap_remove_all(vm_page_t m) 1732{ 1733 register pv_entry_t pv; 1734 pt_entry_t *pte, tpte; 1735 pd_entry_t ptepde; 1736 1737#if defined(PMAP_DIAGNOSTIC) 1738 /* 1739 * XXX This makes pmap_remove_all() illegal for non-managed pages! 1740 */ 1741 if (m->flags & PG_FICTITIOUS) { 1742 panic("pmap_remove_all: illegal for unmanaged page, va: 0x%lx", 1743 VM_PAGE_TO_PHYS(m)); 1744 } 1745#endif 1746 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1747 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 1748 PMAP_LOCK(pv->pv_pmap); 1749 pv->pv_pmap->pm_stats.resident_count--; 1750 pte = pmap_pte_pde(pv->pv_pmap, pv->pv_va, &ptepde); 1751 tpte = pte_load_clear(pte); 1752 if (tpte & PG_W) 1753 pv->pv_pmap->pm_stats.wired_count--; 1754 if (tpte & PG_A) 1755 vm_page_flag_set(m, PG_REFERENCED); 1756 1757 /* 1758 * Update the vm_page_t clean and reference bits. 1759 */ 1760 if (tpte & PG_M) { 1761 KASSERT((tpte & PG_RW), 1762 ("pmap_remove_all: modified page not writable: va: %#lx, pte: %#lx", 1763 pv->pv_va, tpte)); 1764 if (pmap_track_modified(pv->pv_va)) 1765 vm_page_dirty(m); 1766 } 1767 pmap_invalidate_page(pv->pv_pmap, pv->pv_va); 1768 TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); 1769 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 1770 m->md.pv_list_count--; 1771 pmap_unuse_pt(pv->pv_pmap, pv->pv_va, ptepde); 1772 PMAP_UNLOCK(pv->pv_pmap); 1773 free_pv_entry(pv); 1774 } 1775 vm_page_flag_clear(m, PG_WRITEABLE); 1776} 1777 1778/* 1779 * Set the physical protection on the 1780 * specified range of this map as requested. 1781 */ 1782void 1783pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 1784{ 1785 vm_offset_t va_next; 1786 pml4_entry_t *pml4e; 1787 pdp_entry_t *pdpe; 1788 pd_entry_t ptpaddr, *pde; 1789 pt_entry_t *pte; 1790 int anychanged; 1791 1792 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 1793 pmap_remove(pmap, sva, eva); 1794 return; 1795 } 1796 1797 if (prot & VM_PROT_WRITE) 1798 return; 1799 1800 anychanged = 0; 1801 1802 vm_page_lock_queues(); 1803 PMAP_LOCK(pmap); 1804 for (; sva < eva; sva = va_next) { 1805 1806 pml4e = pmap_pml4e(pmap, sva); 1807 if ((*pml4e & PG_V) == 0) { 1808 va_next = (sva + NBPML4) & ~PML4MASK; 1809 continue; 1810 } 1811 1812 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 1813 if ((*pdpe & PG_V) == 0) { 1814 va_next = (sva + NBPDP) & ~PDPMASK; 1815 continue; 1816 } 1817 1818 va_next = (sva + NBPDR) & ~PDRMASK; 1819 1820 pde = pmap_pdpe_to_pde(pdpe, sva); 1821 ptpaddr = *pde; 1822 1823 /* 1824 * Weed out invalid mappings. 1825 */ 1826 if (ptpaddr == 0) 1827 continue; 1828 1829 /* 1830 * Check for large page. 1831 */ 1832 if ((ptpaddr & PG_PS) != 0) { 1833 *pde &= ~(PG_M|PG_RW); 1834 anychanged = 1; 1835 continue; 1836 } 1837 1838 if (va_next > eva) 1839 va_next = eva; 1840 1841 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 1842 sva += PAGE_SIZE) { 1843 pt_entry_t obits, pbits; 1844 vm_page_t m; 1845 1846retry: 1847 obits = pbits = *pte; 1848 if (pbits & PG_MANAGED) { 1849 m = NULL; 1850 if (pbits & PG_A) { 1851 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 1852 vm_page_flag_set(m, PG_REFERENCED); 1853 pbits &= ~PG_A; 1854 } 1855 if ((pbits & PG_M) != 0 && 1856 pmap_track_modified(sva)) { 1857 if (m == NULL) 1858 m = PHYS_TO_VM_PAGE(pbits & 1859 PG_FRAME); 1860 vm_page_dirty(m); 1861 } 1862 } 1863 1864 pbits &= ~(PG_RW | PG_M); 1865 1866 if (pbits != obits) { 1867 if (!atomic_cmpset_long(pte, obits, pbits)) 1868 goto retry; 1869 if (obits & PG_G) 1870 pmap_invalidate_page(pmap, sva); 1871 else 1872 anychanged = 1; 1873 } 1874 } 1875 } 1876 vm_page_unlock_queues(); 1877 if (anychanged) 1878 pmap_invalidate_all(pmap); 1879 PMAP_UNLOCK(pmap); 1880} 1881 1882/* 1883 * Insert the given physical page (p) at 1884 * the specified virtual address (v) in the 1885 * target physical map with the protection requested. 1886 * 1887 * If specified, the page will be wired down, meaning 1888 * that the related pte can not be reclaimed. 1889 * 1890 * NB: This is the only routine which MAY NOT lazy-evaluate 1891 * or lose information. That is, this routine must actually 1892 * insert this page into the given map NOW. 1893 */ 1894void 1895pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 1896 boolean_t wired) 1897{ 1898 vm_paddr_t pa; 1899 register pt_entry_t *pte; 1900 vm_paddr_t opa; 1901 pt_entry_t origpte, newpte; 1902 vm_page_t mpte, om; 1903 boolean_t invlva; 1904 1905 va = trunc_page(va); 1906#ifdef PMAP_DIAGNOSTIC 1907 if (va > VM_MAX_KERNEL_ADDRESS) 1908 panic("pmap_enter: toobig"); 1909 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) 1910 panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va); 1911#endif 1912 1913 mpte = NULL; 1914 1915 vm_page_lock_queues(); 1916 PMAP_LOCK(pmap); 1917 1918 /* 1919 * In the case that a page table page is not 1920 * resident, we are creating it here. 1921 */ 1922 if (va < VM_MAXUSER_ADDRESS) { 1923 mpte = pmap_allocpte(pmap, va, M_WAITOK); 1924 } 1925#if 0 && defined(PMAP_DIAGNOSTIC) 1926 else { 1927 pd_entry_t *pdeaddr = pmap_pde(pmap, va); 1928 origpte = *pdeaddr; 1929 if ((origpte & PG_V) == 0) { 1930 panic("pmap_enter: invalid kernel page table page, pde=%p, va=%p\n", 1931 origpte, va); 1932 } 1933 } 1934#endif 1935 1936 pte = pmap_pte(pmap, va); 1937 1938 /* 1939 * Page Directory table entry not valid, we need a new PT page 1940 */ 1941 if (pte == NULL) 1942 panic("pmap_enter: invalid page directory va=%#lx\n", va); 1943 1944 pa = VM_PAGE_TO_PHYS(m); 1945 om = NULL; 1946 origpte = *pte; 1947 opa = origpte & PG_FRAME; 1948 1949 if (origpte & PG_PS) 1950 panic("pmap_enter: attempted pmap_enter on 2MB page"); 1951 1952 /* 1953 * Mapping has not changed, must be protection or wiring change. 1954 */ 1955 if (origpte && (opa == pa)) { 1956 /* 1957 * Wiring change, just update stats. We don't worry about 1958 * wiring PT pages as they remain resident as long as there 1959 * are valid mappings in them. Hence, if a user page is wired, 1960 * the PT page will be also. 1961 */ 1962 if (wired && ((origpte & PG_W) == 0)) 1963 pmap->pm_stats.wired_count++; 1964 else if (!wired && (origpte & PG_W)) 1965 pmap->pm_stats.wired_count--; 1966 1967 /* 1968 * Remove extra pte reference 1969 */ 1970 if (mpte) 1971 mpte->wire_count--; 1972 1973 /* 1974 * We might be turning off write access to the page, 1975 * so we go ahead and sense modify status. 1976 */ 1977 if (origpte & PG_MANAGED) { 1978 om = m; 1979 pa |= PG_MANAGED; 1980 } 1981 goto validate; 1982 } 1983 /* 1984 * Mapping has changed, invalidate old range and fall through to 1985 * handle validating new mapping. 1986 */ 1987 if (opa) { 1988 if (origpte & PG_W) 1989 pmap->pm_stats.wired_count--; 1990 if (origpte & PG_MANAGED) { 1991 om = PHYS_TO_VM_PAGE(opa); 1992 pmap_remove_entry(pmap, om, va); 1993 } 1994 if (mpte != NULL) { 1995 mpte->wire_count--; 1996 KASSERT(mpte->wire_count > 0, 1997 ("pmap_enter: missing reference to page table page," 1998 " va: 0x%lx", va)); 1999 } 2000 } else 2001 pmap->pm_stats.resident_count++; 2002 2003 /* 2004 * Enter on the PV list if part of our managed memory. 2005 */ 2006 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { 2007 pmap_insert_entry(pmap, va, m); 2008 pa |= PG_MANAGED; 2009 } 2010 2011 /* 2012 * Increment counters 2013 */ 2014 if (wired) 2015 pmap->pm_stats.wired_count++; 2016 2017validate: 2018 /* 2019 * Now validate mapping with desired protection/wiring. 2020 */ 2021 newpte = (pt_entry_t)(pa | PG_V); 2022 if ((prot & VM_PROT_WRITE) != 0) 2023 newpte |= PG_RW; 2024 if ((prot & VM_PROT_EXECUTE) == 0) 2025 newpte |= pg_nx; 2026 if (wired) 2027 newpte |= PG_W; 2028 if (va < VM_MAXUSER_ADDRESS) 2029 newpte |= PG_U; 2030 if (pmap == kernel_pmap) 2031 newpte |= PG_G; 2032 2033 /* 2034 * if the mapping or permission bits are different, we need 2035 * to update the pte. 2036 */ 2037 if ((origpte & ~(PG_M|PG_A)) != newpte) { 2038 if (origpte & PG_V) { 2039 invlva = FALSE; 2040 origpte = pte_load_store(pte, newpte | PG_A); 2041 if (origpte & PG_A) { 2042 if (origpte & PG_MANAGED) 2043 vm_page_flag_set(om, PG_REFERENCED); 2044 if (opa != VM_PAGE_TO_PHYS(m) || ((origpte & 2045 PG_NX) == 0 && (newpte & PG_NX))) 2046 invlva = TRUE; 2047 } 2048 if (origpte & PG_M) { 2049 KASSERT((origpte & PG_RW), 2050 ("pmap_enter: modified page not writable: va: %#lx, pte: %#lx", 2051 va, origpte)); 2052 if ((origpte & PG_MANAGED) && 2053 pmap_track_modified(va)) 2054 vm_page_dirty(om); 2055 if ((newpte & PG_RW) == 0) 2056 invlva = TRUE; 2057 } 2058 if (invlva) 2059 pmap_invalidate_page(pmap, va); 2060 } else 2061 pte_store(pte, newpte | PG_A); 2062 } 2063 vm_page_unlock_queues(); 2064 PMAP_UNLOCK(pmap); 2065} 2066 2067/* 2068 * this code makes some *MAJOR* assumptions: 2069 * 1. Current pmap & pmap exists. 2070 * 2. Not wired. 2071 * 3. Read access. 2072 * 4. No page table pages. 2073 * but is *MUCH* faster than pmap_enter... 2074 */ 2075 2076vm_page_t 2077pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2078 vm_page_t mpte) 2079{ 2080 pt_entry_t *pte; 2081 vm_paddr_t pa; 2082 2083 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2084 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 2085 PMAP_LOCK(pmap); 2086 2087 /* 2088 * In the case that a page table page is not 2089 * resident, we are creating it here. 2090 */ 2091 if (va < VM_MAXUSER_ADDRESS) { 2092 vm_pindex_t ptepindex; 2093 pd_entry_t *ptepa; 2094 2095 /* 2096 * Calculate pagetable page index 2097 */ 2098 ptepindex = pmap_pde_pindex(va); 2099 if (mpte && (mpte->pindex == ptepindex)) { 2100 mpte->wire_count++; 2101 } else { 2102 retry: 2103 /* 2104 * Get the page directory entry 2105 */ 2106 ptepa = pmap_pde(pmap, va); 2107 2108 /* 2109 * If the page table page is mapped, we just increment 2110 * the hold count, and activate it. 2111 */ 2112 if (ptepa && (*ptepa & PG_V) != 0) { 2113 if (*ptepa & PG_PS) 2114 panic("pmap_enter_quick: unexpected mapping into 2MB page"); 2115 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); 2116 mpte->wire_count++; 2117 } else { 2118 mpte = _pmap_allocpte(pmap, ptepindex, 2119 M_NOWAIT); 2120 if (mpte == NULL) { 2121 PMAP_UNLOCK(pmap); 2122 vm_page_busy(m); 2123 vm_page_unlock_queues(); 2124 VM_OBJECT_UNLOCK(m->object); 2125 VM_WAIT; 2126 VM_OBJECT_LOCK(m->object); 2127 vm_page_lock_queues(); 2128 vm_page_wakeup(m); 2129 PMAP_LOCK(pmap); 2130 goto retry; 2131 } 2132 } 2133 } 2134 } else { 2135 mpte = NULL; 2136 } 2137 2138 /* 2139 * This call to vtopte makes the assumption that we are 2140 * entering the page into the current pmap. In order to support 2141 * quick entry into any pmap, one would likely use pmap_pte. 2142 * But that isn't as quick as vtopte. 2143 */ 2144 pte = vtopte(va); 2145 if (*pte) { 2146 if (mpte != NULL) { 2147 pmap_unwire_pte_hold(pmap, va, mpte); 2148 mpte = NULL; 2149 } 2150 goto out; 2151 } 2152 2153 /* 2154 * Enter on the PV list if part of our managed memory. Note that we 2155 * raise IPL while manipulating pv_table since pmap_enter can be 2156 * called at interrupt time. 2157 */ 2158 if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) 2159 pmap_insert_entry(pmap, va, m); 2160 2161 /* 2162 * Increment counters 2163 */ 2164 pmap->pm_stats.resident_count++; 2165 2166 pa = VM_PAGE_TO_PHYS(m); 2167 if ((prot & VM_PROT_EXECUTE) == 0) 2168 pa |= pg_nx; 2169 2170 /* 2171 * Now validate mapping with RO protection 2172 */ 2173 if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) 2174 pte_store(pte, pa | PG_V | PG_U); 2175 else 2176 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 2177out: 2178 PMAP_UNLOCK(pmap); 2179 return mpte; 2180} 2181 2182/* 2183 * Make a temporary mapping for a physical address. This is only intended 2184 * to be used for panic dumps. 2185 */ 2186void * 2187pmap_kenter_temporary(vm_paddr_t pa, int i) 2188{ 2189 vm_offset_t va; 2190 2191 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 2192 pmap_kenter(va, pa); 2193 invlpg(va); 2194 return ((void *)crashdumpmap); 2195} 2196 2197/* 2198 * This code maps large physical mmap regions into the 2199 * processor address space. Note that some shortcuts 2200 * are taken, but the code works. 2201 */ 2202void 2203pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, 2204 vm_object_t object, vm_pindex_t pindex, 2205 vm_size_t size) 2206{ 2207 vm_offset_t va; 2208 vm_page_t p, pdpg; 2209 2210 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 2211 KASSERT(object->type == OBJT_DEVICE, 2212 ("pmap_object_init_pt: non-device object")); 2213 if (((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) { 2214 vm_page_t m[1]; 2215 pd_entry_t ptepa, *pde; 2216 2217 PMAP_LOCK(pmap); 2218 pde = pmap_pde(pmap, addr); 2219 if (pde != 0 && (*pde & PG_V) != 0) 2220 goto out; 2221 PMAP_UNLOCK(pmap); 2222retry: 2223 p = vm_page_lookup(object, pindex); 2224 if (p != NULL) { 2225 vm_page_lock_queues(); 2226 if (vm_page_sleep_if_busy(p, FALSE, "init4p")) 2227 goto retry; 2228 } else { 2229 p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL); 2230 if (p == NULL) 2231 return; 2232 m[0] = p; 2233 2234 if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) { 2235 vm_page_lock_queues(); 2236 vm_page_free(p); 2237 vm_page_unlock_queues(); 2238 return; 2239 } 2240 2241 p = vm_page_lookup(object, pindex); 2242 vm_page_lock_queues(); 2243 vm_page_wakeup(p); 2244 } 2245 vm_page_unlock_queues(); 2246 2247 ptepa = VM_PAGE_TO_PHYS(p); 2248 if (ptepa & (NBPDR - 1)) 2249 return; 2250 2251 p->valid = VM_PAGE_BITS_ALL; 2252 2253 PMAP_LOCK(pmap); 2254 for (va = addr; va < addr + size; va += NBPDR) { 2255 while ((pdpg = 2256 pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) { 2257 PMAP_UNLOCK(pmap); 2258 vm_page_lock_queues(); 2259 vm_page_busy(p); 2260 vm_page_unlock_queues(); 2261 VM_OBJECT_UNLOCK(object); 2262 VM_WAIT; 2263 VM_OBJECT_LOCK(object); 2264 vm_page_lock_queues(); 2265 vm_page_wakeup(p); 2266 vm_page_unlock_queues(); 2267 PMAP_LOCK(pmap); 2268 } 2269 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 2270 pde = &pde[pmap_pde_index(va)]; 2271 if ((*pde & PG_V) == 0) { 2272 pde_store(pde, ptepa | PG_PS | PG_M | PG_A | 2273 PG_U | PG_RW | PG_V); 2274 pmap->pm_stats.resident_count += 2275 NBPDR / PAGE_SIZE; 2276 } else { 2277 pdpg->wire_count--; 2278 KASSERT(pdpg->wire_count > 0, 2279 ("pmap_object_init_pt: missing reference " 2280 "to page directory page, va: 0x%lx", va)); 2281 } 2282 ptepa += NBPDR; 2283 } 2284 pmap_invalidate_all(pmap); 2285out: 2286 PMAP_UNLOCK(pmap); 2287 } 2288} 2289 2290/* 2291 * Routine: pmap_change_wiring 2292 * Function: Change the wiring attribute for a map/virtual-address 2293 * pair. 2294 * In/out conditions: 2295 * The mapping must already exist in the pmap. 2296 */ 2297void 2298pmap_change_wiring(pmap, va, wired) 2299 register pmap_t pmap; 2300 vm_offset_t va; 2301 boolean_t wired; 2302{ 2303 register pt_entry_t *pte; 2304 2305 /* 2306 * Wiring is not a hardware characteristic so there is no need to 2307 * invalidate TLB. 2308 */ 2309 PMAP_LOCK(pmap); 2310 pte = pmap_pte(pmap, va); 2311 if (wired && (*pte & PG_W) == 0) { 2312 pmap->pm_stats.wired_count++; 2313 atomic_set_long(pte, PG_W); 2314 } else if (!wired && (*pte & PG_W) != 0) { 2315 pmap->pm_stats.wired_count--; 2316 atomic_clear_long(pte, PG_W); 2317 } 2318 PMAP_UNLOCK(pmap); 2319} 2320 2321 2322 2323/* 2324 * Copy the range specified by src_addr/len 2325 * from the source map to the range dst_addr/len 2326 * in the destination map. 2327 * 2328 * This routine is only advisory and need not do anything. 2329 */ 2330 2331void 2332pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 2333 vm_offset_t src_addr) 2334{ 2335 vm_offset_t addr; 2336 vm_offset_t end_addr = src_addr + len; 2337 vm_offset_t va_next; 2338 vm_page_t m; 2339 2340 if (dst_addr != src_addr) 2341 return; 2342 2343 if (!pmap_is_current(src_pmap)) 2344 return; 2345 2346 vm_page_lock_queues(); 2347 if (dst_pmap < src_pmap) { 2348 PMAP_LOCK(dst_pmap); 2349 PMAP_LOCK(src_pmap); 2350 } else { 2351 PMAP_LOCK(src_pmap); 2352 PMAP_LOCK(dst_pmap); 2353 } 2354 for (addr = src_addr; addr < end_addr; addr = va_next) { 2355 pt_entry_t *src_pte, *dst_pte; 2356 vm_page_t dstmpde, dstmpte, srcmpte; 2357 pml4_entry_t *pml4e; 2358 pdp_entry_t *pdpe; 2359 pd_entry_t srcptepaddr, *pde; 2360 2361 if (addr >= UPT_MIN_ADDRESS) 2362 panic("pmap_copy: invalid to pmap_copy page tables"); 2363 2364 /* 2365 * Don't let optional prefaulting of pages make us go 2366 * way below the low water mark of free pages or way 2367 * above high water mark of used pv entries. 2368 */ 2369 if (cnt.v_free_count < cnt.v_free_reserved || 2370 pv_entry_count > pv_entry_high_water) 2371 break; 2372 2373 pml4e = pmap_pml4e(src_pmap, addr); 2374 if ((*pml4e & PG_V) == 0) { 2375 va_next = (addr + NBPML4) & ~PML4MASK; 2376 continue; 2377 } 2378 2379 pdpe = pmap_pml4e_to_pdpe(pml4e, addr); 2380 if ((*pdpe & PG_V) == 0) { 2381 va_next = (addr + NBPDP) & ~PDPMASK; 2382 continue; 2383 } 2384 2385 va_next = (addr + NBPDR) & ~PDRMASK; 2386 2387 pde = pmap_pdpe_to_pde(pdpe, addr); 2388 srcptepaddr = *pde; 2389 if (srcptepaddr == 0) 2390 continue; 2391 2392 if (srcptepaddr & PG_PS) { 2393 dstmpde = pmap_allocpde(dst_pmap, addr, M_NOWAIT); 2394 if (dstmpde == NULL) 2395 break; 2396 pde = (pd_entry_t *) 2397 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde)); 2398 pde = &pde[pmap_pde_index(addr)]; 2399 if (*pde == 0) { 2400 *pde = srcptepaddr; 2401 dst_pmap->pm_stats.resident_count += 2402 NBPDR / PAGE_SIZE; 2403 } else 2404 pmap_unwire_pte_hold(dst_pmap, addr, dstmpde); 2405 continue; 2406 } 2407 2408 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); 2409 if (srcmpte->wire_count == 0) 2410 panic("pmap_copy: source page table page is unused"); 2411 2412 if (va_next > end_addr) 2413 va_next = end_addr; 2414 2415 src_pte = vtopte(addr); 2416 while (addr < va_next) { 2417 pt_entry_t ptetemp; 2418 ptetemp = *src_pte; 2419 /* 2420 * we only virtual copy managed pages 2421 */ 2422 if ((ptetemp & PG_MANAGED) != 0) { 2423 /* 2424 * We have to check after allocpte for the 2425 * pte still being around... allocpte can 2426 * block. 2427 */ 2428 dstmpte = pmap_allocpte(dst_pmap, addr, 2429 M_NOWAIT); 2430 if (dstmpte == NULL) 2431 break; 2432 dst_pte = (pt_entry_t *) 2433 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 2434 dst_pte = &dst_pte[pmap_pte_index(addr)]; 2435 if (*dst_pte == 0) { 2436 /* 2437 * Clear the modified and 2438 * accessed (referenced) bits 2439 * during the copy. 2440 */ 2441 m = PHYS_TO_VM_PAGE(ptetemp & PG_FRAME); 2442 *dst_pte = ptetemp & ~(PG_M | PG_A); 2443 dst_pmap->pm_stats.resident_count++; 2444 pmap_insert_entry(dst_pmap, addr, m); 2445 } else 2446 pmap_unwire_pte_hold(dst_pmap, addr, dstmpte); 2447 if (dstmpte->wire_count >= srcmpte->wire_count) 2448 break; 2449 } 2450 addr += PAGE_SIZE; 2451 src_pte++; 2452 } 2453 } 2454 vm_page_unlock_queues(); 2455 PMAP_UNLOCK(src_pmap); 2456 PMAP_UNLOCK(dst_pmap); 2457} 2458 2459/* 2460 * pmap_zero_page zeros the specified hardware page by mapping 2461 * the page into KVM and using bzero to clear its contents. 2462 */ 2463void 2464pmap_zero_page(vm_page_t m) 2465{ 2466 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 2467 2468 pagezero((void *)va); 2469} 2470 2471/* 2472 * pmap_zero_page_area zeros the specified hardware page by mapping 2473 * the page into KVM and using bzero to clear its contents. 2474 * 2475 * off and size may not cover an area beyond a single hardware page. 2476 */ 2477void 2478pmap_zero_page_area(vm_page_t m, int off, int size) 2479{ 2480 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 2481 2482 if (off == 0 && size == PAGE_SIZE) 2483 pagezero((void *)va); 2484 else 2485 bzero((char *)va + off, size); 2486} 2487 2488/* 2489 * pmap_zero_page_idle zeros the specified hardware page by mapping 2490 * the page into KVM and using bzero to clear its contents. This 2491 * is intended to be called from the vm_pagezero process only and 2492 * outside of Giant. 2493 */ 2494void 2495pmap_zero_page_idle(vm_page_t m) 2496{ 2497 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 2498 2499 pagezero((void *)va); 2500} 2501 2502/* 2503 * pmap_copy_page copies the specified (machine independent) 2504 * page by mapping the page into virtual memory and using 2505 * bcopy to copy the page, one machine dependent page at a 2506 * time. 2507 */ 2508void 2509pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 2510{ 2511 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 2512 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 2513 2514 pagecopy((void *)src, (void *)dst); 2515} 2516 2517/* 2518 * Returns true if the pmap's pv is one of the first 2519 * 16 pvs linked to from this page. This count may 2520 * be changed upwards or downwards in the future; it 2521 * is only necessary that true be returned for a small 2522 * subset of pmaps for proper page aging. 2523 */ 2524boolean_t 2525pmap_page_exists_quick(pmap, m) 2526 pmap_t pmap; 2527 vm_page_t m; 2528{ 2529 pv_entry_t pv; 2530 int loops = 0; 2531 2532 if (m->flags & PG_FICTITIOUS) 2533 return FALSE; 2534 2535 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2536 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2537 if (pv->pv_pmap == pmap) { 2538 return TRUE; 2539 } 2540 loops++; 2541 if (loops >= 16) 2542 break; 2543 } 2544 return (FALSE); 2545} 2546 2547#define PMAP_REMOVE_PAGES_CURPROC_ONLY 2548/* 2549 * Remove all pages from specified address space 2550 * this aids process exit speeds. Also, this code 2551 * is special cased for current process only, but 2552 * can have the more generic (and slightly slower) 2553 * mode enabled. This is much faster than pmap_remove 2554 * in the case of running down an entire address space. 2555 */ 2556void 2557pmap_remove_pages(pmap, sva, eva) 2558 pmap_t pmap; 2559 vm_offset_t sva, eva; 2560{ 2561 pt_entry_t *pte, tpte; 2562 vm_page_t m; 2563 pv_entry_t pv, npv; 2564 2565#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY 2566 if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) { 2567 printf("warning: pmap_remove_pages called with non-current pmap\n"); 2568 return; 2569 } 2570#endif 2571 vm_page_lock_queues(); 2572 PMAP_LOCK(pmap); 2573 for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { 2574 2575 if (pv->pv_va >= eva || pv->pv_va < sva) { 2576 npv = TAILQ_NEXT(pv, pv_plist); 2577 continue; 2578 } 2579 2580#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY 2581 pte = vtopte(pv->pv_va); 2582#else 2583 pte = pmap_pte(pmap, pv->pv_va); 2584#endif 2585 tpte = *pte; 2586 2587 if (tpte == 0) { 2588 printf("TPTE at %p IS ZERO @ VA %08lx\n", 2589 pte, pv->pv_va); 2590 panic("bad pte"); 2591 } 2592 2593/* 2594 * We cannot remove wired pages from a process' mapping at this time 2595 */ 2596 if (tpte & PG_W) { 2597 npv = TAILQ_NEXT(pv, pv_plist); 2598 continue; 2599 } 2600 2601 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 2602 KASSERT(m->phys_addr == (tpte & PG_FRAME), 2603 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 2604 m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); 2605 2606 KASSERT(m < &vm_page_array[vm_page_array_size], 2607 ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); 2608 2609 pmap->pm_stats.resident_count--; 2610 2611 pte_clear(pte); 2612 2613 /* 2614 * Update the vm_page_t clean and reference bits. 2615 */ 2616 if (tpte & PG_M) { 2617 vm_page_dirty(m); 2618 } 2619 2620 npv = TAILQ_NEXT(pv, pv_plist); 2621 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); 2622 2623 m->md.pv_list_count--; 2624 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2625 if (TAILQ_EMPTY(&m->md.pv_list)) 2626 vm_page_flag_clear(m, PG_WRITEABLE); 2627 2628 pmap_unuse_pt(pmap, pv->pv_va, *vtopde(pv->pv_va)); 2629 free_pv_entry(pv); 2630 } 2631 pmap_invalidate_all(pmap); 2632 PMAP_UNLOCK(pmap); 2633 vm_page_unlock_queues(); 2634} 2635 2636/* 2637 * pmap_is_modified: 2638 * 2639 * Return whether or not the specified physical page was modified 2640 * in any physical maps. 2641 */ 2642boolean_t 2643pmap_is_modified(vm_page_t m) 2644{ 2645 pv_entry_t pv; 2646 pt_entry_t *pte; 2647 boolean_t rv; 2648 2649 rv = FALSE; 2650 if (m->flags & PG_FICTITIOUS) 2651 return (rv); 2652 2653 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2654 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2655 /* 2656 * if the bit being tested is the modified bit, then 2657 * mark clean_map and ptes as never 2658 * modified. 2659 */ 2660 if (!pmap_track_modified(pv->pv_va)) 2661 continue; 2662 PMAP_LOCK(pv->pv_pmap); 2663 pte = pmap_pte(pv->pv_pmap, pv->pv_va); 2664 rv = (*pte & PG_M) != 0; 2665 PMAP_UNLOCK(pv->pv_pmap); 2666 if (rv) 2667 break; 2668 } 2669 return (rv); 2670} 2671 2672/* 2673 * pmap_is_prefaultable: 2674 * 2675 * Return whether or not the specified virtual address is elgible 2676 * for prefault. 2677 */ 2678boolean_t 2679pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 2680{ 2681 pd_entry_t *pde; 2682 pt_entry_t *pte; 2683 boolean_t rv; 2684 2685 rv = FALSE; 2686 PMAP_LOCK(pmap); 2687 pde = pmap_pde(pmap, addr); 2688 if (pde != NULL && (*pde & PG_V)) { 2689 pte = vtopte(addr); 2690 rv = (*pte & PG_V) == 0; 2691 } 2692 PMAP_UNLOCK(pmap); 2693 return (rv); 2694} 2695 2696/* 2697 * Clear the given bit in each of the given page's ptes. 2698 */ 2699static __inline void 2700pmap_clear_ptes(vm_page_t m, long bit) 2701{ 2702 register pv_entry_t pv; 2703 pt_entry_t pbits, *pte; 2704 2705 if ((m->flags & PG_FICTITIOUS) || 2706 (bit == PG_RW && (m->flags & PG_WRITEABLE) == 0)) 2707 return; 2708 2709 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2710 /* 2711 * Loop over all current mappings setting/clearing as appropos If 2712 * setting RO do we need to clear the VAC? 2713 */ 2714 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2715 /* 2716 * don't write protect pager mappings 2717 */ 2718 if (bit == PG_RW) { 2719 if (!pmap_track_modified(pv->pv_va)) 2720 continue; 2721 } 2722 2723 PMAP_LOCK(pv->pv_pmap); 2724 pte = pmap_pte(pv->pv_pmap, pv->pv_va); 2725retry: 2726 pbits = *pte; 2727 if (pbits & bit) { 2728 if (bit == PG_RW) { 2729 if (!atomic_cmpset_long(pte, pbits, 2730 pbits & ~(PG_RW | PG_M))) 2731 goto retry; 2732 if (pbits & PG_M) { 2733 vm_page_dirty(m); 2734 } 2735 } else { 2736 atomic_clear_long(pte, bit); 2737 } 2738 pmap_invalidate_page(pv->pv_pmap, pv->pv_va); 2739 } 2740 PMAP_UNLOCK(pv->pv_pmap); 2741 } 2742 if (bit == PG_RW) 2743 vm_page_flag_clear(m, PG_WRITEABLE); 2744} 2745 2746/* 2747 * pmap_page_protect: 2748 * 2749 * Lower the permission for all mappings to a given page. 2750 */ 2751void 2752pmap_page_protect(vm_page_t m, vm_prot_t prot) 2753{ 2754 if ((prot & VM_PROT_WRITE) == 0) { 2755 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { 2756 pmap_clear_ptes(m, PG_RW); 2757 } else { 2758 pmap_remove_all(m); 2759 } 2760 } 2761} 2762 2763/* 2764 * pmap_ts_referenced: 2765 * 2766 * Return a count of reference bits for a page, clearing those bits. 2767 * It is not necessary for every reference bit to be cleared, but it 2768 * is necessary that 0 only be returned when there are truly no 2769 * reference bits set. 2770 * 2771 * XXX: The exact number of bits to check and clear is a matter that 2772 * should be tested and standardized at some point in the future for 2773 * optimal aging of shared pages. 2774 */ 2775int 2776pmap_ts_referenced(vm_page_t m) 2777{ 2778 register pv_entry_t pv, pvf, pvn; 2779 pt_entry_t *pte; 2780 pt_entry_t v; 2781 int rtval = 0; 2782 2783 if (m->flags & PG_FICTITIOUS) 2784 return (rtval); 2785 2786 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2787 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2788 2789 pvf = pv; 2790 2791 do { 2792 pvn = TAILQ_NEXT(pv, pv_list); 2793 2794 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2795 2796 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2797 2798 if (!pmap_track_modified(pv->pv_va)) 2799 continue; 2800 2801 PMAP_LOCK(pv->pv_pmap); 2802 pte = pmap_pte(pv->pv_pmap, pv->pv_va); 2803 2804 if (pte && ((v = pte_load(pte)) & PG_A) != 0) { 2805 atomic_clear_long(pte, PG_A); 2806 pmap_invalidate_page(pv->pv_pmap, pv->pv_va); 2807 2808 rtval++; 2809 if (rtval > 4) { 2810 PMAP_UNLOCK(pv->pv_pmap); 2811 break; 2812 } 2813 } 2814 PMAP_UNLOCK(pv->pv_pmap); 2815 } while ((pv = pvn) != NULL && pv != pvf); 2816 } 2817 2818 return (rtval); 2819} 2820 2821/* 2822 * Clear the modify bits on the specified physical page. 2823 */ 2824void 2825pmap_clear_modify(vm_page_t m) 2826{ 2827 pmap_clear_ptes(m, PG_M); 2828} 2829 2830/* 2831 * pmap_clear_reference: 2832 * 2833 * Clear the reference bit on the specified physical page. 2834 */ 2835void 2836pmap_clear_reference(vm_page_t m) 2837{ 2838 pmap_clear_ptes(m, PG_A); 2839} 2840 2841/* 2842 * Miscellaneous support routines follow 2843 */ 2844 2845/* 2846 * Map a set of physical memory pages into the kernel virtual 2847 * address space. Return a pointer to where it is mapped. This 2848 * routine is intended to be used for mapping device memory, 2849 * NOT real memory. 2850 */ 2851void * 2852pmap_mapdev(pa, size) 2853 vm_paddr_t pa; 2854 vm_size_t size; 2855{ 2856 vm_offset_t va, tmpva, offset; 2857 2858 /* If this fits within the direct map window, use it */ 2859 if (pa < dmaplimit && (pa + size) < dmaplimit) 2860 return ((void *)PHYS_TO_DMAP(pa)); 2861 offset = pa & PAGE_MASK; 2862 size = roundup(offset + size, PAGE_SIZE); 2863 va = kmem_alloc_nofault(kernel_map, size); 2864 if (!va) 2865 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 2866 pa = trunc_page(pa); 2867 for (tmpva = va; size > 0; ) { 2868 pmap_kenter(tmpva, pa); 2869 size -= PAGE_SIZE; 2870 tmpva += PAGE_SIZE; 2871 pa += PAGE_SIZE; 2872 } 2873 pmap_invalidate_range(kernel_pmap, va, tmpva); 2874 return ((void *)(va + offset)); 2875} 2876 2877void 2878pmap_unmapdev(va, size) 2879 vm_offset_t va; 2880 vm_size_t size; 2881{ 2882 vm_offset_t base, offset, tmpva; 2883 2884 /* If we gave a direct map region in pmap_mapdev, do nothing */ 2885 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 2886 return; 2887 base = trunc_page(va); 2888 offset = va & PAGE_MASK; 2889 size = roundup(offset + size, PAGE_SIZE); 2890 for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) 2891 pmap_kremove(tmpva); 2892 pmap_invalidate_range(kernel_pmap, va, tmpva); 2893 kmem_free(kernel_map, base, size); 2894} 2895 2896/* 2897 * perform the pmap work for mincore 2898 */ 2899int 2900pmap_mincore(pmap, addr) 2901 pmap_t pmap; 2902 vm_offset_t addr; 2903{ 2904 pt_entry_t *ptep, pte; 2905 vm_page_t m; 2906 int val = 0; 2907 2908 PMAP_LOCK(pmap); 2909 ptep = pmap_pte(pmap, addr); 2910 pte = (ptep != NULL) ? *ptep : 0; 2911 PMAP_UNLOCK(pmap); 2912 2913 if (pte != 0) { 2914 vm_paddr_t pa; 2915 2916 val = MINCORE_INCORE; 2917 if ((pte & PG_MANAGED) == 0) 2918 return val; 2919 2920 pa = pte & PG_FRAME; 2921 2922 m = PHYS_TO_VM_PAGE(pa); 2923 2924 /* 2925 * Modified by us 2926 */ 2927 if (pte & PG_M) 2928 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 2929 else { 2930 /* 2931 * Modified by someone else 2932 */ 2933 vm_page_lock_queues(); 2934 if (m->dirty || pmap_is_modified(m)) 2935 val |= MINCORE_MODIFIED_OTHER; 2936 vm_page_unlock_queues(); 2937 } 2938 /* 2939 * Referenced by us 2940 */ 2941 if (pte & PG_A) 2942 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 2943 else { 2944 /* 2945 * Referenced by someone else 2946 */ 2947 vm_page_lock_queues(); 2948 if ((m->flags & PG_REFERENCED) || 2949 pmap_ts_referenced(m)) { 2950 val |= MINCORE_REFERENCED_OTHER; 2951 vm_page_flag_set(m, PG_REFERENCED); 2952 } 2953 vm_page_unlock_queues(); 2954 } 2955 } 2956 return val; 2957} 2958 2959void 2960pmap_activate(struct thread *td) 2961{ 2962 struct proc *p = td->td_proc; 2963 pmap_t pmap, oldpmap; 2964 u_int64_t cr3; 2965 2966 critical_enter(); 2967 pmap = vmspace_pmap(td->td_proc->p_vmspace); 2968 oldpmap = PCPU_GET(curpmap); 2969#ifdef SMP 2970if (oldpmap) /* XXX FIXME */ 2971 atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask)); 2972 atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask)); 2973#else 2974if (oldpmap) /* XXX FIXME */ 2975 oldpmap->pm_active &= ~PCPU_GET(cpumask); 2976 pmap->pm_active |= PCPU_GET(cpumask); 2977#endif 2978 cr3 = vtophys(pmap->pm_pml4); 2979 /* XXXKSE this is wrong. 2980 * pmap_activate is for the current thread on the current cpu 2981 */ 2982 if (p->p_flag & P_SA) { 2983 /* Make sure all other cr3 entries are updated. */ 2984 /* what if they are running? XXXKSE (maybe abort them) */ 2985 FOREACH_THREAD_IN_PROC(p, td) { 2986 td->td_pcb->pcb_cr3 = cr3; 2987 } 2988 } else { 2989 td->td_pcb->pcb_cr3 = cr3; 2990 } 2991 load_cr3(cr3); 2992 critical_exit(); 2993} 2994 2995vm_offset_t 2996pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) 2997{ 2998 2999 if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { 3000 return addr; 3001 } 3002 3003 addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); 3004 return addr; 3005} 3006