pmap.c revision 177851
1/*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2003 Peter Wemm 9 * All rights reserved. 10 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu> 11 * All rights reserved. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * the Systems Programming Group of the University of Utah Computer 15 * Science Department and William Jolitz of UUNET Technologies Inc. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. All advertising materials mentioning features or use of this software 26 * must display the following acknowledgement: 27 * This product includes software developed by the University of 28 * California, Berkeley and its contributors. 29 * 4. Neither the name of the University nor the names of its contributors 30 * may be used to endorse or promote products derived from this software 31 * without specific prior written permission. 32 * 33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 36 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 43 * SUCH DAMAGE. 44 * 45 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 46 */ 47/*- 48 * Copyright (c) 2003 Networks Associates Technology, Inc. 49 * All rights reserved. 50 * 51 * This software was developed for the FreeBSD Project by Jake Burkholder, 52 * Safeport Network Services, and Network Associates Laboratories, the 53 * Security Research Division of Network Associates, Inc. under 54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 55 * CHATS research program. 56 * 57 * Redistribution and use in source and binary forms, with or without 58 * modification, are permitted provided that the following conditions 59 * are met: 60 * 1. Redistributions of source code must retain the above copyright 61 * notice, this list of conditions and the following disclaimer. 62 * 2. Redistributions in binary form must reproduce the above copyright 63 * notice, this list of conditions and the following disclaimer in the 64 * documentation and/or other materials provided with the distribution. 65 * 66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 69 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 76 * SUCH DAMAGE. 77 */ 78 79#include <sys/cdefs.h> 80__FBSDID("$FreeBSD: head/sys/amd64/amd64/pmap.c 177851 2008-04-02 04:39:47Z alc $"); 81 82/* 83 * Manages physical address maps. 84 * 85 * In addition to hardware address maps, this 86 * module is called upon to provide software-use-only 87 * maps which may or may not be stored in the same 88 * form as hardware maps. These pseudo-maps are 89 * used to store intermediate results from copy 90 * operations to and from address spaces. 91 * 92 * Since the information managed by this module is 93 * also stored by the logical address mapping module, 94 * this module may throw away valid virtual-to-physical 95 * mappings at almost any time. However, invalidations 96 * of virtual-to-physical mappings must be done as 97 * requested. 98 * 99 * In order to cope with hardware architectures which 100 * make virtual-to-physical map invalidates expensive, 101 * this module may delay invalidate or reduced protection 102 * operations until such time as they are actually 103 * necessary. This module is given full information as 104 * to which processors are currently using which maps, 105 * and to when physical maps must be made correct. 106 */ 107 108#include "opt_msgbuf.h" 109#include "opt_pmap.h" 110#include "opt_vm.h" 111 112#include <sys/param.h> 113#include <sys/systm.h> 114#include <sys/kernel.h> 115#include <sys/ktr.h> 116#include <sys/lock.h> 117#include <sys/malloc.h> 118#include <sys/mman.h> 119#include <sys/msgbuf.h> 120#include <sys/mutex.h> 121#include <sys/proc.h> 122#include <sys/sx.h> 123#include <sys/vmmeter.h> 124#include <sys/sched.h> 125#include <sys/sysctl.h> 126#ifdef SMP 127#include <sys/smp.h> 128#endif 129 130#include <vm/vm.h> 131#include <vm/vm_param.h> 132#include <vm/vm_kern.h> 133#include <vm/vm_page.h> 134#include <vm/vm_map.h> 135#include <vm/vm_object.h> 136#include <vm/vm_extern.h> 137#include <vm/vm_pageout.h> 138#include <vm/vm_pager.h> 139#include <vm/vm_reserv.h> 140#include <vm/uma.h> 141 142#include <machine/cpu.h> 143#include <machine/cputypes.h> 144#include <machine/md_var.h> 145#include <machine/pcb.h> 146#include <machine/specialreg.h> 147#ifdef SMP 148#include <machine/smp.h> 149#endif 150 151#ifndef PMAP_SHPGPERPROC 152#define PMAP_SHPGPERPROC 200 153#endif 154 155#if !defined(DIAGNOSTIC) 156#define PMAP_INLINE __gnu89_inline 157#else 158#define PMAP_INLINE 159#endif 160 161#define PV_STATS 162#ifdef PV_STATS 163#define PV_STAT(x) do { x ; } while (0) 164#else 165#define PV_STAT(x) do { } while (0) 166#endif 167 168#define pa_index(pa) ((pa) >> PDRSHIFT) 169#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 170 171struct pmap kernel_pmap_store; 172 173vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 174vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 175 176static int nkpt; 177static int ndmpdp; 178static vm_paddr_t dmaplimit; 179vm_offset_t kernel_vm_end; 180pt_entry_t pg_nx; 181 182SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 183 184static int pg_ps_enabled; 185SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RD, &pg_ps_enabled, 0, 186 "Are large page mappings enabled?"); 187 188static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 189static u_int64_t KPDphys; /* phys addr of kernel level 2 */ 190u_int64_t KPDPphys; /* phys addr of kernel level 3 */ 191u_int64_t KPML4phys; /* phys addr of kernel level 4 */ 192 193static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ 194static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 195 196/* 197 * Data for the pv entry allocation mechanism 198 */ 199static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 200static struct md_page *pv_table; 201static int shpgperproc = PMAP_SHPGPERPROC; 202 203/* 204 * All those kernel PT submaps that BSD is so fond of 205 */ 206pt_entry_t *CMAP1 = 0; 207caddr_t CADDR1 = 0; 208struct msgbuf *msgbufp = 0; 209 210/* 211 * Crashdump maps. 212 */ 213static caddr_t crashdumpmap; 214 215static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 216static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try); 217static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 218static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_page_t m); 219static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 220static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 221static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 222 vm_offset_t va); 223 224static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 225static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, 226 vm_prot_t prot); 227static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 228 vm_page_t m, vm_prot_t prot, vm_page_t mpte); 229static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 230static boolean_t pmap_is_modified_pvh(struct md_page *pvh); 231static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); 232static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 233static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 234 vm_prot_t prot); 235static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 236 vm_page_t *free); 237static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, 238 vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free); 239static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); 240static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 241 vm_page_t *free); 242static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, 243 vm_offset_t va); 244static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); 245static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 246 vm_page_t m); 247 248static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags); 249static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags); 250 251static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags); 252static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 253 vm_page_t* free); 254static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, vm_page_t *); 255static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 256 257CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); 258CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); 259 260/* 261 * Move the kernel virtual free pointer to the next 262 * 2MB. This is used to help improve performance 263 * by using a large (2MB) page for much of the kernel 264 * (.text, .data, .bss) 265 */ 266static vm_offset_t 267pmap_kmem_choose(vm_offset_t addr) 268{ 269 vm_offset_t newaddr = addr; 270 271 newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); 272 return newaddr; 273} 274 275/********************/ 276/* Inline functions */ 277/********************/ 278 279/* Return a non-clipped PD index for a given VA */ 280static __inline vm_pindex_t 281pmap_pde_pindex(vm_offset_t va) 282{ 283 return va >> PDRSHIFT; 284} 285 286 287/* Return various clipped indexes for a given VA */ 288static __inline vm_pindex_t 289pmap_pte_index(vm_offset_t va) 290{ 291 292 return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); 293} 294 295static __inline vm_pindex_t 296pmap_pde_index(vm_offset_t va) 297{ 298 299 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 300} 301 302static __inline vm_pindex_t 303pmap_pdpe_index(vm_offset_t va) 304{ 305 306 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 307} 308 309static __inline vm_pindex_t 310pmap_pml4e_index(vm_offset_t va) 311{ 312 313 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 314} 315 316/* Return a pointer to the PML4 slot that corresponds to a VA */ 317static __inline pml4_entry_t * 318pmap_pml4e(pmap_t pmap, vm_offset_t va) 319{ 320 321 return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 322} 323 324/* Return a pointer to the PDP slot that corresponds to a VA */ 325static __inline pdp_entry_t * 326pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 327{ 328 pdp_entry_t *pdpe; 329 330 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 331 return (&pdpe[pmap_pdpe_index(va)]); 332} 333 334/* Return a pointer to the PDP slot that corresponds to a VA */ 335static __inline pdp_entry_t * 336pmap_pdpe(pmap_t pmap, vm_offset_t va) 337{ 338 pml4_entry_t *pml4e; 339 340 pml4e = pmap_pml4e(pmap, va); 341 if ((*pml4e & PG_V) == 0) 342 return NULL; 343 return (pmap_pml4e_to_pdpe(pml4e, va)); 344} 345 346/* Return a pointer to the PD slot that corresponds to a VA */ 347static __inline pd_entry_t * 348pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 349{ 350 pd_entry_t *pde; 351 352 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 353 return (&pde[pmap_pde_index(va)]); 354} 355 356/* Return a pointer to the PD slot that corresponds to a VA */ 357static __inline pd_entry_t * 358pmap_pde(pmap_t pmap, vm_offset_t va) 359{ 360 pdp_entry_t *pdpe; 361 362 pdpe = pmap_pdpe(pmap, va); 363 if (pdpe == NULL || (*pdpe & PG_V) == 0) 364 return NULL; 365 return (pmap_pdpe_to_pde(pdpe, va)); 366} 367 368/* Return a pointer to the PT slot that corresponds to a VA */ 369static __inline pt_entry_t * 370pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 371{ 372 pt_entry_t *pte; 373 374 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 375 return (&pte[pmap_pte_index(va)]); 376} 377 378/* Return a pointer to the PT slot that corresponds to a VA */ 379static __inline pt_entry_t * 380pmap_pte(pmap_t pmap, vm_offset_t va) 381{ 382 pd_entry_t *pde; 383 384 pde = pmap_pde(pmap, va); 385 if (pde == NULL || (*pde & PG_V) == 0) 386 return NULL; 387 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 388 return ((pt_entry_t *)pde); 389 return (pmap_pde_to_pte(pde, va)); 390} 391 392 393PMAP_INLINE pt_entry_t * 394vtopte(vm_offset_t va) 395{ 396 u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 397 398 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 399} 400 401static __inline pd_entry_t * 402vtopde(vm_offset_t va) 403{ 404 u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 405 406 return (PDmap + ((va >> PDRSHIFT) & mask)); 407} 408 409static u_int64_t 410allocpages(vm_paddr_t *firstaddr, int n) 411{ 412 u_int64_t ret; 413 414 ret = *firstaddr; 415 bzero((void *)ret, n * PAGE_SIZE); 416 *firstaddr += n * PAGE_SIZE; 417 return (ret); 418} 419 420static void 421create_pagetables(vm_paddr_t *firstaddr) 422{ 423 int i; 424 425 /* Allocate pages */ 426 KPTphys = allocpages(firstaddr, NKPT); 427 KPML4phys = allocpages(firstaddr, 1); 428 KPDPphys = allocpages(firstaddr, NKPML4E); 429 KPDphys = allocpages(firstaddr, NKPDPE); 430 431 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; 432 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 433 ndmpdp = 4; 434 DMPDPphys = allocpages(firstaddr, NDMPML4E); 435 if ((amd_feature & AMDID_PAGE1GB) == 0) 436 DMPDphys = allocpages(firstaddr, ndmpdp); 437 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 438 439 /* Fill in the underlying page table pages */ 440 /* Read-only from zero to physfree */ 441 /* XXX not fully used, underneath 2M pages */ 442 for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) { 443 ((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT; 444 ((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G; 445 } 446 447 /* Now map the page tables at their location within PTmap */ 448 for (i = 0; i < NKPT; i++) { 449 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); 450 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V; 451 } 452 453 /* Map from zero to end of allocations under 2M pages */ 454 /* This replaces some of the KPTphys entries above */ 455 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) { 456 ((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT; 457 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G; 458 } 459 460 /* And connect up the PD to the PDP */ 461 for (i = 0; i < NKPDPE; i++) { 462 ((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys + 463 (i << PAGE_SHIFT); 464 ((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U; 465 } 466 467 /* Now set up the direct map space using either 2MB or 1GB pages */ 468 if ((amd_feature & AMDID_PAGE1GB) == 0) { 469 for (i = 0; i < NPDEPG * ndmpdp; i++) { 470 ((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT; 471 ((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS | 472 PG_G; 473 } 474 /* And the direct map space's PDP */ 475 for (i = 0; i < ndmpdp; i++) { 476 ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + 477 (i << PAGE_SHIFT); 478 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U; 479 } 480 } else { 481 for (i = 0; i < ndmpdp; i++) { 482 ((pdp_entry_t *)DMPDPphys)[i] = 483 (vm_paddr_t)i << PDPSHIFT; 484 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS | 485 PG_G; 486 } 487 } 488 489 /* And recursively map PML4 to itself in order to get PTmap */ 490 ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; 491 ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U; 492 493 /* Connect the Direct Map slot up to the PML4 */ 494 ((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys; 495 ((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U; 496 497 /* Connect the KVA slot up to the PML4 */ 498 ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys; 499 ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U; 500} 501 502/* 503 * Bootstrap the system enough to run with virtual memory. 504 * 505 * On amd64 this is called after mapping has already been enabled 506 * and just syncs the pmap module with what has already been done. 507 * [We can't call it easily with mapping off since the kernel is not 508 * mapped with PA == VA, hence we would have to relocate every address 509 * from the linked base (virtual) address "KERNBASE" to the actual 510 * (physical) address starting relative to 0] 511 */ 512void 513pmap_bootstrap(vm_paddr_t *firstaddr) 514{ 515 vm_offset_t va; 516 pt_entry_t *pte, *unused; 517 518 /* 519 * Create an initial set of page tables to run the kernel in. 520 */ 521 create_pagetables(firstaddr); 522 523 virtual_avail = (vm_offset_t) KERNBASE + *firstaddr; 524 virtual_avail = pmap_kmem_choose(virtual_avail); 525 526 virtual_end = VM_MAX_KERNEL_ADDRESS; 527 528 529 /* XXX do %cr0 as well */ 530 load_cr4(rcr4() | CR4_PGE | CR4_PSE); 531 load_cr3(KPML4phys); 532 533 /* 534 * Initialize the kernel pmap (which is statically allocated). 535 */ 536 PMAP_LOCK_INIT(kernel_pmap); 537 kernel_pmap->pm_pml4 = (pdp_entry_t *) (KERNBASE + KPML4phys); 538 kernel_pmap->pm_root = NULL; 539 kernel_pmap->pm_active = -1; /* don't allow deactivation */ 540 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 541 nkpt = NKPT; 542 543 /* 544 * Reserve some special page table entries/VA space for temporary 545 * mapping of pages. 546 */ 547#define SYSMAP(c, p, v, n) \ 548 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 549 550 va = virtual_avail; 551 pte = vtopte(va); 552 553 /* 554 * CMAP1 is only used for the memory test. 555 */ 556 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 557 558 /* 559 * Crashdump maps. 560 */ 561 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) 562 563 /* 564 * msgbufp is used to map the system message buffer. 565 */ 566 SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE))) 567 568 virtual_avail = va; 569 570 *CMAP1 = 0; 571 572 invltlb(); 573 574 /* Initialize the PAT MSR. */ 575 pmap_init_pat(); 576} 577 578/* 579 * Setup the PAT MSR. 580 */ 581void 582pmap_init_pat(void) 583{ 584 uint64_t pat_msr; 585 586 /* Bail if this CPU doesn't implement PAT. */ 587 if (!(cpu_feature & CPUID_PAT)) 588 panic("no PAT??"); 589 590#ifdef PAT_WORKS 591 /* 592 * Leave the indices 0-3 at the default of WB, WT, UC, and UC-. 593 * Program 4 and 5 as WP and WC. 594 * Leave 6 and 7 as UC and UC-. 595 */ 596 pat_msr = rdmsr(MSR_PAT); 597 pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5)); 598 pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) | 599 PAT_VALUE(5, PAT_WRITE_COMBINING); 600#else 601 /* 602 * Due to some Intel errata, we can only safely use the lower 4 603 * PAT entries. Thus, just replace PAT Index 2 with WC instead 604 * of UC-. 605 * 606 * Intel Pentium III Processor Specification Update 607 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B 608 * or Mode C Paging) 609 * 610 * Intel Pentium IV Processor Specification Update 611 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) 612 */ 613 pat_msr = rdmsr(MSR_PAT); 614 pat_msr &= ~PAT_MASK(2); 615 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 616#endif 617 wrmsr(MSR_PAT, pat_msr); 618} 619 620/* 621 * Initialize a vm_page's machine-dependent fields. 622 */ 623void 624pmap_page_init(vm_page_t m) 625{ 626 627 TAILQ_INIT(&m->md.pv_list); 628} 629 630/* 631 * Initialize the pmap module. 632 * Called by vm_init, to initialize any structures that the pmap 633 * system needs to map virtual memory. 634 */ 635void 636pmap_init(void) 637{ 638 pd_entry_t *pd; 639 vm_page_t mpte; 640 vm_size_t s; 641 int i, pv_npg; 642 643 /* 644 * Initialize the vm page array entries for the kernel pmap's 645 * page table pages. 646 */ 647 pd = pmap_pde(kernel_pmap, VM_MIN_KERNEL_ADDRESS); 648 for (i = 0; i < nkpt; i++) { 649 if ((pd[i] & (PG_PS | PG_V)) == (PG_PS | PG_V)) 650 continue; 651 mpte = PHYS_TO_VM_PAGE(pd[i] & PG_FRAME); 652 KASSERT(mpte >= vm_page_array && 653 mpte < &vm_page_array[vm_page_array_size], 654 ("pmap_init: page table page is out of range")); 655 mpte->pindex = pmap_pde_pindex(VM_MIN_KERNEL_ADDRESS) + i; 656 mpte->phys_addr = pd[i] & PG_FRAME; 657 } 658 659 /* 660 * Initialize the address space (zone) for the pv entries. Set a 661 * high water mark so that the system can recover from excessive 662 * numbers of pv entries. 663 */ 664 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 665 pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; 666 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 667 pv_entry_high_water = 9 * (pv_entry_max / 10); 668 669 /* 670 * Are large page mappings enabled? 671 */ 672 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 673 674 /* 675 * Calculate the size of the pv head table for superpages. 676 */ 677 for (i = 0; phys_avail[i + 1]; i += 2); 678 pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR; 679 680 /* 681 * Allocate memory for the pv head table for superpages. 682 */ 683 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 684 s = round_page(s); 685 pv_table = (struct md_page *)kmem_alloc(kernel_map, s); 686 for (i = 0; i < pv_npg; i++) 687 TAILQ_INIT(&pv_table[i].pv_list); 688} 689 690static int 691pmap_pventry_proc(SYSCTL_HANDLER_ARGS) 692{ 693 int error; 694 695 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); 696 if (error == 0 && req->newptr) { 697 shpgperproc = (pv_entry_max - cnt.v_page_count) / maxproc; 698 pv_entry_high_water = 9 * (pv_entry_max / 10); 699 } 700 return (error); 701} 702SYSCTL_PROC(_vm_pmap, OID_AUTO, pv_entry_max, CTLTYPE_INT|CTLFLAG_RW, 703 &pv_entry_max, 0, pmap_pventry_proc, "IU", "Max number of PV entries"); 704 705static int 706pmap_shpgperproc_proc(SYSCTL_HANDLER_ARGS) 707{ 708 int error; 709 710 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); 711 if (error == 0 && req->newptr) { 712 pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; 713 pv_entry_high_water = 9 * (pv_entry_max / 10); 714 } 715 return (error); 716} 717SYSCTL_PROC(_vm_pmap, OID_AUTO, shpgperproc, CTLTYPE_INT|CTLFLAG_RW, 718 &shpgperproc, 0, pmap_shpgperproc_proc, "IU", "Page share factor per proc"); 719 720SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, 721 "2MB page mapping counters"); 722 723static u_long pmap_pde_demotions; 724SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, 725 &pmap_pde_demotions, 0, "2MB page demotions"); 726 727static u_long pmap_pde_mappings; 728SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 729 &pmap_pde_mappings, 0, "2MB page mappings"); 730 731static u_long pmap_pde_p_failures; 732SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 733 &pmap_pde_p_failures, 0, "2MB page promotion failures"); 734 735static u_long pmap_pde_promotions; 736SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 737 &pmap_pde_promotions, 0, "2MB page promotions"); 738 739 740/*************************************************** 741 * Low level helper routines..... 742 ***************************************************/ 743 744/* 745 * Determine the appropriate bits to set in a PTE or PDE for a specified 746 * caching mode. 747 */ 748static int 749pmap_cache_bits(int mode, boolean_t is_pde) 750{ 751 int pat_flag, pat_index, cache_bits; 752 753 /* The PAT bit is different for PTE's and PDE's. */ 754 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; 755 756 /* If we don't support PAT, map extended modes to older ones. */ 757 if (!(cpu_feature & CPUID_PAT)) { 758 switch (mode) { 759 case PAT_UNCACHEABLE: 760 case PAT_WRITE_THROUGH: 761 case PAT_WRITE_BACK: 762 break; 763 case PAT_UNCACHED: 764 case PAT_WRITE_COMBINING: 765 case PAT_WRITE_PROTECTED: 766 mode = PAT_UNCACHEABLE; 767 break; 768 } 769 } 770 771 /* Map the caching mode to a PAT index. */ 772 switch (mode) { 773#ifdef PAT_WORKS 774 case PAT_UNCACHEABLE: 775 pat_index = 3; 776 break; 777 case PAT_WRITE_THROUGH: 778 pat_index = 1; 779 break; 780 case PAT_WRITE_BACK: 781 pat_index = 0; 782 break; 783 case PAT_UNCACHED: 784 pat_index = 2; 785 break; 786 case PAT_WRITE_COMBINING: 787 pat_index = 5; 788 break; 789 case PAT_WRITE_PROTECTED: 790 pat_index = 4; 791 break; 792#else 793 case PAT_UNCACHED: 794 case PAT_UNCACHEABLE: 795 case PAT_WRITE_PROTECTED: 796 pat_index = 3; 797 break; 798 case PAT_WRITE_THROUGH: 799 pat_index = 1; 800 break; 801 case PAT_WRITE_BACK: 802 pat_index = 0; 803 break; 804 case PAT_WRITE_COMBINING: 805 pat_index = 2; 806 break; 807#endif 808 default: 809 panic("Unknown caching mode %d\n", mode); 810 } 811 812 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 813 cache_bits = 0; 814 if (pat_index & 0x4) 815 cache_bits |= pat_flag; 816 if (pat_index & 0x2) 817 cache_bits |= PG_NC_PCD; 818 if (pat_index & 0x1) 819 cache_bits |= PG_NC_PWT; 820 return (cache_bits); 821} 822#ifdef SMP 823/* 824 * For SMP, these functions have to use the IPI mechanism for coherence. 825 * 826 * N.B.: Before calling any of the following TLB invalidation functions, 827 * the calling processor must ensure that all stores updating a non- 828 * kernel page table are globally performed. Otherwise, another 829 * processor could cache an old, pre-update entry without being 830 * invalidated. This can happen one of two ways: (1) The pmap becomes 831 * active on another processor after its pm_active field is checked by 832 * one of the following functions but before a store updating the page 833 * table is globally performed. (2) The pmap becomes active on another 834 * processor before its pm_active field is checked but due to 835 * speculative loads one of the following functions stills reads the 836 * pmap as inactive on the other processor. 837 * 838 * The kernel page table is exempt because its pm_active field is 839 * immutable. The kernel page table is always active on every 840 * processor. 841 */ 842void 843pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 844{ 845 u_int cpumask; 846 u_int other_cpus; 847 848 sched_pin(); 849 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 850 invlpg(va); 851 smp_invlpg(va); 852 } else { 853 cpumask = PCPU_GET(cpumask); 854 other_cpus = PCPU_GET(other_cpus); 855 if (pmap->pm_active & cpumask) 856 invlpg(va); 857 if (pmap->pm_active & other_cpus) 858 smp_masked_invlpg(pmap->pm_active & other_cpus, va); 859 } 860 sched_unpin(); 861} 862 863void 864pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 865{ 866 u_int cpumask; 867 u_int other_cpus; 868 vm_offset_t addr; 869 870 sched_pin(); 871 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 872 for (addr = sva; addr < eva; addr += PAGE_SIZE) 873 invlpg(addr); 874 smp_invlpg_range(sva, eva); 875 } else { 876 cpumask = PCPU_GET(cpumask); 877 other_cpus = PCPU_GET(other_cpus); 878 if (pmap->pm_active & cpumask) 879 for (addr = sva; addr < eva; addr += PAGE_SIZE) 880 invlpg(addr); 881 if (pmap->pm_active & other_cpus) 882 smp_masked_invlpg_range(pmap->pm_active & other_cpus, 883 sva, eva); 884 } 885 sched_unpin(); 886} 887 888void 889pmap_invalidate_all(pmap_t pmap) 890{ 891 u_int cpumask; 892 u_int other_cpus; 893 894 sched_pin(); 895 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 896 invltlb(); 897 smp_invltlb(); 898 } else { 899 cpumask = PCPU_GET(cpumask); 900 other_cpus = PCPU_GET(other_cpus); 901 if (pmap->pm_active & cpumask) 902 invltlb(); 903 if (pmap->pm_active & other_cpus) 904 smp_masked_invltlb(pmap->pm_active & other_cpus); 905 } 906 sched_unpin(); 907} 908 909void 910pmap_invalidate_cache(void) 911{ 912 913 sched_pin(); 914 wbinvd(); 915 smp_cache_flush(); 916 sched_unpin(); 917} 918#else /* !SMP */ 919/* 920 * Normal, non-SMP, invalidation functions. 921 * We inline these within pmap.c for speed. 922 */ 923PMAP_INLINE void 924pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 925{ 926 927 if (pmap == kernel_pmap || pmap->pm_active) 928 invlpg(va); 929} 930 931PMAP_INLINE void 932pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 933{ 934 vm_offset_t addr; 935 936 if (pmap == kernel_pmap || pmap->pm_active) 937 for (addr = sva; addr < eva; addr += PAGE_SIZE) 938 invlpg(addr); 939} 940 941PMAP_INLINE void 942pmap_invalidate_all(pmap_t pmap) 943{ 944 945 if (pmap == kernel_pmap || pmap->pm_active) 946 invltlb(); 947} 948 949PMAP_INLINE void 950pmap_invalidate_cache(void) 951{ 952 953 wbinvd(); 954} 955#endif /* !SMP */ 956 957/* 958 * Are we current address space or kernel? 959 */ 960static __inline int 961pmap_is_current(pmap_t pmap) 962{ 963 return (pmap == kernel_pmap || 964 (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME)); 965} 966 967/* 968 * Routine: pmap_extract 969 * Function: 970 * Extract the physical page address associated 971 * with the given map/virtual_address pair. 972 */ 973vm_paddr_t 974pmap_extract(pmap_t pmap, vm_offset_t va) 975{ 976 vm_paddr_t rtval; 977 pt_entry_t *pte; 978 pd_entry_t pde, *pdep; 979 980 rtval = 0; 981 PMAP_LOCK(pmap); 982 pdep = pmap_pde(pmap, va); 983 if (pdep != NULL) { 984 pde = *pdep; 985 if (pde) { 986 if ((pde & PG_PS) != 0) { 987 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); 988 PMAP_UNLOCK(pmap); 989 return rtval; 990 } 991 pte = pmap_pde_to_pte(pdep, va); 992 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); 993 } 994 } 995 PMAP_UNLOCK(pmap); 996 return (rtval); 997} 998 999/* 1000 * Routine: pmap_extract_and_hold 1001 * Function: 1002 * Atomically extract and hold the physical page 1003 * with the given pmap and virtual address pair 1004 * if that mapping permits the given protection. 1005 */ 1006vm_page_t 1007pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1008{ 1009 pd_entry_t pde, *pdep; 1010 pt_entry_t pte; 1011 vm_page_t m; 1012 1013 m = NULL; 1014 vm_page_lock_queues(); 1015 PMAP_LOCK(pmap); 1016 pdep = pmap_pde(pmap, va); 1017 if (pdep != NULL && (pde = *pdep)) { 1018 if (pde & PG_PS) { 1019 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 1020 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | 1021 (va & PDRMASK)); 1022 vm_page_hold(m); 1023 } 1024 } else { 1025 pte = *pmap_pde_to_pte(pdep, va); 1026 if ((pte & PG_V) && 1027 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 1028 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 1029 vm_page_hold(m); 1030 } 1031 } 1032 } 1033 vm_page_unlock_queues(); 1034 PMAP_UNLOCK(pmap); 1035 return (m); 1036} 1037 1038vm_paddr_t 1039pmap_kextract(vm_offset_t va) 1040{ 1041 pd_entry_t *pde; 1042 vm_paddr_t pa; 1043 1044 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 1045 pa = DMAP_TO_PHYS(va); 1046 } else { 1047 pde = vtopde(va); 1048 if (*pde & PG_PS) { 1049 pa = (*pde & PG_PS_FRAME) | (va & PDRMASK); 1050 } else { 1051 pa = *vtopte(va); 1052 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 1053 } 1054 } 1055 return pa; 1056} 1057 1058/*************************************************** 1059 * Low level mapping routines..... 1060 ***************************************************/ 1061 1062/* 1063 * Add a wired page to the kva. 1064 * Note: not SMP coherent. 1065 */ 1066PMAP_INLINE void 1067pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1068{ 1069 pt_entry_t *pte; 1070 1071 pte = vtopte(va); 1072 pte_store(pte, pa | PG_RW | PG_V | PG_G); 1073} 1074 1075PMAP_INLINE void 1076pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 1077{ 1078 pt_entry_t *pte; 1079 1080 pte = vtopte(va); 1081 pte_store(pte, pa | PG_RW | PG_V | PG_G | pmap_cache_bits(mode, 0)); 1082} 1083 1084/* 1085 * Remove a page from the kernel pagetables. 1086 * Note: not SMP coherent. 1087 */ 1088PMAP_INLINE void 1089pmap_kremove(vm_offset_t va) 1090{ 1091 pt_entry_t *pte; 1092 1093 pte = vtopte(va); 1094 pte_clear(pte); 1095} 1096 1097/* 1098 * Used to map a range of physical addresses into kernel 1099 * virtual address space. 1100 * 1101 * The value passed in '*virt' is a suggested virtual address for 1102 * the mapping. Architectures which can support a direct-mapped 1103 * physical to virtual region can return the appropriate address 1104 * within that region, leaving '*virt' unchanged. Other 1105 * architectures should map the pages starting at '*virt' and 1106 * update '*virt' with the first usable address after the mapped 1107 * region. 1108 */ 1109vm_offset_t 1110pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1111{ 1112 return PHYS_TO_DMAP(start); 1113} 1114 1115 1116/* 1117 * Add a list of wired pages to the kva 1118 * this routine is only used for temporary 1119 * kernel mappings that do not need to have 1120 * page modification or references recorded. 1121 * Note that old mappings are simply written 1122 * over. The page *must* be wired. 1123 * Note: SMP coherent. Uses a ranged shootdown IPI. 1124 */ 1125void 1126pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1127{ 1128 pt_entry_t *endpte, oldpte, *pte; 1129 1130 oldpte = 0; 1131 pte = vtopte(sva); 1132 endpte = pte + count; 1133 while (pte < endpte) { 1134 oldpte |= *pte; 1135 pte_store(pte, VM_PAGE_TO_PHYS(*ma) | PG_G | PG_RW | PG_V); 1136 pte++; 1137 ma++; 1138 } 1139 if ((oldpte & PG_V) != 0) 1140 pmap_invalidate_range(kernel_pmap, sva, sva + count * 1141 PAGE_SIZE); 1142} 1143 1144/* 1145 * This routine tears out page mappings from the 1146 * kernel -- it is meant only for temporary mappings. 1147 * Note: SMP coherent. Uses a ranged shootdown IPI. 1148 */ 1149void 1150pmap_qremove(vm_offset_t sva, int count) 1151{ 1152 vm_offset_t va; 1153 1154 va = sva; 1155 while (count-- > 0) { 1156 pmap_kremove(va); 1157 va += PAGE_SIZE; 1158 } 1159 pmap_invalidate_range(kernel_pmap, sva, va); 1160} 1161 1162/*************************************************** 1163 * Page table page management routines..... 1164 ***************************************************/ 1165static __inline void 1166pmap_free_zero_pages(vm_page_t free) 1167{ 1168 vm_page_t m; 1169 1170 while (free != NULL) { 1171 m = free; 1172 free = m->right; 1173 /* Preserve the page's PG_ZERO setting. */ 1174 vm_page_free_toq(m); 1175 } 1176} 1177 1178/* 1179 * Schedule the specified unused page table page to be freed. Specifically, 1180 * add the page to the specified list of pages that will be released to the 1181 * physical memory manager after the TLB has been updated. 1182 */ 1183static __inline void 1184pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO) 1185{ 1186 1187 if (set_PG_ZERO) 1188 m->flags |= PG_ZERO; 1189 else 1190 m->flags &= ~PG_ZERO; 1191 m->right = *free; 1192 *free = m; 1193} 1194 1195/* 1196 * Inserts the specified page table page into the specified pmap's collection 1197 * of idle page table pages. Each of a pmap's page table pages is responsible 1198 * for mapping a distinct range of virtual addresses. The pmap's collection is 1199 * ordered by this virtual address range. 1200 */ 1201static void 1202pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 1203{ 1204 vm_page_t root; 1205 1206 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1207 root = pmap->pm_root; 1208 if (root == NULL) { 1209 mpte->left = NULL; 1210 mpte->right = NULL; 1211 } else { 1212 root = vm_page_splay(mpte->pindex, root); 1213 if (mpte->pindex < root->pindex) { 1214 mpte->left = root->left; 1215 mpte->right = root; 1216 root->left = NULL; 1217 } else if (mpte->pindex == root->pindex) 1218 panic("pmap_insert_pt_page: pindex already inserted"); 1219 else { 1220 mpte->right = root->right; 1221 mpte->left = root; 1222 root->right = NULL; 1223 } 1224 } 1225 pmap->pm_root = mpte; 1226} 1227 1228/* 1229 * Looks for a page table page mapping the specified virtual address in the 1230 * specified pmap's collection of idle page table pages. Returns NULL if there 1231 * is no page table page corresponding to the specified virtual address. 1232 */ 1233static vm_page_t 1234pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) 1235{ 1236 vm_page_t mpte; 1237 vm_pindex_t pindex = pmap_pde_pindex(va); 1238 1239 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1240 if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) { 1241 mpte = vm_page_splay(pindex, mpte); 1242 if ((pmap->pm_root = mpte)->pindex != pindex) 1243 mpte = NULL; 1244 } 1245 return (mpte); 1246} 1247 1248/* 1249 * Removes the specified page table page from the specified pmap's collection 1250 * of idle page table pages. The specified page table page must be a member of 1251 * the pmap's collection. 1252 */ 1253static void 1254pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) 1255{ 1256 vm_page_t root; 1257 1258 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1259 if (mpte != pmap->pm_root) { 1260 root = vm_page_splay(mpte->pindex, pmap->pm_root); 1261 KASSERT(mpte == root, 1262 ("pmap_remove_pt_page: mpte %p is missing from pmap %p", 1263 mpte, pmap)); 1264 } 1265 if (mpte->left == NULL) 1266 root = mpte->right; 1267 else { 1268 root = vm_page_splay(mpte->pindex, mpte->left); 1269 root->right = mpte->right; 1270 } 1271 pmap->pm_root = root; 1272} 1273 1274/* 1275 * This routine unholds page table pages, and if the hold count 1276 * drops to zero, then it decrements the wire count. 1277 */ 1278static __inline int 1279pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free) 1280{ 1281 1282 --m->wire_count; 1283 if (m->wire_count == 0) 1284 return _pmap_unwire_pte_hold(pmap, va, m, free); 1285 else 1286 return 0; 1287} 1288 1289static int 1290_pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 1291 vm_page_t *free) 1292{ 1293 vm_offset_t pteva; 1294 1295 /* 1296 * unmap the page table page 1297 */ 1298 if (m->pindex >= (NUPDE + NUPDPE)) { 1299 /* PDP page */ 1300 pml4_entry_t *pml4; 1301 pml4 = pmap_pml4e(pmap, va); 1302 pteva = (vm_offset_t) PDPmap + amd64_ptob(m->pindex - (NUPDE + NUPDPE)); 1303 *pml4 = 0; 1304 } else if (m->pindex >= NUPDE) { 1305 /* PD page */ 1306 pdp_entry_t *pdp; 1307 pdp = pmap_pdpe(pmap, va); 1308 pteva = (vm_offset_t) PDmap + amd64_ptob(m->pindex - NUPDE); 1309 *pdp = 0; 1310 } else { 1311 /* PTE page */ 1312 pd_entry_t *pd; 1313 pd = pmap_pde(pmap, va); 1314 pteva = (vm_offset_t) PTmap + amd64_ptob(m->pindex); 1315 *pd = 0; 1316 } 1317 --pmap->pm_stats.resident_count; 1318 if (m->pindex < NUPDE) { 1319 /* We just released a PT, unhold the matching PD */ 1320 vm_page_t pdpg; 1321 1322 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 1323 pmap_unwire_pte_hold(pmap, va, pdpg, free); 1324 } 1325 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 1326 /* We just released a PD, unhold the matching PDP */ 1327 vm_page_t pdppg; 1328 1329 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 1330 pmap_unwire_pte_hold(pmap, va, pdppg, free); 1331 } 1332 1333 /* 1334 * This is a release store so that the ordinary store unmapping 1335 * the page table page is globally performed before TLB shoot- 1336 * down is begun. 1337 */ 1338 atomic_subtract_rel_int(&cnt.v_wire_count, 1); 1339 1340 /* 1341 * Do an invltlb to make the invalidated mapping 1342 * take effect immediately. 1343 */ 1344 pmap_invalidate_page(pmap, pteva); 1345 1346 /* 1347 * Put page on a list so that it is released after 1348 * *ALL* TLB shootdown is done 1349 */ 1350 pmap_add_delayed_free_list(m, free, TRUE); 1351 1352 return 1; 1353} 1354 1355/* 1356 * After removing a page table entry, this routine is used to 1357 * conditionally free the page, and manage the hold/wire counts. 1358 */ 1359static int 1360pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, vm_page_t *free) 1361{ 1362 vm_page_t mpte; 1363 1364 if (va >= VM_MAXUSER_ADDRESS) 1365 return 0; 1366 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 1367 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 1368 return pmap_unwire_pte_hold(pmap, va, mpte, free); 1369} 1370 1371void 1372pmap_pinit0(pmap_t pmap) 1373{ 1374 1375 PMAP_LOCK_INIT(pmap); 1376 pmap->pm_pml4 = (pml4_entry_t *)(KERNBASE + KPML4phys); 1377 pmap->pm_root = NULL; 1378 pmap->pm_active = 0; 1379 TAILQ_INIT(&pmap->pm_pvchunk); 1380 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1381} 1382 1383/* 1384 * Initialize a preallocated and zeroed pmap structure, 1385 * such as one in a vmspace structure. 1386 */ 1387int 1388pmap_pinit(pmap_t pmap) 1389{ 1390 vm_page_t pml4pg; 1391 static vm_pindex_t color; 1392 1393 PMAP_LOCK_INIT(pmap); 1394 1395 /* 1396 * allocate the page directory page 1397 */ 1398 while ((pml4pg = vm_page_alloc(NULL, color++, VM_ALLOC_NOOBJ | 1399 VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) 1400 VM_WAIT; 1401 1402 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); 1403 1404 if ((pml4pg->flags & PG_ZERO) == 0) 1405 pagezero(pmap->pm_pml4); 1406 1407 /* Wire in kernel global address entries. */ 1408 pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U; 1409 pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U; 1410 1411 /* install self-referential address mapping entry(s) */ 1412 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M; 1413 1414 pmap->pm_root = NULL; 1415 pmap->pm_active = 0; 1416 TAILQ_INIT(&pmap->pm_pvchunk); 1417 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1418 1419 return (1); 1420} 1421 1422/* 1423 * this routine is called if the page table page is not 1424 * mapped correctly. 1425 * 1426 * Note: If a page allocation fails at page table level two or three, 1427 * one or two pages may be held during the wait, only to be released 1428 * afterwards. This conservative approach is easily argued to avoid 1429 * race conditions. 1430 */ 1431static vm_page_t 1432_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags) 1433{ 1434 vm_page_t m, pdppg, pdpg; 1435 1436 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || 1437 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, 1438 ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); 1439 1440 /* 1441 * Allocate a page table page. 1442 */ 1443 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1444 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1445 if (flags & M_WAITOK) { 1446 PMAP_UNLOCK(pmap); 1447 vm_page_unlock_queues(); 1448 VM_WAIT; 1449 vm_page_lock_queues(); 1450 PMAP_LOCK(pmap); 1451 } 1452 1453 /* 1454 * Indicate the need to retry. While waiting, the page table 1455 * page may have been allocated. 1456 */ 1457 return (NULL); 1458 } 1459 if ((m->flags & PG_ZERO) == 0) 1460 pmap_zero_page(m); 1461 1462 /* 1463 * Map the pagetable page into the process address space, if 1464 * it isn't already there. 1465 */ 1466 1467 pmap->pm_stats.resident_count++; 1468 1469 if (ptepindex >= (NUPDE + NUPDPE)) { 1470 pml4_entry_t *pml4; 1471 vm_pindex_t pml4index; 1472 1473 /* Wire up a new PDPE page */ 1474 pml4index = ptepindex - (NUPDE + NUPDPE); 1475 pml4 = &pmap->pm_pml4[pml4index]; 1476 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1477 1478 } else if (ptepindex >= NUPDE) { 1479 vm_pindex_t pml4index; 1480 vm_pindex_t pdpindex; 1481 pml4_entry_t *pml4; 1482 pdp_entry_t *pdp; 1483 1484 /* Wire up a new PDE page */ 1485 pdpindex = ptepindex - NUPDE; 1486 pml4index = pdpindex >> NPML4EPGSHIFT; 1487 1488 pml4 = &pmap->pm_pml4[pml4index]; 1489 if ((*pml4 & PG_V) == 0) { 1490 /* Have to allocate a new pdp, recurse */ 1491 if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, 1492 flags) == NULL) { 1493 --m->wire_count; 1494 vm_page_free(m); 1495 return (NULL); 1496 } 1497 } else { 1498 /* Add reference to pdp page */ 1499 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 1500 pdppg->wire_count++; 1501 } 1502 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1503 1504 /* Now find the pdp page */ 1505 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1506 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1507 1508 } else { 1509 vm_pindex_t pml4index; 1510 vm_pindex_t pdpindex; 1511 pml4_entry_t *pml4; 1512 pdp_entry_t *pdp; 1513 pd_entry_t *pd; 1514 1515 /* Wire up a new PTE page */ 1516 pdpindex = ptepindex >> NPDPEPGSHIFT; 1517 pml4index = pdpindex >> NPML4EPGSHIFT; 1518 1519 /* First, find the pdp and check that its valid. */ 1520 pml4 = &pmap->pm_pml4[pml4index]; 1521 if ((*pml4 & PG_V) == 0) { 1522 /* Have to allocate a new pd, recurse */ 1523 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 1524 flags) == NULL) { 1525 --m->wire_count; 1526 vm_page_free(m); 1527 return (NULL); 1528 } 1529 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1530 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1531 } else { 1532 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1533 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1534 if ((*pdp & PG_V) == 0) { 1535 /* Have to allocate a new pd, recurse */ 1536 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 1537 flags) == NULL) { 1538 --m->wire_count; 1539 vm_page_free(m); 1540 return (NULL); 1541 } 1542 } else { 1543 /* Add reference to the pd page */ 1544 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 1545 pdpg->wire_count++; 1546 } 1547 } 1548 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 1549 1550 /* Now we know where the page directory page is */ 1551 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; 1552 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1553 } 1554 1555 return m; 1556} 1557 1558static vm_page_t 1559pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags) 1560{ 1561 vm_pindex_t pdpindex, ptepindex; 1562 pdp_entry_t *pdpe; 1563 vm_page_t pdpg; 1564 1565 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || 1566 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, 1567 ("pmap_allocpde: flags is neither M_NOWAIT nor M_WAITOK")); 1568retry: 1569 pdpe = pmap_pdpe(pmap, va); 1570 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 1571 /* Add a reference to the pd page. */ 1572 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 1573 pdpg->wire_count++; 1574 } else { 1575 /* Allocate a pd page. */ 1576 ptepindex = pmap_pde_pindex(va); 1577 pdpindex = ptepindex >> NPDPEPGSHIFT; 1578 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, flags); 1579 if (pdpg == NULL && (flags & M_WAITOK)) 1580 goto retry; 1581 } 1582 return (pdpg); 1583} 1584 1585static vm_page_t 1586pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags) 1587{ 1588 vm_pindex_t ptepindex; 1589 pd_entry_t *pd; 1590 vm_page_t m; 1591 1592 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || 1593 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, 1594 ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); 1595 1596 /* 1597 * Calculate pagetable page index 1598 */ 1599 ptepindex = pmap_pde_pindex(va); 1600retry: 1601 /* 1602 * Get the page directory entry 1603 */ 1604 pd = pmap_pde(pmap, va); 1605 1606 /* 1607 * This supports switching from a 2MB page to a 1608 * normal 4K page. 1609 */ 1610 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 1611 if (!pmap_demote_pde(pmap, pd, va)) { 1612 /* 1613 * Invalidation of the 2MB page mapping may have caused 1614 * the deallocation of the underlying PD page. 1615 */ 1616 pd = NULL; 1617 } 1618 } 1619 1620 /* 1621 * If the page table page is mapped, we just increment the 1622 * hold count, and activate it. 1623 */ 1624 if (pd != NULL && (*pd & PG_V) != 0) { 1625 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 1626 m->wire_count++; 1627 } else { 1628 /* 1629 * Here if the pte page isn't mapped, or if it has been 1630 * deallocated. 1631 */ 1632 m = _pmap_allocpte(pmap, ptepindex, flags); 1633 if (m == NULL && (flags & M_WAITOK)) 1634 goto retry; 1635 } 1636 return (m); 1637} 1638 1639 1640/*************************************************** 1641 * Pmap allocation/deallocation routines. 1642 ***************************************************/ 1643 1644/* 1645 * Release any resources held by the given physical map. 1646 * Called when a pmap initialized by pmap_pinit is being released. 1647 * Should only be called if the map contains no valid mappings. 1648 */ 1649void 1650pmap_release(pmap_t pmap) 1651{ 1652 vm_page_t m; 1653 1654 KASSERT(pmap->pm_stats.resident_count == 0, 1655 ("pmap_release: pmap resident count %ld != 0", 1656 pmap->pm_stats.resident_count)); 1657 KASSERT(pmap->pm_root == NULL, 1658 ("pmap_release: pmap has reserved page table page(s)")); 1659 1660 m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME); 1661 1662 pmap->pm_pml4[KPML4I] = 0; /* KVA */ 1663 pmap->pm_pml4[DMPML4I] = 0; /* Direct Map */ 1664 pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ 1665 1666 m->wire_count--; 1667 atomic_subtract_int(&cnt.v_wire_count, 1); 1668 vm_page_free_zero(m); 1669 PMAP_LOCK_DESTROY(pmap); 1670} 1671 1672static int 1673kvm_size(SYSCTL_HANDLER_ARGS) 1674{ 1675 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; 1676 1677 return sysctl_handle_long(oidp, &ksize, 0, req); 1678} 1679SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 1680 0, 0, kvm_size, "LU", "Size of KVM"); 1681 1682static int 1683kvm_free(SYSCTL_HANDLER_ARGS) 1684{ 1685 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 1686 1687 return sysctl_handle_long(oidp, &kfree, 0, req); 1688} 1689SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 1690 0, 0, kvm_free, "LU", "Amount of KVM free"); 1691 1692/* 1693 * grow the number of kernel page table entries, if needed 1694 */ 1695void 1696pmap_growkernel(vm_offset_t addr) 1697{ 1698 vm_paddr_t paddr; 1699 vm_page_t nkpg; 1700 pd_entry_t *pde, newpdir; 1701 pdp_entry_t newpdp; 1702 1703 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 1704 if (kernel_vm_end == 0) { 1705 kernel_vm_end = KERNBASE; 1706 nkpt = 0; 1707 while ((*pmap_pde(kernel_pmap, kernel_vm_end) & PG_V) != 0) { 1708 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1709 nkpt++; 1710 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 1711 kernel_vm_end = kernel_map->max_offset; 1712 break; 1713 } 1714 } 1715 } 1716 addr = roundup2(addr, PAGE_SIZE * NPTEPG); 1717 if (addr - 1 >= kernel_map->max_offset) 1718 addr = kernel_map->max_offset; 1719 while (kernel_vm_end < addr) { 1720 pde = pmap_pde(kernel_pmap, kernel_vm_end); 1721 if (pde == NULL) { 1722 /* We need a new PDP entry */ 1723 nkpg = vm_page_alloc(NULL, nkpt, 1724 VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); 1725 if (nkpg == NULL) 1726 panic("pmap_growkernel: no memory to grow kernel"); 1727 pmap_zero_page(nkpg); 1728 paddr = VM_PAGE_TO_PHYS(nkpg); 1729 newpdp = (pdp_entry_t) 1730 (paddr | PG_V | PG_RW | PG_A | PG_M); 1731 *pmap_pdpe(kernel_pmap, kernel_vm_end) = newpdp; 1732 continue; /* try again */ 1733 } 1734 if ((*pde & PG_V) != 0) { 1735 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1736 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 1737 kernel_vm_end = kernel_map->max_offset; 1738 break; 1739 } 1740 continue; 1741 } 1742 1743 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end), 1744 VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); 1745 if (nkpg == NULL) 1746 panic("pmap_growkernel: no memory to grow kernel"); 1747 1748 nkpt++; 1749 1750 pmap_zero_page(nkpg); 1751 paddr = VM_PAGE_TO_PHYS(nkpg); 1752 newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M); 1753 *pmap_pde(kernel_pmap, kernel_vm_end) = newpdir; 1754 1755 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1756 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 1757 kernel_vm_end = kernel_map->max_offset; 1758 break; 1759 } 1760 } 1761} 1762 1763 1764/*************************************************** 1765 * page management routines. 1766 ***************************************************/ 1767 1768CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 1769CTASSERT(_NPCM == 3); 1770CTASSERT(_NPCPV == 168); 1771 1772static __inline struct pv_chunk * 1773pv_to_chunk(pv_entry_t pv) 1774{ 1775 1776 return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK); 1777} 1778 1779#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 1780 1781#define PC_FREE0 0xfffffffffffffffful 1782#define PC_FREE1 0xfffffffffffffffful 1783#define PC_FREE2 0x000000fffffffffful 1784 1785static uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 1786 1787SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 1788 "Current number of pv entries"); 1789 1790#ifdef PV_STATS 1791static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 1792 1793SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 1794 "Current number of pv entry chunks"); 1795SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 1796 "Current number of pv entry chunks allocated"); 1797SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 1798 "Current number of pv entry chunks frees"); 1799SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 1800 "Number of times tried to get a chunk page but failed."); 1801 1802static long pv_entry_frees, pv_entry_allocs; 1803static int pv_entry_spare; 1804 1805SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 1806 "Current number of pv entry frees"); 1807SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 1808 "Current number of pv entry allocs"); 1809SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 1810 "Current number of spare pv entries"); 1811 1812static int pmap_collect_inactive, pmap_collect_active; 1813 1814SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0, 1815 "Current number times pmap_collect called on inactive queue"); 1816SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0, 1817 "Current number times pmap_collect called on active queue"); 1818#endif 1819 1820/* 1821 * We are in a serious low memory condition. Resort to 1822 * drastic measures to free some pages so we can allocate 1823 * another pv entry chunk. This is normally called to 1824 * unmap inactive pages, and if necessary, active pages. 1825 * 1826 * We do not, however, unmap 2mpages because subsequent accesses will 1827 * allocate per-page pv entries until repromotion occurs, thereby 1828 * exacerbating the shortage of free pv entries. 1829 */ 1830static void 1831pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq) 1832{ 1833 struct md_page *pvh; 1834 pd_entry_t *pde; 1835 pmap_t pmap; 1836 pt_entry_t *pte, tpte; 1837 pv_entry_t next_pv, pv; 1838 vm_offset_t va; 1839 vm_page_t m, free; 1840 1841 TAILQ_FOREACH(m, &vpq->pl, pageq) { 1842 if (m->hold_count || m->busy) 1843 continue; 1844 TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) { 1845 va = pv->pv_va; 1846 pmap = PV_PMAP(pv); 1847 /* Avoid deadlock and lock recursion. */ 1848 if (pmap > locked_pmap) 1849 PMAP_LOCK(pmap); 1850 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) 1851 continue; 1852 pmap->pm_stats.resident_count--; 1853 pde = pmap_pde(pmap, va); 1854 KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found" 1855 " a 2mpage in page %p's pv list", m)); 1856 pte = pmap_pde_to_pte(pde, va); 1857 tpte = pte_load_clear(pte); 1858 KASSERT((tpte & PG_W) == 0, 1859 ("pmap_collect: wired pte %#lx", tpte)); 1860 if (tpte & PG_A) 1861 vm_page_flag_set(m, PG_REFERENCED); 1862 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 1863 vm_page_dirty(m); 1864 free = NULL; 1865 pmap_unuse_pt(pmap, va, *pde, &free); 1866 pmap_invalidate_page(pmap, va); 1867 pmap_free_zero_pages(free); 1868 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 1869 if (TAILQ_EMPTY(&m->md.pv_list)) { 1870 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 1871 if (TAILQ_EMPTY(&pvh->pv_list)) 1872 vm_page_flag_clear(m, PG_WRITEABLE); 1873 } 1874 free_pv_entry(pmap, pv); 1875 if (pmap != locked_pmap) 1876 PMAP_UNLOCK(pmap); 1877 } 1878 } 1879} 1880 1881 1882/* 1883 * free the pv_entry back to the free list 1884 */ 1885static void 1886free_pv_entry(pmap_t pmap, pv_entry_t pv) 1887{ 1888 vm_page_t m; 1889 struct pv_chunk *pc; 1890 int idx, field, bit; 1891 1892 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1893 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1894 PV_STAT(pv_entry_frees++); 1895 PV_STAT(pv_entry_spare++); 1896 pv_entry_count--; 1897 pc = pv_to_chunk(pv); 1898 idx = pv - &pc->pc_pventry[0]; 1899 field = idx / 64; 1900 bit = idx % 64; 1901 pc->pc_map[field] |= 1ul << bit; 1902 /* move to head of list */ 1903 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1904 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1905 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 1906 pc->pc_map[2] != PC_FREE2) 1907 return; 1908 PV_STAT(pv_entry_spare -= _NPCPV); 1909 PV_STAT(pc_chunk_count--); 1910 PV_STAT(pc_chunk_frees++); 1911 /* entire chunk is free, return it */ 1912 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1913 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 1914 dump_drop_page(m->phys_addr); 1915 vm_page_unwire(m, 0); 1916 vm_page_free(m); 1917} 1918 1919/* 1920 * get a new pv_entry, allocating a block from the system 1921 * when needed. 1922 */ 1923static pv_entry_t 1924get_pv_entry(pmap_t pmap, int try) 1925{ 1926 static const struct timeval printinterval = { 60, 0 }; 1927 static struct timeval lastprint; 1928 static vm_pindex_t colour; 1929 struct vpgqueues *pq; 1930 int bit, field; 1931 pv_entry_t pv; 1932 struct pv_chunk *pc; 1933 vm_page_t m; 1934 1935 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1936 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1937 PV_STAT(pv_entry_allocs++); 1938 pv_entry_count++; 1939 if (pv_entry_count > pv_entry_high_water) 1940 if (ratecheck(&lastprint, &printinterval)) 1941 printf("Approaching the limit on PV entries, consider " 1942 "increasing either the vm.pmap.shpgperproc or the " 1943 "vm.pmap.pv_entry_max sysctl.\n"); 1944 pq = NULL; 1945retry: 1946 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1947 if (pc != NULL) { 1948 for (field = 0; field < _NPCM; field++) { 1949 if (pc->pc_map[field]) { 1950 bit = bsfq(pc->pc_map[field]); 1951 break; 1952 } 1953 } 1954 if (field < _NPCM) { 1955 pv = &pc->pc_pventry[field * 64 + bit]; 1956 pc->pc_map[field] &= ~(1ul << bit); 1957 /* If this was the last item, move it to tail */ 1958 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 1959 pc->pc_map[2] == 0) { 1960 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1961 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1962 } 1963 PV_STAT(pv_entry_spare--); 1964 return (pv); 1965 } 1966 } 1967 /* No free items, allocate another chunk */ 1968 m = vm_page_alloc(NULL, colour, (pq == &vm_page_queues[PQ_ACTIVE] ? 1969 VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | 1970 VM_ALLOC_WIRED); 1971 if (m == NULL) { 1972 if (try) { 1973 pv_entry_count--; 1974 PV_STAT(pc_chunk_tryfail++); 1975 return (NULL); 1976 } 1977 /* 1978 * Reclaim pv entries: At first, destroy mappings to inactive 1979 * pages. After that, if a pv chunk entry is still needed, 1980 * destroy mappings to active pages. 1981 */ 1982 if (pq == NULL) { 1983 PV_STAT(pmap_collect_inactive++); 1984 pq = &vm_page_queues[PQ_INACTIVE]; 1985 } else if (pq == &vm_page_queues[PQ_INACTIVE]) { 1986 PV_STAT(pmap_collect_active++); 1987 pq = &vm_page_queues[PQ_ACTIVE]; 1988 } else 1989 panic("get_pv_entry: increase vm.pmap.shpgperproc"); 1990 pmap_collect(pmap, pq); 1991 goto retry; 1992 } 1993 PV_STAT(pc_chunk_count++); 1994 PV_STAT(pc_chunk_allocs++); 1995 colour++; 1996 dump_add_page(m->phys_addr); 1997 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1998 pc->pc_pmap = pmap; 1999 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 2000 pc->pc_map[1] = PC_FREE1; 2001 pc->pc_map[2] = PC_FREE2; 2002 pv = &pc->pc_pventry[0]; 2003 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2004 PV_STAT(pv_entry_spare += _NPCPV - 1); 2005 return (pv); 2006} 2007 2008/* 2009 * First find and then remove the pv entry for the specified pmap and virtual 2010 * address from the specified pv list. Returns the pv entry if found and NULL 2011 * otherwise. This operation can be performed on pv lists for either 4KB or 2012 * 2MB page mappings. 2013 */ 2014static __inline pv_entry_t 2015pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2016{ 2017 pv_entry_t pv; 2018 2019 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2020 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 2021 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 2022 TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); 2023 break; 2024 } 2025 } 2026 return (pv); 2027} 2028 2029/* 2030 * After demotion from a 2MB page mapping to 512 4KB page mappings, 2031 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 2032 * entries for each of the 4KB page mappings. 2033 */ 2034static void 2035pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2036{ 2037 struct md_page *pvh; 2038 pv_entry_t pv; 2039 vm_offset_t va_last; 2040 vm_page_t m; 2041 2042 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2043 KASSERT((pa & PDRMASK) == 0, 2044 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 2045 2046 /* 2047 * Transfer the 2mpage's pv entry for this mapping to the first 2048 * page's pv list. 2049 */ 2050 pvh = pa_to_pvh(pa); 2051 va = trunc_2mpage(va); 2052 pv = pmap_pvh_remove(pvh, pmap, va); 2053 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 2054 m = PHYS_TO_VM_PAGE(pa); 2055 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2056 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 2057 va_last = va + NBPDR - PAGE_SIZE; 2058 do { 2059 m++; 2060 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0, 2061 ("pmap_pv_demote_pde: page %p is not managed", m)); 2062 va += PAGE_SIZE; 2063 pmap_insert_entry(pmap, va, m); 2064 } while (va < va_last); 2065} 2066 2067/* 2068 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 2069 * replace the many pv entries for the 4KB page mappings by a single pv entry 2070 * for the 2MB page mapping. 2071 */ 2072static void 2073pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2074{ 2075 struct md_page *pvh; 2076 pv_entry_t pv; 2077 vm_offset_t va_last; 2078 vm_page_t m; 2079 2080 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2081 KASSERT((pa & PDRMASK) == 0, 2082 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 2083 2084 /* 2085 * Transfer the first page's pv entry for this mapping to the 2086 * 2mpage's pv list. Aside from avoiding the cost of a call 2087 * to get_pv_entry(), a transfer avoids the possibility that 2088 * get_pv_entry() calls pmap_collect() and that pmap_collect() 2089 * removes one of the mappings that is being promoted. 2090 */ 2091 m = PHYS_TO_VM_PAGE(pa); 2092 va = trunc_2mpage(va); 2093 pv = pmap_pvh_remove(&m->md, pmap, va); 2094 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 2095 pvh = pa_to_pvh(pa); 2096 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); 2097 /* Free the remaining NPTEPG - 1 pv entries. */ 2098 va_last = va + NBPDR - PAGE_SIZE; 2099 do { 2100 m++; 2101 va += PAGE_SIZE; 2102 pmap_pvh_free(&m->md, pmap, va); 2103 } while (va < va_last); 2104} 2105 2106/* 2107 * First find and then destroy the pv entry for the specified pmap and virtual 2108 * address. This operation can be performed on pv lists for either 4KB or 2MB 2109 * page mappings. 2110 */ 2111static void 2112pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2113{ 2114 pv_entry_t pv; 2115 2116 pv = pmap_pvh_remove(pvh, pmap, va); 2117 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 2118 free_pv_entry(pmap, pv); 2119} 2120 2121static void 2122pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 2123{ 2124 struct md_page *pvh; 2125 2126 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2127 pmap_pvh_free(&m->md, pmap, va); 2128 if (TAILQ_EMPTY(&m->md.pv_list)) { 2129 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2130 if (TAILQ_EMPTY(&pvh->pv_list)) 2131 vm_page_flag_clear(m, PG_WRITEABLE); 2132 } 2133} 2134 2135/* 2136 * Create a pv entry for page at pa for 2137 * (pmap, va). 2138 */ 2139static void 2140pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2141{ 2142 pv_entry_t pv; 2143 2144 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2145 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2146 pv = get_pv_entry(pmap, FALSE); 2147 pv->pv_va = va; 2148 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2149} 2150 2151/* 2152 * Conditionally create a pv entry. 2153 */ 2154static boolean_t 2155pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2156{ 2157 pv_entry_t pv; 2158 2159 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2160 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2161 if (pv_entry_count < pv_entry_high_water && 2162 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2163 pv->pv_va = va; 2164 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2165 return (TRUE); 2166 } else 2167 return (FALSE); 2168} 2169 2170/* 2171 * Create the pv entry for a 2MB page mapping. 2172 */ 2173static boolean_t 2174pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_page_t m) 2175{ 2176 struct md_page *pvh; 2177 pv_entry_t pv; 2178 2179 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2180 if (pv_entry_count < pv_entry_high_water && 2181 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2182 pv->pv_va = va; 2183 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2184 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); 2185 return (TRUE); 2186 } else 2187 return (FALSE); 2188} 2189 2190/* 2191 * Tries to demote a 2MB page mapping. 2192 */ 2193static boolean_t 2194pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2195{ 2196 pd_entry_t newpde, oldpde; 2197 pt_entry_t *firstpte, newpte, *pte; 2198 vm_paddr_t mptepa; 2199 vm_page_t free, mpte; 2200 2201 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2202 mpte = pmap_lookup_pt_page(pmap, va); 2203 if (mpte != NULL) 2204 pmap_remove_pt_page(pmap, mpte); 2205 else { 2206 KASSERT((*pde & PG_W) == 0, 2207 ("pmap_demote_pde: page table page for a wired mapping" 2208 " is missing")); 2209 free = NULL; 2210 pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free); 2211 pmap_invalidate_page(pmap, trunc_2mpage(va)); 2212 pmap_free_zero_pages(free); 2213 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx" 2214 " in pmap %p", va, pmap); 2215 return (FALSE); 2216 } 2217 mptepa = VM_PAGE_TO_PHYS(mpte); 2218 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 2219 oldpde = *pde; 2220 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 2221 KASSERT((oldpde & (PG_A | PG_V)) == (PG_A | PG_V), 2222 ("pmap_demote_pde: oldpde is missing PG_A and/or PG_V")); 2223 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 2224 ("pmap_demote_pde: oldpde is missing PG_M")); 2225 KASSERT((oldpde & PG_PS) != 0, 2226 ("pmap_demote_pde: oldpde is missing PG_PS")); 2227 newpte = oldpde & ~PG_PS; 2228 if ((newpte & PG_PDE_PAT) != 0) 2229 newpte ^= PG_PDE_PAT | PG_PTE_PAT; 2230 2231 /* 2232 * If the mapping has changed attributes, update the page table 2233 * entries. 2234 */ 2235 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 2236 ("pmap_demote_pde: firstpte and newpte map different physical" 2237 " addresses")); 2238 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 2239 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 2240 *pte = newpte; 2241 newpte += PAGE_SIZE; 2242 } 2243 2244 /* 2245 * Demote the mapping. This pmap is locked. The old PDE has 2246 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 2247 * set. Thus, there is no danger of a race with another 2248 * processor changing the setting of PG_A and/or PG_M between 2249 * the read above and the store below. 2250 */ 2251 pde_store(pde, newpde); 2252 2253 /* 2254 * Invalidate a stale mapping of the page table page. 2255 */ 2256 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 2257 2258 /* 2259 * Demote the pv entry. This depends on the earlier demotion 2260 * of the mapping. Specifically, the (re)creation of a per- 2261 * page pv entry might trigger the execution of pmap_collect(), 2262 * which might reclaim a newly (re)created per-page pv entry 2263 * and destroy the associated mapping. In order to destroy 2264 * the mapping, the PDE must have already changed from mapping 2265 * the 2mpage to referencing the page table page. 2266 */ 2267 if ((oldpde & PG_MANAGED) != 0) 2268 pmap_pv_demote_pde(pmap, va, oldpde & PG_FRAME); 2269 2270 pmap_pde_demotions++; 2271 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx" 2272 " in pmap %p", va, pmap); 2273 return (TRUE); 2274} 2275 2276/* 2277 * pmap_remove_pde: do the things to unmap a superpage in a process 2278 */ 2279static int 2280pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 2281 vm_page_t *free) 2282{ 2283 struct md_page *pvh; 2284 pd_entry_t oldpde; 2285 vm_offset_t eva, va; 2286 vm_page_t m, mpte; 2287 2288 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2289 KASSERT((sva & PDRMASK) == 0, 2290 ("pmap_remove_pde: sva is not 2mpage aligned")); 2291 oldpde = pte_load_clear(pdq); 2292 if (oldpde & PG_W) 2293 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 2294 2295 /* 2296 * Machines that don't support invlpg, also don't support 2297 * PG_G. 2298 */ 2299 if (oldpde & PG_G) 2300 pmap_invalidate_page(kernel_pmap, sva); 2301 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2302 if (oldpde & PG_MANAGED) { 2303 pvh = pa_to_pvh(oldpde & PG_FRAME); 2304 pmap_pvh_free(pvh, pmap, sva); 2305 eva = sva + NBPDR; 2306 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_FRAME); 2307 va < eva; va += PAGE_SIZE, m++) { 2308 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2309 vm_page_dirty(m); 2310 if (oldpde & PG_A) 2311 vm_page_flag_set(m, PG_REFERENCED); 2312 if (TAILQ_EMPTY(&m->md.pv_list) && 2313 TAILQ_EMPTY(&pvh->pv_list)) 2314 vm_page_flag_clear(m, PG_WRITEABLE); 2315 } 2316 } 2317 if (pmap == kernel_pmap) { 2318 if (!pmap_demote_pde(pmap, pdq, sva)) 2319 panic("pmap_remove_pde: failed demotion"); 2320 } else { 2321 mpte = pmap_lookup_pt_page(pmap, sva); 2322 if (mpte != NULL) { 2323 pmap_remove_pt_page(pmap, mpte); 2324 KASSERT(mpte->wire_count == NPTEPG, 2325 ("pmap_remove_pde: pte page wire count error")); 2326 mpte->wire_count = 0; 2327 pmap_add_delayed_free_list(mpte, free, FALSE); 2328 atomic_subtract_int(&cnt.v_wire_count, 1); 2329 } 2330 } 2331 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); 2332} 2333 2334/* 2335 * pmap_remove_pte: do the things to unmap a page in a process 2336 */ 2337static int 2338pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 2339 pd_entry_t ptepde, vm_page_t *free) 2340{ 2341 pt_entry_t oldpte; 2342 vm_page_t m; 2343 2344 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2345 oldpte = pte_load_clear(ptq); 2346 if (oldpte & PG_W) 2347 pmap->pm_stats.wired_count -= 1; 2348 /* 2349 * Machines that don't support invlpg, also don't support 2350 * PG_G. 2351 */ 2352 if (oldpte & PG_G) 2353 pmap_invalidate_page(kernel_pmap, va); 2354 pmap->pm_stats.resident_count -= 1; 2355 if (oldpte & PG_MANAGED) { 2356 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 2357 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2358 vm_page_dirty(m); 2359 if (oldpte & PG_A) 2360 vm_page_flag_set(m, PG_REFERENCED); 2361 pmap_remove_entry(pmap, m, va); 2362 } 2363 return (pmap_unuse_pt(pmap, va, ptepde, free)); 2364} 2365 2366/* 2367 * Remove a single page from a process address space 2368 */ 2369static void 2370pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, vm_page_t *free) 2371{ 2372 pt_entry_t *pte; 2373 2374 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2375 if ((*pde & PG_V) == 0) 2376 return; 2377 pte = pmap_pde_to_pte(pde, va); 2378 if ((*pte & PG_V) == 0) 2379 return; 2380 pmap_remove_pte(pmap, pte, va, *pde, free); 2381 pmap_invalidate_page(pmap, va); 2382} 2383 2384/* 2385 * Remove the given range of addresses from the specified map. 2386 * 2387 * It is assumed that the start and end are properly 2388 * rounded to the page size. 2389 */ 2390void 2391pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2392{ 2393 vm_offset_t va_next; 2394 pml4_entry_t *pml4e; 2395 pdp_entry_t *pdpe; 2396 pd_entry_t ptpaddr, *pde; 2397 pt_entry_t *pte; 2398 vm_page_t free = NULL; 2399 int anyvalid; 2400 2401 /* 2402 * Perform an unsynchronized read. This is, however, safe. 2403 */ 2404 if (pmap->pm_stats.resident_count == 0) 2405 return; 2406 2407 anyvalid = 0; 2408 2409 vm_page_lock_queues(); 2410 PMAP_LOCK(pmap); 2411 2412 /* 2413 * special handling of removing one page. a very 2414 * common operation and easy to short circuit some 2415 * code. 2416 */ 2417 if (sva + PAGE_SIZE == eva) { 2418 pde = pmap_pde(pmap, sva); 2419 if (pde && (*pde & PG_PS) == 0) { 2420 pmap_remove_page(pmap, sva, pde, &free); 2421 goto out; 2422 } 2423 } 2424 2425 for (; sva < eva; sva = va_next) { 2426 2427 if (pmap->pm_stats.resident_count == 0) 2428 break; 2429 2430 pml4e = pmap_pml4e(pmap, sva); 2431 if ((*pml4e & PG_V) == 0) { 2432 va_next = (sva + NBPML4) & ~PML4MASK; 2433 if (va_next < sva) 2434 va_next = eva; 2435 continue; 2436 } 2437 2438 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2439 if ((*pdpe & PG_V) == 0) { 2440 va_next = (sva + NBPDP) & ~PDPMASK; 2441 if (va_next < sva) 2442 va_next = eva; 2443 continue; 2444 } 2445 2446 /* 2447 * Calculate index for next page table. 2448 */ 2449 va_next = (sva + NBPDR) & ~PDRMASK; 2450 if (va_next < sva) 2451 va_next = eva; 2452 2453 pde = pmap_pdpe_to_pde(pdpe, sva); 2454 ptpaddr = *pde; 2455 2456 /* 2457 * Weed out invalid mappings. 2458 */ 2459 if (ptpaddr == 0) 2460 continue; 2461 2462 /* 2463 * Check for large page. 2464 */ 2465 if ((ptpaddr & PG_PS) != 0) { 2466 /* 2467 * Are we removing the entire large page? If not, 2468 * demote the mapping and fall through. 2469 */ 2470 if (sva + NBPDR == va_next && eva >= va_next) { 2471 /* 2472 * The TLB entry for a PG_G mapping is 2473 * invalidated by pmap_remove_pde(). 2474 */ 2475 if ((ptpaddr & PG_G) == 0) 2476 anyvalid = 1; 2477 pmap_remove_pde(pmap, pde, sva, &free); 2478 continue; 2479 } else if (!pmap_demote_pde(pmap, pde, sva)) { 2480 /* The large page mapping was destroyed. */ 2481 continue; 2482 } else 2483 ptpaddr = *pde; 2484 } 2485 2486 /* 2487 * Limit our scan to either the end of the va represented 2488 * by the current page table page, or to the end of the 2489 * range being removed. 2490 */ 2491 if (va_next > eva) 2492 va_next = eva; 2493 2494 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2495 sva += PAGE_SIZE) { 2496 if (*pte == 0) 2497 continue; 2498 2499 /* 2500 * The TLB entry for a PG_G mapping is invalidated 2501 * by pmap_remove_pte(). 2502 */ 2503 if ((*pte & PG_G) == 0) 2504 anyvalid = 1; 2505 if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free)) 2506 break; 2507 } 2508 } 2509out: 2510 if (anyvalid) 2511 pmap_invalidate_all(pmap); 2512 vm_page_unlock_queues(); 2513 PMAP_UNLOCK(pmap); 2514 pmap_free_zero_pages(free); 2515} 2516 2517/* 2518 * Routine: pmap_remove_all 2519 * Function: 2520 * Removes this physical page from 2521 * all physical maps in which it resides. 2522 * Reflects back modify bits to the pager. 2523 * 2524 * Notes: 2525 * Original versions of this routine were very 2526 * inefficient because they iteratively called 2527 * pmap_remove (slow...) 2528 */ 2529 2530void 2531pmap_remove_all(vm_page_t m) 2532{ 2533 struct md_page *pvh; 2534 pv_entry_t pv; 2535 pmap_t pmap; 2536 pt_entry_t *pte, tpte; 2537 pd_entry_t *pde; 2538 vm_offset_t va; 2539 vm_page_t free; 2540 2541 KASSERT((m->flags & PG_FICTITIOUS) == 0, 2542 ("pmap_remove_all: page %p is fictitious", m)); 2543 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2544 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2545 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 2546 va = pv->pv_va; 2547 pmap = PV_PMAP(pv); 2548 PMAP_LOCK(pmap); 2549 pde = pmap_pde(pmap, va); 2550 (void)pmap_demote_pde(pmap, pde, va); 2551 PMAP_UNLOCK(pmap); 2552 } 2553 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2554 pmap = PV_PMAP(pv); 2555 PMAP_LOCK(pmap); 2556 pmap->pm_stats.resident_count--; 2557 pde = pmap_pde(pmap, pv->pv_va); 2558 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 2559 " a 2mpage in page %p's pv list", m)); 2560 pte = pmap_pde_to_pte(pde, pv->pv_va); 2561 tpte = pte_load_clear(pte); 2562 if (tpte & PG_W) 2563 pmap->pm_stats.wired_count--; 2564 if (tpte & PG_A) 2565 vm_page_flag_set(m, PG_REFERENCED); 2566 2567 /* 2568 * Update the vm_page_t clean and reference bits. 2569 */ 2570 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2571 vm_page_dirty(m); 2572 free = NULL; 2573 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); 2574 pmap_invalidate_page(pmap, pv->pv_va); 2575 pmap_free_zero_pages(free); 2576 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2577 free_pv_entry(pmap, pv); 2578 PMAP_UNLOCK(pmap); 2579 } 2580 vm_page_flag_clear(m, PG_WRITEABLE); 2581} 2582 2583/* 2584 * pmap_protect_pde: do the things to protect a 2mpage in a process 2585 */ 2586static boolean_t 2587pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 2588{ 2589 pd_entry_t newpde, oldpde; 2590 vm_offset_t eva, va; 2591 vm_page_t m; 2592 boolean_t anychanged; 2593 2594 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2595 KASSERT((sva & PDRMASK) == 0, 2596 ("pmap_protect_pde: sva is not 2mpage aligned")); 2597 anychanged = FALSE; 2598retry: 2599 oldpde = newpde = *pde; 2600 if (oldpde & PG_MANAGED) { 2601 eva = sva + NBPDR; 2602 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_FRAME); 2603 va < eva; va += PAGE_SIZE, m++) { 2604 /* 2605 * In contrast to the analogous operation on a 4KB page 2606 * mapping, the mapping's PG_A flag is not cleared and 2607 * the page's PG_REFERENCED flag is not set. The 2608 * reason is that pmap_demote_pde() expects that a 2MB 2609 * page mapping with a stored page table page has PG_A 2610 * set. 2611 */ 2612 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2613 vm_page_dirty(m); 2614 } 2615 } 2616 if ((prot & VM_PROT_WRITE) == 0) 2617 newpde &= ~(PG_RW | PG_M); 2618 if ((prot & VM_PROT_EXECUTE) == 0) 2619 newpde |= pg_nx; 2620 if (newpde != oldpde) { 2621 if (!atomic_cmpset_long(pde, oldpde, newpde)) 2622 goto retry; 2623 if (oldpde & PG_G) 2624 pmap_invalidate_page(pmap, sva); 2625 else 2626 anychanged = TRUE; 2627 } 2628 return (anychanged); 2629} 2630 2631/* 2632 * Set the physical protection on the 2633 * specified range of this map as requested. 2634 */ 2635void 2636pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2637{ 2638 vm_offset_t va_next; 2639 pml4_entry_t *pml4e; 2640 pdp_entry_t *pdpe; 2641 pd_entry_t ptpaddr, *pde; 2642 pt_entry_t *pte; 2643 int anychanged; 2644 2645 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2646 pmap_remove(pmap, sva, eva); 2647 return; 2648 } 2649 2650 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 2651 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 2652 return; 2653 2654 anychanged = 0; 2655 2656 vm_page_lock_queues(); 2657 PMAP_LOCK(pmap); 2658 for (; sva < eva; sva = va_next) { 2659 2660 pml4e = pmap_pml4e(pmap, sva); 2661 if ((*pml4e & PG_V) == 0) { 2662 va_next = (sva + NBPML4) & ~PML4MASK; 2663 if (va_next < sva) 2664 va_next = eva; 2665 continue; 2666 } 2667 2668 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2669 if ((*pdpe & PG_V) == 0) { 2670 va_next = (sva + NBPDP) & ~PDPMASK; 2671 if (va_next < sva) 2672 va_next = eva; 2673 continue; 2674 } 2675 2676 va_next = (sva + NBPDR) & ~PDRMASK; 2677 if (va_next < sva) 2678 va_next = eva; 2679 2680 pde = pmap_pdpe_to_pde(pdpe, sva); 2681 ptpaddr = *pde; 2682 2683 /* 2684 * Weed out invalid mappings. 2685 */ 2686 if (ptpaddr == 0) 2687 continue; 2688 2689 /* 2690 * Check for large page. 2691 */ 2692 if ((ptpaddr & PG_PS) != 0) { 2693 /* 2694 * Are we protecting the entire large page? If not, 2695 * demote the mapping and fall through. 2696 */ 2697 if (sva + NBPDR == va_next && eva >= va_next) { 2698 /* 2699 * The TLB entry for a PG_G mapping is 2700 * invalidated by pmap_protect_pde(). 2701 */ 2702 if (pmap_protect_pde(pmap, pde, sva, prot)) 2703 anychanged = 1; 2704 continue; 2705 } else if (!pmap_demote_pde(pmap, pde, sva)) { 2706 /* The large page mapping was destroyed. */ 2707 continue; 2708 } 2709 } 2710 2711 if (va_next > eva) 2712 va_next = eva; 2713 2714 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2715 sva += PAGE_SIZE) { 2716 pt_entry_t obits, pbits; 2717 vm_page_t m; 2718 2719retry: 2720 obits = pbits = *pte; 2721 if ((pbits & PG_V) == 0) 2722 continue; 2723 if (pbits & PG_MANAGED) { 2724 m = NULL; 2725 if (pbits & PG_A) { 2726 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 2727 vm_page_flag_set(m, PG_REFERENCED); 2728 pbits &= ~PG_A; 2729 } 2730 if ((pbits & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 2731 if (m == NULL) 2732 m = PHYS_TO_VM_PAGE(pbits & 2733 PG_FRAME); 2734 vm_page_dirty(m); 2735 } 2736 } 2737 2738 if ((prot & VM_PROT_WRITE) == 0) 2739 pbits &= ~(PG_RW | PG_M); 2740 if ((prot & VM_PROT_EXECUTE) == 0) 2741 pbits |= pg_nx; 2742 2743 if (pbits != obits) { 2744 if (!atomic_cmpset_long(pte, obits, pbits)) 2745 goto retry; 2746 if (obits & PG_G) 2747 pmap_invalidate_page(pmap, sva); 2748 else 2749 anychanged = 1; 2750 } 2751 } 2752 } 2753 if (anychanged) 2754 pmap_invalidate_all(pmap); 2755 vm_page_unlock_queues(); 2756 PMAP_UNLOCK(pmap); 2757} 2758 2759/* 2760 * Tries to promote the 512, contiguous 4KB page mappings that are within a 2761 * single page table page to a single 2MB page mapping. For promotion to 2762 * occur, two conditions must be met: (1) the 4KB page mappings must map 2763 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 2764 * identical characteristics. 2765 */ 2766static void 2767pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2768{ 2769 pd_entry_t newpde; 2770 pt_entry_t *firstpte, oldpte, *pte; 2771 vm_offset_t oldpteva; 2772 vm_paddr_t pa; 2773 vm_page_t mpte; 2774 2775 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2776 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 2777 KASSERT((*firstpte & PG_V) != 0, 2778 ("pmap_promote_pde: firstpte is missing PG_V")); 2779 if ((*firstpte & PG_A) == 0) { 2780 pmap_pde_p_failures++; 2781 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 2782 " in pmap %p", va, pmap); 2783 return; 2784 } 2785 pa = *firstpte & PG_PS_FRAME; 2786 newpde = *firstpte; 2787 if ((newpde & (PG_M | PG_RW)) == PG_RW) 2788 newpde &= ~PG_RW; 2789 2790 /* 2791 * Check all the ptes before promotion 2792 */ 2793 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 2794retry: 2795 oldpte = *pte; 2796 if ((oldpte & PG_FRAME) != pa) { 2797 pmap_pde_p_failures++; 2798 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 2799 " in pmap %p", va, pmap); 2800 return; 2801 } 2802 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 2803 /* 2804 * When PG_M is already clear, PG_RW can be cleared 2805 * without a TLB invalidation. 2806 */ 2807 if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW)) 2808 goto retry; 2809 oldpte &= ~PG_RW; 2810 oldpteva = (oldpte & PG_FRAME & PDRMASK) | 2811 (va & ~PDRMASK); 2812 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 2813 " in pmap %p", oldpteva, pmap); 2814 } 2815 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 2816 pmap_pde_p_failures++; 2817 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 2818 " in pmap %p", va, pmap); 2819 return; 2820 } 2821 pa += PAGE_SIZE; 2822 } 2823 2824 /* 2825 * Save the page table page in its current state until the PDE 2826 * mapping the superpage is demoted by pmap_demote_pde() or 2827 * destroyed by pmap_remove_pde(). 2828 */ 2829 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 2830 KASSERT(mpte >= vm_page_array && 2831 mpte < &vm_page_array[vm_page_array_size], 2832 ("pmap_promote_pde: page table page is out of range")); 2833 KASSERT(mpte->pindex == pmap_pde_pindex(va), 2834 ("pmap_promote_pde: page table page's pindex is wrong")); 2835 pmap_insert_pt_page(pmap, mpte); 2836 2837 /* 2838 * Promote the pv entries. 2839 */ 2840 if ((newpde & PG_MANAGED) != 0) 2841 pmap_pv_promote_pde(pmap, va, newpde & PG_FRAME); 2842 2843 /* 2844 * Propagate the PAT index to its proper position. 2845 */ 2846 if ((newpde & PG_PTE_PAT) != 0) 2847 newpde ^= PG_PDE_PAT | PG_PTE_PAT; 2848 2849 /* 2850 * Map the superpage. 2851 */ 2852 pde_store(pde, PG_PS | newpde); 2853 2854 pmap_pde_promotions++; 2855 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" 2856 " in pmap %p", va, pmap); 2857} 2858 2859/* 2860 * Insert the given physical page (p) at 2861 * the specified virtual address (v) in the 2862 * target physical map with the protection requested. 2863 * 2864 * If specified, the page will be wired down, meaning 2865 * that the related pte can not be reclaimed. 2866 * 2867 * NB: This is the only routine which MAY NOT lazy-evaluate 2868 * or lose information. That is, this routine must actually 2869 * insert this page into the given map NOW. 2870 */ 2871void 2872pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m, 2873 vm_prot_t prot, boolean_t wired) 2874{ 2875 vm_paddr_t pa; 2876 pd_entry_t *pde; 2877 pt_entry_t *pte; 2878 vm_paddr_t opa; 2879 pt_entry_t origpte, newpte; 2880 vm_page_t mpte, om; 2881 boolean_t invlva; 2882 2883 va = trunc_page(va); 2884 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 2885 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 2886 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va)); 2887 2888 mpte = NULL; 2889 2890 vm_page_lock_queues(); 2891 PMAP_LOCK(pmap); 2892 2893 /* 2894 * In the case that a page table page is not 2895 * resident, we are creating it here. 2896 */ 2897 if (va < VM_MAXUSER_ADDRESS) { 2898 mpte = pmap_allocpte(pmap, va, M_WAITOK); 2899 } 2900 2901 pde = pmap_pde(pmap, va); 2902 if (pde != NULL && (*pde & PG_V) != 0) { 2903 if ((*pde & PG_PS) != 0) 2904 panic("pmap_enter: attempted pmap_enter on 2MB page"); 2905 pte = pmap_pde_to_pte(pde, va); 2906 } else 2907 pte = NULL; 2908 2909 /* 2910 * Page Directory table entry not valid, we need a new PT page 2911 */ 2912 if (pte == NULL) 2913 panic("pmap_enter: invalid page directory va=%#lx", va); 2914 2915 pa = VM_PAGE_TO_PHYS(m); 2916 om = NULL; 2917 origpte = *pte; 2918 opa = origpte & PG_FRAME; 2919 2920 /* 2921 * Mapping has not changed, must be protection or wiring change. 2922 */ 2923 if (origpte && (opa == pa)) { 2924 /* 2925 * Wiring change, just update stats. We don't worry about 2926 * wiring PT pages as they remain resident as long as there 2927 * are valid mappings in them. Hence, if a user page is wired, 2928 * the PT page will be also. 2929 */ 2930 if (wired && ((origpte & PG_W) == 0)) 2931 pmap->pm_stats.wired_count++; 2932 else if (!wired && (origpte & PG_W)) 2933 pmap->pm_stats.wired_count--; 2934 2935 /* 2936 * Remove extra pte reference 2937 */ 2938 if (mpte) 2939 mpte->wire_count--; 2940 2941 /* 2942 * We might be turning off write access to the page, 2943 * so we go ahead and sense modify status. 2944 */ 2945 if (origpte & PG_MANAGED) { 2946 om = m; 2947 pa |= PG_MANAGED; 2948 } 2949 goto validate; 2950 } 2951 /* 2952 * Mapping has changed, invalidate old range and fall through to 2953 * handle validating new mapping. 2954 */ 2955 if (opa) { 2956 if (origpte & PG_W) 2957 pmap->pm_stats.wired_count--; 2958 if (origpte & PG_MANAGED) { 2959 om = PHYS_TO_VM_PAGE(opa); 2960 pmap_remove_entry(pmap, om, va); 2961 } 2962 if (mpte != NULL) { 2963 mpte->wire_count--; 2964 KASSERT(mpte->wire_count > 0, 2965 ("pmap_enter: missing reference to page table page," 2966 " va: 0x%lx", va)); 2967 } 2968 } else 2969 pmap->pm_stats.resident_count++; 2970 2971 /* 2972 * Enter on the PV list if part of our managed memory. 2973 */ 2974 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { 2975 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, 2976 ("pmap_enter: managed mapping within the clean submap")); 2977 pmap_insert_entry(pmap, va, m); 2978 pa |= PG_MANAGED; 2979 } 2980 2981 /* 2982 * Increment counters 2983 */ 2984 if (wired) 2985 pmap->pm_stats.wired_count++; 2986 2987validate: 2988 /* 2989 * Now validate mapping with desired protection/wiring. 2990 */ 2991 newpte = (pt_entry_t)(pa | PG_V); 2992 if ((prot & VM_PROT_WRITE) != 0) { 2993 newpte |= PG_RW; 2994 vm_page_flag_set(m, PG_WRITEABLE); 2995 } 2996 if ((prot & VM_PROT_EXECUTE) == 0) 2997 newpte |= pg_nx; 2998 if (wired) 2999 newpte |= PG_W; 3000 if (va < VM_MAXUSER_ADDRESS) 3001 newpte |= PG_U; 3002 if (pmap == kernel_pmap) 3003 newpte |= PG_G; 3004 3005 /* 3006 * if the mapping or permission bits are different, we need 3007 * to update the pte. 3008 */ 3009 if ((origpte & ~(PG_M|PG_A)) != newpte) { 3010 newpte |= PG_A; 3011 if ((access & VM_PROT_WRITE) != 0) 3012 newpte |= PG_M; 3013 if (origpte & PG_V) { 3014 invlva = FALSE; 3015 origpte = pte_load_store(pte, newpte); 3016 if (origpte & PG_A) { 3017 if (origpte & PG_MANAGED) 3018 vm_page_flag_set(om, PG_REFERENCED); 3019 if (opa != VM_PAGE_TO_PHYS(m) || ((origpte & 3020 PG_NX) == 0 && (newpte & PG_NX))) 3021 invlva = TRUE; 3022 } 3023 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 3024 if ((origpte & PG_MANAGED) != 0) 3025 vm_page_dirty(om); 3026 if ((newpte & PG_RW) == 0) 3027 invlva = TRUE; 3028 } 3029 if (invlva) 3030 pmap_invalidate_page(pmap, va); 3031 } else 3032 pte_store(pte, newpte); 3033 } 3034 3035 /* 3036 * If both the page table page and the reservation are fully 3037 * populated, then attempt promotion. 3038 */ 3039 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 3040 pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0) 3041 pmap_promote_pde(pmap, pde, va); 3042 3043 vm_page_unlock_queues(); 3044 PMAP_UNLOCK(pmap); 3045} 3046 3047/* 3048 * Tries to create a 2MB page mapping. Returns TRUE if successful and FALSE 3049 * otherwise. Fails if (1) a page table page cannot be allocated without 3050 * blocking, (2) a mapping already exists at the specified virtual address, or 3051 * (3) a pv entry cannot be allocated without reclaiming another pv entry. 3052 */ 3053static boolean_t 3054pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3055{ 3056 pd_entry_t *pde, newpde; 3057 vm_page_t free, mpde; 3058 3059 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3060 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3061 if ((mpde = pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) { 3062 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3063 " in pmap %p", va, pmap); 3064 return (FALSE); 3065 } 3066 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde)); 3067 pde = &pde[pmap_pde_index(va)]; 3068 if ((*pde & PG_V) != 0) { 3069 KASSERT(mpde->wire_count > 1, 3070 ("pmap_enter_pde: mpde's wire count is too low")); 3071 mpde->wire_count--; 3072 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3073 " in pmap %p", va, pmap); 3074 return (FALSE); 3075 } 3076 newpde = VM_PAGE_TO_PHYS(m) | PG_PS | PG_V; 3077 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { 3078 newpde |= PG_MANAGED; 3079 3080 /* 3081 * Abort this mapping if its PV entry could not be created. 3082 */ 3083 if (!pmap_pv_insert_pde(pmap, va, m)) { 3084 free = NULL; 3085 if (pmap_unwire_pte_hold(pmap, va, mpde, &free)) { 3086 pmap_invalidate_page(pmap, va); 3087 pmap_free_zero_pages(free); 3088 } 3089 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3090 " in pmap %p", va, pmap); 3091 return (FALSE); 3092 } 3093 } 3094 if ((prot & VM_PROT_EXECUTE) == 0) 3095 newpde |= pg_nx; 3096 if (va < VM_MAXUSER_ADDRESS) 3097 newpde |= PG_U; 3098 3099 /* 3100 * Increment counters. 3101 */ 3102 pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; 3103 3104 /* 3105 * Map the superpage. 3106 */ 3107 pde_store(pde, newpde); 3108 3109 pmap_pde_mappings++; 3110 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 3111 " in pmap %p", va, pmap); 3112 return (TRUE); 3113} 3114 3115/* 3116 * Maps a sequence of resident pages belonging to the same object. 3117 * The sequence begins with the given page m_start. This page is 3118 * mapped at the given virtual address start. Each subsequent page is 3119 * mapped at a virtual address that is offset from start by the same 3120 * amount as the page is offset from m_start within the object. The 3121 * last page in the sequence is the page with the largest offset from 3122 * m_start that can be mapped at a virtual address less than the given 3123 * virtual address end. Not every virtual page between start and end 3124 * is mapped; only those for which a resident page exists with the 3125 * corresponding offset from m_start are mapped. 3126 */ 3127void 3128pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3129 vm_page_t m_start, vm_prot_t prot) 3130{ 3131 vm_offset_t va; 3132 vm_page_t m, mpte; 3133 vm_pindex_t diff, psize; 3134 3135 VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED); 3136 psize = atop(end - start); 3137 mpte = NULL; 3138 m = m_start; 3139 PMAP_LOCK(pmap); 3140 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3141 va = start + ptoa(diff); 3142 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 3143 (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 && 3144 pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 && 3145 pmap_enter_pde(pmap, va, m, prot)) 3146 m = &m[NBPDR / PAGE_SIZE - 1]; 3147 else 3148 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 3149 mpte); 3150 m = TAILQ_NEXT(m, listq); 3151 } 3152 PMAP_UNLOCK(pmap); 3153} 3154 3155/* 3156 * this code makes some *MAJOR* assumptions: 3157 * 1. Current pmap & pmap exists. 3158 * 2. Not wired. 3159 * 3. Read access. 3160 * 4. No page table pages. 3161 * but is *MUCH* faster than pmap_enter... 3162 */ 3163 3164void 3165pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3166{ 3167 3168 PMAP_LOCK(pmap); 3169 (void) pmap_enter_quick_locked(pmap, va, m, prot, NULL); 3170 PMAP_UNLOCK(pmap); 3171} 3172 3173static vm_page_t 3174pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3175 vm_prot_t prot, vm_page_t mpte) 3176{ 3177 vm_page_t free; 3178 pt_entry_t *pte; 3179 vm_paddr_t pa; 3180 3181 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 3182 (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0, 3183 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3184 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3185 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3186 3187 /* 3188 * In the case that a page table page is not 3189 * resident, we are creating it here. 3190 */ 3191 if (va < VM_MAXUSER_ADDRESS) { 3192 vm_pindex_t ptepindex; 3193 pd_entry_t *ptepa; 3194 3195 /* 3196 * Calculate pagetable page index 3197 */ 3198 ptepindex = pmap_pde_pindex(va); 3199 if (mpte && (mpte->pindex == ptepindex)) { 3200 mpte->wire_count++; 3201 } else { 3202 /* 3203 * Get the page directory entry 3204 */ 3205 ptepa = pmap_pde(pmap, va); 3206 3207 /* 3208 * If the page table page is mapped, we just increment 3209 * the hold count, and activate it. 3210 */ 3211 if (ptepa && (*ptepa & PG_V) != 0) { 3212 if (*ptepa & PG_PS) 3213 return (NULL); 3214 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); 3215 mpte->wire_count++; 3216 } else { 3217 mpte = _pmap_allocpte(pmap, ptepindex, 3218 M_NOWAIT); 3219 if (mpte == NULL) 3220 return (mpte); 3221 } 3222 } 3223 } else { 3224 mpte = NULL; 3225 } 3226 3227 /* 3228 * This call to vtopte makes the assumption that we are 3229 * entering the page into the current pmap. In order to support 3230 * quick entry into any pmap, one would likely use pmap_pte. 3231 * But that isn't as quick as vtopte. 3232 */ 3233 pte = vtopte(va); 3234 if (*pte) { 3235 if (mpte != NULL) { 3236 mpte->wire_count--; 3237 mpte = NULL; 3238 } 3239 return (mpte); 3240 } 3241 3242 /* 3243 * Enter on the PV list if part of our managed memory. 3244 */ 3245 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 && 3246 !pmap_try_insert_pv_entry(pmap, va, m)) { 3247 if (mpte != NULL) { 3248 free = NULL; 3249 if (pmap_unwire_pte_hold(pmap, va, mpte, &free)) { 3250 pmap_invalidate_page(pmap, va); 3251 pmap_free_zero_pages(free); 3252 } 3253 mpte = NULL; 3254 } 3255 return (mpte); 3256 } 3257 3258 /* 3259 * Increment counters 3260 */ 3261 pmap->pm_stats.resident_count++; 3262 3263 pa = VM_PAGE_TO_PHYS(m); 3264 if ((prot & VM_PROT_EXECUTE) == 0) 3265 pa |= pg_nx; 3266 3267 /* 3268 * Now validate mapping with RO protection 3269 */ 3270 if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) 3271 pte_store(pte, pa | PG_V | PG_U); 3272 else 3273 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 3274 return mpte; 3275} 3276 3277/* 3278 * Make a temporary mapping for a physical address. This is only intended 3279 * to be used for panic dumps. 3280 */ 3281void * 3282pmap_kenter_temporary(vm_paddr_t pa, int i) 3283{ 3284 vm_offset_t va; 3285 3286 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 3287 pmap_kenter(va, pa); 3288 invlpg(va); 3289 return ((void *)crashdumpmap); 3290} 3291 3292/* 3293 * This code maps large physical mmap regions into the 3294 * processor address space. Note that some shortcuts 3295 * are taken, but the code works. 3296 */ 3297void 3298pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, 3299 vm_object_t object, vm_pindex_t pindex, 3300 vm_size_t size) 3301{ 3302 vm_offset_t va; 3303 vm_page_t p, pdpg; 3304 3305 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 3306 KASSERT(object->type == OBJT_DEVICE, 3307 ("pmap_object_init_pt: non-device object")); 3308 if (((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) { 3309 vm_page_t m[1]; 3310 pd_entry_t ptepa, *pde; 3311 3312 PMAP_LOCK(pmap); 3313 pde = pmap_pde(pmap, addr); 3314 if (pde != 0 && (*pde & PG_V) != 0) 3315 goto out; 3316 PMAP_UNLOCK(pmap); 3317retry: 3318 p = vm_page_lookup(object, pindex); 3319 if (p != NULL) { 3320 if (vm_page_sleep_if_busy(p, FALSE, "init4p")) 3321 goto retry; 3322 } else { 3323 p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL); 3324 if (p == NULL) 3325 return; 3326 m[0] = p; 3327 3328 if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) { 3329 vm_page_lock_queues(); 3330 vm_page_free(p); 3331 vm_page_unlock_queues(); 3332 return; 3333 } 3334 3335 p = vm_page_lookup(object, pindex); 3336 vm_page_lock_queues(); 3337 vm_page_wakeup(p); 3338 vm_page_unlock_queues(); 3339 } 3340 3341 ptepa = VM_PAGE_TO_PHYS(p); 3342 if (ptepa & (NBPDR - 1)) 3343 return; 3344 3345 p->valid = VM_PAGE_BITS_ALL; 3346 3347 PMAP_LOCK(pmap); 3348 for (va = addr; va < addr + size; va += NBPDR) { 3349 while ((pdpg = 3350 pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) { 3351 PMAP_UNLOCK(pmap); 3352 vm_page_lock_queues(); 3353 vm_page_busy(p); 3354 vm_page_unlock_queues(); 3355 VM_OBJECT_UNLOCK(object); 3356 VM_WAIT; 3357 VM_OBJECT_LOCK(object); 3358 vm_page_lock_queues(); 3359 vm_page_wakeup(p); 3360 vm_page_unlock_queues(); 3361 PMAP_LOCK(pmap); 3362 } 3363 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 3364 pde = &pde[pmap_pde_index(va)]; 3365 if ((*pde & PG_V) == 0) { 3366 pde_store(pde, ptepa | PG_PS | PG_M | PG_A | 3367 PG_U | PG_RW | PG_V); 3368 pmap->pm_stats.resident_count += 3369 NBPDR / PAGE_SIZE; 3370 } else { 3371 pdpg->wire_count--; 3372 KASSERT(pdpg->wire_count > 0, 3373 ("pmap_object_init_pt: missing reference " 3374 "to page directory page, va: 0x%lx", va)); 3375 } 3376 ptepa += NBPDR; 3377 } 3378 pmap_invalidate_all(pmap); 3379out: 3380 PMAP_UNLOCK(pmap); 3381 } 3382} 3383 3384/* 3385 * Routine: pmap_change_wiring 3386 * Function: Change the wiring attribute for a map/virtual-address 3387 * pair. 3388 * In/out conditions: 3389 * The mapping must already exist in the pmap. 3390 */ 3391void 3392pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) 3393{ 3394 pd_entry_t *pde; 3395 pt_entry_t *pte; 3396 boolean_t are_queues_locked; 3397 3398 are_queues_locked = FALSE; 3399 3400 /* 3401 * Wiring is not a hardware characteristic so there is no need to 3402 * invalidate TLB. 3403 */ 3404retry: 3405 PMAP_LOCK(pmap); 3406 pde = pmap_pde(pmap, va); 3407 if ((*pde & PG_PS) != 0) { 3408 if (!wired != ((*pde & PG_W) == 0)) { 3409 if (!are_queues_locked) { 3410 are_queues_locked = TRUE; 3411 if (!mtx_trylock(&vm_page_queue_mtx)) { 3412 PMAP_UNLOCK(pmap); 3413 vm_page_lock_queues(); 3414 goto retry; 3415 } 3416 } 3417 if (!pmap_demote_pde(pmap, pde, va)) 3418 panic("pmap_change_wiring: demotion failed"); 3419 } else 3420 goto out; 3421 } 3422 pte = pmap_pde_to_pte(pde, va); 3423 if (wired && (*pte & PG_W) == 0) { 3424 pmap->pm_stats.wired_count++; 3425 atomic_set_long(pte, PG_W); 3426 } else if (!wired && (*pte & PG_W) != 0) { 3427 pmap->pm_stats.wired_count--; 3428 atomic_clear_long(pte, PG_W); 3429 } 3430out: 3431 if (are_queues_locked) 3432 vm_page_unlock_queues(); 3433 PMAP_UNLOCK(pmap); 3434} 3435 3436 3437 3438/* 3439 * Copy the range specified by src_addr/len 3440 * from the source map to the range dst_addr/len 3441 * in the destination map. 3442 * 3443 * This routine is only advisory and need not do anything. 3444 */ 3445 3446void 3447pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 3448 vm_offset_t src_addr) 3449{ 3450 vm_page_t free; 3451 vm_offset_t addr; 3452 vm_offset_t end_addr = src_addr + len; 3453 vm_offset_t va_next; 3454 3455 if (dst_addr != src_addr) 3456 return; 3457 3458 if (!pmap_is_current(src_pmap)) 3459 return; 3460 3461 vm_page_lock_queues(); 3462 if (dst_pmap < src_pmap) { 3463 PMAP_LOCK(dst_pmap); 3464 PMAP_LOCK(src_pmap); 3465 } else { 3466 PMAP_LOCK(src_pmap); 3467 PMAP_LOCK(dst_pmap); 3468 } 3469 for (addr = src_addr; addr < end_addr; addr = va_next) { 3470 pt_entry_t *src_pte, *dst_pte; 3471 vm_page_t dstmpde, dstmpte, srcmpte; 3472 pml4_entry_t *pml4e; 3473 pdp_entry_t *pdpe; 3474 pd_entry_t srcptepaddr, *pde; 3475 3476 KASSERT(addr < UPT_MIN_ADDRESS, 3477 ("pmap_copy: invalid to pmap_copy page tables")); 3478 3479 pml4e = pmap_pml4e(src_pmap, addr); 3480 if ((*pml4e & PG_V) == 0) { 3481 va_next = (addr + NBPML4) & ~PML4MASK; 3482 if (va_next < addr) 3483 va_next = end_addr; 3484 continue; 3485 } 3486 3487 pdpe = pmap_pml4e_to_pdpe(pml4e, addr); 3488 if ((*pdpe & PG_V) == 0) { 3489 va_next = (addr + NBPDP) & ~PDPMASK; 3490 if (va_next < addr) 3491 va_next = end_addr; 3492 continue; 3493 } 3494 3495 va_next = (addr + NBPDR) & ~PDRMASK; 3496 if (va_next < addr) 3497 va_next = end_addr; 3498 3499 pde = pmap_pdpe_to_pde(pdpe, addr); 3500 srcptepaddr = *pde; 3501 if (srcptepaddr == 0) 3502 continue; 3503 3504 if (srcptepaddr & PG_PS) { 3505 dstmpde = pmap_allocpde(dst_pmap, addr, M_NOWAIT); 3506 if (dstmpde == NULL) 3507 break; 3508 pde = (pd_entry_t *) 3509 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde)); 3510 pde = &pde[pmap_pde_index(addr)]; 3511 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 3512 pmap_pv_insert_pde(dst_pmap, addr, 3513 PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME)))) { 3514 *pde = srcptepaddr & ~PG_W; 3515 dst_pmap->pm_stats.resident_count += 3516 NBPDR / PAGE_SIZE; 3517 } else 3518 dstmpde->wire_count--; 3519 continue; 3520 } 3521 3522 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); 3523 KASSERT(srcmpte->wire_count > 0, 3524 ("pmap_copy: source page table page is unused")); 3525 3526 if (va_next > end_addr) 3527 va_next = end_addr; 3528 3529 src_pte = vtopte(addr); 3530 while (addr < va_next) { 3531 pt_entry_t ptetemp; 3532 ptetemp = *src_pte; 3533 /* 3534 * we only virtual copy managed pages 3535 */ 3536 if ((ptetemp & PG_MANAGED) != 0) { 3537 dstmpte = pmap_allocpte(dst_pmap, addr, 3538 M_NOWAIT); 3539 if (dstmpte == NULL) 3540 break; 3541 dst_pte = (pt_entry_t *) 3542 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 3543 dst_pte = &dst_pte[pmap_pte_index(addr)]; 3544 if (*dst_pte == 0 && 3545 pmap_try_insert_pv_entry(dst_pmap, addr, 3546 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { 3547 /* 3548 * Clear the wired, modified, and 3549 * accessed (referenced) bits 3550 * during the copy. 3551 */ 3552 *dst_pte = ptetemp & ~(PG_W | PG_M | 3553 PG_A); 3554 dst_pmap->pm_stats.resident_count++; 3555 } else { 3556 free = NULL; 3557 if (pmap_unwire_pte_hold(dst_pmap, 3558 addr, dstmpte, &free)) { 3559 pmap_invalidate_page(dst_pmap, 3560 addr); 3561 pmap_free_zero_pages(free); 3562 } 3563 } 3564 if (dstmpte->wire_count >= srcmpte->wire_count) 3565 break; 3566 } 3567 addr += PAGE_SIZE; 3568 src_pte++; 3569 } 3570 } 3571 vm_page_unlock_queues(); 3572 PMAP_UNLOCK(src_pmap); 3573 PMAP_UNLOCK(dst_pmap); 3574} 3575 3576/* 3577 * pmap_zero_page zeros the specified hardware page by mapping 3578 * the page into KVM and using bzero to clear its contents. 3579 */ 3580void 3581pmap_zero_page(vm_page_t m) 3582{ 3583 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3584 3585 pagezero((void *)va); 3586} 3587 3588/* 3589 * pmap_zero_page_area zeros the specified hardware page by mapping 3590 * the page into KVM and using bzero to clear its contents. 3591 * 3592 * off and size may not cover an area beyond a single hardware page. 3593 */ 3594void 3595pmap_zero_page_area(vm_page_t m, int off, int size) 3596{ 3597 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3598 3599 if (off == 0 && size == PAGE_SIZE) 3600 pagezero((void *)va); 3601 else 3602 bzero((char *)va + off, size); 3603} 3604 3605/* 3606 * pmap_zero_page_idle zeros the specified hardware page by mapping 3607 * the page into KVM and using bzero to clear its contents. This 3608 * is intended to be called from the vm_pagezero process only and 3609 * outside of Giant. 3610 */ 3611void 3612pmap_zero_page_idle(vm_page_t m) 3613{ 3614 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3615 3616 pagezero((void *)va); 3617} 3618 3619/* 3620 * pmap_copy_page copies the specified (machine independent) 3621 * page by mapping the page into virtual memory and using 3622 * bcopy to copy the page, one machine dependent page at a 3623 * time. 3624 */ 3625void 3626pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 3627{ 3628 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 3629 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 3630 3631 pagecopy((void *)src, (void *)dst); 3632} 3633 3634/* 3635 * Returns true if the pmap's pv is one of the first 3636 * 16 pvs linked to from this page. This count may 3637 * be changed upwards or downwards in the future; it 3638 * is only necessary that true be returned for a small 3639 * subset of pmaps for proper page aging. 3640 */ 3641boolean_t 3642pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 3643{ 3644 struct md_page *pvh; 3645 pv_entry_t pv; 3646 int loops = 0; 3647 3648 if (m->flags & PG_FICTITIOUS) 3649 return FALSE; 3650 3651 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3652 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3653 if (PV_PMAP(pv) == pmap) { 3654 return TRUE; 3655 } 3656 loops++; 3657 if (loops >= 16) 3658 break; 3659 } 3660 if (loops < 16) { 3661 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3662 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 3663 if (PV_PMAP(pv) == pmap) 3664 return (TRUE); 3665 loops++; 3666 if (loops >= 16) 3667 break; 3668 } 3669 } 3670 return (FALSE); 3671} 3672 3673/* 3674 * pmap_page_wired_mappings: 3675 * 3676 * Return the number of managed mappings to the given physical page 3677 * that are wired. 3678 */ 3679int 3680pmap_page_wired_mappings(vm_page_t m) 3681{ 3682 pv_entry_t pv; 3683 pt_entry_t *pte; 3684 pmap_t pmap; 3685 int count; 3686 3687 count = 0; 3688 if ((m->flags & PG_FICTITIOUS) != 0) 3689 return (count); 3690 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3691 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3692 pmap = PV_PMAP(pv); 3693 PMAP_LOCK(pmap); 3694 pte = pmap_pte(pmap, pv->pv_va); 3695 if ((*pte & PG_W) != 0) 3696 count++; 3697 PMAP_UNLOCK(pmap); 3698 } 3699 return (count); 3700} 3701 3702/* 3703 * Returns TRUE if the given page is mapped individually or as part of 3704 * a 2mpage. Otherwise, returns FALSE. 3705 */ 3706boolean_t 3707pmap_page_is_mapped(vm_page_t m) 3708{ 3709 struct md_page *pvh; 3710 3711 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0) 3712 return (FALSE); 3713 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3714 if (TAILQ_EMPTY(&m->md.pv_list)) { 3715 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3716 return (!TAILQ_EMPTY(&pvh->pv_list)); 3717 } else 3718 return (TRUE); 3719} 3720 3721/* 3722 * Remove all pages from specified address space 3723 * this aids process exit speeds. Also, this code 3724 * is special cased for current process only, but 3725 * can have the more generic (and slightly slower) 3726 * mode enabled. This is much faster than pmap_remove 3727 * in the case of running down an entire address space. 3728 */ 3729void 3730pmap_remove_pages(pmap_t pmap) 3731{ 3732 pd_entry_t *pde; 3733 pt_entry_t *pte, tpte; 3734 vm_page_t free = NULL; 3735 vm_page_t m, mpte, mt; 3736 pv_entry_t pv; 3737 struct md_page *pvh; 3738 struct pv_chunk *pc, *npc; 3739 int field, idx; 3740 int64_t bit; 3741 uint64_t inuse, bitmask; 3742 int allfree; 3743 3744 if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) { 3745 printf("warning: pmap_remove_pages called with non-current pmap\n"); 3746 return; 3747 } 3748 vm_page_lock_queues(); 3749 PMAP_LOCK(pmap); 3750 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 3751 allfree = 1; 3752 for (field = 0; field < _NPCM; field++) { 3753 inuse = (~(pc->pc_map[field])) & pc_freemask[field]; 3754 while (inuse != 0) { 3755 bit = bsfq(inuse); 3756 bitmask = 1UL << bit; 3757 idx = field * 64 + bit; 3758 pv = &pc->pc_pventry[idx]; 3759 inuse &= ~bitmask; 3760 3761 pde = vtopde(pv->pv_va); 3762 tpte = *pde; 3763 if ((tpte & PG_PS) != 0) 3764 pte = pde; 3765 else { 3766 pte = vtopte(pv->pv_va); 3767 tpte = *pte & ~PG_PTE_PAT; 3768 } 3769 3770 if (tpte == 0) { 3771 printf( 3772 "TPTE at %p IS ZERO @ VA %08lx\n", 3773 pte, pv->pv_va); 3774 panic("bad pte"); 3775 } 3776 3777/* 3778 * We cannot remove wired pages from a process' mapping at this time 3779 */ 3780 if (tpte & PG_W) { 3781 allfree = 0; 3782 continue; 3783 } 3784 3785 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 3786 KASSERT(m->phys_addr == (tpte & PG_FRAME), 3787 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 3788 m, (uintmax_t)m->phys_addr, 3789 (uintmax_t)tpte)); 3790 3791 KASSERT(m < &vm_page_array[vm_page_array_size], 3792 ("pmap_remove_pages: bad tpte %#jx", 3793 (uintmax_t)tpte)); 3794 3795 pte_clear(pte); 3796 3797 /* 3798 * Update the vm_page_t clean/reference bits. 3799 */ 3800 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 3801 if ((tpte & PG_PS) != 0) { 3802 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 3803 vm_page_dirty(mt); 3804 } else 3805 vm_page_dirty(m); 3806 } 3807 3808 /* Mark free */ 3809 PV_STAT(pv_entry_frees++); 3810 PV_STAT(pv_entry_spare++); 3811 pv_entry_count--; 3812 pc->pc_map[field] |= bitmask; 3813 if ((tpte & PG_PS) != 0) { 3814 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 3815 pvh = pa_to_pvh(tpte & PG_FRAME); 3816 TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); 3817 if (TAILQ_EMPTY(&pvh->pv_list)) { 3818 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 3819 if (TAILQ_EMPTY(&mt->md.pv_list)) 3820 vm_page_flag_clear(mt, PG_WRITEABLE); 3821 } 3822 mpte = pmap_lookup_pt_page(pmap, pv->pv_va); 3823 if (mpte != NULL) { 3824 pmap_remove_pt_page(pmap, mpte); 3825 KASSERT(mpte->wire_count == NPTEPG, 3826 ("pmap_remove_pages: pte page wire count error")); 3827 mpte->wire_count = 0; 3828 pmap_add_delayed_free_list(mpte, &free, FALSE); 3829 atomic_subtract_int(&cnt.v_wire_count, 1); 3830 } 3831 pmap_unuse_pt(pmap, pv->pv_va, 3832 *pmap_pdpe(pmap, pv->pv_va), &free); 3833 } else { 3834 pmap->pm_stats.resident_count--; 3835 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3836 if (TAILQ_EMPTY(&m->md.pv_list)) { 3837 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3838 if (TAILQ_EMPTY(&pvh->pv_list)) 3839 vm_page_flag_clear(m, PG_WRITEABLE); 3840 } 3841 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); 3842 } 3843 } 3844 } 3845 if (allfree) { 3846 PV_STAT(pv_entry_spare -= _NPCPV); 3847 PV_STAT(pc_chunk_count--); 3848 PV_STAT(pc_chunk_frees++); 3849 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3850 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 3851 dump_drop_page(m->phys_addr); 3852 vm_page_unwire(m, 0); 3853 vm_page_free(m); 3854 } 3855 } 3856 pmap_invalidate_all(pmap); 3857 vm_page_unlock_queues(); 3858 PMAP_UNLOCK(pmap); 3859 pmap_free_zero_pages(free); 3860} 3861 3862/* 3863 * pmap_is_modified: 3864 * 3865 * Return whether or not the specified physical page was modified 3866 * in any physical maps. 3867 */ 3868boolean_t 3869pmap_is_modified(vm_page_t m) 3870{ 3871 3872 if (m->flags & PG_FICTITIOUS) 3873 return (FALSE); 3874 if (pmap_is_modified_pvh(&m->md)) 3875 return (TRUE); 3876 return (pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 3877} 3878 3879/* 3880 * Returns TRUE if any of the given mappings were used to modify 3881 * physical memory. Otherwise, returns FALSE. Both page and 2mpage 3882 * mappings are supported. 3883 */ 3884static boolean_t 3885pmap_is_modified_pvh(struct md_page *pvh) 3886{ 3887 pv_entry_t pv; 3888 pt_entry_t *pte; 3889 pmap_t pmap; 3890 boolean_t rv; 3891 3892 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3893 rv = FALSE; 3894 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 3895 pmap = PV_PMAP(pv); 3896 PMAP_LOCK(pmap); 3897 pte = pmap_pte(pmap, pv->pv_va); 3898 rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); 3899 PMAP_UNLOCK(pmap); 3900 if (rv) 3901 break; 3902 } 3903 return (rv); 3904} 3905 3906/* 3907 * pmap_is_prefaultable: 3908 * 3909 * Return whether or not the specified virtual address is elgible 3910 * for prefault. 3911 */ 3912boolean_t 3913pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 3914{ 3915 pd_entry_t *pde; 3916 pt_entry_t *pte; 3917 boolean_t rv; 3918 3919 rv = FALSE; 3920 PMAP_LOCK(pmap); 3921 pde = pmap_pde(pmap, addr); 3922 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { 3923 pte = pmap_pde_to_pte(pde, addr); 3924 rv = (*pte & PG_V) == 0; 3925 } 3926 PMAP_UNLOCK(pmap); 3927 return (rv); 3928} 3929 3930/* 3931 * Clear the write and modified bits in each of the given page's mappings. 3932 */ 3933void 3934pmap_remove_write(vm_page_t m) 3935{ 3936 struct md_page *pvh; 3937 pmap_t pmap; 3938 pv_entry_t next_pv, pv; 3939 pd_entry_t *pde; 3940 pt_entry_t oldpte, *pte; 3941 vm_offset_t va; 3942 3943 if ((m->flags & PG_FICTITIOUS) != 0 || 3944 (m->flags & PG_WRITEABLE) == 0) 3945 return; 3946 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3947 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3948 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { 3949 va = pv->pv_va; 3950 pmap = PV_PMAP(pv); 3951 PMAP_LOCK(pmap); 3952 pde = pmap_pde(pmap, va); 3953 if ((*pde & PG_RW) != 0) 3954 (void)pmap_demote_pde(pmap, pde, va); 3955 PMAP_UNLOCK(pmap); 3956 } 3957 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3958 pmap = PV_PMAP(pv); 3959 PMAP_LOCK(pmap); 3960 pde = pmap_pde(pmap, pv->pv_va); 3961 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" 3962 " a 2mpage in page %p's pv list", m)); 3963 pte = pmap_pde_to_pte(pde, pv->pv_va); 3964retry: 3965 oldpte = *pte; 3966 if (oldpte & PG_RW) { 3967 if (!atomic_cmpset_long(pte, oldpte, oldpte & 3968 ~(PG_RW | PG_M))) 3969 goto retry; 3970 if ((oldpte & PG_M) != 0) 3971 vm_page_dirty(m); 3972 pmap_invalidate_page(pmap, pv->pv_va); 3973 } 3974 PMAP_UNLOCK(pmap); 3975 } 3976 vm_page_flag_clear(m, PG_WRITEABLE); 3977} 3978 3979/* 3980 * pmap_ts_referenced: 3981 * 3982 * Return a count of reference bits for a page, clearing those bits. 3983 * It is not necessary for every reference bit to be cleared, but it 3984 * is necessary that 0 only be returned when there are truly no 3985 * reference bits set. 3986 * 3987 * XXX: The exact number of bits to check and clear is a matter that 3988 * should be tested and standardized at some point in the future for 3989 * optimal aging of shared pages. 3990 */ 3991int 3992pmap_ts_referenced(vm_page_t m) 3993{ 3994 struct md_page *pvh; 3995 pv_entry_t pv, pvf, pvn; 3996 pmap_t pmap; 3997 pd_entry_t oldpde, *pde; 3998 pt_entry_t *pte; 3999 vm_offset_t va; 4000 int rtval = 0; 4001 4002 if (m->flags & PG_FICTITIOUS) 4003 return (rtval); 4004 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 4005 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4006 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) { 4007 va = pv->pv_va; 4008 pmap = PV_PMAP(pv); 4009 PMAP_LOCK(pmap); 4010 pde = pmap_pde(pmap, va); 4011 oldpde = *pde; 4012 if ((oldpde & PG_A) != 0) { 4013 if (pmap_demote_pde(pmap, pde, va)) { 4014 if ((oldpde & PG_W) == 0) { 4015 /* 4016 * Remove the mapping to a single page 4017 * so that a subsequent access may 4018 * repromote. Since the underlying 4019 * page table page is fully populated, 4020 * this removal never frees a page 4021 * table page. 4022 */ 4023 va += VM_PAGE_TO_PHYS(m) - (oldpde & 4024 PG_FRAME); 4025 pmap_remove_page(pmap, va, pde, NULL); 4026 rtval++; 4027 if (rtval > 4) { 4028 PMAP_UNLOCK(pmap); 4029 return (rtval); 4030 } 4031 } 4032 } 4033 } 4034 PMAP_UNLOCK(pmap); 4035 } 4036 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4037 pvf = pv; 4038 do { 4039 pvn = TAILQ_NEXT(pv, pv_list); 4040 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 4041 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 4042 pmap = PV_PMAP(pv); 4043 PMAP_LOCK(pmap); 4044 pde = pmap_pde(pmap, pv->pv_va); 4045 KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:" 4046 " found a 2mpage in page %p's pv list", m)); 4047 pte = pmap_pde_to_pte(pde, pv->pv_va); 4048 if ((*pte & PG_A) != 0) { 4049 atomic_clear_long(pte, PG_A); 4050 pmap_invalidate_page(pmap, pv->pv_va); 4051 rtval++; 4052 if (rtval > 4) 4053 pvn = NULL; 4054 } 4055 PMAP_UNLOCK(pmap); 4056 } while ((pv = pvn) != NULL && pv != pvf); 4057 } 4058 return (rtval); 4059} 4060 4061/* 4062 * Clear the modify bits on the specified physical page. 4063 */ 4064void 4065pmap_clear_modify(vm_page_t m) 4066{ 4067 struct md_page *pvh; 4068 pmap_t pmap; 4069 pv_entry_t next_pv, pv; 4070 pd_entry_t oldpde, *pde; 4071 pt_entry_t oldpte, *pte; 4072 vm_offset_t va; 4073 4074 if ((m->flags & PG_FICTITIOUS) != 0) 4075 return; 4076 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 4077 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4078 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { 4079 va = pv->pv_va; 4080 pmap = PV_PMAP(pv); 4081 PMAP_LOCK(pmap); 4082 pde = pmap_pde(pmap, va); 4083 oldpde = *pde; 4084 if ((oldpde & PG_RW) != 0) { 4085 if (pmap_demote_pde(pmap, pde, va)) { 4086 if ((oldpde & PG_W) == 0) { 4087 /* 4088 * Write protect the mapping to a 4089 * single page so that a subsequent 4090 * write access may repromote. 4091 */ 4092 va += VM_PAGE_TO_PHYS(m) - (oldpde & 4093 PG_FRAME); 4094 pte = pmap_pde_to_pte(pde, va); 4095 oldpte = *pte; 4096 if ((oldpte & PG_V) != 0) { 4097 while (!atomic_cmpset_long(pte, 4098 oldpte, 4099 oldpte & ~(PG_M | PG_RW))) 4100 oldpte = *pte; 4101 vm_page_dirty(m); 4102 pmap_invalidate_page(pmap, va); 4103 } 4104 } 4105 } 4106 } 4107 PMAP_UNLOCK(pmap); 4108 } 4109 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4110 pmap = PV_PMAP(pv); 4111 PMAP_LOCK(pmap); 4112 pde = pmap_pde(pmap, pv->pv_va); 4113 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 4114 " a 2mpage in page %p's pv list", m)); 4115 pte = pmap_pde_to_pte(pde, pv->pv_va); 4116 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4117 atomic_clear_long(pte, PG_M); 4118 pmap_invalidate_page(pmap, pv->pv_va); 4119 } 4120 PMAP_UNLOCK(pmap); 4121 } 4122} 4123 4124/* 4125 * pmap_clear_reference: 4126 * 4127 * Clear the reference bit on the specified physical page. 4128 */ 4129void 4130pmap_clear_reference(vm_page_t m) 4131{ 4132 struct md_page *pvh; 4133 pmap_t pmap; 4134 pv_entry_t next_pv, pv; 4135 pd_entry_t oldpde, *pde; 4136 pt_entry_t *pte; 4137 vm_offset_t va; 4138 4139 if ((m->flags & PG_FICTITIOUS) != 0) 4140 return; 4141 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 4142 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4143 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { 4144 va = pv->pv_va; 4145 pmap = PV_PMAP(pv); 4146 PMAP_LOCK(pmap); 4147 pde = pmap_pde(pmap, va); 4148 oldpde = *pde; 4149 if ((oldpde & PG_A) != 0) { 4150 if (pmap_demote_pde(pmap, pde, va)) { 4151 /* 4152 * Remove the mapping to a single page so 4153 * that a subsequent access may repromote. 4154 * Since the underlying page table page is 4155 * fully populated, this removal never frees 4156 * a page table page. 4157 */ 4158 va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_FRAME); 4159 pmap_remove_page(pmap, va, pde, NULL); 4160 } 4161 } 4162 PMAP_UNLOCK(pmap); 4163 } 4164 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4165 pmap = PV_PMAP(pv); 4166 PMAP_LOCK(pmap); 4167 pde = pmap_pde(pmap, pv->pv_va); 4168 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found" 4169 " a 2mpage in page %p's pv list", m)); 4170 pte = pmap_pde_to_pte(pde, pv->pv_va); 4171 if (*pte & PG_A) { 4172 atomic_clear_long(pte, PG_A); 4173 pmap_invalidate_page(pmap, pv->pv_va); 4174 } 4175 PMAP_UNLOCK(pmap); 4176 } 4177} 4178 4179/* 4180 * Miscellaneous support routines follow 4181 */ 4182 4183/* Adjust the cache mode for a 4KB page mapped via a PTE. */ 4184static __inline void 4185pmap_pte_attr(vm_offset_t va, int mode) 4186{ 4187 pt_entry_t *pte; 4188 u_int opte, npte; 4189 4190 pte = vtopte(va); 4191 4192 /* 4193 * The cache mode bits are all in the low 32-bits of the 4194 * PTE, so we can just spin on updating the low 32-bits. 4195 */ 4196 do { 4197 opte = *(u_int *)pte; 4198 npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT); 4199 npte |= pmap_cache_bits(mode, 0); 4200 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 4201} 4202 4203/* Adjust the cache mode for a 2MB page mapped via a PDE. */ 4204static __inline void 4205pmap_pde_attr(vm_offset_t va, int mode) 4206{ 4207 pd_entry_t *pde; 4208 u_int opde, npde; 4209 4210 pde = pmap_pde(kernel_pmap, va); 4211 4212 /* 4213 * The cache mode bits are all in the low 32-bits of the 4214 * PDE, so we can just spin on updating the low 32-bits. 4215 */ 4216 do { 4217 opde = *(u_int *)pde; 4218 npde = opde & ~(PG_PDE_PAT | PG_NC_PCD | PG_NC_PWT); 4219 npde |= pmap_cache_bits(mode, 1); 4220 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 4221} 4222 4223/* 4224 * Map a set of physical memory pages into the kernel virtual 4225 * address space. Return a pointer to where it is mapped. This 4226 * routine is intended to be used for mapping device memory, 4227 * NOT real memory. 4228 */ 4229void * 4230pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 4231{ 4232 vm_offset_t va, tmpva, offset; 4233 4234 /* 4235 * If this fits within the direct map window and use WB caching 4236 * mode, use the direct map. 4237 */ 4238 if (pa < dmaplimit && (pa + size) < dmaplimit && mode == PAT_WRITE_BACK) 4239 return ((void *)PHYS_TO_DMAP(pa)); 4240 offset = pa & PAGE_MASK; 4241 size = roundup(offset + size, PAGE_SIZE); 4242 va = kmem_alloc_nofault(kernel_map, size); 4243 if (!va) 4244 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 4245 pa = trunc_page(pa); 4246 for (tmpva = va; size > 0; ) { 4247 pmap_kenter_attr(tmpva, pa, mode); 4248 size -= PAGE_SIZE; 4249 tmpva += PAGE_SIZE; 4250 pa += PAGE_SIZE; 4251 } 4252 pmap_invalidate_range(kernel_pmap, va, tmpva); 4253 pmap_invalidate_cache(); 4254 return ((void *)(va + offset)); 4255} 4256 4257void * 4258pmap_mapdev(vm_paddr_t pa, vm_size_t size) 4259{ 4260 4261 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 4262} 4263 4264void * 4265pmap_mapbios(vm_paddr_t pa, vm_size_t size) 4266{ 4267 4268 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 4269} 4270 4271void 4272pmap_unmapdev(vm_offset_t va, vm_size_t size) 4273{ 4274 vm_offset_t base, offset, tmpva; 4275 4276 /* If we gave a direct map region in pmap_mapdev, do nothing */ 4277 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 4278 return; 4279 base = trunc_page(va); 4280 offset = va & PAGE_MASK; 4281 size = roundup(offset + size, PAGE_SIZE); 4282 for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) 4283 pmap_kremove(tmpva); 4284 pmap_invalidate_range(kernel_pmap, va, tmpva); 4285 kmem_free(kernel_map, base, size); 4286} 4287 4288int 4289pmap_change_attr(va, size, mode) 4290 vm_offset_t va; 4291 vm_size_t size; 4292 int mode; 4293{ 4294 vm_offset_t base, offset, tmpva; 4295 pd_entry_t *pde; 4296 pt_entry_t *pte; 4297 4298 base = trunc_page(va); 4299 offset = va & PAGE_MASK; 4300 size = roundup(offset + size, PAGE_SIZE); 4301 4302 /* Only supported on kernel virtual addresses. */ 4303 if (base <= VM_MAXUSER_ADDRESS) 4304 return (EINVAL); 4305 4306 /* 4307 * XXX: We have to support tearing 2MB pages down into 4k pages if 4308 * needed here. 4309 */ 4310 /* Pages that aren't mapped aren't supported. */ 4311 for (tmpva = base; tmpva < (base + size); ) { 4312 pde = pmap_pde(kernel_pmap, tmpva); 4313 if (*pde == 0) 4314 return (EINVAL); 4315 if (*pde & PG_PS) { 4316 /* Handle 2MB pages that are completely contained. */ 4317 if (size >= NBPDR) { 4318 tmpva += NBPDR; 4319 continue; 4320 } 4321 return (EINVAL); 4322 } 4323 pte = vtopte(va); 4324 if (*pte == 0) 4325 return (EINVAL); 4326 tmpva += PAGE_SIZE; 4327 } 4328 4329 /* 4330 * Ok, all the pages exist, so run through them updating their 4331 * cache mode. 4332 */ 4333 for (tmpva = base; size > 0; ) { 4334 pde = pmap_pde(kernel_pmap, tmpva); 4335 if (*pde & PG_PS) { 4336 pmap_pde_attr(tmpva, mode); 4337 tmpva += NBPDR; 4338 size -= NBPDR; 4339 } else { 4340 pmap_pte_attr(tmpva, mode); 4341 tmpva += PAGE_SIZE; 4342 size -= PAGE_SIZE; 4343 } 4344 } 4345 4346 /* 4347 * Flush CPU caches to make sure any data isn't cached that shouldn't 4348 * be, etc. 4349 */ 4350 pmap_invalidate_range(kernel_pmap, base, tmpva); 4351 pmap_invalidate_cache(); 4352 return (0); 4353} 4354 4355/* 4356 * perform the pmap work for mincore 4357 */ 4358int 4359pmap_mincore(pmap_t pmap, vm_offset_t addr) 4360{ 4361 pd_entry_t *pdep; 4362 pt_entry_t pte; 4363 vm_paddr_t pa; 4364 vm_page_t m; 4365 int val = 0; 4366 4367 PMAP_LOCK(pmap); 4368 pdep = pmap_pde(pmap, addr); 4369 if (pdep != NULL && (*pdep & PG_V)) { 4370 if (*pdep & PG_PS) { 4371 pte = *pdep; 4372 val = MINCORE_SUPER; 4373 /* Compute the physical address of the 4KB page. */ 4374 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & 4375 PG_FRAME; 4376 } else { 4377 pte = *pmap_pde_to_pte(pdep, addr); 4378 pa = pte & PG_FRAME; 4379 } 4380 } else { 4381 pte = 0; 4382 pa = 0; 4383 } 4384 PMAP_UNLOCK(pmap); 4385 4386 if (pte != 0) { 4387 val |= MINCORE_INCORE; 4388 if ((pte & PG_MANAGED) == 0) 4389 return val; 4390 4391 m = PHYS_TO_VM_PAGE(pa); 4392 4393 /* 4394 * Modified by us 4395 */ 4396 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 4397 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 4398 else { 4399 /* 4400 * Modified by someone else 4401 */ 4402 vm_page_lock_queues(); 4403 if (m->dirty || pmap_is_modified(m)) 4404 val |= MINCORE_MODIFIED_OTHER; 4405 vm_page_unlock_queues(); 4406 } 4407 /* 4408 * Referenced by us 4409 */ 4410 if (pte & PG_A) 4411 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 4412 else { 4413 /* 4414 * Referenced by someone else 4415 */ 4416 vm_page_lock_queues(); 4417 if ((m->flags & PG_REFERENCED) || 4418 pmap_ts_referenced(m)) { 4419 val |= MINCORE_REFERENCED_OTHER; 4420 vm_page_flag_set(m, PG_REFERENCED); 4421 } 4422 vm_page_unlock_queues(); 4423 } 4424 } 4425 return val; 4426} 4427 4428void 4429pmap_activate(struct thread *td) 4430{ 4431 pmap_t pmap, oldpmap; 4432 u_int64_t cr3; 4433 4434 critical_enter(); 4435 pmap = vmspace_pmap(td->td_proc->p_vmspace); 4436 oldpmap = PCPU_GET(curpmap); 4437#ifdef SMP 4438if (oldpmap) /* XXX FIXME */ 4439 atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask)); 4440 atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask)); 4441#else 4442if (oldpmap) /* XXX FIXME */ 4443 oldpmap->pm_active &= ~PCPU_GET(cpumask); 4444 pmap->pm_active |= PCPU_GET(cpumask); 4445#endif 4446 cr3 = vtophys(pmap->pm_pml4); 4447 td->td_pcb->pcb_cr3 = cr3; 4448 load_cr3(cr3); 4449 critical_exit(); 4450} 4451 4452vm_offset_t 4453pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) 4454{ 4455 4456 if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { 4457 return addr; 4458 } 4459 4460 addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); 4461 return addr; 4462} 4463