pmap.c revision 193734
1/*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2003 Peter Wemm 9 * All rights reserved. 10 * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu> 11 * All rights reserved. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * the Systems Programming Group of the University of Utah Computer 15 * Science Department and William Jolitz of UUNET Technologies Inc. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. All advertising materials mentioning features or use of this software 26 * must display the following acknowledgement: 27 * This product includes software developed by the University of 28 * California, Berkeley and its contributors. 29 * 4. Neither the name of the University nor the names of its contributors 30 * may be used to endorse or promote products derived from this software 31 * without specific prior written permission. 32 * 33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 36 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 43 * SUCH DAMAGE. 44 * 45 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 46 */ 47/*- 48 * Copyright (c) 2003 Networks Associates Technology, Inc. 49 * All rights reserved. 50 * 51 * This software was developed for the FreeBSD Project by Jake Burkholder, 52 * Safeport Network Services, and Network Associates Laboratories, the 53 * Security Research Division of Network Associates, Inc. under 54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 55 * CHATS research program. 56 * 57 * Redistribution and use in source and binary forms, with or without 58 * modification, are permitted provided that the following conditions 59 * are met: 60 * 1. Redistributions of source code must retain the above copyright 61 * notice, this list of conditions and the following disclaimer. 62 * 2. Redistributions in binary form must reproduce the above copyright 63 * notice, this list of conditions and the following disclaimer in the 64 * documentation and/or other materials provided with the distribution. 65 * 66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 69 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 76 * SUCH DAMAGE. 77 */ 78 79#include <sys/cdefs.h> 80__FBSDID("$FreeBSD: head/sys/amd64/amd64/pmap.c 193734 2009-06-08 18:23:43Z ed $"); 81 82/* 83 * Manages physical address maps. 84 * 85 * In addition to hardware address maps, this 86 * module is called upon to provide software-use-only 87 * maps which may or may not be stored in the same 88 * form as hardware maps. These pseudo-maps are 89 * used to store intermediate results from copy 90 * operations to and from address spaces. 91 * 92 * Since the information managed by this module is 93 * also stored by the logical address mapping module, 94 * this module may throw away valid virtual-to-physical 95 * mappings at almost any time. However, invalidations 96 * of virtual-to-physical mappings must be done as 97 * requested. 98 * 99 * In order to cope with hardware architectures which 100 * make virtual-to-physical map invalidates expensive, 101 * this module may delay invalidate or reduced protection 102 * operations until such time as they are actually 103 * necessary. This module is given full information as 104 * to which processors are currently using which maps, 105 * and to when physical maps must be made correct. 106 */ 107 108#include "opt_msgbuf.h" 109#include "opt_pmap.h" 110#include "opt_vm.h" 111 112#include <sys/param.h> 113#include <sys/systm.h> 114#include <sys/kernel.h> 115#include <sys/ktr.h> 116#include <sys/lock.h> 117#include <sys/malloc.h> 118#include <sys/mman.h> 119#include <sys/msgbuf.h> 120#include <sys/mutex.h> 121#include <sys/proc.h> 122#include <sys/sx.h> 123#include <sys/vmmeter.h> 124#include <sys/sched.h> 125#include <sys/sysctl.h> 126#ifdef SMP 127#include <sys/smp.h> 128#endif 129 130#include <vm/vm.h> 131#include <vm/vm_param.h> 132#include <vm/vm_kern.h> 133#include <vm/vm_page.h> 134#include <vm/vm_map.h> 135#include <vm/vm_object.h> 136#include <vm/vm_extern.h> 137#include <vm/vm_pageout.h> 138#include <vm/vm_pager.h> 139#include <vm/vm_reserv.h> 140#include <vm/uma.h> 141 142#include <machine/cpu.h> 143#include <machine/cputypes.h> 144#include <machine/md_var.h> 145#include <machine/pcb.h> 146#include <machine/specialreg.h> 147#ifdef SMP 148#include <machine/smp.h> 149#endif 150 151#ifndef PMAP_SHPGPERPROC 152#define PMAP_SHPGPERPROC 200 153#endif 154 155#if !defined(DIAGNOSTIC) 156#define PMAP_INLINE __gnu89_inline 157#else 158#define PMAP_INLINE 159#endif 160 161#define PV_STATS 162#ifdef PV_STATS 163#define PV_STAT(x) do { x ; } while (0) 164#else 165#define PV_STAT(x) do { } while (0) 166#endif 167 168#define pa_index(pa) ((pa) >> PDRSHIFT) 169#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 170 171struct pmap kernel_pmap_store; 172 173vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 174vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 175 176static int ndmpdp; 177static vm_paddr_t dmaplimit; 178vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 179pt_entry_t pg_nx; 180 181SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 182 183static int pg_ps_enabled = 1; 184SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RD, &pg_ps_enabled, 0, 185 "Are large page mappings enabled?"); 186 187static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 188static u_int64_t KPDphys; /* phys addr of kernel level 2 */ 189u_int64_t KPDPphys; /* phys addr of kernel level 3 */ 190u_int64_t KPML4phys; /* phys addr of kernel level 4 */ 191 192static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ 193static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 194 195/* 196 * Data for the pv entry allocation mechanism 197 */ 198static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 199static struct md_page *pv_table; 200static int shpgperproc = PMAP_SHPGPERPROC; 201 202/* 203 * All those kernel PT submaps that BSD is so fond of 204 */ 205pt_entry_t *CMAP1 = 0; 206caddr_t CADDR1 = 0; 207struct msgbuf *msgbufp = 0; 208 209/* 210 * Crashdump maps. 211 */ 212static caddr_t crashdumpmap; 213 214static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 215static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try); 216static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 217static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 218static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 219static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 220static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 221 vm_offset_t va); 222static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); 223 224static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); 225static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 226static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, 227 vm_offset_t va); 228static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, 229 vm_prot_t prot); 230static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 231 vm_page_t m, vm_prot_t prot, vm_page_t mpte); 232static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 233static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 234static boolean_t pmap_is_modified_pvh(struct md_page *pvh); 235static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 236static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); 237static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); 238static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 239static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 240 vm_prot_t prot); 241static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); 242static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 243 vm_page_t *free); 244static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, 245 vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free); 246static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); 247static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 248 vm_page_t *free); 249static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, 250 vm_offset_t va); 251static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); 252static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 253 vm_page_t m); 254 255static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags); 256static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags); 257 258static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags); 259static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 260 vm_page_t* free); 261static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, vm_page_t *); 262static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 263 264CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); 265CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); 266 267/* 268 * Move the kernel virtual free pointer to the next 269 * 2MB. This is used to help improve performance 270 * by using a large (2MB) page for much of the kernel 271 * (.text, .data, .bss) 272 */ 273static vm_offset_t 274pmap_kmem_choose(vm_offset_t addr) 275{ 276 vm_offset_t newaddr = addr; 277 278 newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); 279 return newaddr; 280} 281 282/********************/ 283/* Inline functions */ 284/********************/ 285 286/* Return a non-clipped PD index for a given VA */ 287static __inline vm_pindex_t 288pmap_pde_pindex(vm_offset_t va) 289{ 290 return va >> PDRSHIFT; 291} 292 293 294/* Return various clipped indexes for a given VA */ 295static __inline vm_pindex_t 296pmap_pte_index(vm_offset_t va) 297{ 298 299 return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); 300} 301 302static __inline vm_pindex_t 303pmap_pde_index(vm_offset_t va) 304{ 305 306 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 307} 308 309static __inline vm_pindex_t 310pmap_pdpe_index(vm_offset_t va) 311{ 312 313 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 314} 315 316static __inline vm_pindex_t 317pmap_pml4e_index(vm_offset_t va) 318{ 319 320 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 321} 322 323/* Return a pointer to the PML4 slot that corresponds to a VA */ 324static __inline pml4_entry_t * 325pmap_pml4e(pmap_t pmap, vm_offset_t va) 326{ 327 328 return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 329} 330 331/* Return a pointer to the PDP slot that corresponds to a VA */ 332static __inline pdp_entry_t * 333pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 334{ 335 pdp_entry_t *pdpe; 336 337 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 338 return (&pdpe[pmap_pdpe_index(va)]); 339} 340 341/* Return a pointer to the PDP slot that corresponds to a VA */ 342static __inline pdp_entry_t * 343pmap_pdpe(pmap_t pmap, vm_offset_t va) 344{ 345 pml4_entry_t *pml4e; 346 347 pml4e = pmap_pml4e(pmap, va); 348 if ((*pml4e & PG_V) == 0) 349 return NULL; 350 return (pmap_pml4e_to_pdpe(pml4e, va)); 351} 352 353/* Return a pointer to the PD slot that corresponds to a VA */ 354static __inline pd_entry_t * 355pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 356{ 357 pd_entry_t *pde; 358 359 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 360 return (&pde[pmap_pde_index(va)]); 361} 362 363/* Return a pointer to the PD slot that corresponds to a VA */ 364static __inline pd_entry_t * 365pmap_pde(pmap_t pmap, vm_offset_t va) 366{ 367 pdp_entry_t *pdpe; 368 369 pdpe = pmap_pdpe(pmap, va); 370 if (pdpe == NULL || (*pdpe & PG_V) == 0) 371 return NULL; 372 return (pmap_pdpe_to_pde(pdpe, va)); 373} 374 375/* Return a pointer to the PT slot that corresponds to a VA */ 376static __inline pt_entry_t * 377pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 378{ 379 pt_entry_t *pte; 380 381 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 382 return (&pte[pmap_pte_index(va)]); 383} 384 385/* Return a pointer to the PT slot that corresponds to a VA */ 386static __inline pt_entry_t * 387pmap_pte(pmap_t pmap, vm_offset_t va) 388{ 389 pd_entry_t *pde; 390 391 pde = pmap_pde(pmap, va); 392 if (pde == NULL || (*pde & PG_V) == 0) 393 return NULL; 394 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 395 return ((pt_entry_t *)pde); 396 return (pmap_pde_to_pte(pde, va)); 397} 398 399 400PMAP_INLINE pt_entry_t * 401vtopte(vm_offset_t va) 402{ 403 u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 404 405 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 406} 407 408static __inline pd_entry_t * 409vtopde(vm_offset_t va) 410{ 411 u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 412 413 return (PDmap + ((va >> PDRSHIFT) & mask)); 414} 415 416static u_int64_t 417allocpages(vm_paddr_t *firstaddr, int n) 418{ 419 u_int64_t ret; 420 421 ret = *firstaddr; 422 bzero((void *)ret, n * PAGE_SIZE); 423 *firstaddr += n * PAGE_SIZE; 424 return (ret); 425} 426 427static void 428create_pagetables(vm_paddr_t *firstaddr) 429{ 430 int i; 431 432 /* Allocate pages */ 433 KPTphys = allocpages(firstaddr, NKPT); 434 KPML4phys = allocpages(firstaddr, 1); 435 KPDPphys = allocpages(firstaddr, NKPML4E); 436 KPDphys = allocpages(firstaddr, NKPDPE); 437 438 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; 439 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 440 ndmpdp = 4; 441 DMPDPphys = allocpages(firstaddr, NDMPML4E); 442 if ((amd_feature & AMDID_PAGE1GB) == 0) 443 DMPDphys = allocpages(firstaddr, ndmpdp); 444 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 445 446 /* Fill in the underlying page table pages */ 447 /* Read-only from zero to physfree */ 448 /* XXX not fully used, underneath 2M pages */ 449 for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) { 450 ((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT; 451 ((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G; 452 } 453 454 /* Now map the page tables at their location within PTmap */ 455 for (i = 0; i < NKPT; i++) { 456 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); 457 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V; 458 } 459 460 /* Map from zero to end of allocations under 2M pages */ 461 /* This replaces some of the KPTphys entries above */ 462 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) { 463 ((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT; 464 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G; 465 } 466 467 /* And connect up the PD to the PDP */ 468 for (i = 0; i < NKPDPE; i++) { 469 ((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys + 470 (i << PAGE_SHIFT); 471 ((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U; 472 } 473 474 /* Now set up the direct map space using either 2MB or 1GB pages */ 475 /* Preset PG_M and PG_A because demotion expects it */ 476 if ((amd_feature & AMDID_PAGE1GB) == 0) { 477 for (i = 0; i < NPDEPG * ndmpdp; i++) { 478 ((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT; 479 ((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS | 480 PG_G | PG_M | PG_A; 481 } 482 /* And the direct map space's PDP */ 483 for (i = 0; i < ndmpdp; i++) { 484 ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + 485 (i << PAGE_SHIFT); 486 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U; 487 } 488 } else { 489 for (i = 0; i < ndmpdp; i++) { 490 ((pdp_entry_t *)DMPDPphys)[i] = 491 (vm_paddr_t)i << PDPSHIFT; 492 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS | 493 PG_G | PG_M | PG_A; 494 } 495 } 496 497 /* And recursively map PML4 to itself in order to get PTmap */ 498 ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; 499 ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U; 500 501 /* Connect the Direct Map slot up to the PML4 */ 502 ((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys; 503 ((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U; 504 505 /* Connect the KVA slot up to the PML4 */ 506 ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys; 507 ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U; 508} 509 510/* 511 * Bootstrap the system enough to run with virtual memory. 512 * 513 * On amd64 this is called after mapping has already been enabled 514 * and just syncs the pmap module with what has already been done. 515 * [We can't call it easily with mapping off since the kernel is not 516 * mapped with PA == VA, hence we would have to relocate every address 517 * from the linked base (virtual) address "KERNBASE" to the actual 518 * (physical) address starting relative to 0] 519 */ 520void 521pmap_bootstrap(vm_paddr_t *firstaddr) 522{ 523 vm_offset_t va; 524 pt_entry_t *pte, *unused; 525 526 /* 527 * Create an initial set of page tables to run the kernel in. 528 */ 529 create_pagetables(firstaddr); 530 531 virtual_avail = (vm_offset_t) KERNBASE + *firstaddr; 532 virtual_avail = pmap_kmem_choose(virtual_avail); 533 534 virtual_end = VM_MAX_KERNEL_ADDRESS; 535 536 537 /* XXX do %cr0 as well */ 538 load_cr4(rcr4() | CR4_PGE | CR4_PSE); 539 load_cr3(KPML4phys); 540 541 /* 542 * Initialize the kernel pmap (which is statically allocated). 543 */ 544 PMAP_LOCK_INIT(kernel_pmap); 545 kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); 546 kernel_pmap->pm_root = NULL; 547 kernel_pmap->pm_active = -1; /* don't allow deactivation */ 548 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 549 550 /* 551 * Reserve some special page table entries/VA space for temporary 552 * mapping of pages. 553 */ 554#define SYSMAP(c, p, v, n) \ 555 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 556 557 va = virtual_avail; 558 pte = vtopte(va); 559 560 /* 561 * CMAP1 is only used for the memory test. 562 */ 563 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 564 565 /* 566 * Crashdump maps. 567 */ 568 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) 569 570 /* 571 * msgbufp is used to map the system message buffer. 572 */ 573 SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE))) 574 575 virtual_avail = va; 576 577 *CMAP1 = 0; 578 579 invltlb(); 580 581 /* Initialize the PAT MSR. */ 582 pmap_init_pat(); 583} 584 585/* 586 * Setup the PAT MSR. 587 */ 588void 589pmap_init_pat(void) 590{ 591 uint64_t pat_msr; 592 593 /* Bail if this CPU doesn't implement PAT. */ 594 if (!(cpu_feature & CPUID_PAT)) 595 panic("no PAT??"); 596 597 /* 598 * Leave the indices 0-3 at the default of WB, WT, UC, and UC-. 599 * Program 4 and 5 as WP and WC. 600 * Leave 6 and 7 as UC and UC-. 601 */ 602 pat_msr = rdmsr(MSR_PAT); 603 pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5)); 604 pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) | 605 PAT_VALUE(5, PAT_WRITE_COMBINING); 606 wrmsr(MSR_PAT, pat_msr); 607} 608 609/* 610 * Initialize a vm_page's machine-dependent fields. 611 */ 612void 613pmap_page_init(vm_page_t m) 614{ 615 616 TAILQ_INIT(&m->md.pv_list); 617} 618 619/* 620 * Initialize the pmap module. 621 * Called by vm_init, to initialize any structures that the pmap 622 * system needs to map virtual memory. 623 */ 624void 625pmap_init(void) 626{ 627 pd_entry_t *pd; 628 vm_page_t mpte; 629 vm_size_t s; 630 int i, pv_npg; 631 632 /* 633 * Initialize the vm page array entries for the kernel pmap's 634 * page table pages. 635 */ 636 pd = pmap_pde(kernel_pmap, KERNBASE); 637 for (i = 0; i < NKPT; i++) { 638 if ((pd[i] & (PG_PS | PG_V)) == (PG_PS | PG_V)) 639 continue; 640 KASSERT((pd[i] & PG_V) != 0, 641 ("pmap_init: page table page is missing")); 642 mpte = PHYS_TO_VM_PAGE(pd[i] & PG_FRAME); 643 KASSERT(mpte >= vm_page_array && 644 mpte < &vm_page_array[vm_page_array_size], 645 ("pmap_init: page table page is out of range")); 646 mpte->pindex = pmap_pde_pindex(KERNBASE) + i; 647 mpte->phys_addr = pd[i] & PG_FRAME; 648 } 649 650 /* 651 * Initialize the address space (zone) for the pv entries. Set a 652 * high water mark so that the system can recover from excessive 653 * numbers of pv entries. 654 */ 655 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 656 pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; 657 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 658 pv_entry_high_water = 9 * (pv_entry_max / 10); 659 660 /* 661 * Are large page mappings enabled? 662 */ 663 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 664 665 /* 666 * Calculate the size of the pv head table for superpages. 667 */ 668 for (i = 0; phys_avail[i + 1]; i += 2); 669 pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR; 670 671 /* 672 * Allocate memory for the pv head table for superpages. 673 */ 674 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 675 s = round_page(s); 676 pv_table = (struct md_page *)kmem_alloc(kernel_map, s); 677 for (i = 0; i < pv_npg; i++) 678 TAILQ_INIT(&pv_table[i].pv_list); 679} 680 681static int 682pmap_pventry_proc(SYSCTL_HANDLER_ARGS) 683{ 684 int error; 685 686 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); 687 if (error == 0 && req->newptr) { 688 shpgperproc = (pv_entry_max - cnt.v_page_count) / maxproc; 689 pv_entry_high_water = 9 * (pv_entry_max / 10); 690 } 691 return (error); 692} 693SYSCTL_PROC(_vm_pmap, OID_AUTO, pv_entry_max, CTLTYPE_INT|CTLFLAG_RW, 694 &pv_entry_max, 0, pmap_pventry_proc, "IU", "Max number of PV entries"); 695 696static int 697pmap_shpgperproc_proc(SYSCTL_HANDLER_ARGS) 698{ 699 int error; 700 701 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); 702 if (error == 0 && req->newptr) { 703 pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; 704 pv_entry_high_water = 9 * (pv_entry_max / 10); 705 } 706 return (error); 707} 708SYSCTL_PROC(_vm_pmap, OID_AUTO, shpgperproc, CTLTYPE_INT|CTLFLAG_RW, 709 &shpgperproc, 0, pmap_shpgperproc_proc, "IU", "Page share factor per proc"); 710 711SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, 712 "2MB page mapping counters"); 713 714static u_long pmap_pde_demotions; 715SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, 716 &pmap_pde_demotions, 0, "2MB page demotions"); 717 718static u_long pmap_pde_mappings; 719SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 720 &pmap_pde_mappings, 0, "2MB page mappings"); 721 722static u_long pmap_pde_p_failures; 723SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 724 &pmap_pde_p_failures, 0, "2MB page promotion failures"); 725 726static u_long pmap_pde_promotions; 727SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 728 &pmap_pde_promotions, 0, "2MB page promotions"); 729 730SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0, 731 "1GB page mapping counters"); 732 733static u_long pmap_pdpe_demotions; 734SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, 735 &pmap_pdpe_demotions, 0, "1GB page demotions"); 736 737 738/*************************************************** 739 * Low level helper routines..... 740 ***************************************************/ 741 742/* 743 * Determine the appropriate bits to set in a PTE or PDE for a specified 744 * caching mode. 745 */ 746static int 747pmap_cache_bits(int mode, boolean_t is_pde) 748{ 749 int pat_flag, pat_index, cache_bits; 750 751 /* The PAT bit is different for PTE's and PDE's. */ 752 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; 753 754 /* If we don't support PAT, map extended modes to older ones. */ 755 if (!(cpu_feature & CPUID_PAT)) { 756 switch (mode) { 757 case PAT_UNCACHEABLE: 758 case PAT_WRITE_THROUGH: 759 case PAT_WRITE_BACK: 760 break; 761 case PAT_UNCACHED: 762 case PAT_WRITE_COMBINING: 763 case PAT_WRITE_PROTECTED: 764 mode = PAT_UNCACHEABLE; 765 break; 766 } 767 } 768 769 /* Map the caching mode to a PAT index. */ 770 switch (mode) { 771 case PAT_UNCACHEABLE: 772 pat_index = 3; 773 break; 774 case PAT_WRITE_THROUGH: 775 pat_index = 1; 776 break; 777 case PAT_WRITE_BACK: 778 pat_index = 0; 779 break; 780 case PAT_UNCACHED: 781 pat_index = 2; 782 break; 783 case PAT_WRITE_COMBINING: 784 pat_index = 5; 785 break; 786 case PAT_WRITE_PROTECTED: 787 pat_index = 4; 788 break; 789 default: 790 panic("Unknown caching mode %d\n", mode); 791 } 792 793 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 794 cache_bits = 0; 795 if (pat_index & 0x4) 796 cache_bits |= pat_flag; 797 if (pat_index & 0x2) 798 cache_bits |= PG_NC_PCD; 799 if (pat_index & 0x1) 800 cache_bits |= PG_NC_PWT; 801 return (cache_bits); 802} 803#ifdef SMP 804/* 805 * For SMP, these functions have to use the IPI mechanism for coherence. 806 * 807 * N.B.: Before calling any of the following TLB invalidation functions, 808 * the calling processor must ensure that all stores updating a non- 809 * kernel page table are globally performed. Otherwise, another 810 * processor could cache an old, pre-update entry without being 811 * invalidated. This can happen one of two ways: (1) The pmap becomes 812 * active on another processor after its pm_active field is checked by 813 * one of the following functions but before a store updating the page 814 * table is globally performed. (2) The pmap becomes active on another 815 * processor before its pm_active field is checked but due to 816 * speculative loads one of the following functions stills reads the 817 * pmap as inactive on the other processor. 818 * 819 * The kernel page table is exempt because its pm_active field is 820 * immutable. The kernel page table is always active on every 821 * processor. 822 */ 823void 824pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 825{ 826 u_int cpumask; 827 u_int other_cpus; 828 829 sched_pin(); 830 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 831 invlpg(va); 832 smp_invlpg(va); 833 } else { 834 cpumask = PCPU_GET(cpumask); 835 other_cpus = PCPU_GET(other_cpus); 836 if (pmap->pm_active & cpumask) 837 invlpg(va); 838 if (pmap->pm_active & other_cpus) 839 smp_masked_invlpg(pmap->pm_active & other_cpus, va); 840 } 841 sched_unpin(); 842} 843 844void 845pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 846{ 847 u_int cpumask; 848 u_int other_cpus; 849 vm_offset_t addr; 850 851 sched_pin(); 852 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 853 for (addr = sva; addr < eva; addr += PAGE_SIZE) 854 invlpg(addr); 855 smp_invlpg_range(sva, eva); 856 } else { 857 cpumask = PCPU_GET(cpumask); 858 other_cpus = PCPU_GET(other_cpus); 859 if (pmap->pm_active & cpumask) 860 for (addr = sva; addr < eva; addr += PAGE_SIZE) 861 invlpg(addr); 862 if (pmap->pm_active & other_cpus) 863 smp_masked_invlpg_range(pmap->pm_active & other_cpus, 864 sva, eva); 865 } 866 sched_unpin(); 867} 868 869void 870pmap_invalidate_all(pmap_t pmap) 871{ 872 u_int cpumask; 873 u_int other_cpus; 874 875 sched_pin(); 876 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 877 invltlb(); 878 smp_invltlb(); 879 } else { 880 cpumask = PCPU_GET(cpumask); 881 other_cpus = PCPU_GET(other_cpus); 882 if (pmap->pm_active & cpumask) 883 invltlb(); 884 if (pmap->pm_active & other_cpus) 885 smp_masked_invltlb(pmap->pm_active & other_cpus); 886 } 887 sched_unpin(); 888} 889 890void 891pmap_invalidate_cache(void) 892{ 893 894 sched_pin(); 895 wbinvd(); 896 smp_cache_flush(); 897 sched_unpin(); 898} 899#else /* !SMP */ 900/* 901 * Normal, non-SMP, invalidation functions. 902 * We inline these within pmap.c for speed. 903 */ 904PMAP_INLINE void 905pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 906{ 907 908 if (pmap == kernel_pmap || pmap->pm_active) 909 invlpg(va); 910} 911 912PMAP_INLINE void 913pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 914{ 915 vm_offset_t addr; 916 917 if (pmap == kernel_pmap || pmap->pm_active) 918 for (addr = sva; addr < eva; addr += PAGE_SIZE) 919 invlpg(addr); 920} 921 922PMAP_INLINE void 923pmap_invalidate_all(pmap_t pmap) 924{ 925 926 if (pmap == kernel_pmap || pmap->pm_active) 927 invltlb(); 928} 929 930PMAP_INLINE void 931pmap_invalidate_cache(void) 932{ 933 934 wbinvd(); 935} 936#endif /* !SMP */ 937 938/* 939 * Are we current address space or kernel? 940 */ 941static __inline int 942pmap_is_current(pmap_t pmap) 943{ 944 return (pmap == kernel_pmap || 945 (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME)); 946} 947 948/* 949 * Routine: pmap_extract 950 * Function: 951 * Extract the physical page address associated 952 * with the given map/virtual_address pair. 953 */ 954vm_paddr_t 955pmap_extract(pmap_t pmap, vm_offset_t va) 956{ 957 vm_paddr_t rtval; 958 pt_entry_t *pte; 959 pd_entry_t pde, *pdep; 960 961 rtval = 0; 962 PMAP_LOCK(pmap); 963 pdep = pmap_pde(pmap, va); 964 if (pdep != NULL) { 965 pde = *pdep; 966 if (pde) { 967 if ((pde & PG_PS) != 0) 968 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); 969 else { 970 pte = pmap_pde_to_pte(pdep, va); 971 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); 972 } 973 } 974 } 975 PMAP_UNLOCK(pmap); 976 return (rtval); 977} 978 979/* 980 * Routine: pmap_extract_and_hold 981 * Function: 982 * Atomically extract and hold the physical page 983 * with the given pmap and virtual address pair 984 * if that mapping permits the given protection. 985 */ 986vm_page_t 987pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 988{ 989 pd_entry_t pde, *pdep; 990 pt_entry_t pte; 991 vm_page_t m; 992 993 m = NULL; 994 vm_page_lock_queues(); 995 PMAP_LOCK(pmap); 996 pdep = pmap_pde(pmap, va); 997 if (pdep != NULL && (pde = *pdep)) { 998 if (pde & PG_PS) { 999 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 1000 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | 1001 (va & PDRMASK)); 1002 vm_page_hold(m); 1003 } 1004 } else { 1005 pte = *pmap_pde_to_pte(pdep, va); 1006 if ((pte & PG_V) && 1007 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 1008 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 1009 vm_page_hold(m); 1010 } 1011 } 1012 } 1013 vm_page_unlock_queues(); 1014 PMAP_UNLOCK(pmap); 1015 return (m); 1016} 1017 1018vm_paddr_t 1019pmap_kextract(vm_offset_t va) 1020{ 1021 pd_entry_t pde; 1022 vm_paddr_t pa; 1023 1024 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 1025 pa = DMAP_TO_PHYS(va); 1026 } else { 1027 pde = *vtopde(va); 1028 if (pde & PG_PS) { 1029 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 1030 } else { 1031 /* 1032 * Beware of a concurrent promotion that changes the 1033 * PDE at this point! For example, vtopte() must not 1034 * be used to access the PTE because it would use the 1035 * new PDE. It is, however, safe to use the old PDE 1036 * because the page table page is preserved by the 1037 * promotion. 1038 */ 1039 pa = *pmap_pde_to_pte(&pde, va); 1040 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 1041 } 1042 } 1043 return pa; 1044} 1045 1046/*************************************************** 1047 * Low level mapping routines..... 1048 ***************************************************/ 1049 1050/* 1051 * Add a wired page to the kva. 1052 * Note: not SMP coherent. 1053 */ 1054PMAP_INLINE void 1055pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1056{ 1057 pt_entry_t *pte; 1058 1059 pte = vtopte(va); 1060 pte_store(pte, pa | PG_RW | PG_V | PG_G); 1061} 1062 1063static __inline void 1064pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 1065{ 1066 pt_entry_t *pte; 1067 1068 pte = vtopte(va); 1069 pte_store(pte, pa | PG_RW | PG_V | PG_G | pmap_cache_bits(mode, 0)); 1070} 1071 1072/* 1073 * Remove a page from the kernel pagetables. 1074 * Note: not SMP coherent. 1075 */ 1076PMAP_INLINE void 1077pmap_kremove(vm_offset_t va) 1078{ 1079 pt_entry_t *pte; 1080 1081 pte = vtopte(va); 1082 pte_clear(pte); 1083} 1084 1085/* 1086 * Used to map a range of physical addresses into kernel 1087 * virtual address space. 1088 * 1089 * The value passed in '*virt' is a suggested virtual address for 1090 * the mapping. Architectures which can support a direct-mapped 1091 * physical to virtual region can return the appropriate address 1092 * within that region, leaving '*virt' unchanged. Other 1093 * architectures should map the pages starting at '*virt' and 1094 * update '*virt' with the first usable address after the mapped 1095 * region. 1096 */ 1097vm_offset_t 1098pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1099{ 1100 return PHYS_TO_DMAP(start); 1101} 1102 1103 1104/* 1105 * Add a list of wired pages to the kva 1106 * this routine is only used for temporary 1107 * kernel mappings that do not need to have 1108 * page modification or references recorded. 1109 * Note that old mappings are simply written 1110 * over. The page *must* be wired. 1111 * Note: SMP coherent. Uses a ranged shootdown IPI. 1112 */ 1113void 1114pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1115{ 1116 pt_entry_t *endpte, oldpte, *pte; 1117 1118 oldpte = 0; 1119 pte = vtopte(sva); 1120 endpte = pte + count; 1121 while (pte < endpte) { 1122 oldpte |= *pte; 1123 pte_store(pte, VM_PAGE_TO_PHYS(*ma) | PG_G | PG_RW | PG_V); 1124 pte++; 1125 ma++; 1126 } 1127 if ((oldpte & PG_V) != 0) 1128 pmap_invalidate_range(kernel_pmap, sva, sva + count * 1129 PAGE_SIZE); 1130} 1131 1132/* 1133 * This routine tears out page mappings from the 1134 * kernel -- it is meant only for temporary mappings. 1135 * Note: SMP coherent. Uses a ranged shootdown IPI. 1136 */ 1137void 1138pmap_qremove(vm_offset_t sva, int count) 1139{ 1140 vm_offset_t va; 1141 1142 va = sva; 1143 while (count-- > 0) { 1144 pmap_kremove(va); 1145 va += PAGE_SIZE; 1146 } 1147 pmap_invalidate_range(kernel_pmap, sva, va); 1148} 1149 1150/*************************************************** 1151 * Page table page management routines..... 1152 ***************************************************/ 1153static __inline void 1154pmap_free_zero_pages(vm_page_t free) 1155{ 1156 vm_page_t m; 1157 1158 while (free != NULL) { 1159 m = free; 1160 free = m->right; 1161 /* Preserve the page's PG_ZERO setting. */ 1162 vm_page_free_toq(m); 1163 } 1164} 1165 1166/* 1167 * Schedule the specified unused page table page to be freed. Specifically, 1168 * add the page to the specified list of pages that will be released to the 1169 * physical memory manager after the TLB has been updated. 1170 */ 1171static __inline void 1172pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO) 1173{ 1174 1175 if (set_PG_ZERO) 1176 m->flags |= PG_ZERO; 1177 else 1178 m->flags &= ~PG_ZERO; 1179 m->right = *free; 1180 *free = m; 1181} 1182 1183/* 1184 * Inserts the specified page table page into the specified pmap's collection 1185 * of idle page table pages. Each of a pmap's page table pages is responsible 1186 * for mapping a distinct range of virtual addresses. The pmap's collection is 1187 * ordered by this virtual address range. 1188 */ 1189static void 1190pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 1191{ 1192 vm_page_t root; 1193 1194 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1195 root = pmap->pm_root; 1196 if (root == NULL) { 1197 mpte->left = NULL; 1198 mpte->right = NULL; 1199 } else { 1200 root = vm_page_splay(mpte->pindex, root); 1201 if (mpte->pindex < root->pindex) { 1202 mpte->left = root->left; 1203 mpte->right = root; 1204 root->left = NULL; 1205 } else if (mpte->pindex == root->pindex) 1206 panic("pmap_insert_pt_page: pindex already inserted"); 1207 else { 1208 mpte->right = root->right; 1209 mpte->left = root; 1210 root->right = NULL; 1211 } 1212 } 1213 pmap->pm_root = mpte; 1214} 1215 1216/* 1217 * Looks for a page table page mapping the specified virtual address in the 1218 * specified pmap's collection of idle page table pages. Returns NULL if there 1219 * is no page table page corresponding to the specified virtual address. 1220 */ 1221static vm_page_t 1222pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) 1223{ 1224 vm_page_t mpte; 1225 vm_pindex_t pindex = pmap_pde_pindex(va); 1226 1227 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1228 if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) { 1229 mpte = vm_page_splay(pindex, mpte); 1230 if ((pmap->pm_root = mpte)->pindex != pindex) 1231 mpte = NULL; 1232 } 1233 return (mpte); 1234} 1235 1236/* 1237 * Removes the specified page table page from the specified pmap's collection 1238 * of idle page table pages. The specified page table page must be a member of 1239 * the pmap's collection. 1240 */ 1241static void 1242pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) 1243{ 1244 vm_page_t root; 1245 1246 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1247 if (mpte != pmap->pm_root) { 1248 root = vm_page_splay(mpte->pindex, pmap->pm_root); 1249 KASSERT(mpte == root, 1250 ("pmap_remove_pt_page: mpte %p is missing from pmap %p", 1251 mpte, pmap)); 1252 } 1253 if (mpte->left == NULL) 1254 root = mpte->right; 1255 else { 1256 root = vm_page_splay(mpte->pindex, mpte->left); 1257 root->right = mpte->right; 1258 } 1259 pmap->pm_root = root; 1260} 1261 1262/* 1263 * This routine unholds page table pages, and if the hold count 1264 * drops to zero, then it decrements the wire count. 1265 */ 1266static __inline int 1267pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free) 1268{ 1269 1270 --m->wire_count; 1271 if (m->wire_count == 0) 1272 return _pmap_unwire_pte_hold(pmap, va, m, free); 1273 else 1274 return 0; 1275} 1276 1277static int 1278_pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 1279 vm_page_t *free) 1280{ 1281 1282 /* 1283 * unmap the page table page 1284 */ 1285 if (m->pindex >= (NUPDE + NUPDPE)) { 1286 /* PDP page */ 1287 pml4_entry_t *pml4; 1288 pml4 = pmap_pml4e(pmap, va); 1289 *pml4 = 0; 1290 } else if (m->pindex >= NUPDE) { 1291 /* PD page */ 1292 pdp_entry_t *pdp; 1293 pdp = pmap_pdpe(pmap, va); 1294 *pdp = 0; 1295 } else { 1296 /* PTE page */ 1297 pd_entry_t *pd; 1298 pd = pmap_pde(pmap, va); 1299 *pd = 0; 1300 } 1301 --pmap->pm_stats.resident_count; 1302 if (m->pindex < NUPDE) { 1303 /* We just released a PT, unhold the matching PD */ 1304 vm_page_t pdpg; 1305 1306 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 1307 pmap_unwire_pte_hold(pmap, va, pdpg, free); 1308 } 1309 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 1310 /* We just released a PD, unhold the matching PDP */ 1311 vm_page_t pdppg; 1312 1313 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 1314 pmap_unwire_pte_hold(pmap, va, pdppg, free); 1315 } 1316 1317 /* 1318 * This is a release store so that the ordinary store unmapping 1319 * the page table page is globally performed before TLB shoot- 1320 * down is begun. 1321 */ 1322 atomic_subtract_rel_int(&cnt.v_wire_count, 1); 1323 1324 /* 1325 * Put page on a list so that it is released after 1326 * *ALL* TLB shootdown is done 1327 */ 1328 pmap_add_delayed_free_list(m, free, TRUE); 1329 1330 return 1; 1331} 1332 1333/* 1334 * After removing a page table entry, this routine is used to 1335 * conditionally free the page, and manage the hold/wire counts. 1336 */ 1337static int 1338pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, vm_page_t *free) 1339{ 1340 vm_page_t mpte; 1341 1342 if (va >= VM_MAXUSER_ADDRESS) 1343 return 0; 1344 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 1345 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 1346 return pmap_unwire_pte_hold(pmap, va, mpte, free); 1347} 1348 1349void 1350pmap_pinit0(pmap_t pmap) 1351{ 1352 1353 PMAP_LOCK_INIT(pmap); 1354 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 1355 pmap->pm_root = NULL; 1356 pmap->pm_active = 0; 1357 TAILQ_INIT(&pmap->pm_pvchunk); 1358 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1359} 1360 1361/* 1362 * Initialize a preallocated and zeroed pmap structure, 1363 * such as one in a vmspace structure. 1364 */ 1365int 1366pmap_pinit(pmap_t pmap) 1367{ 1368 vm_page_t pml4pg; 1369 static vm_pindex_t color; 1370 1371 PMAP_LOCK_INIT(pmap); 1372 1373 /* 1374 * allocate the page directory page 1375 */ 1376 while ((pml4pg = vm_page_alloc(NULL, color++, VM_ALLOC_NOOBJ | 1377 VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) 1378 VM_WAIT; 1379 1380 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); 1381 1382 if ((pml4pg->flags & PG_ZERO) == 0) 1383 pagezero(pmap->pm_pml4); 1384 1385 /* Wire in kernel global address entries. */ 1386 pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U; 1387 pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U; 1388 1389 /* install self-referential address mapping entry(s) */ 1390 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M; 1391 1392 pmap->pm_root = NULL; 1393 pmap->pm_active = 0; 1394 TAILQ_INIT(&pmap->pm_pvchunk); 1395 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1396 1397 return (1); 1398} 1399 1400/* 1401 * this routine is called if the page table page is not 1402 * mapped correctly. 1403 * 1404 * Note: If a page allocation fails at page table level two or three, 1405 * one or two pages may be held during the wait, only to be released 1406 * afterwards. This conservative approach is easily argued to avoid 1407 * race conditions. 1408 */ 1409static vm_page_t 1410_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags) 1411{ 1412 vm_page_t m, pdppg, pdpg; 1413 1414 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || 1415 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, 1416 ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); 1417 1418 /* 1419 * Allocate a page table page. 1420 */ 1421 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1422 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1423 if (flags & M_WAITOK) { 1424 PMAP_UNLOCK(pmap); 1425 vm_page_unlock_queues(); 1426 VM_WAIT; 1427 vm_page_lock_queues(); 1428 PMAP_LOCK(pmap); 1429 } 1430 1431 /* 1432 * Indicate the need to retry. While waiting, the page table 1433 * page may have been allocated. 1434 */ 1435 return (NULL); 1436 } 1437 if ((m->flags & PG_ZERO) == 0) 1438 pmap_zero_page(m); 1439 1440 /* 1441 * Map the pagetable page into the process address space, if 1442 * it isn't already there. 1443 */ 1444 1445 if (ptepindex >= (NUPDE + NUPDPE)) { 1446 pml4_entry_t *pml4; 1447 vm_pindex_t pml4index; 1448 1449 /* Wire up a new PDPE page */ 1450 pml4index = ptepindex - (NUPDE + NUPDPE); 1451 pml4 = &pmap->pm_pml4[pml4index]; 1452 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1453 1454 } else if (ptepindex >= NUPDE) { 1455 vm_pindex_t pml4index; 1456 vm_pindex_t pdpindex; 1457 pml4_entry_t *pml4; 1458 pdp_entry_t *pdp; 1459 1460 /* Wire up a new PDE page */ 1461 pdpindex = ptepindex - NUPDE; 1462 pml4index = pdpindex >> NPML4EPGSHIFT; 1463 1464 pml4 = &pmap->pm_pml4[pml4index]; 1465 if ((*pml4 & PG_V) == 0) { 1466 /* Have to allocate a new pdp, recurse */ 1467 if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, 1468 flags) == NULL) { 1469 --m->wire_count; 1470 atomic_subtract_int(&cnt.v_wire_count, 1); 1471 vm_page_free_zero(m); 1472 return (NULL); 1473 } 1474 } else { 1475 /* Add reference to pdp page */ 1476 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 1477 pdppg->wire_count++; 1478 } 1479 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1480 1481 /* Now find the pdp page */ 1482 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1483 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1484 1485 } else { 1486 vm_pindex_t pml4index; 1487 vm_pindex_t pdpindex; 1488 pml4_entry_t *pml4; 1489 pdp_entry_t *pdp; 1490 pd_entry_t *pd; 1491 1492 /* Wire up a new PTE page */ 1493 pdpindex = ptepindex >> NPDPEPGSHIFT; 1494 pml4index = pdpindex >> NPML4EPGSHIFT; 1495 1496 /* First, find the pdp and check that its valid. */ 1497 pml4 = &pmap->pm_pml4[pml4index]; 1498 if ((*pml4 & PG_V) == 0) { 1499 /* Have to allocate a new pd, recurse */ 1500 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 1501 flags) == NULL) { 1502 --m->wire_count; 1503 atomic_subtract_int(&cnt.v_wire_count, 1); 1504 vm_page_free_zero(m); 1505 return (NULL); 1506 } 1507 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1508 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1509 } else { 1510 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1511 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1512 if ((*pdp & PG_V) == 0) { 1513 /* Have to allocate a new pd, recurse */ 1514 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 1515 flags) == NULL) { 1516 --m->wire_count; 1517 atomic_subtract_int(&cnt.v_wire_count, 1518 1); 1519 vm_page_free_zero(m); 1520 return (NULL); 1521 } 1522 } else { 1523 /* Add reference to the pd page */ 1524 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 1525 pdpg->wire_count++; 1526 } 1527 } 1528 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 1529 1530 /* Now we know where the page directory page is */ 1531 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; 1532 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1533 } 1534 1535 pmap->pm_stats.resident_count++; 1536 1537 return m; 1538} 1539 1540static vm_page_t 1541pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags) 1542{ 1543 vm_pindex_t pdpindex, ptepindex; 1544 pdp_entry_t *pdpe; 1545 vm_page_t pdpg; 1546 1547 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || 1548 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, 1549 ("pmap_allocpde: flags is neither M_NOWAIT nor M_WAITOK")); 1550retry: 1551 pdpe = pmap_pdpe(pmap, va); 1552 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 1553 /* Add a reference to the pd page. */ 1554 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 1555 pdpg->wire_count++; 1556 } else { 1557 /* Allocate a pd page. */ 1558 ptepindex = pmap_pde_pindex(va); 1559 pdpindex = ptepindex >> NPDPEPGSHIFT; 1560 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, flags); 1561 if (pdpg == NULL && (flags & M_WAITOK)) 1562 goto retry; 1563 } 1564 return (pdpg); 1565} 1566 1567static vm_page_t 1568pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags) 1569{ 1570 vm_pindex_t ptepindex; 1571 pd_entry_t *pd; 1572 vm_page_t m; 1573 1574 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || 1575 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, 1576 ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); 1577 1578 /* 1579 * Calculate pagetable page index 1580 */ 1581 ptepindex = pmap_pde_pindex(va); 1582retry: 1583 /* 1584 * Get the page directory entry 1585 */ 1586 pd = pmap_pde(pmap, va); 1587 1588 /* 1589 * This supports switching from a 2MB page to a 1590 * normal 4K page. 1591 */ 1592 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 1593 if (!pmap_demote_pde(pmap, pd, va)) { 1594 /* 1595 * Invalidation of the 2MB page mapping may have caused 1596 * the deallocation of the underlying PD page. 1597 */ 1598 pd = NULL; 1599 } 1600 } 1601 1602 /* 1603 * If the page table page is mapped, we just increment the 1604 * hold count, and activate it. 1605 */ 1606 if (pd != NULL && (*pd & PG_V) != 0) { 1607 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 1608 m->wire_count++; 1609 } else { 1610 /* 1611 * Here if the pte page isn't mapped, or if it has been 1612 * deallocated. 1613 */ 1614 m = _pmap_allocpte(pmap, ptepindex, flags); 1615 if (m == NULL && (flags & M_WAITOK)) 1616 goto retry; 1617 } 1618 return (m); 1619} 1620 1621 1622/*************************************************** 1623 * Pmap allocation/deallocation routines. 1624 ***************************************************/ 1625 1626/* 1627 * Release any resources held by the given physical map. 1628 * Called when a pmap initialized by pmap_pinit is being released. 1629 * Should only be called if the map contains no valid mappings. 1630 */ 1631void 1632pmap_release(pmap_t pmap) 1633{ 1634 vm_page_t m; 1635 1636 KASSERT(pmap->pm_stats.resident_count == 0, 1637 ("pmap_release: pmap resident count %ld != 0", 1638 pmap->pm_stats.resident_count)); 1639 KASSERT(pmap->pm_root == NULL, 1640 ("pmap_release: pmap has reserved page table page(s)")); 1641 1642 m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME); 1643 1644 pmap->pm_pml4[KPML4I] = 0; /* KVA */ 1645 pmap->pm_pml4[DMPML4I] = 0; /* Direct Map */ 1646 pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ 1647 1648 m->wire_count--; 1649 atomic_subtract_int(&cnt.v_wire_count, 1); 1650 vm_page_free_zero(m); 1651 PMAP_LOCK_DESTROY(pmap); 1652} 1653 1654static int 1655kvm_size(SYSCTL_HANDLER_ARGS) 1656{ 1657 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 1658 1659 return sysctl_handle_long(oidp, &ksize, 0, req); 1660} 1661SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 1662 0, 0, kvm_size, "LU", "Size of KVM"); 1663 1664static int 1665kvm_free(SYSCTL_HANDLER_ARGS) 1666{ 1667 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 1668 1669 return sysctl_handle_long(oidp, &kfree, 0, req); 1670} 1671SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 1672 0, 0, kvm_free, "LU", "Amount of KVM free"); 1673 1674/* 1675 * grow the number of kernel page table entries, if needed 1676 */ 1677void 1678pmap_growkernel(vm_offset_t addr) 1679{ 1680 vm_paddr_t paddr; 1681 vm_page_t nkpg; 1682 pd_entry_t *pde, newpdir; 1683 pdp_entry_t *pdpe; 1684 1685 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 1686 1687 /* 1688 * Return if "addr" is within the range of kernel page table pages 1689 * that were preallocated during pmap bootstrap. Moreover, leave 1690 * "kernel_vm_end" and the kernel page table as they were. 1691 * 1692 * The correctness of this action is based on the following 1693 * argument: vm_map_findspace() allocates contiguous ranges of the 1694 * kernel virtual address space. It calls this function if a range 1695 * ends after "kernel_vm_end". If the kernel is mapped between 1696 * "kernel_vm_end" and "addr", then the range cannot begin at 1697 * "kernel_vm_end". In fact, its beginning address cannot be less 1698 * than the kernel. Thus, there is no immediate need to allocate 1699 * any new kernel page table pages between "kernel_vm_end" and 1700 * "KERNBASE". 1701 */ 1702 if (KERNBASE < addr && addr <= KERNBASE + NKPT * NBPDR) 1703 return; 1704 1705 addr = roundup2(addr, NBPDR); 1706 if (addr - 1 >= kernel_map->max_offset) 1707 addr = kernel_map->max_offset; 1708 while (kernel_vm_end < addr) { 1709 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end); 1710 if ((*pdpe & PG_V) == 0) { 1711 /* We need a new PDP entry */ 1712 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT, 1713 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 1714 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1715 if (nkpg == NULL) 1716 panic("pmap_growkernel: no memory to grow kernel"); 1717 if ((nkpg->flags & PG_ZERO) == 0) 1718 pmap_zero_page(nkpg); 1719 paddr = VM_PAGE_TO_PHYS(nkpg); 1720 *pdpe = (pdp_entry_t) 1721 (paddr | PG_V | PG_RW | PG_A | PG_M); 1722 continue; /* try again */ 1723 } 1724 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); 1725 if ((*pde & PG_V) != 0) { 1726 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 1727 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 1728 kernel_vm_end = kernel_map->max_offset; 1729 break; 1730 } 1731 continue; 1732 } 1733 1734 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end), 1735 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 1736 VM_ALLOC_ZERO); 1737 if (nkpg == NULL) 1738 panic("pmap_growkernel: no memory to grow kernel"); 1739 if ((nkpg->flags & PG_ZERO) == 0) 1740 pmap_zero_page(nkpg); 1741 paddr = VM_PAGE_TO_PHYS(nkpg); 1742 newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M); 1743 pde_store(pde, newpdir); 1744 1745 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 1746 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 1747 kernel_vm_end = kernel_map->max_offset; 1748 break; 1749 } 1750 } 1751} 1752 1753 1754/*************************************************** 1755 * page management routines. 1756 ***************************************************/ 1757 1758CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 1759CTASSERT(_NPCM == 3); 1760CTASSERT(_NPCPV == 168); 1761 1762static __inline struct pv_chunk * 1763pv_to_chunk(pv_entry_t pv) 1764{ 1765 1766 return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK); 1767} 1768 1769#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 1770 1771#define PC_FREE0 0xfffffffffffffffful 1772#define PC_FREE1 0xfffffffffffffffful 1773#define PC_FREE2 0x000000fffffffffful 1774 1775static uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 1776 1777SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 1778 "Current number of pv entries"); 1779 1780#ifdef PV_STATS 1781static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 1782 1783SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 1784 "Current number of pv entry chunks"); 1785SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 1786 "Current number of pv entry chunks allocated"); 1787SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 1788 "Current number of pv entry chunks frees"); 1789SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 1790 "Number of times tried to get a chunk page but failed."); 1791 1792static long pv_entry_frees, pv_entry_allocs; 1793static int pv_entry_spare; 1794 1795SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 1796 "Current number of pv entry frees"); 1797SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 1798 "Current number of pv entry allocs"); 1799SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 1800 "Current number of spare pv entries"); 1801 1802static int pmap_collect_inactive, pmap_collect_active; 1803 1804SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0, 1805 "Current number times pmap_collect called on inactive queue"); 1806SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0, 1807 "Current number times pmap_collect called on active queue"); 1808#endif 1809 1810/* 1811 * We are in a serious low memory condition. Resort to 1812 * drastic measures to free some pages so we can allocate 1813 * another pv entry chunk. This is normally called to 1814 * unmap inactive pages, and if necessary, active pages. 1815 * 1816 * We do not, however, unmap 2mpages because subsequent accesses will 1817 * allocate per-page pv entries until repromotion occurs, thereby 1818 * exacerbating the shortage of free pv entries. 1819 */ 1820static void 1821pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq) 1822{ 1823 struct md_page *pvh; 1824 pd_entry_t *pde; 1825 pmap_t pmap; 1826 pt_entry_t *pte, tpte; 1827 pv_entry_t next_pv, pv; 1828 vm_offset_t va; 1829 vm_page_t m, free; 1830 1831 TAILQ_FOREACH(m, &vpq->pl, pageq) { 1832 if (m->hold_count || m->busy) 1833 continue; 1834 TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) { 1835 va = pv->pv_va; 1836 pmap = PV_PMAP(pv); 1837 /* Avoid deadlock and lock recursion. */ 1838 if (pmap > locked_pmap) 1839 PMAP_LOCK(pmap); 1840 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) 1841 continue; 1842 pmap->pm_stats.resident_count--; 1843 pde = pmap_pde(pmap, va); 1844 KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found" 1845 " a 2mpage in page %p's pv list", m)); 1846 pte = pmap_pde_to_pte(pde, va); 1847 tpte = pte_load_clear(pte); 1848 KASSERT((tpte & PG_W) == 0, 1849 ("pmap_collect: wired pte %#lx", tpte)); 1850 if (tpte & PG_A) 1851 vm_page_flag_set(m, PG_REFERENCED); 1852 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 1853 vm_page_dirty(m); 1854 free = NULL; 1855 pmap_unuse_pt(pmap, va, *pde, &free); 1856 pmap_invalidate_page(pmap, va); 1857 pmap_free_zero_pages(free); 1858 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 1859 if (TAILQ_EMPTY(&m->md.pv_list)) { 1860 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 1861 if (TAILQ_EMPTY(&pvh->pv_list)) 1862 vm_page_flag_clear(m, PG_WRITEABLE); 1863 } 1864 free_pv_entry(pmap, pv); 1865 if (pmap != locked_pmap) 1866 PMAP_UNLOCK(pmap); 1867 } 1868 } 1869} 1870 1871 1872/* 1873 * free the pv_entry back to the free list 1874 */ 1875static void 1876free_pv_entry(pmap_t pmap, pv_entry_t pv) 1877{ 1878 vm_page_t m; 1879 struct pv_chunk *pc; 1880 int idx, field, bit; 1881 1882 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1883 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1884 PV_STAT(pv_entry_frees++); 1885 PV_STAT(pv_entry_spare++); 1886 pv_entry_count--; 1887 pc = pv_to_chunk(pv); 1888 idx = pv - &pc->pc_pventry[0]; 1889 field = idx / 64; 1890 bit = idx % 64; 1891 pc->pc_map[field] |= 1ul << bit; 1892 /* move to head of list */ 1893 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1894 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 1895 pc->pc_map[2] != PC_FREE2) { 1896 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1897 return; 1898 } 1899 PV_STAT(pv_entry_spare -= _NPCPV); 1900 PV_STAT(pc_chunk_count--); 1901 PV_STAT(pc_chunk_frees++); 1902 /* entire chunk is free, return it */ 1903 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 1904 dump_drop_page(m->phys_addr); 1905 vm_page_unwire(m, 0); 1906 vm_page_free(m); 1907} 1908 1909/* 1910 * get a new pv_entry, allocating a block from the system 1911 * when needed. 1912 */ 1913static pv_entry_t 1914get_pv_entry(pmap_t pmap, int try) 1915{ 1916 static const struct timeval printinterval = { 60, 0 }; 1917 static struct timeval lastprint; 1918 static vm_pindex_t colour; 1919 struct vpgqueues *pq; 1920 int bit, field; 1921 pv_entry_t pv; 1922 struct pv_chunk *pc; 1923 vm_page_t m; 1924 1925 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1926 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1927 PV_STAT(pv_entry_allocs++); 1928 pv_entry_count++; 1929 if (pv_entry_count > pv_entry_high_water) 1930 if (ratecheck(&lastprint, &printinterval)) 1931 printf("Approaching the limit on PV entries, consider " 1932 "increasing either the vm.pmap.shpgperproc or the " 1933 "vm.pmap.pv_entry_max sysctl.\n"); 1934 pq = NULL; 1935retry: 1936 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1937 if (pc != NULL) { 1938 for (field = 0; field < _NPCM; field++) { 1939 if (pc->pc_map[field]) { 1940 bit = bsfq(pc->pc_map[field]); 1941 break; 1942 } 1943 } 1944 if (field < _NPCM) { 1945 pv = &pc->pc_pventry[field * 64 + bit]; 1946 pc->pc_map[field] &= ~(1ul << bit); 1947 /* If this was the last item, move it to tail */ 1948 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 1949 pc->pc_map[2] == 0) { 1950 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1951 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1952 } 1953 PV_STAT(pv_entry_spare--); 1954 return (pv); 1955 } 1956 } 1957 /* No free items, allocate another chunk */ 1958 m = vm_page_alloc(NULL, colour, (pq == &vm_page_queues[PQ_ACTIVE] ? 1959 VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | 1960 VM_ALLOC_WIRED); 1961 if (m == NULL) { 1962 if (try) { 1963 pv_entry_count--; 1964 PV_STAT(pc_chunk_tryfail++); 1965 return (NULL); 1966 } 1967 /* 1968 * Reclaim pv entries: At first, destroy mappings to inactive 1969 * pages. After that, if a pv chunk entry is still needed, 1970 * destroy mappings to active pages. 1971 */ 1972 if (pq == NULL) { 1973 PV_STAT(pmap_collect_inactive++); 1974 pq = &vm_page_queues[PQ_INACTIVE]; 1975 } else if (pq == &vm_page_queues[PQ_INACTIVE]) { 1976 PV_STAT(pmap_collect_active++); 1977 pq = &vm_page_queues[PQ_ACTIVE]; 1978 } else 1979 panic("get_pv_entry: increase vm.pmap.shpgperproc"); 1980 pmap_collect(pmap, pq); 1981 goto retry; 1982 } 1983 PV_STAT(pc_chunk_count++); 1984 PV_STAT(pc_chunk_allocs++); 1985 colour++; 1986 dump_add_page(m->phys_addr); 1987 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 1988 pc->pc_pmap = pmap; 1989 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 1990 pc->pc_map[1] = PC_FREE1; 1991 pc->pc_map[2] = PC_FREE2; 1992 pv = &pc->pc_pventry[0]; 1993 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1994 PV_STAT(pv_entry_spare += _NPCPV - 1); 1995 return (pv); 1996} 1997 1998/* 1999 * First find and then remove the pv entry for the specified pmap and virtual 2000 * address from the specified pv list. Returns the pv entry if found and NULL 2001 * otherwise. This operation can be performed on pv lists for either 4KB or 2002 * 2MB page mappings. 2003 */ 2004static __inline pv_entry_t 2005pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2006{ 2007 pv_entry_t pv; 2008 2009 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2010 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 2011 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 2012 TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); 2013 break; 2014 } 2015 } 2016 return (pv); 2017} 2018 2019/* 2020 * After demotion from a 2MB page mapping to 512 4KB page mappings, 2021 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 2022 * entries for each of the 4KB page mappings. 2023 */ 2024static void 2025pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2026{ 2027 struct md_page *pvh; 2028 pv_entry_t pv; 2029 vm_offset_t va_last; 2030 vm_page_t m; 2031 2032 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2033 KASSERT((pa & PDRMASK) == 0, 2034 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 2035 2036 /* 2037 * Transfer the 2mpage's pv entry for this mapping to the first 2038 * page's pv list. 2039 */ 2040 pvh = pa_to_pvh(pa); 2041 va = trunc_2mpage(va); 2042 pv = pmap_pvh_remove(pvh, pmap, va); 2043 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 2044 m = PHYS_TO_VM_PAGE(pa); 2045 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2046 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 2047 va_last = va + NBPDR - PAGE_SIZE; 2048 do { 2049 m++; 2050 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0, 2051 ("pmap_pv_demote_pde: page %p is not managed", m)); 2052 va += PAGE_SIZE; 2053 pmap_insert_entry(pmap, va, m); 2054 } while (va < va_last); 2055} 2056 2057/* 2058 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 2059 * replace the many pv entries for the 4KB page mappings by a single pv entry 2060 * for the 2MB page mapping. 2061 */ 2062static void 2063pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2064{ 2065 struct md_page *pvh; 2066 pv_entry_t pv; 2067 vm_offset_t va_last; 2068 vm_page_t m; 2069 2070 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2071 KASSERT((pa & PDRMASK) == 0, 2072 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 2073 2074 /* 2075 * Transfer the first page's pv entry for this mapping to the 2076 * 2mpage's pv list. Aside from avoiding the cost of a call 2077 * to get_pv_entry(), a transfer avoids the possibility that 2078 * get_pv_entry() calls pmap_collect() and that pmap_collect() 2079 * removes one of the mappings that is being promoted. 2080 */ 2081 m = PHYS_TO_VM_PAGE(pa); 2082 va = trunc_2mpage(va); 2083 pv = pmap_pvh_remove(&m->md, pmap, va); 2084 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 2085 pvh = pa_to_pvh(pa); 2086 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); 2087 /* Free the remaining NPTEPG - 1 pv entries. */ 2088 va_last = va + NBPDR - PAGE_SIZE; 2089 do { 2090 m++; 2091 va += PAGE_SIZE; 2092 pmap_pvh_free(&m->md, pmap, va); 2093 } while (va < va_last); 2094} 2095 2096/* 2097 * First find and then destroy the pv entry for the specified pmap and virtual 2098 * address. This operation can be performed on pv lists for either 4KB or 2MB 2099 * page mappings. 2100 */ 2101static void 2102pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2103{ 2104 pv_entry_t pv; 2105 2106 pv = pmap_pvh_remove(pvh, pmap, va); 2107 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 2108 free_pv_entry(pmap, pv); 2109} 2110 2111static void 2112pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 2113{ 2114 struct md_page *pvh; 2115 2116 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2117 pmap_pvh_free(&m->md, pmap, va); 2118 if (TAILQ_EMPTY(&m->md.pv_list)) { 2119 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2120 if (TAILQ_EMPTY(&pvh->pv_list)) 2121 vm_page_flag_clear(m, PG_WRITEABLE); 2122 } 2123} 2124 2125/* 2126 * Create a pv entry for page at pa for 2127 * (pmap, va). 2128 */ 2129static void 2130pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2131{ 2132 pv_entry_t pv; 2133 2134 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2135 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2136 pv = get_pv_entry(pmap, FALSE); 2137 pv->pv_va = va; 2138 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2139} 2140 2141/* 2142 * Conditionally create a pv entry. 2143 */ 2144static boolean_t 2145pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2146{ 2147 pv_entry_t pv; 2148 2149 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2150 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2151 if (pv_entry_count < pv_entry_high_water && 2152 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2153 pv->pv_va = va; 2154 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2155 return (TRUE); 2156 } else 2157 return (FALSE); 2158} 2159 2160/* 2161 * Create the pv entry for a 2MB page mapping. 2162 */ 2163static boolean_t 2164pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2165{ 2166 struct md_page *pvh; 2167 pv_entry_t pv; 2168 2169 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2170 if (pv_entry_count < pv_entry_high_water && 2171 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2172 pv->pv_va = va; 2173 pvh = pa_to_pvh(pa); 2174 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); 2175 return (TRUE); 2176 } else 2177 return (FALSE); 2178} 2179 2180/* 2181 * Fills a page table page with mappings to consecutive physical pages. 2182 */ 2183static void 2184pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 2185{ 2186 pt_entry_t *pte; 2187 2188 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 2189 *pte = newpte; 2190 newpte += PAGE_SIZE; 2191 } 2192} 2193 2194/* 2195 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 2196 * mapping is invalidated. 2197 */ 2198static boolean_t 2199pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2200{ 2201 pd_entry_t newpde, oldpde; 2202 pt_entry_t *firstpte, newpte; 2203 vm_paddr_t mptepa; 2204 vm_page_t free, mpte; 2205 2206 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2207 oldpde = *pde; 2208 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 2209 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 2210 mpte = pmap_lookup_pt_page(pmap, va); 2211 if (mpte != NULL) 2212 pmap_remove_pt_page(pmap, mpte); 2213 else { 2214 KASSERT((oldpde & PG_W) == 0, 2215 ("pmap_demote_pde: page table page for a wired mapping" 2216 " is missing")); 2217 2218 /* 2219 * Invalidate the 2MB page mapping and return "failure" if the 2220 * mapping was never accessed or the allocation of the new 2221 * page table page fails. 2222 */ 2223 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 2224 pmap_pde_pindex(va), VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | 2225 VM_ALLOC_WIRED)) == NULL) { 2226 free = NULL; 2227 pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free); 2228 pmap_invalidate_page(pmap, trunc_2mpage(va)); 2229 pmap_free_zero_pages(free); 2230 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx" 2231 " in pmap %p", va, pmap); 2232 return (FALSE); 2233 } 2234 } 2235 mptepa = VM_PAGE_TO_PHYS(mpte); 2236 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 2237 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 2238 KASSERT((oldpde & PG_A) != 0, 2239 ("pmap_demote_pde: oldpde is missing PG_A")); 2240 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 2241 ("pmap_demote_pde: oldpde is missing PG_M")); 2242 newpte = oldpde & ~PG_PS; 2243 if ((newpte & PG_PDE_PAT) != 0) 2244 newpte ^= PG_PDE_PAT | PG_PTE_PAT; 2245 2246 /* 2247 * If the page table page is new, initialize it. 2248 */ 2249 if (mpte->wire_count == 1) { 2250 mpte->wire_count = NPTEPG; 2251 pmap_fill_ptp(firstpte, newpte); 2252 } 2253 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 2254 ("pmap_demote_pde: firstpte and newpte map different physical" 2255 " addresses")); 2256 2257 /* 2258 * If the mapping has changed attributes, update the page table 2259 * entries. 2260 */ 2261 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 2262 pmap_fill_ptp(firstpte, newpte); 2263 2264 /* 2265 * Demote the mapping. This pmap is locked. The old PDE has 2266 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 2267 * set. Thus, there is no danger of a race with another 2268 * processor changing the setting of PG_A and/or PG_M between 2269 * the read above and the store below. 2270 */ 2271 pde_store(pde, newpde); 2272 2273 /* 2274 * Invalidate a stale recursive mapping of the page table page. 2275 */ 2276 if (va >= VM_MAXUSER_ADDRESS) 2277 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 2278 2279 /* 2280 * Demote the pv entry. This depends on the earlier demotion 2281 * of the mapping. Specifically, the (re)creation of a per- 2282 * page pv entry might trigger the execution of pmap_collect(), 2283 * which might reclaim a newly (re)created per-page pv entry 2284 * and destroy the associated mapping. In order to destroy 2285 * the mapping, the PDE must have already changed from mapping 2286 * the 2mpage to referencing the page table page. 2287 */ 2288 if ((oldpde & PG_MANAGED) != 0) 2289 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); 2290 2291 pmap_pde_demotions++; 2292 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx" 2293 " in pmap %p", va, pmap); 2294 return (TRUE); 2295} 2296 2297/* 2298 * pmap_remove_pde: do the things to unmap a superpage in a process 2299 */ 2300static int 2301pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 2302 vm_page_t *free) 2303{ 2304 struct md_page *pvh; 2305 pd_entry_t oldpde; 2306 vm_offset_t eva, va; 2307 vm_page_t m, mpte; 2308 2309 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2310 KASSERT((sva & PDRMASK) == 0, 2311 ("pmap_remove_pde: sva is not 2mpage aligned")); 2312 oldpde = pte_load_clear(pdq); 2313 if (oldpde & PG_W) 2314 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 2315 2316 /* 2317 * Machines that don't support invlpg, also don't support 2318 * PG_G. 2319 */ 2320 if (oldpde & PG_G) 2321 pmap_invalidate_page(kernel_pmap, sva); 2322 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2323 if (oldpde & PG_MANAGED) { 2324 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 2325 pmap_pvh_free(pvh, pmap, sva); 2326 eva = sva + NBPDR; 2327 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 2328 va < eva; va += PAGE_SIZE, m++) { 2329 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2330 vm_page_dirty(m); 2331 if (oldpde & PG_A) 2332 vm_page_flag_set(m, PG_REFERENCED); 2333 if (TAILQ_EMPTY(&m->md.pv_list) && 2334 TAILQ_EMPTY(&pvh->pv_list)) 2335 vm_page_flag_clear(m, PG_WRITEABLE); 2336 } 2337 } 2338 if (pmap == kernel_pmap) { 2339 if (!pmap_demote_pde(pmap, pdq, sva)) 2340 panic("pmap_remove_pde: failed demotion"); 2341 } else { 2342 mpte = pmap_lookup_pt_page(pmap, sva); 2343 if (mpte != NULL) { 2344 pmap_remove_pt_page(pmap, mpte); 2345 pmap->pm_stats.resident_count--; 2346 KASSERT(mpte->wire_count == NPTEPG, 2347 ("pmap_remove_pde: pte page wire count error")); 2348 mpte->wire_count = 0; 2349 pmap_add_delayed_free_list(mpte, free, FALSE); 2350 atomic_subtract_int(&cnt.v_wire_count, 1); 2351 } 2352 } 2353 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); 2354} 2355 2356/* 2357 * pmap_remove_pte: do the things to unmap a page in a process 2358 */ 2359static int 2360pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 2361 pd_entry_t ptepde, vm_page_t *free) 2362{ 2363 pt_entry_t oldpte; 2364 vm_page_t m; 2365 2366 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2367 oldpte = pte_load_clear(ptq); 2368 if (oldpte & PG_W) 2369 pmap->pm_stats.wired_count -= 1; 2370 /* 2371 * Machines that don't support invlpg, also don't support 2372 * PG_G. 2373 */ 2374 if (oldpte & PG_G) 2375 pmap_invalidate_page(kernel_pmap, va); 2376 pmap->pm_stats.resident_count -= 1; 2377 if (oldpte & PG_MANAGED) { 2378 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 2379 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2380 vm_page_dirty(m); 2381 if (oldpte & PG_A) 2382 vm_page_flag_set(m, PG_REFERENCED); 2383 pmap_remove_entry(pmap, m, va); 2384 } 2385 return (pmap_unuse_pt(pmap, va, ptepde, free)); 2386} 2387 2388/* 2389 * Remove a single page from a process address space 2390 */ 2391static void 2392pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, vm_page_t *free) 2393{ 2394 pt_entry_t *pte; 2395 2396 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2397 if ((*pde & PG_V) == 0) 2398 return; 2399 pte = pmap_pde_to_pte(pde, va); 2400 if ((*pte & PG_V) == 0) 2401 return; 2402 pmap_remove_pte(pmap, pte, va, *pde, free); 2403 pmap_invalidate_page(pmap, va); 2404} 2405 2406/* 2407 * Remove the given range of addresses from the specified map. 2408 * 2409 * It is assumed that the start and end are properly 2410 * rounded to the page size. 2411 */ 2412void 2413pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2414{ 2415 vm_offset_t va_next; 2416 pml4_entry_t *pml4e; 2417 pdp_entry_t *pdpe; 2418 pd_entry_t ptpaddr, *pde; 2419 pt_entry_t *pte; 2420 vm_page_t free = NULL; 2421 int anyvalid; 2422 2423 /* 2424 * Perform an unsynchronized read. This is, however, safe. 2425 */ 2426 if (pmap->pm_stats.resident_count == 0) 2427 return; 2428 2429 anyvalid = 0; 2430 2431 vm_page_lock_queues(); 2432 PMAP_LOCK(pmap); 2433 2434 /* 2435 * special handling of removing one page. a very 2436 * common operation and easy to short circuit some 2437 * code. 2438 */ 2439 if (sva + PAGE_SIZE == eva) { 2440 pde = pmap_pde(pmap, sva); 2441 if (pde && (*pde & PG_PS) == 0) { 2442 pmap_remove_page(pmap, sva, pde, &free); 2443 goto out; 2444 } 2445 } 2446 2447 for (; sva < eva; sva = va_next) { 2448 2449 if (pmap->pm_stats.resident_count == 0) 2450 break; 2451 2452 pml4e = pmap_pml4e(pmap, sva); 2453 if ((*pml4e & PG_V) == 0) { 2454 va_next = (sva + NBPML4) & ~PML4MASK; 2455 if (va_next < sva) 2456 va_next = eva; 2457 continue; 2458 } 2459 2460 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2461 if ((*pdpe & PG_V) == 0) { 2462 va_next = (sva + NBPDP) & ~PDPMASK; 2463 if (va_next < sva) 2464 va_next = eva; 2465 continue; 2466 } 2467 2468 /* 2469 * Calculate index for next page table. 2470 */ 2471 va_next = (sva + NBPDR) & ~PDRMASK; 2472 if (va_next < sva) 2473 va_next = eva; 2474 2475 pde = pmap_pdpe_to_pde(pdpe, sva); 2476 ptpaddr = *pde; 2477 2478 /* 2479 * Weed out invalid mappings. 2480 */ 2481 if (ptpaddr == 0) 2482 continue; 2483 2484 /* 2485 * Check for large page. 2486 */ 2487 if ((ptpaddr & PG_PS) != 0) { 2488 /* 2489 * Are we removing the entire large page? If not, 2490 * demote the mapping and fall through. 2491 */ 2492 if (sva + NBPDR == va_next && eva >= va_next) { 2493 /* 2494 * The TLB entry for a PG_G mapping is 2495 * invalidated by pmap_remove_pde(). 2496 */ 2497 if ((ptpaddr & PG_G) == 0) 2498 anyvalid = 1; 2499 pmap_remove_pde(pmap, pde, sva, &free); 2500 continue; 2501 } else if (!pmap_demote_pde(pmap, pde, sva)) { 2502 /* The large page mapping was destroyed. */ 2503 continue; 2504 } else 2505 ptpaddr = *pde; 2506 } 2507 2508 /* 2509 * Limit our scan to either the end of the va represented 2510 * by the current page table page, or to the end of the 2511 * range being removed. 2512 */ 2513 if (va_next > eva) 2514 va_next = eva; 2515 2516 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2517 sva += PAGE_SIZE) { 2518 if (*pte == 0) 2519 continue; 2520 2521 /* 2522 * The TLB entry for a PG_G mapping is invalidated 2523 * by pmap_remove_pte(). 2524 */ 2525 if ((*pte & PG_G) == 0) 2526 anyvalid = 1; 2527 if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free)) 2528 break; 2529 } 2530 } 2531out: 2532 if (anyvalid) 2533 pmap_invalidate_all(pmap); 2534 vm_page_unlock_queues(); 2535 PMAP_UNLOCK(pmap); 2536 pmap_free_zero_pages(free); 2537} 2538 2539/* 2540 * Routine: pmap_remove_all 2541 * Function: 2542 * Removes this physical page from 2543 * all physical maps in which it resides. 2544 * Reflects back modify bits to the pager. 2545 * 2546 * Notes: 2547 * Original versions of this routine were very 2548 * inefficient because they iteratively called 2549 * pmap_remove (slow...) 2550 */ 2551 2552void 2553pmap_remove_all(vm_page_t m) 2554{ 2555 struct md_page *pvh; 2556 pv_entry_t pv; 2557 pmap_t pmap; 2558 pt_entry_t *pte, tpte; 2559 pd_entry_t *pde; 2560 vm_offset_t va; 2561 vm_page_t free; 2562 2563 KASSERT((m->flags & PG_FICTITIOUS) == 0, 2564 ("pmap_remove_all: page %p is fictitious", m)); 2565 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2566 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2567 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 2568 va = pv->pv_va; 2569 pmap = PV_PMAP(pv); 2570 PMAP_LOCK(pmap); 2571 pde = pmap_pde(pmap, va); 2572 (void)pmap_demote_pde(pmap, pde, va); 2573 PMAP_UNLOCK(pmap); 2574 } 2575 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2576 pmap = PV_PMAP(pv); 2577 PMAP_LOCK(pmap); 2578 pmap->pm_stats.resident_count--; 2579 pde = pmap_pde(pmap, pv->pv_va); 2580 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 2581 " a 2mpage in page %p's pv list", m)); 2582 pte = pmap_pde_to_pte(pde, pv->pv_va); 2583 tpte = pte_load_clear(pte); 2584 if (tpte & PG_W) 2585 pmap->pm_stats.wired_count--; 2586 if (tpte & PG_A) 2587 vm_page_flag_set(m, PG_REFERENCED); 2588 2589 /* 2590 * Update the vm_page_t clean and reference bits. 2591 */ 2592 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2593 vm_page_dirty(m); 2594 free = NULL; 2595 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); 2596 pmap_invalidate_page(pmap, pv->pv_va); 2597 pmap_free_zero_pages(free); 2598 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2599 free_pv_entry(pmap, pv); 2600 PMAP_UNLOCK(pmap); 2601 } 2602 vm_page_flag_clear(m, PG_WRITEABLE); 2603} 2604 2605/* 2606 * pmap_protect_pde: do the things to protect a 2mpage in a process 2607 */ 2608static boolean_t 2609pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 2610{ 2611 pd_entry_t newpde, oldpde; 2612 vm_offset_t eva, va; 2613 vm_page_t m; 2614 boolean_t anychanged; 2615 2616 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2617 KASSERT((sva & PDRMASK) == 0, 2618 ("pmap_protect_pde: sva is not 2mpage aligned")); 2619 anychanged = FALSE; 2620retry: 2621 oldpde = newpde = *pde; 2622 if (oldpde & PG_MANAGED) { 2623 eva = sva + NBPDR; 2624 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 2625 va < eva; va += PAGE_SIZE, m++) { 2626 /* 2627 * In contrast to the analogous operation on a 4KB page 2628 * mapping, the mapping's PG_A flag is not cleared and 2629 * the page's PG_REFERENCED flag is not set. The 2630 * reason is that pmap_demote_pde() expects that a 2MB 2631 * page mapping with a stored page table page has PG_A 2632 * set. 2633 */ 2634 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2635 vm_page_dirty(m); 2636 } 2637 } 2638 if ((prot & VM_PROT_WRITE) == 0) 2639 newpde &= ~(PG_RW | PG_M); 2640 if ((prot & VM_PROT_EXECUTE) == 0) 2641 newpde |= pg_nx; 2642 if (newpde != oldpde) { 2643 if (!atomic_cmpset_long(pde, oldpde, newpde)) 2644 goto retry; 2645 if (oldpde & PG_G) 2646 pmap_invalidate_page(pmap, sva); 2647 else 2648 anychanged = TRUE; 2649 } 2650 return (anychanged); 2651} 2652 2653/* 2654 * Set the physical protection on the 2655 * specified range of this map as requested. 2656 */ 2657void 2658pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2659{ 2660 vm_offset_t va_next; 2661 pml4_entry_t *pml4e; 2662 pdp_entry_t *pdpe; 2663 pd_entry_t ptpaddr, *pde; 2664 pt_entry_t *pte; 2665 int anychanged; 2666 2667 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2668 pmap_remove(pmap, sva, eva); 2669 return; 2670 } 2671 2672 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 2673 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 2674 return; 2675 2676 anychanged = 0; 2677 2678 vm_page_lock_queues(); 2679 PMAP_LOCK(pmap); 2680 for (; sva < eva; sva = va_next) { 2681 2682 pml4e = pmap_pml4e(pmap, sva); 2683 if ((*pml4e & PG_V) == 0) { 2684 va_next = (sva + NBPML4) & ~PML4MASK; 2685 if (va_next < sva) 2686 va_next = eva; 2687 continue; 2688 } 2689 2690 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2691 if ((*pdpe & PG_V) == 0) { 2692 va_next = (sva + NBPDP) & ~PDPMASK; 2693 if (va_next < sva) 2694 va_next = eva; 2695 continue; 2696 } 2697 2698 va_next = (sva + NBPDR) & ~PDRMASK; 2699 if (va_next < sva) 2700 va_next = eva; 2701 2702 pde = pmap_pdpe_to_pde(pdpe, sva); 2703 ptpaddr = *pde; 2704 2705 /* 2706 * Weed out invalid mappings. 2707 */ 2708 if (ptpaddr == 0) 2709 continue; 2710 2711 /* 2712 * Check for large page. 2713 */ 2714 if ((ptpaddr & PG_PS) != 0) { 2715 /* 2716 * Are we protecting the entire large page? If not, 2717 * demote the mapping and fall through. 2718 */ 2719 if (sva + NBPDR == va_next && eva >= va_next) { 2720 /* 2721 * The TLB entry for a PG_G mapping is 2722 * invalidated by pmap_protect_pde(). 2723 */ 2724 if (pmap_protect_pde(pmap, pde, sva, prot)) 2725 anychanged = 1; 2726 continue; 2727 } else if (!pmap_demote_pde(pmap, pde, sva)) { 2728 /* The large page mapping was destroyed. */ 2729 continue; 2730 } 2731 } 2732 2733 if (va_next > eva) 2734 va_next = eva; 2735 2736 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 2737 sva += PAGE_SIZE) { 2738 pt_entry_t obits, pbits; 2739 vm_page_t m; 2740 2741retry: 2742 obits = pbits = *pte; 2743 if ((pbits & PG_V) == 0) 2744 continue; 2745 if (pbits & PG_MANAGED) { 2746 m = NULL; 2747 if (pbits & PG_A) { 2748 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 2749 vm_page_flag_set(m, PG_REFERENCED); 2750 pbits &= ~PG_A; 2751 } 2752 if ((pbits & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 2753 if (m == NULL) 2754 m = PHYS_TO_VM_PAGE(pbits & 2755 PG_FRAME); 2756 vm_page_dirty(m); 2757 } 2758 } 2759 2760 if ((prot & VM_PROT_WRITE) == 0) 2761 pbits &= ~(PG_RW | PG_M); 2762 if ((prot & VM_PROT_EXECUTE) == 0) 2763 pbits |= pg_nx; 2764 2765 if (pbits != obits) { 2766 if (!atomic_cmpset_long(pte, obits, pbits)) 2767 goto retry; 2768 if (obits & PG_G) 2769 pmap_invalidate_page(pmap, sva); 2770 else 2771 anychanged = 1; 2772 } 2773 } 2774 } 2775 if (anychanged) 2776 pmap_invalidate_all(pmap); 2777 vm_page_unlock_queues(); 2778 PMAP_UNLOCK(pmap); 2779} 2780 2781/* 2782 * Tries to promote the 512, contiguous 4KB page mappings that are within a 2783 * single page table page (PTP) to a single 2MB page mapping. For promotion 2784 * to occur, two conditions must be met: (1) the 4KB page mappings must map 2785 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 2786 * identical characteristics. 2787 */ 2788static void 2789pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2790{ 2791 pd_entry_t newpde; 2792 pt_entry_t *firstpte, oldpte, pa, *pte; 2793 vm_offset_t oldpteva; 2794 vm_page_t mpte; 2795 2796 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2797 2798 /* 2799 * Examine the first PTE in the specified PTP. Abort if this PTE is 2800 * either invalid, unused, or does not map the first 4KB physical page 2801 * within a 2MB page. 2802 */ 2803 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 2804setpde: 2805 newpde = *firstpte; 2806 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 2807 pmap_pde_p_failures++; 2808 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 2809 " in pmap %p", va, pmap); 2810 return; 2811 } 2812 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 2813 /* 2814 * When PG_M is already clear, PG_RW can be cleared without 2815 * a TLB invalidation. 2816 */ 2817 if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW)) 2818 goto setpde; 2819 newpde &= ~PG_RW; 2820 } 2821 2822 /* 2823 * Examine each of the other PTEs in the specified PTP. Abort if this 2824 * PTE maps an unexpected 4KB physical page or does not have identical 2825 * characteristics to the first PTE. 2826 */ 2827 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 2828 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 2829setpte: 2830 oldpte = *pte; 2831 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 2832 pmap_pde_p_failures++; 2833 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 2834 " in pmap %p", va, pmap); 2835 return; 2836 } 2837 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 2838 /* 2839 * When PG_M is already clear, PG_RW can be cleared 2840 * without a TLB invalidation. 2841 */ 2842 if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW)) 2843 goto setpte; 2844 oldpte &= ~PG_RW; 2845 oldpteva = (oldpte & PG_FRAME & PDRMASK) | 2846 (va & ~PDRMASK); 2847 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 2848 " in pmap %p", oldpteva, pmap); 2849 } 2850 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 2851 pmap_pde_p_failures++; 2852 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 2853 " in pmap %p", va, pmap); 2854 return; 2855 } 2856 pa -= PAGE_SIZE; 2857 } 2858 2859 /* 2860 * Save the page table page in its current state until the PDE 2861 * mapping the superpage is demoted by pmap_demote_pde() or 2862 * destroyed by pmap_remove_pde(). 2863 */ 2864 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 2865 KASSERT(mpte >= vm_page_array && 2866 mpte < &vm_page_array[vm_page_array_size], 2867 ("pmap_promote_pde: page table page is out of range")); 2868 KASSERT(mpte->pindex == pmap_pde_pindex(va), 2869 ("pmap_promote_pde: page table page's pindex is wrong")); 2870 pmap_insert_pt_page(pmap, mpte); 2871 2872 /* 2873 * Promote the pv entries. 2874 */ 2875 if ((newpde & PG_MANAGED) != 0) 2876 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); 2877 2878 /* 2879 * Propagate the PAT index to its proper position. 2880 */ 2881 if ((newpde & PG_PTE_PAT) != 0) 2882 newpde ^= PG_PDE_PAT | PG_PTE_PAT; 2883 2884 /* 2885 * Map the superpage. 2886 */ 2887 pde_store(pde, PG_PS | newpde); 2888 2889 pmap_pde_promotions++; 2890 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" 2891 " in pmap %p", va, pmap); 2892} 2893 2894/* 2895 * Insert the given physical page (p) at 2896 * the specified virtual address (v) in the 2897 * target physical map with the protection requested. 2898 * 2899 * If specified, the page will be wired down, meaning 2900 * that the related pte can not be reclaimed. 2901 * 2902 * NB: This is the only routine which MAY NOT lazy-evaluate 2903 * or lose information. That is, this routine must actually 2904 * insert this page into the given map NOW. 2905 */ 2906void 2907pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m, 2908 vm_prot_t prot, boolean_t wired) 2909{ 2910 vm_paddr_t pa; 2911 pd_entry_t *pde; 2912 pt_entry_t *pte; 2913 vm_paddr_t opa; 2914 pt_entry_t origpte, newpte; 2915 vm_page_t mpte, om; 2916 boolean_t invlva; 2917 2918 va = trunc_page(va); 2919 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 2920 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 2921 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va)); 2922 2923 mpte = NULL; 2924 2925 vm_page_lock_queues(); 2926 PMAP_LOCK(pmap); 2927 2928 /* 2929 * In the case that a page table page is not 2930 * resident, we are creating it here. 2931 */ 2932 if (va < VM_MAXUSER_ADDRESS) { 2933 mpte = pmap_allocpte(pmap, va, M_WAITOK); 2934 } 2935 2936 pde = pmap_pde(pmap, va); 2937 if (pde != NULL && (*pde & PG_V) != 0) { 2938 if ((*pde & PG_PS) != 0) 2939 panic("pmap_enter: attempted pmap_enter on 2MB page"); 2940 pte = pmap_pde_to_pte(pde, va); 2941 } else 2942 panic("pmap_enter: invalid page directory va=%#lx", va); 2943 2944 pa = VM_PAGE_TO_PHYS(m); 2945 om = NULL; 2946 origpte = *pte; 2947 opa = origpte & PG_FRAME; 2948 2949 /* 2950 * Mapping has not changed, must be protection or wiring change. 2951 */ 2952 if (origpte && (opa == pa)) { 2953 /* 2954 * Wiring change, just update stats. We don't worry about 2955 * wiring PT pages as they remain resident as long as there 2956 * are valid mappings in them. Hence, if a user page is wired, 2957 * the PT page will be also. 2958 */ 2959 if (wired && ((origpte & PG_W) == 0)) 2960 pmap->pm_stats.wired_count++; 2961 else if (!wired && (origpte & PG_W)) 2962 pmap->pm_stats.wired_count--; 2963 2964 /* 2965 * Remove extra pte reference 2966 */ 2967 if (mpte) 2968 mpte->wire_count--; 2969 2970 /* 2971 * We might be turning off write access to the page, 2972 * so we go ahead and sense modify status. 2973 */ 2974 if (origpte & PG_MANAGED) { 2975 om = m; 2976 pa |= PG_MANAGED; 2977 } 2978 goto validate; 2979 } 2980 /* 2981 * Mapping has changed, invalidate old range and fall through to 2982 * handle validating new mapping. 2983 */ 2984 if (opa) { 2985 if (origpte & PG_W) 2986 pmap->pm_stats.wired_count--; 2987 if (origpte & PG_MANAGED) { 2988 om = PHYS_TO_VM_PAGE(opa); 2989 pmap_remove_entry(pmap, om, va); 2990 } 2991 if (mpte != NULL) { 2992 mpte->wire_count--; 2993 KASSERT(mpte->wire_count > 0, 2994 ("pmap_enter: missing reference to page table page," 2995 " va: 0x%lx", va)); 2996 } 2997 } else 2998 pmap->pm_stats.resident_count++; 2999 3000 /* 3001 * Enter on the PV list if part of our managed memory. 3002 */ 3003 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { 3004 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, 3005 ("pmap_enter: managed mapping within the clean submap")); 3006 pmap_insert_entry(pmap, va, m); 3007 pa |= PG_MANAGED; 3008 } 3009 3010 /* 3011 * Increment counters 3012 */ 3013 if (wired) 3014 pmap->pm_stats.wired_count++; 3015 3016validate: 3017 /* 3018 * Now validate mapping with desired protection/wiring. 3019 */ 3020 newpte = (pt_entry_t)(pa | PG_V); 3021 if ((prot & VM_PROT_WRITE) != 0) { 3022 newpte |= PG_RW; 3023 vm_page_flag_set(m, PG_WRITEABLE); 3024 } 3025 if ((prot & VM_PROT_EXECUTE) == 0) 3026 newpte |= pg_nx; 3027 if (wired) 3028 newpte |= PG_W; 3029 if (va < VM_MAXUSER_ADDRESS) 3030 newpte |= PG_U; 3031 if (pmap == kernel_pmap) 3032 newpte |= PG_G; 3033 3034 /* 3035 * if the mapping or permission bits are different, we need 3036 * to update the pte. 3037 */ 3038 if ((origpte & ~(PG_M|PG_A)) != newpte) { 3039 newpte |= PG_A; 3040 if ((access & VM_PROT_WRITE) != 0) 3041 newpte |= PG_M; 3042 if (origpte & PG_V) { 3043 invlva = FALSE; 3044 origpte = pte_load_store(pte, newpte); 3045 if (origpte & PG_A) { 3046 if (origpte & PG_MANAGED) 3047 vm_page_flag_set(om, PG_REFERENCED); 3048 if (opa != VM_PAGE_TO_PHYS(m) || ((origpte & 3049 PG_NX) == 0 && (newpte & PG_NX))) 3050 invlva = TRUE; 3051 } 3052 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 3053 if ((origpte & PG_MANAGED) != 0) 3054 vm_page_dirty(om); 3055 if ((newpte & PG_RW) == 0) 3056 invlva = TRUE; 3057 } 3058 if (invlva) 3059 pmap_invalidate_page(pmap, va); 3060 } else 3061 pte_store(pte, newpte); 3062 } 3063 3064 /* 3065 * If both the page table page and the reservation are fully 3066 * populated, then attempt promotion. 3067 */ 3068 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 3069 pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0) 3070 pmap_promote_pde(pmap, pde, va); 3071 3072 vm_page_unlock_queues(); 3073 PMAP_UNLOCK(pmap); 3074} 3075 3076/* 3077 * Tries to create a 2MB page mapping. Returns TRUE if successful and FALSE 3078 * otherwise. Fails if (1) a page table page cannot be allocated without 3079 * blocking, (2) a mapping already exists at the specified virtual address, or 3080 * (3) a pv entry cannot be allocated without reclaiming another pv entry. 3081 */ 3082static boolean_t 3083pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3084{ 3085 pd_entry_t *pde, newpde; 3086 vm_page_t free, mpde; 3087 3088 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3089 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3090 if ((mpde = pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) { 3091 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3092 " in pmap %p", va, pmap); 3093 return (FALSE); 3094 } 3095 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde)); 3096 pde = &pde[pmap_pde_index(va)]; 3097 if ((*pde & PG_V) != 0) { 3098 KASSERT(mpde->wire_count > 1, 3099 ("pmap_enter_pde: mpde's wire count is too low")); 3100 mpde->wire_count--; 3101 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3102 " in pmap %p", va, pmap); 3103 return (FALSE); 3104 } 3105 newpde = VM_PAGE_TO_PHYS(m) | PG_PS | PG_V; 3106 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { 3107 newpde |= PG_MANAGED; 3108 3109 /* 3110 * Abort this mapping if its PV entry could not be created. 3111 */ 3112 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) { 3113 free = NULL; 3114 if (pmap_unwire_pte_hold(pmap, va, mpde, &free)) { 3115 pmap_invalidate_page(pmap, va); 3116 pmap_free_zero_pages(free); 3117 } 3118 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3119 " in pmap %p", va, pmap); 3120 return (FALSE); 3121 } 3122 } 3123 if ((prot & VM_PROT_EXECUTE) == 0) 3124 newpde |= pg_nx; 3125 if (va < VM_MAXUSER_ADDRESS) 3126 newpde |= PG_U; 3127 3128 /* 3129 * Increment counters. 3130 */ 3131 pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; 3132 3133 /* 3134 * Map the superpage. 3135 */ 3136 pde_store(pde, newpde); 3137 3138 pmap_pde_mappings++; 3139 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 3140 " in pmap %p", va, pmap); 3141 return (TRUE); 3142} 3143 3144/* 3145 * Maps a sequence of resident pages belonging to the same object. 3146 * The sequence begins with the given page m_start. This page is 3147 * mapped at the given virtual address start. Each subsequent page is 3148 * mapped at a virtual address that is offset from start by the same 3149 * amount as the page is offset from m_start within the object. The 3150 * last page in the sequence is the page with the largest offset from 3151 * m_start that can be mapped at a virtual address less than the given 3152 * virtual address end. Not every virtual page between start and end 3153 * is mapped; only those for which a resident page exists with the 3154 * corresponding offset from m_start are mapped. 3155 */ 3156void 3157pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3158 vm_page_t m_start, vm_prot_t prot) 3159{ 3160 vm_offset_t va; 3161 vm_page_t m, mpte; 3162 vm_pindex_t diff, psize; 3163 3164 VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED); 3165 psize = atop(end - start); 3166 mpte = NULL; 3167 m = m_start; 3168 PMAP_LOCK(pmap); 3169 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3170 va = start + ptoa(diff); 3171 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 3172 (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 && 3173 pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 && 3174 pmap_enter_pde(pmap, va, m, prot)) 3175 m = &m[NBPDR / PAGE_SIZE - 1]; 3176 else 3177 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 3178 mpte); 3179 m = TAILQ_NEXT(m, listq); 3180 } 3181 PMAP_UNLOCK(pmap); 3182} 3183 3184/* 3185 * this code makes some *MAJOR* assumptions: 3186 * 1. Current pmap & pmap exists. 3187 * 2. Not wired. 3188 * 3. Read access. 3189 * 4. No page table pages. 3190 * but is *MUCH* faster than pmap_enter... 3191 */ 3192 3193void 3194pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3195{ 3196 3197 PMAP_LOCK(pmap); 3198 (void) pmap_enter_quick_locked(pmap, va, m, prot, NULL); 3199 PMAP_UNLOCK(pmap); 3200} 3201 3202static vm_page_t 3203pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3204 vm_prot_t prot, vm_page_t mpte) 3205{ 3206 vm_page_t free; 3207 pt_entry_t *pte; 3208 vm_paddr_t pa; 3209 3210 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 3211 (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0, 3212 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3213 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3214 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3215 3216 /* 3217 * In the case that a page table page is not 3218 * resident, we are creating it here. 3219 */ 3220 if (va < VM_MAXUSER_ADDRESS) { 3221 vm_pindex_t ptepindex; 3222 pd_entry_t *ptepa; 3223 3224 /* 3225 * Calculate pagetable page index 3226 */ 3227 ptepindex = pmap_pde_pindex(va); 3228 if (mpte && (mpte->pindex == ptepindex)) { 3229 mpte->wire_count++; 3230 } else { 3231 /* 3232 * Get the page directory entry 3233 */ 3234 ptepa = pmap_pde(pmap, va); 3235 3236 /* 3237 * If the page table page is mapped, we just increment 3238 * the hold count, and activate it. 3239 */ 3240 if (ptepa && (*ptepa & PG_V) != 0) { 3241 if (*ptepa & PG_PS) 3242 return (NULL); 3243 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); 3244 mpte->wire_count++; 3245 } else { 3246 mpte = _pmap_allocpte(pmap, ptepindex, 3247 M_NOWAIT); 3248 if (mpte == NULL) 3249 return (mpte); 3250 } 3251 } 3252 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 3253 pte = &pte[pmap_pte_index(va)]; 3254 } else { 3255 mpte = NULL; 3256 pte = vtopte(va); 3257 } 3258 if (*pte) { 3259 if (mpte != NULL) { 3260 mpte->wire_count--; 3261 mpte = NULL; 3262 } 3263 return (mpte); 3264 } 3265 3266 /* 3267 * Enter on the PV list if part of our managed memory. 3268 */ 3269 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 && 3270 !pmap_try_insert_pv_entry(pmap, va, m)) { 3271 if (mpte != NULL) { 3272 free = NULL; 3273 if (pmap_unwire_pte_hold(pmap, va, mpte, &free)) { 3274 pmap_invalidate_page(pmap, va); 3275 pmap_free_zero_pages(free); 3276 } 3277 mpte = NULL; 3278 } 3279 return (mpte); 3280 } 3281 3282 /* 3283 * Increment counters 3284 */ 3285 pmap->pm_stats.resident_count++; 3286 3287 pa = VM_PAGE_TO_PHYS(m); 3288 if ((prot & VM_PROT_EXECUTE) == 0) 3289 pa |= pg_nx; 3290 3291 /* 3292 * Now validate mapping with RO protection 3293 */ 3294 if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) 3295 pte_store(pte, pa | PG_V | PG_U); 3296 else 3297 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 3298 return mpte; 3299} 3300 3301/* 3302 * Make a temporary mapping for a physical address. This is only intended 3303 * to be used for panic dumps. 3304 */ 3305void * 3306pmap_kenter_temporary(vm_paddr_t pa, int i) 3307{ 3308 vm_offset_t va; 3309 3310 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 3311 pmap_kenter(va, pa); 3312 invlpg(va); 3313 return ((void *)crashdumpmap); 3314} 3315 3316/* 3317 * This code maps large physical mmap regions into the 3318 * processor address space. Note that some shortcuts 3319 * are taken, but the code works. 3320 */ 3321void 3322pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3323 vm_pindex_t pindex, vm_size_t size) 3324{ 3325 vm_offset_t va; 3326 vm_page_t p, pdpg; 3327 3328 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 3329 KASSERT(object->type == OBJT_DEVICE, 3330 ("pmap_object_init_pt: non-device object")); 3331 if (((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) { 3332 vm_page_t m[1]; 3333 pd_entry_t ptepa, *pde; 3334 3335 PMAP_LOCK(pmap); 3336 pde = pmap_pde(pmap, addr); 3337 if (pde != 0 && (*pde & PG_V) != 0) 3338 goto out; 3339 PMAP_UNLOCK(pmap); 3340retry: 3341 p = vm_page_lookup(object, pindex); 3342 if (p != NULL) { 3343 if (vm_page_sleep_if_busy(p, FALSE, "init4p")) 3344 goto retry; 3345 } else { 3346 p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL); 3347 if (p == NULL) 3348 return; 3349 m[0] = p; 3350 3351 if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) { 3352 vm_page_lock_queues(); 3353 vm_page_free(p); 3354 vm_page_unlock_queues(); 3355 return; 3356 } 3357 3358 p = vm_page_lookup(object, pindex); 3359 vm_page_wakeup(p); 3360 } 3361 3362 ptepa = VM_PAGE_TO_PHYS(p); 3363 if (ptepa & (NBPDR - 1)) 3364 return; 3365 3366 p->valid = VM_PAGE_BITS_ALL; 3367 3368 PMAP_LOCK(pmap); 3369 for (va = addr; va < addr + size; va += NBPDR) { 3370 while ((pdpg = 3371 pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) { 3372 PMAP_UNLOCK(pmap); 3373 vm_page_busy(p); 3374 VM_OBJECT_UNLOCK(object); 3375 VM_WAIT; 3376 VM_OBJECT_LOCK(object); 3377 vm_page_wakeup(p); 3378 PMAP_LOCK(pmap); 3379 } 3380 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 3381 pde = &pde[pmap_pde_index(va)]; 3382 if ((*pde & PG_V) == 0) { 3383 pde_store(pde, ptepa | PG_PS | PG_M | PG_A | 3384 PG_U | PG_RW | PG_V); 3385 pmap->pm_stats.resident_count += 3386 NBPDR / PAGE_SIZE; 3387 } else { 3388 pdpg->wire_count--; 3389 KASSERT(pdpg->wire_count > 0, 3390 ("pmap_object_init_pt: missing reference " 3391 "to page directory page, va: 0x%lx", va)); 3392 } 3393 ptepa += NBPDR; 3394 } 3395 pmap_invalidate_all(pmap); 3396out: 3397 PMAP_UNLOCK(pmap); 3398 } 3399} 3400 3401/* 3402 * Routine: pmap_change_wiring 3403 * Function: Change the wiring attribute for a map/virtual-address 3404 * pair. 3405 * In/out conditions: 3406 * The mapping must already exist in the pmap. 3407 */ 3408void 3409pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) 3410{ 3411 pd_entry_t *pde; 3412 pt_entry_t *pte; 3413 boolean_t are_queues_locked; 3414 3415 are_queues_locked = FALSE; 3416 3417 /* 3418 * Wiring is not a hardware characteristic so there is no need to 3419 * invalidate TLB. 3420 */ 3421retry: 3422 PMAP_LOCK(pmap); 3423 pde = pmap_pde(pmap, va); 3424 if ((*pde & PG_PS) != 0) { 3425 if (!wired != ((*pde & PG_W) == 0)) { 3426 if (!are_queues_locked) { 3427 are_queues_locked = TRUE; 3428 if (!mtx_trylock(&vm_page_queue_mtx)) { 3429 PMAP_UNLOCK(pmap); 3430 vm_page_lock_queues(); 3431 goto retry; 3432 } 3433 } 3434 if (!pmap_demote_pde(pmap, pde, va)) 3435 panic("pmap_change_wiring: demotion failed"); 3436 } else 3437 goto out; 3438 } 3439 pte = pmap_pde_to_pte(pde, va); 3440 if (wired && (*pte & PG_W) == 0) { 3441 pmap->pm_stats.wired_count++; 3442 atomic_set_long(pte, PG_W); 3443 } else if (!wired && (*pte & PG_W) != 0) { 3444 pmap->pm_stats.wired_count--; 3445 atomic_clear_long(pte, PG_W); 3446 } 3447out: 3448 if (are_queues_locked) 3449 vm_page_unlock_queues(); 3450 PMAP_UNLOCK(pmap); 3451} 3452 3453 3454 3455/* 3456 * Copy the range specified by src_addr/len 3457 * from the source map to the range dst_addr/len 3458 * in the destination map. 3459 * 3460 * This routine is only advisory and need not do anything. 3461 */ 3462 3463void 3464pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 3465 vm_offset_t src_addr) 3466{ 3467 vm_page_t free; 3468 vm_offset_t addr; 3469 vm_offset_t end_addr = src_addr + len; 3470 vm_offset_t va_next; 3471 3472 if (dst_addr != src_addr) 3473 return; 3474 3475 vm_page_lock_queues(); 3476 if (dst_pmap < src_pmap) { 3477 PMAP_LOCK(dst_pmap); 3478 PMAP_LOCK(src_pmap); 3479 } else { 3480 PMAP_LOCK(src_pmap); 3481 PMAP_LOCK(dst_pmap); 3482 } 3483 for (addr = src_addr; addr < end_addr; addr = va_next) { 3484 pt_entry_t *src_pte, *dst_pte; 3485 vm_page_t dstmpde, dstmpte, srcmpte; 3486 pml4_entry_t *pml4e; 3487 pdp_entry_t *pdpe; 3488 pd_entry_t srcptepaddr, *pde; 3489 3490 KASSERT(addr < UPT_MIN_ADDRESS, 3491 ("pmap_copy: invalid to pmap_copy page tables")); 3492 3493 pml4e = pmap_pml4e(src_pmap, addr); 3494 if ((*pml4e & PG_V) == 0) { 3495 va_next = (addr + NBPML4) & ~PML4MASK; 3496 if (va_next < addr) 3497 va_next = end_addr; 3498 continue; 3499 } 3500 3501 pdpe = pmap_pml4e_to_pdpe(pml4e, addr); 3502 if ((*pdpe & PG_V) == 0) { 3503 va_next = (addr + NBPDP) & ~PDPMASK; 3504 if (va_next < addr) 3505 va_next = end_addr; 3506 continue; 3507 } 3508 3509 va_next = (addr + NBPDR) & ~PDRMASK; 3510 if (va_next < addr) 3511 va_next = end_addr; 3512 3513 pde = pmap_pdpe_to_pde(pdpe, addr); 3514 srcptepaddr = *pde; 3515 if (srcptepaddr == 0) 3516 continue; 3517 3518 if (srcptepaddr & PG_PS) { 3519 dstmpde = pmap_allocpde(dst_pmap, addr, M_NOWAIT); 3520 if (dstmpde == NULL) 3521 break; 3522 pde = (pd_entry_t *) 3523 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde)); 3524 pde = &pde[pmap_pde_index(addr)]; 3525 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 3526 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & 3527 PG_PS_FRAME))) { 3528 *pde = srcptepaddr & ~PG_W; 3529 dst_pmap->pm_stats.resident_count += 3530 NBPDR / PAGE_SIZE; 3531 } else 3532 dstmpde->wire_count--; 3533 continue; 3534 } 3535 3536 srcptepaddr &= PG_FRAME; 3537 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 3538 KASSERT(srcmpte->wire_count > 0, 3539 ("pmap_copy: source page table page is unused")); 3540 3541 if (va_next > end_addr) 3542 va_next = end_addr; 3543 3544 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 3545 src_pte = &src_pte[pmap_pte_index(addr)]; 3546 dstmpte = NULL; 3547 while (addr < va_next) { 3548 pt_entry_t ptetemp; 3549 ptetemp = *src_pte; 3550 /* 3551 * we only virtual copy managed pages 3552 */ 3553 if ((ptetemp & PG_MANAGED) != 0) { 3554 if (dstmpte != NULL && 3555 dstmpte->pindex == pmap_pde_pindex(addr)) 3556 dstmpte->wire_count++; 3557 else if ((dstmpte = pmap_allocpte(dst_pmap, 3558 addr, M_NOWAIT)) == NULL) 3559 goto out; 3560 dst_pte = (pt_entry_t *) 3561 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 3562 dst_pte = &dst_pte[pmap_pte_index(addr)]; 3563 if (*dst_pte == 0 && 3564 pmap_try_insert_pv_entry(dst_pmap, addr, 3565 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { 3566 /* 3567 * Clear the wired, modified, and 3568 * accessed (referenced) bits 3569 * during the copy. 3570 */ 3571 *dst_pte = ptetemp & ~(PG_W | PG_M | 3572 PG_A); 3573 dst_pmap->pm_stats.resident_count++; 3574 } else { 3575 free = NULL; 3576 if (pmap_unwire_pte_hold(dst_pmap, 3577 addr, dstmpte, &free)) { 3578 pmap_invalidate_page(dst_pmap, 3579 addr); 3580 pmap_free_zero_pages(free); 3581 } 3582 goto out; 3583 } 3584 if (dstmpte->wire_count >= srcmpte->wire_count) 3585 break; 3586 } 3587 addr += PAGE_SIZE; 3588 src_pte++; 3589 } 3590 } 3591out: 3592 vm_page_unlock_queues(); 3593 PMAP_UNLOCK(src_pmap); 3594 PMAP_UNLOCK(dst_pmap); 3595} 3596 3597/* 3598 * pmap_zero_page zeros the specified hardware page by mapping 3599 * the page into KVM and using bzero to clear its contents. 3600 */ 3601void 3602pmap_zero_page(vm_page_t m) 3603{ 3604 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3605 3606 pagezero((void *)va); 3607} 3608 3609/* 3610 * pmap_zero_page_area zeros the specified hardware page by mapping 3611 * the page into KVM and using bzero to clear its contents. 3612 * 3613 * off and size may not cover an area beyond a single hardware page. 3614 */ 3615void 3616pmap_zero_page_area(vm_page_t m, int off, int size) 3617{ 3618 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3619 3620 if (off == 0 && size == PAGE_SIZE) 3621 pagezero((void *)va); 3622 else 3623 bzero((char *)va + off, size); 3624} 3625 3626/* 3627 * pmap_zero_page_idle zeros the specified hardware page by mapping 3628 * the page into KVM and using bzero to clear its contents. This 3629 * is intended to be called from the vm_pagezero process only and 3630 * outside of Giant. 3631 */ 3632void 3633pmap_zero_page_idle(vm_page_t m) 3634{ 3635 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 3636 3637 pagezero((void *)va); 3638} 3639 3640/* 3641 * pmap_copy_page copies the specified (machine independent) 3642 * page by mapping the page into virtual memory and using 3643 * bcopy to copy the page, one machine dependent page at a 3644 * time. 3645 */ 3646void 3647pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 3648{ 3649 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 3650 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 3651 3652 pagecopy((void *)src, (void *)dst); 3653} 3654 3655/* 3656 * Returns true if the pmap's pv is one of the first 3657 * 16 pvs linked to from this page. This count may 3658 * be changed upwards or downwards in the future; it 3659 * is only necessary that true be returned for a small 3660 * subset of pmaps for proper page aging. 3661 */ 3662boolean_t 3663pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 3664{ 3665 struct md_page *pvh; 3666 pv_entry_t pv; 3667 int loops = 0; 3668 3669 if (m->flags & PG_FICTITIOUS) 3670 return FALSE; 3671 3672 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3673 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3674 if (PV_PMAP(pv) == pmap) { 3675 return TRUE; 3676 } 3677 loops++; 3678 if (loops >= 16) 3679 break; 3680 } 3681 if (loops < 16) { 3682 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3683 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 3684 if (PV_PMAP(pv) == pmap) 3685 return (TRUE); 3686 loops++; 3687 if (loops >= 16) 3688 break; 3689 } 3690 } 3691 return (FALSE); 3692} 3693 3694/* 3695 * pmap_page_wired_mappings: 3696 * 3697 * Return the number of managed mappings to the given physical page 3698 * that are wired. 3699 */ 3700int 3701pmap_page_wired_mappings(vm_page_t m) 3702{ 3703 int count; 3704 3705 count = 0; 3706 if ((m->flags & PG_FICTITIOUS) != 0) 3707 return (count); 3708 count = pmap_pvh_wired_mappings(&m->md, count); 3709 return (pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count)); 3710} 3711 3712/* 3713 * pmap_pvh_wired_mappings: 3714 * 3715 * Return the updated number "count" of managed mappings that are wired. 3716 */ 3717static int 3718pmap_pvh_wired_mappings(struct md_page *pvh, int count) 3719{ 3720 pmap_t pmap; 3721 pt_entry_t *pte; 3722 pv_entry_t pv; 3723 3724 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3725 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 3726 pmap = PV_PMAP(pv); 3727 PMAP_LOCK(pmap); 3728 pte = pmap_pte(pmap, pv->pv_va); 3729 if ((*pte & PG_W) != 0) 3730 count++; 3731 PMAP_UNLOCK(pmap); 3732 } 3733 return (count); 3734} 3735 3736/* 3737 * Returns TRUE if the given page is mapped individually or as part of 3738 * a 2mpage. Otherwise, returns FALSE. 3739 */ 3740boolean_t 3741pmap_page_is_mapped(vm_page_t m) 3742{ 3743 struct md_page *pvh; 3744 3745 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0) 3746 return (FALSE); 3747 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3748 if (TAILQ_EMPTY(&m->md.pv_list)) { 3749 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3750 return (!TAILQ_EMPTY(&pvh->pv_list)); 3751 } else 3752 return (TRUE); 3753} 3754 3755/* 3756 * Remove all pages from specified address space 3757 * this aids process exit speeds. Also, this code 3758 * is special cased for current process only, but 3759 * can have the more generic (and slightly slower) 3760 * mode enabled. This is much faster than pmap_remove 3761 * in the case of running down an entire address space. 3762 */ 3763void 3764pmap_remove_pages(pmap_t pmap) 3765{ 3766 pd_entry_t ptepde; 3767 pt_entry_t *pte, tpte; 3768 vm_page_t free = NULL; 3769 vm_page_t m, mpte, mt; 3770 pv_entry_t pv; 3771 struct md_page *pvh; 3772 struct pv_chunk *pc, *npc; 3773 int field, idx; 3774 int64_t bit; 3775 uint64_t inuse, bitmask; 3776 int allfree; 3777 3778 if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) { 3779 printf("warning: pmap_remove_pages called with non-current pmap\n"); 3780 return; 3781 } 3782 vm_page_lock_queues(); 3783 PMAP_LOCK(pmap); 3784 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 3785 allfree = 1; 3786 for (field = 0; field < _NPCM; field++) { 3787 inuse = (~(pc->pc_map[field])) & pc_freemask[field]; 3788 while (inuse != 0) { 3789 bit = bsfq(inuse); 3790 bitmask = 1UL << bit; 3791 idx = field * 64 + bit; 3792 pv = &pc->pc_pventry[idx]; 3793 inuse &= ~bitmask; 3794 3795 pte = pmap_pdpe(pmap, pv->pv_va); 3796 ptepde = *pte; 3797 pte = pmap_pdpe_to_pde(pte, pv->pv_va); 3798 tpte = *pte; 3799 if ((tpte & (PG_PS | PG_V)) == PG_V) { 3800 ptepde = tpte; 3801 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 3802 PG_FRAME); 3803 pte = &pte[pmap_pte_index(pv->pv_va)]; 3804 tpte = *pte & ~PG_PTE_PAT; 3805 } 3806 if ((tpte & PG_V) == 0) 3807 panic("bad pte"); 3808 3809/* 3810 * We cannot remove wired pages from a process' mapping at this time 3811 */ 3812 if (tpte & PG_W) { 3813 allfree = 0; 3814 continue; 3815 } 3816 3817 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 3818 KASSERT(m->phys_addr == (tpte & PG_FRAME), 3819 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 3820 m, (uintmax_t)m->phys_addr, 3821 (uintmax_t)tpte)); 3822 3823 KASSERT(m < &vm_page_array[vm_page_array_size], 3824 ("pmap_remove_pages: bad tpte %#jx", 3825 (uintmax_t)tpte)); 3826 3827 pte_clear(pte); 3828 3829 /* 3830 * Update the vm_page_t clean/reference bits. 3831 */ 3832 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 3833 if ((tpte & PG_PS) != 0) { 3834 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 3835 vm_page_dirty(mt); 3836 } else 3837 vm_page_dirty(m); 3838 } 3839 3840 /* Mark free */ 3841 PV_STAT(pv_entry_frees++); 3842 PV_STAT(pv_entry_spare++); 3843 pv_entry_count--; 3844 pc->pc_map[field] |= bitmask; 3845 if ((tpte & PG_PS) != 0) { 3846 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 3847 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 3848 TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); 3849 if (TAILQ_EMPTY(&pvh->pv_list)) { 3850 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 3851 if (TAILQ_EMPTY(&mt->md.pv_list)) 3852 vm_page_flag_clear(mt, PG_WRITEABLE); 3853 } 3854 mpte = pmap_lookup_pt_page(pmap, pv->pv_va); 3855 if (mpte != NULL) { 3856 pmap_remove_pt_page(pmap, mpte); 3857 pmap->pm_stats.resident_count--; 3858 KASSERT(mpte->wire_count == NPTEPG, 3859 ("pmap_remove_pages: pte page wire count error")); 3860 mpte->wire_count = 0; 3861 pmap_add_delayed_free_list(mpte, &free, FALSE); 3862 atomic_subtract_int(&cnt.v_wire_count, 1); 3863 } 3864 } else { 3865 pmap->pm_stats.resident_count--; 3866 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3867 if (TAILQ_EMPTY(&m->md.pv_list)) { 3868 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3869 if (TAILQ_EMPTY(&pvh->pv_list)) 3870 vm_page_flag_clear(m, PG_WRITEABLE); 3871 } 3872 } 3873 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 3874 } 3875 } 3876 if (allfree) { 3877 PV_STAT(pv_entry_spare -= _NPCPV); 3878 PV_STAT(pc_chunk_count--); 3879 PV_STAT(pc_chunk_frees++); 3880 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3881 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 3882 dump_drop_page(m->phys_addr); 3883 vm_page_unwire(m, 0); 3884 vm_page_free(m); 3885 } 3886 } 3887 pmap_invalidate_all(pmap); 3888 vm_page_unlock_queues(); 3889 PMAP_UNLOCK(pmap); 3890 pmap_free_zero_pages(free); 3891} 3892 3893/* 3894 * pmap_is_modified: 3895 * 3896 * Return whether or not the specified physical page was modified 3897 * in any physical maps. 3898 */ 3899boolean_t 3900pmap_is_modified(vm_page_t m) 3901{ 3902 3903 if (m->flags & PG_FICTITIOUS) 3904 return (FALSE); 3905 if (pmap_is_modified_pvh(&m->md)) 3906 return (TRUE); 3907 return (pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 3908} 3909 3910/* 3911 * Returns TRUE if any of the given mappings were used to modify 3912 * physical memory. Otherwise, returns FALSE. Both page and 2mpage 3913 * mappings are supported. 3914 */ 3915static boolean_t 3916pmap_is_modified_pvh(struct md_page *pvh) 3917{ 3918 pv_entry_t pv; 3919 pt_entry_t *pte; 3920 pmap_t pmap; 3921 boolean_t rv; 3922 3923 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3924 rv = FALSE; 3925 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 3926 pmap = PV_PMAP(pv); 3927 PMAP_LOCK(pmap); 3928 pte = pmap_pte(pmap, pv->pv_va); 3929 rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); 3930 PMAP_UNLOCK(pmap); 3931 if (rv) 3932 break; 3933 } 3934 return (rv); 3935} 3936 3937/* 3938 * pmap_is_prefaultable: 3939 * 3940 * Return whether or not the specified virtual address is elgible 3941 * for prefault. 3942 */ 3943boolean_t 3944pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 3945{ 3946 pd_entry_t *pde; 3947 pt_entry_t *pte; 3948 boolean_t rv; 3949 3950 rv = FALSE; 3951 PMAP_LOCK(pmap); 3952 pde = pmap_pde(pmap, addr); 3953 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { 3954 pte = pmap_pde_to_pte(pde, addr); 3955 rv = (*pte & PG_V) == 0; 3956 } 3957 PMAP_UNLOCK(pmap); 3958 return (rv); 3959} 3960 3961/* 3962 * Clear the write and modified bits in each of the given page's mappings. 3963 */ 3964void 3965pmap_remove_write(vm_page_t m) 3966{ 3967 struct md_page *pvh; 3968 pmap_t pmap; 3969 pv_entry_t next_pv, pv; 3970 pd_entry_t *pde; 3971 pt_entry_t oldpte, *pte; 3972 vm_offset_t va; 3973 3974 if ((m->flags & PG_FICTITIOUS) != 0 || 3975 (m->flags & PG_WRITEABLE) == 0) 3976 return; 3977 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3978 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3979 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { 3980 va = pv->pv_va; 3981 pmap = PV_PMAP(pv); 3982 PMAP_LOCK(pmap); 3983 pde = pmap_pde(pmap, va); 3984 if ((*pde & PG_RW) != 0) 3985 (void)pmap_demote_pde(pmap, pde, va); 3986 PMAP_UNLOCK(pmap); 3987 } 3988 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3989 pmap = PV_PMAP(pv); 3990 PMAP_LOCK(pmap); 3991 pde = pmap_pde(pmap, pv->pv_va); 3992 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" 3993 " a 2mpage in page %p's pv list", m)); 3994 pte = pmap_pde_to_pte(pde, pv->pv_va); 3995retry: 3996 oldpte = *pte; 3997 if (oldpte & PG_RW) { 3998 if (!atomic_cmpset_long(pte, oldpte, oldpte & 3999 ~(PG_RW | PG_M))) 4000 goto retry; 4001 if ((oldpte & PG_M) != 0) 4002 vm_page_dirty(m); 4003 pmap_invalidate_page(pmap, pv->pv_va); 4004 } 4005 PMAP_UNLOCK(pmap); 4006 } 4007 vm_page_flag_clear(m, PG_WRITEABLE); 4008} 4009 4010/* 4011 * pmap_ts_referenced: 4012 * 4013 * Return a count of reference bits for a page, clearing those bits. 4014 * It is not necessary for every reference bit to be cleared, but it 4015 * is necessary that 0 only be returned when there are truly no 4016 * reference bits set. 4017 * 4018 * XXX: The exact number of bits to check and clear is a matter that 4019 * should be tested and standardized at some point in the future for 4020 * optimal aging of shared pages. 4021 */ 4022int 4023pmap_ts_referenced(vm_page_t m) 4024{ 4025 struct md_page *pvh; 4026 pv_entry_t pv, pvf, pvn; 4027 pmap_t pmap; 4028 pd_entry_t oldpde, *pde; 4029 pt_entry_t *pte; 4030 vm_offset_t va; 4031 int rtval = 0; 4032 4033 if (m->flags & PG_FICTITIOUS) 4034 return (rtval); 4035 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 4036 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4037 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) { 4038 va = pv->pv_va; 4039 pmap = PV_PMAP(pv); 4040 PMAP_LOCK(pmap); 4041 pde = pmap_pde(pmap, va); 4042 oldpde = *pde; 4043 if ((oldpde & PG_A) != 0) { 4044 if (pmap_demote_pde(pmap, pde, va)) { 4045 if ((oldpde & PG_W) == 0) { 4046 /* 4047 * Remove the mapping to a single page 4048 * so that a subsequent access may 4049 * repromote. Since the underlying 4050 * page table page is fully populated, 4051 * this removal never frees a page 4052 * table page. 4053 */ 4054 va += VM_PAGE_TO_PHYS(m) - (oldpde & 4055 PG_PS_FRAME); 4056 pmap_remove_page(pmap, va, pde, NULL); 4057 rtval++; 4058 if (rtval > 4) { 4059 PMAP_UNLOCK(pmap); 4060 return (rtval); 4061 } 4062 } 4063 } 4064 } 4065 PMAP_UNLOCK(pmap); 4066 } 4067 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4068 pvf = pv; 4069 do { 4070 pvn = TAILQ_NEXT(pv, pv_list); 4071 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 4072 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 4073 pmap = PV_PMAP(pv); 4074 PMAP_LOCK(pmap); 4075 pde = pmap_pde(pmap, pv->pv_va); 4076 KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:" 4077 " found a 2mpage in page %p's pv list", m)); 4078 pte = pmap_pde_to_pte(pde, pv->pv_va); 4079 if ((*pte & PG_A) != 0) { 4080 atomic_clear_long(pte, PG_A); 4081 pmap_invalidate_page(pmap, pv->pv_va); 4082 rtval++; 4083 if (rtval > 4) 4084 pvn = NULL; 4085 } 4086 PMAP_UNLOCK(pmap); 4087 } while ((pv = pvn) != NULL && pv != pvf); 4088 } 4089 return (rtval); 4090} 4091 4092/* 4093 * Clear the modify bits on the specified physical page. 4094 */ 4095void 4096pmap_clear_modify(vm_page_t m) 4097{ 4098 struct md_page *pvh; 4099 pmap_t pmap; 4100 pv_entry_t next_pv, pv; 4101 pd_entry_t oldpde, *pde; 4102 pt_entry_t oldpte, *pte; 4103 vm_offset_t va; 4104 4105 if ((m->flags & PG_FICTITIOUS) != 0) 4106 return; 4107 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 4108 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4109 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { 4110 va = pv->pv_va; 4111 pmap = PV_PMAP(pv); 4112 PMAP_LOCK(pmap); 4113 pde = pmap_pde(pmap, va); 4114 oldpde = *pde; 4115 if ((oldpde & PG_RW) != 0) { 4116 if (pmap_demote_pde(pmap, pde, va)) { 4117 if ((oldpde & PG_W) == 0) { 4118 /* 4119 * Write protect the mapping to a 4120 * single page so that a subsequent 4121 * write access may repromote. 4122 */ 4123 va += VM_PAGE_TO_PHYS(m) - (oldpde & 4124 PG_PS_FRAME); 4125 pte = pmap_pde_to_pte(pde, va); 4126 oldpte = *pte; 4127 if ((oldpte & PG_V) != 0) { 4128 while (!atomic_cmpset_long(pte, 4129 oldpte, 4130 oldpte & ~(PG_M | PG_RW))) 4131 oldpte = *pte; 4132 vm_page_dirty(m); 4133 pmap_invalidate_page(pmap, va); 4134 } 4135 } 4136 } 4137 } 4138 PMAP_UNLOCK(pmap); 4139 } 4140 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4141 pmap = PV_PMAP(pv); 4142 PMAP_LOCK(pmap); 4143 pde = pmap_pde(pmap, pv->pv_va); 4144 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 4145 " a 2mpage in page %p's pv list", m)); 4146 pte = pmap_pde_to_pte(pde, pv->pv_va); 4147 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4148 atomic_clear_long(pte, PG_M); 4149 pmap_invalidate_page(pmap, pv->pv_va); 4150 } 4151 PMAP_UNLOCK(pmap); 4152 } 4153} 4154 4155/* 4156 * pmap_clear_reference: 4157 * 4158 * Clear the reference bit on the specified physical page. 4159 */ 4160void 4161pmap_clear_reference(vm_page_t m) 4162{ 4163 struct md_page *pvh; 4164 pmap_t pmap; 4165 pv_entry_t next_pv, pv; 4166 pd_entry_t oldpde, *pde; 4167 pt_entry_t *pte; 4168 vm_offset_t va; 4169 4170 if ((m->flags & PG_FICTITIOUS) != 0) 4171 return; 4172 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 4173 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4174 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { 4175 va = pv->pv_va; 4176 pmap = PV_PMAP(pv); 4177 PMAP_LOCK(pmap); 4178 pde = pmap_pde(pmap, va); 4179 oldpde = *pde; 4180 if ((oldpde & PG_A) != 0) { 4181 if (pmap_demote_pde(pmap, pde, va)) { 4182 /* 4183 * Remove the mapping to a single page so 4184 * that a subsequent access may repromote. 4185 * Since the underlying page table page is 4186 * fully populated, this removal never frees 4187 * a page table page. 4188 */ 4189 va += VM_PAGE_TO_PHYS(m) - (oldpde & 4190 PG_PS_FRAME); 4191 pmap_remove_page(pmap, va, pde, NULL); 4192 } 4193 } 4194 PMAP_UNLOCK(pmap); 4195 } 4196 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4197 pmap = PV_PMAP(pv); 4198 PMAP_LOCK(pmap); 4199 pde = pmap_pde(pmap, pv->pv_va); 4200 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found" 4201 " a 2mpage in page %p's pv list", m)); 4202 pte = pmap_pde_to_pte(pde, pv->pv_va); 4203 if (*pte & PG_A) { 4204 atomic_clear_long(pte, PG_A); 4205 pmap_invalidate_page(pmap, pv->pv_va); 4206 } 4207 PMAP_UNLOCK(pmap); 4208 } 4209} 4210 4211/* 4212 * Miscellaneous support routines follow 4213 */ 4214 4215/* Adjust the cache mode for a 4KB page mapped via a PTE. */ 4216static __inline void 4217pmap_pte_attr(pt_entry_t *pte, int cache_bits) 4218{ 4219 u_int opte, npte; 4220 4221 /* 4222 * The cache mode bits are all in the low 32-bits of the 4223 * PTE, so we can just spin on updating the low 32-bits. 4224 */ 4225 do { 4226 opte = *(u_int *)pte; 4227 npte = opte & ~PG_PTE_CACHE; 4228 npte |= cache_bits; 4229 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 4230} 4231 4232/* Adjust the cache mode for a 2MB page mapped via a PDE. */ 4233static __inline void 4234pmap_pde_attr(pd_entry_t *pde, int cache_bits) 4235{ 4236 u_int opde, npde; 4237 4238 /* 4239 * The cache mode bits are all in the low 32-bits of the 4240 * PDE, so we can just spin on updating the low 32-bits. 4241 */ 4242 do { 4243 opde = *(u_int *)pde; 4244 npde = opde & ~PG_PDE_CACHE; 4245 npde |= cache_bits; 4246 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 4247} 4248 4249/* 4250 * Map a set of physical memory pages into the kernel virtual 4251 * address space. Return a pointer to where it is mapped. This 4252 * routine is intended to be used for mapping device memory, 4253 * NOT real memory. 4254 */ 4255void * 4256pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 4257{ 4258 vm_offset_t va, tmpva, offset; 4259 4260 /* 4261 * If the specified range of physical addresses fits within the direct 4262 * map window, use the direct map. 4263 */ 4264 if (pa < dmaplimit && pa + size < dmaplimit) { 4265 va = PHYS_TO_DMAP(pa); 4266 if (!pmap_change_attr(va, size, mode)) 4267 return ((void *)va); 4268 } 4269 offset = pa & PAGE_MASK; 4270 size = roundup(offset + size, PAGE_SIZE); 4271 va = kmem_alloc_nofault(kernel_map, size); 4272 if (!va) 4273 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 4274 pa = trunc_page(pa); 4275 for (tmpva = va; size > 0; ) { 4276 pmap_kenter_attr(tmpva, pa, mode); 4277 size -= PAGE_SIZE; 4278 tmpva += PAGE_SIZE; 4279 pa += PAGE_SIZE; 4280 } 4281 pmap_invalidate_range(kernel_pmap, va, tmpva); 4282 pmap_invalidate_cache(); 4283 return ((void *)(va + offset)); 4284} 4285 4286void * 4287pmap_mapdev(vm_paddr_t pa, vm_size_t size) 4288{ 4289 4290 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 4291} 4292 4293void * 4294pmap_mapbios(vm_paddr_t pa, vm_size_t size) 4295{ 4296 4297 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 4298} 4299 4300void 4301pmap_unmapdev(vm_offset_t va, vm_size_t size) 4302{ 4303 vm_offset_t base, offset, tmpva; 4304 4305 /* If we gave a direct map region in pmap_mapdev, do nothing */ 4306 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 4307 return; 4308 base = trunc_page(va); 4309 offset = va & PAGE_MASK; 4310 size = roundup(offset + size, PAGE_SIZE); 4311 for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) 4312 pmap_kremove(tmpva); 4313 pmap_invalidate_range(kernel_pmap, va, tmpva); 4314 kmem_free(kernel_map, base, size); 4315} 4316 4317/* 4318 * Tries to demote a 1GB page mapping. 4319 */ 4320static boolean_t 4321pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) 4322{ 4323 pdp_entry_t newpdpe, oldpdpe; 4324 pd_entry_t *firstpde, newpde, *pde; 4325 vm_paddr_t mpdepa; 4326 vm_page_t mpde; 4327 4328 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4329 oldpdpe = *pdpe; 4330 KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), 4331 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 4332 if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT | 4333 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 4334 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 4335 " in pmap %p", va, pmap); 4336 return (FALSE); 4337 } 4338 mpdepa = VM_PAGE_TO_PHYS(mpde); 4339 firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa); 4340 newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; 4341 KASSERT((oldpdpe & PG_A) != 0, 4342 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 4343 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 4344 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 4345 newpde = oldpdpe; 4346 4347 /* 4348 * Initialize the page directory page. 4349 */ 4350 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 4351 *pde = newpde; 4352 newpde += NBPDR; 4353 } 4354 4355 /* 4356 * Demote the mapping. 4357 */ 4358 *pdpe = newpdpe; 4359 4360 /* 4361 * Invalidate a stale recursive mapping of the page directory page. 4362 */ 4363 pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); 4364 4365 pmap_pdpe_demotions++; 4366 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 4367 " in pmap %p", va, pmap); 4368 return (TRUE); 4369} 4370 4371/* 4372 * Changes the specified virtual address range's memory type to that given by 4373 * the parameter "mode". The specified virtual address range must be 4374 * completely contained within either the direct map or the kernel map. If 4375 * the virtual address range is contained within the kernel map, then the 4376 * memory type for each of the corresponding ranges of the direct map is also 4377 * changed. (The corresponding ranges of the direct map are those ranges that 4378 * map the same physical pages as the specified virtual address range.) These 4379 * changes to the direct map are necessary because Intel describes the 4380 * behavior of their processors as "undefined" if two or more mappings to the 4381 * same physical page have different memory types. 4382 * 4383 * Returns zero if the change completed successfully, and either EINVAL or 4384 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 4385 * of the virtual address range was not mapped, and ENOMEM is returned if 4386 * there was insufficient memory available to complete the change. In the 4387 * latter case, the memory type may have been changed on some part of the 4388 * virtual address range or the direct map. 4389 */ 4390int 4391pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 4392{ 4393 int error; 4394 4395 PMAP_LOCK(kernel_pmap); 4396 error = pmap_change_attr_locked(va, size, mode); 4397 PMAP_UNLOCK(kernel_pmap); 4398 return (error); 4399} 4400 4401static int 4402pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) 4403{ 4404 vm_offset_t base, offset, tmpva; 4405 vm_paddr_t pa_start, pa_end; 4406 pdp_entry_t *pdpe; 4407 pd_entry_t *pde; 4408 pt_entry_t *pte; 4409 int cache_bits_pte, cache_bits_pde, error; 4410 boolean_t changed; 4411 4412 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 4413 base = trunc_page(va); 4414 offset = va & PAGE_MASK; 4415 size = roundup(offset + size, PAGE_SIZE); 4416 4417 /* 4418 * Only supported on kernel virtual addresses, including the direct 4419 * map but excluding the recursive map. 4420 */ 4421 if (base < DMAP_MIN_ADDRESS) 4422 return (EINVAL); 4423 4424 cache_bits_pde = cache_bits_pte = -1; 4425 changed = FALSE; 4426 4427 /* 4428 * Pages that aren't mapped aren't supported. Also break down 2MB pages 4429 * into 4KB pages if required. 4430 */ 4431 for (tmpva = base; tmpva < base + size; ) { 4432 pdpe = pmap_pdpe(kernel_pmap, tmpva); 4433 if (*pdpe == 0) 4434 return (EINVAL); 4435 if (*pdpe & PG_PS) { 4436 /* 4437 * If the current 1GB page already has the required 4438 * memory type, then we need not demote this page. Just 4439 * increment tmpva to the next 1GB page frame. 4440 */ 4441 if (cache_bits_pde < 0) 4442 cache_bits_pde = pmap_cache_bits(mode, 1); 4443 if ((*pdpe & PG_PDE_CACHE) == cache_bits_pde) { 4444 tmpva = trunc_1gpage(tmpva) + NBPDP; 4445 continue; 4446 } 4447 4448 /* 4449 * If the current offset aligns with a 1GB page frame 4450 * and there is at least 1GB left within the range, then 4451 * we need not break down this page into 2MB pages. 4452 */ 4453 if ((tmpva & PDPMASK) == 0 && 4454 tmpva + PDPMASK < base + size) { 4455 tmpva += NBPDP; 4456 continue; 4457 } 4458 if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) 4459 return (ENOMEM); 4460 } 4461 pde = pmap_pdpe_to_pde(pdpe, tmpva); 4462 if (*pde == 0) 4463 return (EINVAL); 4464 if (*pde & PG_PS) { 4465 /* 4466 * If the current 2MB page already has the required 4467 * memory type, then we need not demote this page. Just 4468 * increment tmpva to the next 2MB page frame. 4469 */ 4470 if (cache_bits_pde < 0) 4471 cache_bits_pde = pmap_cache_bits(mode, 1); 4472 if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { 4473 tmpva = trunc_2mpage(tmpva) + NBPDR; 4474 continue; 4475 } 4476 4477 /* 4478 * If the current offset aligns with a 2MB page frame 4479 * and there is at least 2MB left within the range, then 4480 * we need not break down this page into 4KB pages. 4481 */ 4482 if ((tmpva & PDRMASK) == 0 && 4483 tmpva + PDRMASK < base + size) { 4484 tmpva += NBPDR; 4485 continue; 4486 } 4487 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) 4488 return (ENOMEM); 4489 } 4490 pte = pmap_pde_to_pte(pde, tmpva); 4491 if (*pte == 0) 4492 return (EINVAL); 4493 tmpva += PAGE_SIZE; 4494 } 4495 error = 0; 4496 4497 /* 4498 * Ok, all the pages exist, so run through them updating their 4499 * cache mode if required. 4500 */ 4501 pa_start = pa_end = 0; 4502 for (tmpva = base; tmpva < base + size; ) { 4503 pdpe = pmap_pdpe(kernel_pmap, tmpva); 4504 if (*pdpe & PG_PS) { 4505 if (cache_bits_pde < 0) 4506 cache_bits_pde = pmap_cache_bits(mode, 1); 4507 if ((*pdpe & PG_PDE_CACHE) != cache_bits_pde) { 4508 pmap_pde_attr(pdpe, cache_bits_pde); 4509 if (!changed) 4510 changed = TRUE; 4511 } 4512 if (tmpva >= VM_MIN_KERNEL_ADDRESS) { 4513 if (pa_start == pa_end) { 4514 /* Start physical address run. */ 4515 pa_start = *pdpe & PG_PS_FRAME; 4516 pa_end = pa_start + NBPDP; 4517 } else if (pa_end == (*pdpe & PG_PS_FRAME)) 4518 pa_end += NBPDP; 4519 else { 4520 /* Run ended, update direct map. */ 4521 error = pmap_change_attr_locked( 4522 PHYS_TO_DMAP(pa_start), 4523 pa_end - pa_start, mode); 4524 if (error != 0) 4525 break; 4526 /* Start physical address run. */ 4527 pa_start = *pdpe & PG_PS_FRAME; 4528 pa_end = pa_start + NBPDP; 4529 } 4530 } 4531 tmpva = trunc_1gpage(tmpva) + NBPDP; 4532 continue; 4533 } 4534 pde = pmap_pdpe_to_pde(pdpe, tmpva); 4535 if (*pde & PG_PS) { 4536 if (cache_bits_pde < 0) 4537 cache_bits_pde = pmap_cache_bits(mode, 1); 4538 if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { 4539 pmap_pde_attr(pde, cache_bits_pde); 4540 if (!changed) 4541 changed = TRUE; 4542 } 4543 if (tmpva >= VM_MIN_KERNEL_ADDRESS) { 4544 if (pa_start == pa_end) { 4545 /* Start physical address run. */ 4546 pa_start = *pde & PG_PS_FRAME; 4547 pa_end = pa_start + NBPDR; 4548 } else if (pa_end == (*pde & PG_PS_FRAME)) 4549 pa_end += NBPDR; 4550 else { 4551 /* Run ended, update direct map. */ 4552 error = pmap_change_attr_locked( 4553 PHYS_TO_DMAP(pa_start), 4554 pa_end - pa_start, mode); 4555 if (error != 0) 4556 break; 4557 /* Start physical address run. */ 4558 pa_start = *pde & PG_PS_FRAME; 4559 pa_end = pa_start + NBPDR; 4560 } 4561 } 4562 tmpva = trunc_2mpage(tmpva) + NBPDR; 4563 } else { 4564 if (cache_bits_pte < 0) 4565 cache_bits_pte = pmap_cache_bits(mode, 0); 4566 pte = pmap_pde_to_pte(pde, tmpva); 4567 if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { 4568 pmap_pte_attr(pte, cache_bits_pte); 4569 if (!changed) 4570 changed = TRUE; 4571 } 4572 if (tmpva >= VM_MIN_KERNEL_ADDRESS) { 4573 if (pa_start == pa_end) { 4574 /* Start physical address run. */ 4575 pa_start = *pte & PG_FRAME; 4576 pa_end = pa_start + PAGE_SIZE; 4577 } else if (pa_end == (*pte & PG_FRAME)) 4578 pa_end += PAGE_SIZE; 4579 else { 4580 /* Run ended, update direct map. */ 4581 error = pmap_change_attr_locked( 4582 PHYS_TO_DMAP(pa_start), 4583 pa_end - pa_start, mode); 4584 if (error != 0) 4585 break; 4586 /* Start physical address run. */ 4587 pa_start = *pte & PG_FRAME; 4588 pa_end = pa_start + PAGE_SIZE; 4589 } 4590 } 4591 tmpva += PAGE_SIZE; 4592 } 4593 } 4594 if (error == 0 && pa_start != pa_end) 4595 error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), 4596 pa_end - pa_start, mode); 4597 4598 /* 4599 * Flush CPU caches if required to make sure any data isn't cached that 4600 * shouldn't be, etc. 4601 */ 4602 if (changed) { 4603 pmap_invalidate_range(kernel_pmap, base, tmpva); 4604 pmap_invalidate_cache(); 4605 } 4606 return (error); 4607} 4608 4609/* 4610 * perform the pmap work for mincore 4611 */ 4612int 4613pmap_mincore(pmap_t pmap, vm_offset_t addr) 4614{ 4615 pd_entry_t *pdep; 4616 pt_entry_t pte; 4617 vm_paddr_t pa; 4618 vm_page_t m; 4619 int val = 0; 4620 4621 PMAP_LOCK(pmap); 4622 pdep = pmap_pde(pmap, addr); 4623 if (pdep != NULL && (*pdep & PG_V)) { 4624 if (*pdep & PG_PS) { 4625 pte = *pdep; 4626 val = MINCORE_SUPER; 4627 /* Compute the physical address of the 4KB page. */ 4628 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & 4629 PG_FRAME; 4630 } else { 4631 pte = *pmap_pde_to_pte(pdep, addr); 4632 pa = pte & PG_FRAME; 4633 } 4634 } else { 4635 pte = 0; 4636 pa = 0; 4637 } 4638 PMAP_UNLOCK(pmap); 4639 4640 if (pte != 0) { 4641 val |= MINCORE_INCORE; 4642 if ((pte & PG_MANAGED) == 0) 4643 return val; 4644 4645 m = PHYS_TO_VM_PAGE(pa); 4646 4647 /* 4648 * Modified by us 4649 */ 4650 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 4651 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 4652 else { 4653 /* 4654 * Modified by someone else 4655 */ 4656 vm_page_lock_queues(); 4657 if (m->dirty || pmap_is_modified(m)) 4658 val |= MINCORE_MODIFIED_OTHER; 4659 vm_page_unlock_queues(); 4660 } 4661 /* 4662 * Referenced by us 4663 */ 4664 if (pte & PG_A) 4665 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 4666 else { 4667 /* 4668 * Referenced by someone else 4669 */ 4670 vm_page_lock_queues(); 4671 if ((m->flags & PG_REFERENCED) || 4672 pmap_ts_referenced(m)) { 4673 val |= MINCORE_REFERENCED_OTHER; 4674 vm_page_flag_set(m, PG_REFERENCED); 4675 } 4676 vm_page_unlock_queues(); 4677 } 4678 } 4679 return val; 4680} 4681 4682void 4683pmap_activate(struct thread *td) 4684{ 4685 pmap_t pmap, oldpmap; 4686 u_int64_t cr3; 4687 4688 critical_enter(); 4689 pmap = vmspace_pmap(td->td_proc->p_vmspace); 4690 oldpmap = PCPU_GET(curpmap); 4691#ifdef SMP 4692if (oldpmap) /* XXX FIXME */ 4693 atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask)); 4694 atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask)); 4695#else 4696if (oldpmap) /* XXX FIXME */ 4697 oldpmap->pm_active &= ~PCPU_GET(cpumask); 4698 pmap->pm_active |= PCPU_GET(cpumask); 4699#endif 4700 cr3 = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4); 4701 td->td_pcb->pcb_cr3 = cr3; 4702 load_cr3(cr3); 4703 critical_exit(); 4704} 4705 4706/* 4707 * Increase the starting virtual address of the given mapping if a 4708 * different alignment might result in more superpage mappings. 4709 */ 4710void 4711pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 4712 vm_offset_t *addr, vm_size_t size) 4713{ 4714 vm_offset_t superpage_offset; 4715 4716 if (size < NBPDR) 4717 return; 4718 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 4719 offset += ptoa(object->pg_color); 4720 superpage_offset = offset & PDRMASK; 4721 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 4722 (*addr & PDRMASK) == superpage_offset) 4723 return; 4724 if ((*addr & PDRMASK) < superpage_offset) 4725 *addr = (*addr & ~PDRMASK) + superpage_offset; 4726 else 4727 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 4728} 4729