pmap.c revision 252646
1/*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2003 Peter Wemm 9 * All rights reserved. 10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 11 * All rights reserved. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * the Systems Programming Group of the University of Utah Computer 15 * Science Department and William Jolitz of UUNET Technologies Inc. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. All advertising materials mentioning features or use of this software 26 * must display the following acknowledgement: 27 * This product includes software developed by the University of 28 * California, Berkeley and its contributors. 29 * 4. Neither the name of the University nor the names of its contributors 30 * may be used to endorse or promote products derived from this software 31 * without specific prior written permission. 32 * 33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 36 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 43 * SUCH DAMAGE. 44 * 45 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 46 */ 47/*- 48 * Copyright (c) 2003 Networks Associates Technology, Inc. 49 * All rights reserved. 50 * 51 * This software was developed for the FreeBSD Project by Jake Burkholder, 52 * Safeport Network Services, and Network Associates Laboratories, the 53 * Security Research Division of Network Associates, Inc. under 54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 55 * CHATS research program. 56 * 57 * Redistribution and use in source and binary forms, with or without 58 * modification, are permitted provided that the following conditions 59 * are met: 60 * 1. Redistributions of source code must retain the above copyright 61 * notice, this list of conditions and the following disclaimer. 62 * 2. Redistributions in binary form must reproduce the above copyright 63 * notice, this list of conditions and the following disclaimer in the 64 * documentation and/or other materials provided with the distribution. 65 * 66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 69 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 76 * SUCH DAMAGE. 77 */ 78 79#include <sys/cdefs.h> 80__FBSDID("$FreeBSD: head/sys/amd64/amd64/pmap.c 252646 2013-07-03 23:21:25Z neel $"); 81 82/* 83 * Manages physical address maps. 84 * 85 * Since the information managed by this module is 86 * also stored by the logical address mapping module, 87 * this module may throw away valid virtual-to-physical 88 * mappings at almost any time. However, invalidations 89 * of virtual-to-physical mappings must be done as 90 * requested. 91 * 92 * In order to cope with hardware architectures which 93 * make virtual-to-physical map invalidates expensive, 94 * this module may delay invalidate or reduced protection 95 * operations until such time as they are actually 96 * necessary. This module is given full information as 97 * to which processors are currently using which maps, 98 * and to when physical maps must be made correct. 99 */ 100 101#include "opt_pmap.h" 102#include "opt_vm.h" 103 104#include <sys/param.h> 105#include <sys/bus.h> 106#include <sys/systm.h> 107#include <sys/kernel.h> 108#include <sys/ktr.h> 109#include <sys/lock.h> 110#include <sys/malloc.h> 111#include <sys/mman.h> 112#include <sys/mutex.h> 113#include <sys/proc.h> 114#include <sys/rwlock.h> 115#include <sys/sx.h> 116#include <sys/vmmeter.h> 117#include <sys/sched.h> 118#include <sys/sysctl.h> 119#ifdef SMP 120#include <sys/smp.h> 121#else 122#include <sys/cpuset.h> 123#endif 124 125#include <vm/vm.h> 126#include <vm/vm_param.h> 127#include <vm/vm_kern.h> 128#include <vm/vm_page.h> 129#include <vm/vm_map.h> 130#include <vm/vm_object.h> 131#include <vm/vm_extern.h> 132#include <vm/vm_pageout.h> 133#include <vm/vm_pager.h> 134#include <vm/vm_radix.h> 135#include <vm/vm_reserv.h> 136#include <vm/uma.h> 137 138#include <machine/intr_machdep.h> 139#include <machine/apicvar.h> 140#include <machine/cpu.h> 141#include <machine/cputypes.h> 142#include <machine/md_var.h> 143#include <machine/pcb.h> 144#include <machine/specialreg.h> 145#ifdef SMP 146#include <machine/smp.h> 147#endif 148 149#if !defined(DIAGNOSTIC) 150#ifdef __GNUC_GNU_INLINE__ 151#define PMAP_INLINE __attribute__((__gnu_inline__)) inline 152#else 153#define PMAP_INLINE extern inline 154#endif 155#else 156#define PMAP_INLINE 157#endif 158 159#ifdef PV_STATS 160#define PV_STAT(x) do { x ; } while (0) 161#else 162#define PV_STAT(x) do { } while (0) 163#endif 164 165#define pa_index(pa) ((pa) >> PDRSHIFT) 166#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 167 168#define NPV_LIST_LOCKS MAXCPU 169 170#define PHYS_TO_PV_LIST_LOCK(pa) \ 171 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) 172 173#define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 174 struct rwlock **_lockp = (lockp); \ 175 struct rwlock *_new_lock; \ 176 \ 177 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 178 if (_new_lock != *_lockp) { \ 179 if (*_lockp != NULL) \ 180 rw_wunlock(*_lockp); \ 181 *_lockp = _new_lock; \ 182 rw_wlock(*_lockp); \ 183 } \ 184} while (0) 185 186#define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 187 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 188 189#define RELEASE_PV_LIST_LOCK(lockp) do { \ 190 struct rwlock **_lockp = (lockp); \ 191 \ 192 if (*_lockp != NULL) { \ 193 rw_wunlock(*_lockp); \ 194 *_lockp = NULL; \ 195 } \ 196} while (0) 197 198#define VM_PAGE_TO_PV_LIST_LOCK(m) \ 199 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 200 201struct pmap kernel_pmap_store; 202 203vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 204vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 205 206int nkpt; 207SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, 208 "Number of kernel page table pages allocated on bootup"); 209 210static int ndmpdp; 211static vm_paddr_t dmaplimit; 212vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 213pt_entry_t pg_nx; 214 215static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 216 217static int pat_works = 1; 218SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, 219 "Is page attribute table fully functional?"); 220 221static int pg_ps_enabled = 1; 222SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0, 223 "Are large page mappings enabled?"); 224 225#define PAT_INDEX_SIZE 8 226static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 227 228static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 229static u_int64_t KPDphys; /* phys addr of kernel level 2 */ 230u_int64_t KPDPphys; /* phys addr of kernel level 3 */ 231u_int64_t KPML4phys; /* phys addr of kernel level 4 */ 232 233static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ 234static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 235 236static struct rwlock_padalign pvh_global_lock; 237 238/* 239 * Data for the pv entry allocation mechanism 240 */ 241static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 242static struct mtx pv_chunks_mutex; 243static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 244static struct md_page *pv_table; 245 246/* 247 * All those kernel PT submaps that BSD is so fond of 248 */ 249pt_entry_t *CMAP1 = 0; 250caddr_t CADDR1 = 0; 251 252/* 253 * Crashdump maps. 254 */ 255static caddr_t crashdumpmap; 256 257static void free_pv_chunk(struct pv_chunk *pc); 258static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 259static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 260static int popcnt_pc_map_elem(uint64_t elem); 261static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 262static void reserve_pv_entries(pmap_t pmap, int needed, 263 struct rwlock **lockp); 264static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 265 struct rwlock **lockp); 266static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 267 struct rwlock **lockp); 268static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 269 struct rwlock **lockp); 270static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 271static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 272 vm_offset_t va); 273static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); 274 275static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); 276static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 277static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, 278 vm_offset_t va, struct rwlock **lockp); 279static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, 280 vm_offset_t va); 281static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, 282 vm_prot_t prot, struct rwlock **lockp); 283static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 284 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 285static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 286static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 287static boolean_t pmap_is_modified_pvh(struct md_page *pvh); 288static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); 289static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 290static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); 291static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); 292static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 293 struct rwlock **lockp); 294static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 295 vm_prot_t prot); 296static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); 297static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 298 vm_page_t *free, struct rwlock **lockp); 299static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, 300 vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free, 301 struct rwlock **lockp); 302static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); 303static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 304 vm_page_t *free); 305static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 306 vm_page_t m, struct rwlock **lockp); 307static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 308 pd_entry_t newpde); 309static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); 310 311static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, 312 struct rwlock **lockp); 313static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, 314 struct rwlock **lockp); 315static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, 316 struct rwlock **lockp); 317 318static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 319 vm_page_t *free); 320static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, vm_page_t *); 321static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 322 323/* 324 * Move the kernel virtual free pointer to the next 325 * 2MB. This is used to help improve performance 326 * by using a large (2MB) page for much of the kernel 327 * (.text, .data, .bss) 328 */ 329static vm_offset_t 330pmap_kmem_choose(vm_offset_t addr) 331{ 332 vm_offset_t newaddr = addr; 333 334 newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); 335 return (newaddr); 336} 337 338/********************/ 339/* Inline functions */ 340/********************/ 341 342/* Return a non-clipped PD index for a given VA */ 343static __inline vm_pindex_t 344pmap_pde_pindex(vm_offset_t va) 345{ 346 return (va >> PDRSHIFT); 347} 348 349 350/* Return various clipped indexes for a given VA */ 351static __inline vm_pindex_t 352pmap_pte_index(vm_offset_t va) 353{ 354 355 return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); 356} 357 358static __inline vm_pindex_t 359pmap_pde_index(vm_offset_t va) 360{ 361 362 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 363} 364 365static __inline vm_pindex_t 366pmap_pdpe_index(vm_offset_t va) 367{ 368 369 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 370} 371 372static __inline vm_pindex_t 373pmap_pml4e_index(vm_offset_t va) 374{ 375 376 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 377} 378 379/* Return a pointer to the PML4 slot that corresponds to a VA */ 380static __inline pml4_entry_t * 381pmap_pml4e(pmap_t pmap, vm_offset_t va) 382{ 383 384 return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 385} 386 387/* Return a pointer to the PDP slot that corresponds to a VA */ 388static __inline pdp_entry_t * 389pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 390{ 391 pdp_entry_t *pdpe; 392 393 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 394 return (&pdpe[pmap_pdpe_index(va)]); 395} 396 397/* Return a pointer to the PDP slot that corresponds to a VA */ 398static __inline pdp_entry_t * 399pmap_pdpe(pmap_t pmap, vm_offset_t va) 400{ 401 pml4_entry_t *pml4e; 402 403 pml4e = pmap_pml4e(pmap, va); 404 if ((*pml4e & PG_V) == 0) 405 return (NULL); 406 return (pmap_pml4e_to_pdpe(pml4e, va)); 407} 408 409/* Return a pointer to the PD slot that corresponds to a VA */ 410static __inline pd_entry_t * 411pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 412{ 413 pd_entry_t *pde; 414 415 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 416 return (&pde[pmap_pde_index(va)]); 417} 418 419/* Return a pointer to the PD slot that corresponds to a VA */ 420static __inline pd_entry_t * 421pmap_pde(pmap_t pmap, vm_offset_t va) 422{ 423 pdp_entry_t *pdpe; 424 425 pdpe = pmap_pdpe(pmap, va); 426 if (pdpe == NULL || (*pdpe & PG_V) == 0) 427 return (NULL); 428 return (pmap_pdpe_to_pde(pdpe, va)); 429} 430 431/* Return a pointer to the PT slot that corresponds to a VA */ 432static __inline pt_entry_t * 433pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 434{ 435 pt_entry_t *pte; 436 437 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 438 return (&pte[pmap_pte_index(va)]); 439} 440 441/* Return a pointer to the PT slot that corresponds to a VA */ 442static __inline pt_entry_t * 443pmap_pte(pmap_t pmap, vm_offset_t va) 444{ 445 pd_entry_t *pde; 446 447 pde = pmap_pde(pmap, va); 448 if (pde == NULL || (*pde & PG_V) == 0) 449 return (NULL); 450 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 451 return ((pt_entry_t *)pde); 452 return (pmap_pde_to_pte(pde, va)); 453} 454 455static __inline void 456pmap_resident_count_inc(pmap_t pmap, int count) 457{ 458 459 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 460 pmap->pm_stats.resident_count += count; 461} 462 463static __inline void 464pmap_resident_count_dec(pmap_t pmap, int count) 465{ 466 467 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 468 pmap->pm_stats.resident_count -= count; 469} 470 471PMAP_INLINE pt_entry_t * 472vtopte(vm_offset_t va) 473{ 474 u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 475 476 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 477} 478 479static __inline pd_entry_t * 480vtopde(vm_offset_t va) 481{ 482 u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 483 484 return (PDmap + ((va >> PDRSHIFT) & mask)); 485} 486 487static u_int64_t 488allocpages(vm_paddr_t *firstaddr, int n) 489{ 490 u_int64_t ret; 491 492 ret = *firstaddr; 493 bzero((void *)ret, n * PAGE_SIZE); 494 *firstaddr += n * PAGE_SIZE; 495 return (ret); 496} 497 498CTASSERT(powerof2(NDMPML4E)); 499 500/* number of kernel PDP slots */ 501#define NKPDPE(ptpgs) howmany((ptpgs), NPDEPG) 502 503static void 504nkpt_init(vm_paddr_t addr) 505{ 506 int pt_pages; 507 508#ifdef NKPT 509 pt_pages = NKPT; 510#else 511 pt_pages = howmany(addr, 1 << PDRSHIFT); 512 pt_pages += NKPDPE(pt_pages); 513 514 /* 515 * Add some slop beyond the bare minimum required for bootstrapping 516 * the kernel. 517 * 518 * This is quite important when allocating KVA for kernel modules. 519 * The modules are required to be linked in the negative 2GB of 520 * the address space. If we run out of KVA in this region then 521 * pmap_growkernel() will need to allocate page table pages to map 522 * the entire 512GB of KVA space which is an unnecessary tax on 523 * physical memory. 524 */ 525 pt_pages += 8; /* 16MB additional slop for kernel modules */ 526#endif 527 nkpt = pt_pages; 528} 529 530static void 531create_pagetables(vm_paddr_t *firstaddr) 532{ 533 int i, j, ndm1g, nkpdpe; 534 535 /* Allocate page table pages for the direct map */ 536 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; 537 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 538 ndmpdp = 4; 539 DMPDPphys = allocpages(firstaddr, NDMPML4E); 540 ndm1g = 0; 541 if ((amd_feature & AMDID_PAGE1GB) != 0) 542 ndm1g = ptoa(Maxmem) >> PDPSHIFT; 543 if (ndm1g < ndmpdp) 544 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); 545 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 546 547 /* Allocate pages */ 548 KPML4phys = allocpages(firstaddr, 1); 549 KPDPphys = allocpages(firstaddr, NKPML4E); 550 551 /* 552 * Allocate the initial number of kernel page table pages required to 553 * bootstrap. We defer this until after all memory-size dependent 554 * allocations are done (e.g. direct map), so that we don't have to 555 * build in too much slop in our estimate. 556 */ 557 nkpt_init(*firstaddr); 558 nkpdpe = NKPDPE(nkpt); 559 560 KPTphys = allocpages(firstaddr, nkpt); 561 KPDphys = allocpages(firstaddr, nkpdpe); 562 563 /* Fill in the underlying page table pages */ 564 /* Read-only from zero to physfree */ 565 /* XXX not fully used, underneath 2M pages */ 566 for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) { 567 ((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT; 568 ((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G; 569 } 570 571 /* Now map the page tables at their location within PTmap */ 572 for (i = 0; i < nkpt; i++) { 573 ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); 574 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V; 575 } 576 577 /* Map from zero to end of allocations under 2M pages */ 578 /* This replaces some of the KPTphys entries above */ 579 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) { 580 ((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT; 581 ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G; 582 } 583 584 /* And connect up the PD to the PDP */ 585 for (i = 0; i < nkpdpe; i++) { 586 ((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys + 587 (i << PAGE_SHIFT); 588 ((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U; 589 } 590 591 /* 592 * Now, set up the direct map region using 2MB and/or 1GB pages. If 593 * the end of physical memory is not aligned to a 1GB page boundary, 594 * then the residual physical memory is mapped with 2MB pages. Later, 595 * if pmap_mapdev{_attr}() uses the direct map for non-write-back 596 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings 597 * that are partially used. 598 */ 599 for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { 600 ((pd_entry_t *)DMPDphys)[j] = (vm_paddr_t)i << PDRSHIFT; 601 /* Preset PG_M and PG_A because demotion expects it. */ 602 ((pd_entry_t *)DMPDphys)[j] |= PG_RW | PG_V | PG_PS | PG_G | 603 PG_M | PG_A; 604 } 605 for (i = 0; i < ndm1g; i++) { 606 ((pdp_entry_t *)DMPDPphys)[i] = (vm_paddr_t)i << PDPSHIFT; 607 /* Preset PG_M and PG_A because demotion expects it. */ 608 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS | PG_G | 609 PG_M | PG_A; 610 } 611 for (j = 0; i < ndmpdp; i++, j++) { 612 ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (j << PAGE_SHIFT); 613 ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U; 614 } 615 616 /* And recursively map PML4 to itself in order to get PTmap */ 617 ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; 618 ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U; 619 620 /* Connect the Direct Map slot(s) up to the PML4. */ 621 for (i = 0; i < NDMPML4E; i++) { 622 ((pdp_entry_t *)KPML4phys)[DMPML4I + i] = DMPDPphys + 623 (i << PAGE_SHIFT); 624 ((pdp_entry_t *)KPML4phys)[DMPML4I + i] |= PG_RW | PG_V | PG_U; 625 } 626 627 /* Connect the KVA slot up to the PML4 */ 628 ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys; 629 ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U; 630} 631 632/* 633 * Bootstrap the system enough to run with virtual memory. 634 * 635 * On amd64 this is called after mapping has already been enabled 636 * and just syncs the pmap module with what has already been done. 637 * [We can't call it easily with mapping off since the kernel is not 638 * mapped with PA == VA, hence we would have to relocate every address 639 * from the linked base (virtual) address "KERNBASE" to the actual 640 * (physical) address starting relative to 0] 641 */ 642void 643pmap_bootstrap(vm_paddr_t *firstaddr) 644{ 645 vm_offset_t va; 646 pt_entry_t *pte, *unused; 647 648 /* 649 * Create an initial set of page tables to run the kernel in. 650 */ 651 create_pagetables(firstaddr); 652 653 virtual_avail = (vm_offset_t) KERNBASE + *firstaddr; 654 virtual_avail = pmap_kmem_choose(virtual_avail); 655 656 virtual_end = VM_MAX_KERNEL_ADDRESS; 657 658 659 /* XXX do %cr0 as well */ 660 load_cr4(rcr4() | CR4_PGE | CR4_PSE); 661 load_cr3(KPML4phys); 662 if (cpu_stdext_feature & CPUID_STDEXT_SMEP) 663 load_cr4(rcr4() | CR4_SMEP); 664 665 /* 666 * Initialize the kernel pmap (which is statically allocated). 667 */ 668 PMAP_LOCK_INIT(kernel_pmap); 669 kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); 670 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 671 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 672 673 /* 674 * Initialize the global pv list lock. 675 */ 676 rw_init(&pvh_global_lock, "pmap pv global"); 677 678 /* 679 * Reserve some special page table entries/VA space for temporary 680 * mapping of pages. 681 */ 682#define SYSMAP(c, p, v, n) \ 683 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 684 685 va = virtual_avail; 686 pte = vtopte(va); 687 688 /* 689 * CMAP1 is only used for the memory test. 690 */ 691 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 692 693 /* 694 * Crashdump maps. 695 */ 696 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) 697 698 virtual_avail = va; 699 700 /* Initialize the PAT MSR. */ 701 pmap_init_pat(); 702} 703 704/* 705 * Setup the PAT MSR. 706 */ 707void 708pmap_init_pat(void) 709{ 710 int pat_table[PAT_INDEX_SIZE]; 711 uint64_t pat_msr; 712 u_long cr0, cr4; 713 int i; 714 715 /* Bail if this CPU doesn't implement PAT. */ 716 if ((cpu_feature & CPUID_PAT) == 0) 717 panic("no PAT??"); 718 719 /* Set default PAT index table. */ 720 for (i = 0; i < PAT_INDEX_SIZE; i++) 721 pat_table[i] = -1; 722 pat_table[PAT_WRITE_BACK] = 0; 723 pat_table[PAT_WRITE_THROUGH] = 1; 724 pat_table[PAT_UNCACHEABLE] = 3; 725 pat_table[PAT_WRITE_COMBINING] = 3; 726 pat_table[PAT_WRITE_PROTECTED] = 3; 727 pat_table[PAT_UNCACHED] = 3; 728 729 /* Initialize default PAT entries. */ 730 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 731 PAT_VALUE(1, PAT_WRITE_THROUGH) | 732 PAT_VALUE(2, PAT_UNCACHED) | 733 PAT_VALUE(3, PAT_UNCACHEABLE) | 734 PAT_VALUE(4, PAT_WRITE_BACK) | 735 PAT_VALUE(5, PAT_WRITE_THROUGH) | 736 PAT_VALUE(6, PAT_UNCACHED) | 737 PAT_VALUE(7, PAT_UNCACHEABLE); 738 739 if (pat_works) { 740 /* 741 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 742 * Program 5 and 6 as WP and WC. 743 * Leave 4 and 7 as WB and UC. 744 */ 745 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); 746 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | 747 PAT_VALUE(6, PAT_WRITE_COMBINING); 748 pat_table[PAT_UNCACHED] = 2; 749 pat_table[PAT_WRITE_PROTECTED] = 5; 750 pat_table[PAT_WRITE_COMBINING] = 6; 751 } else { 752 /* 753 * Just replace PAT Index 2 with WC instead of UC-. 754 */ 755 pat_msr &= ~PAT_MASK(2); 756 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 757 pat_table[PAT_WRITE_COMBINING] = 2; 758 } 759 760 /* Disable PGE. */ 761 cr4 = rcr4(); 762 load_cr4(cr4 & ~CR4_PGE); 763 764 /* Disable caches (CD = 1, NW = 0). */ 765 cr0 = rcr0(); 766 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 767 768 /* Flushes caches and TLBs. */ 769 wbinvd(); 770 invltlb(); 771 772 /* Update PAT and index table. */ 773 wrmsr(MSR_PAT, pat_msr); 774 for (i = 0; i < PAT_INDEX_SIZE; i++) 775 pat_index[i] = pat_table[i]; 776 777 /* Flush caches and TLBs again. */ 778 wbinvd(); 779 invltlb(); 780 781 /* Restore caches and PGE. */ 782 load_cr0(cr0); 783 load_cr4(cr4); 784} 785 786/* 787 * Initialize a vm_page's machine-dependent fields. 788 */ 789void 790pmap_page_init(vm_page_t m) 791{ 792 793 TAILQ_INIT(&m->md.pv_list); 794 m->md.pat_mode = PAT_WRITE_BACK; 795} 796 797/* 798 * Initialize the pmap module. 799 * Called by vm_init, to initialize any structures that the pmap 800 * system needs to map virtual memory. 801 */ 802void 803pmap_init(void) 804{ 805 vm_page_t mpte; 806 vm_size_t s; 807 int i, pv_npg; 808 809 /* 810 * Initialize the vm page array entries for the kernel pmap's 811 * page table pages. 812 */ 813 for (i = 0; i < nkpt; i++) { 814 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 815 KASSERT(mpte >= vm_page_array && 816 mpte < &vm_page_array[vm_page_array_size], 817 ("pmap_init: page table page is out of range")); 818 mpte->pindex = pmap_pde_pindex(KERNBASE) + i; 819 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 820 } 821 822 /* 823 * If the kernel is running in a virtual machine on an AMD Family 10h 824 * processor, then it must assume that MCA is enabled by the virtual 825 * machine monitor. 826 */ 827 if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD && 828 CPUID_TO_FAMILY(cpu_id) == 0x10) 829 workaround_erratum383 = 1; 830 831 /* 832 * Are large page mappings enabled? 833 */ 834 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 835 if (pg_ps_enabled) { 836 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 837 ("pmap_init: can't assign to pagesizes[1]")); 838 pagesizes[1] = NBPDR; 839 } 840 841 /* 842 * Initialize the pv chunk list mutex. 843 */ 844 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 845 846 /* 847 * Initialize the pool of pv list locks. 848 */ 849 for (i = 0; i < NPV_LIST_LOCKS; i++) 850 rw_init(&pv_list_locks[i], "pmap pv list"); 851 852 /* 853 * Calculate the size of the pv head table for superpages. 854 */ 855 for (i = 0; phys_avail[i + 1]; i += 2); 856 pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR; 857 858 /* 859 * Allocate memory for the pv head table for superpages. 860 */ 861 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 862 s = round_page(s); 863 pv_table = (struct md_page *)kmem_alloc(kernel_map, s); 864 for (i = 0; i < pv_npg; i++) 865 TAILQ_INIT(&pv_table[i].pv_list); 866} 867 868static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, 869 "2MB page mapping counters"); 870 871static u_long pmap_pde_demotions; 872SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, 873 &pmap_pde_demotions, 0, "2MB page demotions"); 874 875static u_long pmap_pde_mappings; 876SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 877 &pmap_pde_mappings, 0, "2MB page mappings"); 878 879static u_long pmap_pde_p_failures; 880SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 881 &pmap_pde_p_failures, 0, "2MB page promotion failures"); 882 883static u_long pmap_pde_promotions; 884SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 885 &pmap_pde_promotions, 0, "2MB page promotions"); 886 887static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0, 888 "1GB page mapping counters"); 889 890static u_long pmap_pdpe_demotions; 891SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, 892 &pmap_pdpe_demotions, 0, "1GB page demotions"); 893 894/*************************************************** 895 * Low level helper routines..... 896 ***************************************************/ 897 898/* 899 * Determine the appropriate bits to set in a PTE or PDE for a specified 900 * caching mode. 901 */ 902static int 903pmap_cache_bits(int mode, boolean_t is_pde) 904{ 905 int cache_bits, pat_flag, pat_idx; 906 907 if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) 908 panic("Unknown caching mode %d\n", mode); 909 910 /* The PAT bit is different for PTE's and PDE's. */ 911 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; 912 913 /* Map the caching mode to a PAT index. */ 914 pat_idx = pat_index[mode]; 915 916 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 917 cache_bits = 0; 918 if (pat_idx & 0x4) 919 cache_bits |= pat_flag; 920 if (pat_idx & 0x2) 921 cache_bits |= PG_NC_PCD; 922 if (pat_idx & 0x1) 923 cache_bits |= PG_NC_PWT; 924 return (cache_bits); 925} 926 927/* 928 * After changing the page size for the specified virtual address in the page 929 * table, flush the corresponding entries from the processor's TLB. Only the 930 * calling processor's TLB is affected. 931 * 932 * The calling thread must be pinned to a processor. 933 */ 934static void 935pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) 936{ 937 u_long cr4; 938 939 if ((newpde & PG_PS) == 0) 940 /* Demotion: flush a specific 2MB page mapping. */ 941 invlpg(va); 942 else if ((newpde & PG_G) == 0) 943 /* 944 * Promotion: flush every 4KB page mapping from the TLB 945 * because there are too many to flush individually. 946 */ 947 invltlb(); 948 else { 949 /* 950 * Promotion: flush every 4KB page mapping from the TLB, 951 * including any global (PG_G) mappings. 952 */ 953 cr4 = rcr4(); 954 load_cr4(cr4 & ~CR4_PGE); 955 /* 956 * Although preemption at this point could be detrimental to 957 * performance, it would not lead to an error. PG_G is simply 958 * ignored if CR4.PGE is clear. Moreover, in case this block 959 * is re-entered, the load_cr4() either above or below will 960 * modify CR4.PGE flushing the TLB. 961 */ 962 load_cr4(cr4 | CR4_PGE); 963 } 964} 965#ifdef SMP 966/* 967 * For SMP, these functions have to use the IPI mechanism for coherence. 968 * 969 * N.B.: Before calling any of the following TLB invalidation functions, 970 * the calling processor must ensure that all stores updating a non- 971 * kernel page table are globally performed. Otherwise, another 972 * processor could cache an old, pre-update entry without being 973 * invalidated. This can happen one of two ways: (1) The pmap becomes 974 * active on another processor after its pm_active field is checked by 975 * one of the following functions but before a store updating the page 976 * table is globally performed. (2) The pmap becomes active on another 977 * processor before its pm_active field is checked but due to 978 * speculative loads one of the following functions stills reads the 979 * pmap as inactive on the other processor. 980 * 981 * The kernel page table is exempt because its pm_active field is 982 * immutable. The kernel page table is always active on every 983 * processor. 984 */ 985void 986pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 987{ 988 cpuset_t other_cpus; 989 u_int cpuid; 990 991 sched_pin(); 992 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 993 invlpg(va); 994 smp_invlpg(va); 995 } else { 996 cpuid = PCPU_GET(cpuid); 997 other_cpus = all_cpus; 998 CPU_CLR(cpuid, &other_cpus); 999 if (CPU_ISSET(cpuid, &pmap->pm_active)) 1000 invlpg(va); 1001 CPU_AND(&other_cpus, &pmap->pm_active); 1002 if (!CPU_EMPTY(&other_cpus)) 1003 smp_masked_invlpg(other_cpus, va); 1004 } 1005 sched_unpin(); 1006} 1007 1008void 1009pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1010{ 1011 cpuset_t other_cpus; 1012 vm_offset_t addr; 1013 u_int cpuid; 1014 1015 sched_pin(); 1016 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 1017 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1018 invlpg(addr); 1019 smp_invlpg_range(sva, eva); 1020 } else { 1021 cpuid = PCPU_GET(cpuid); 1022 other_cpus = all_cpus; 1023 CPU_CLR(cpuid, &other_cpus); 1024 if (CPU_ISSET(cpuid, &pmap->pm_active)) 1025 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1026 invlpg(addr); 1027 CPU_AND(&other_cpus, &pmap->pm_active); 1028 if (!CPU_EMPTY(&other_cpus)) 1029 smp_masked_invlpg_range(other_cpus, sva, eva); 1030 } 1031 sched_unpin(); 1032} 1033 1034void 1035pmap_invalidate_all(pmap_t pmap) 1036{ 1037 cpuset_t other_cpus; 1038 u_int cpuid; 1039 1040 sched_pin(); 1041 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 1042 invltlb(); 1043 smp_invltlb(); 1044 } else { 1045 cpuid = PCPU_GET(cpuid); 1046 other_cpus = all_cpus; 1047 CPU_CLR(cpuid, &other_cpus); 1048 if (CPU_ISSET(cpuid, &pmap->pm_active)) 1049 invltlb(); 1050 CPU_AND(&other_cpus, &pmap->pm_active); 1051 if (!CPU_EMPTY(&other_cpus)) 1052 smp_masked_invltlb(other_cpus); 1053 } 1054 sched_unpin(); 1055} 1056 1057void 1058pmap_invalidate_cache(void) 1059{ 1060 1061 sched_pin(); 1062 wbinvd(); 1063 smp_cache_flush(); 1064 sched_unpin(); 1065} 1066 1067struct pde_action { 1068 cpuset_t invalidate; /* processors that invalidate their TLB */ 1069 vm_offset_t va; 1070 pd_entry_t *pde; 1071 pd_entry_t newpde; 1072 u_int store; /* processor that updates the PDE */ 1073}; 1074 1075static void 1076pmap_update_pde_action(void *arg) 1077{ 1078 struct pde_action *act = arg; 1079 1080 if (act->store == PCPU_GET(cpuid)) 1081 pde_store(act->pde, act->newpde); 1082} 1083 1084static void 1085pmap_update_pde_teardown(void *arg) 1086{ 1087 struct pde_action *act = arg; 1088 1089 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 1090 pmap_update_pde_invalidate(act->va, act->newpde); 1091} 1092 1093/* 1094 * Change the page size for the specified virtual address in a way that 1095 * prevents any possibility of the TLB ever having two entries that map the 1096 * same virtual address using different page sizes. This is the recommended 1097 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 1098 * machine check exception for a TLB state that is improperly diagnosed as a 1099 * hardware error. 1100 */ 1101static void 1102pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1103{ 1104 struct pde_action act; 1105 cpuset_t active, other_cpus; 1106 u_int cpuid; 1107 1108 sched_pin(); 1109 cpuid = PCPU_GET(cpuid); 1110 other_cpus = all_cpus; 1111 CPU_CLR(cpuid, &other_cpus); 1112 if (pmap == kernel_pmap) 1113 active = all_cpus; 1114 else 1115 active = pmap->pm_active; 1116 if (CPU_OVERLAP(&active, &other_cpus)) { 1117 act.store = cpuid; 1118 act.invalidate = active; 1119 act.va = va; 1120 act.pde = pde; 1121 act.newpde = newpde; 1122 CPU_SET(cpuid, &active); 1123 smp_rendezvous_cpus(active, 1124 smp_no_rendevous_barrier, pmap_update_pde_action, 1125 pmap_update_pde_teardown, &act); 1126 } else { 1127 pde_store(pde, newpde); 1128 if (CPU_ISSET(cpuid, &active)) 1129 pmap_update_pde_invalidate(va, newpde); 1130 } 1131 sched_unpin(); 1132} 1133#else /* !SMP */ 1134/* 1135 * Normal, non-SMP, invalidation functions. 1136 * We inline these within pmap.c for speed. 1137 */ 1138PMAP_INLINE void 1139pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1140{ 1141 1142 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1143 invlpg(va); 1144} 1145 1146PMAP_INLINE void 1147pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1148{ 1149 vm_offset_t addr; 1150 1151 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1152 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1153 invlpg(addr); 1154} 1155 1156PMAP_INLINE void 1157pmap_invalidate_all(pmap_t pmap) 1158{ 1159 1160 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1161 invltlb(); 1162} 1163 1164PMAP_INLINE void 1165pmap_invalidate_cache(void) 1166{ 1167 1168 wbinvd(); 1169} 1170 1171static void 1172pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1173{ 1174 1175 pde_store(pde, newpde); 1176 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1177 pmap_update_pde_invalidate(va, newpde); 1178} 1179#endif /* !SMP */ 1180 1181#define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1182 1183void 1184pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 1185{ 1186 1187 KASSERT((sva & PAGE_MASK) == 0, 1188 ("pmap_invalidate_cache_range: sva not page-aligned")); 1189 KASSERT((eva & PAGE_MASK) == 0, 1190 ("pmap_invalidate_cache_range: eva not page-aligned")); 1191 1192 if (cpu_feature & CPUID_SS) 1193 ; /* If "Self Snoop" is supported, do nothing. */ 1194 else if ((cpu_feature & CPUID_CLFSH) != 0 && 1195 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 1196 1197 /* 1198 * XXX: Some CPUs fault, hang, or trash the local APIC 1199 * registers if we use CLFLUSH on the local APIC 1200 * range. The local APIC is always uncached, so we 1201 * don't need to flush for that range anyway. 1202 */ 1203 if (pmap_kextract(sva) == lapic_paddr) 1204 return; 1205 1206 /* 1207 * Otherwise, do per-cache line flush. Use the mfence 1208 * instruction to insure that previous stores are 1209 * included in the write-back. The processor 1210 * propagates flush to other processors in the cache 1211 * coherence domain. 1212 */ 1213 mfence(); 1214 for (; sva < eva; sva += cpu_clflush_line_size) 1215 clflush(sva); 1216 mfence(); 1217 } else { 1218 1219 /* 1220 * No targeted cache flush methods are supported by CPU, 1221 * or the supplied range is bigger than 2MB. 1222 * Globally invalidate cache. 1223 */ 1224 pmap_invalidate_cache(); 1225 } 1226} 1227 1228/* 1229 * Remove the specified set of pages from the data and instruction caches. 1230 * 1231 * In contrast to pmap_invalidate_cache_range(), this function does not 1232 * rely on the CPU's self-snoop feature, because it is intended for use 1233 * when moving pages into a different cache domain. 1234 */ 1235void 1236pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1237{ 1238 vm_offset_t daddr, eva; 1239 int i; 1240 1241 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1242 (cpu_feature & CPUID_CLFSH) == 0) 1243 pmap_invalidate_cache(); 1244 else { 1245 mfence(); 1246 for (i = 0; i < count; i++) { 1247 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 1248 eva = daddr + PAGE_SIZE; 1249 for (; daddr < eva; daddr += cpu_clflush_line_size) 1250 clflush(daddr); 1251 } 1252 mfence(); 1253 } 1254} 1255 1256/* 1257 * Are we current address space or kernel? 1258 */ 1259static __inline int 1260pmap_is_current(pmap_t pmap) 1261{ 1262 return (pmap == kernel_pmap || 1263 (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME)); 1264} 1265 1266/* 1267 * Routine: pmap_extract 1268 * Function: 1269 * Extract the physical page address associated 1270 * with the given map/virtual_address pair. 1271 */ 1272vm_paddr_t 1273pmap_extract(pmap_t pmap, vm_offset_t va) 1274{ 1275 pdp_entry_t *pdpe; 1276 pd_entry_t *pde; 1277 pt_entry_t *pte; 1278 vm_paddr_t pa; 1279 1280 pa = 0; 1281 PMAP_LOCK(pmap); 1282 pdpe = pmap_pdpe(pmap, va); 1283 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 1284 if ((*pdpe & PG_PS) != 0) 1285 pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); 1286 else { 1287 pde = pmap_pdpe_to_pde(pdpe, va); 1288 if ((*pde & PG_V) != 0) { 1289 if ((*pde & PG_PS) != 0) { 1290 pa = (*pde & PG_PS_FRAME) | 1291 (va & PDRMASK); 1292 } else { 1293 pte = pmap_pde_to_pte(pde, va); 1294 pa = (*pte & PG_FRAME) | 1295 (va & PAGE_MASK); 1296 } 1297 } 1298 } 1299 } 1300 PMAP_UNLOCK(pmap); 1301 return (pa); 1302} 1303 1304/* 1305 * Routine: pmap_extract_and_hold 1306 * Function: 1307 * Atomically extract and hold the physical page 1308 * with the given pmap and virtual address pair 1309 * if that mapping permits the given protection. 1310 */ 1311vm_page_t 1312pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1313{ 1314 pd_entry_t pde, *pdep; 1315 pt_entry_t pte; 1316 vm_paddr_t pa; 1317 vm_page_t m; 1318 1319 pa = 0; 1320 m = NULL; 1321 PMAP_LOCK(pmap); 1322retry: 1323 pdep = pmap_pde(pmap, va); 1324 if (pdep != NULL && (pde = *pdep)) { 1325 if (pde & PG_PS) { 1326 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 1327 if (vm_page_pa_tryrelock(pmap, (pde & 1328 PG_PS_FRAME) | (va & PDRMASK), &pa)) 1329 goto retry; 1330 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | 1331 (va & PDRMASK)); 1332 vm_page_hold(m); 1333 } 1334 } else { 1335 pte = *pmap_pde_to_pte(pdep, va); 1336 if ((pte & PG_V) && 1337 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 1338 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, 1339 &pa)) 1340 goto retry; 1341 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 1342 vm_page_hold(m); 1343 } 1344 } 1345 } 1346 PA_UNLOCK_COND(pa); 1347 PMAP_UNLOCK(pmap); 1348 return (m); 1349} 1350 1351vm_paddr_t 1352pmap_kextract(vm_offset_t va) 1353{ 1354 pd_entry_t pde; 1355 vm_paddr_t pa; 1356 1357 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 1358 pa = DMAP_TO_PHYS(va); 1359 } else { 1360 pde = *vtopde(va); 1361 if (pde & PG_PS) { 1362 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 1363 } else { 1364 /* 1365 * Beware of a concurrent promotion that changes the 1366 * PDE at this point! For example, vtopte() must not 1367 * be used to access the PTE because it would use the 1368 * new PDE. It is, however, safe to use the old PDE 1369 * because the page table page is preserved by the 1370 * promotion. 1371 */ 1372 pa = *pmap_pde_to_pte(&pde, va); 1373 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 1374 } 1375 } 1376 return (pa); 1377} 1378 1379/*************************************************** 1380 * Low level mapping routines..... 1381 ***************************************************/ 1382 1383/* 1384 * Add a wired page to the kva. 1385 * Note: not SMP coherent. 1386 */ 1387PMAP_INLINE void 1388pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1389{ 1390 pt_entry_t *pte; 1391 1392 pte = vtopte(va); 1393 pte_store(pte, pa | PG_RW | PG_V | PG_G); 1394} 1395 1396static __inline void 1397pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 1398{ 1399 pt_entry_t *pte; 1400 1401 pte = vtopte(va); 1402 pte_store(pte, pa | PG_RW | PG_V | PG_G | pmap_cache_bits(mode, 0)); 1403} 1404 1405/* 1406 * Remove a page from the kernel pagetables. 1407 * Note: not SMP coherent. 1408 */ 1409PMAP_INLINE void 1410pmap_kremove(vm_offset_t va) 1411{ 1412 pt_entry_t *pte; 1413 1414 pte = vtopte(va); 1415 pte_clear(pte); 1416} 1417 1418/* 1419 * Used to map a range of physical addresses into kernel 1420 * virtual address space. 1421 * 1422 * The value passed in '*virt' is a suggested virtual address for 1423 * the mapping. Architectures which can support a direct-mapped 1424 * physical to virtual region can return the appropriate address 1425 * within that region, leaving '*virt' unchanged. Other 1426 * architectures should map the pages starting at '*virt' and 1427 * update '*virt' with the first usable address after the mapped 1428 * region. 1429 */ 1430vm_offset_t 1431pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1432{ 1433 return PHYS_TO_DMAP(start); 1434} 1435 1436 1437/* 1438 * Add a list of wired pages to the kva 1439 * this routine is only used for temporary 1440 * kernel mappings that do not need to have 1441 * page modification or references recorded. 1442 * Note that old mappings are simply written 1443 * over. The page *must* be wired. 1444 * Note: SMP coherent. Uses a ranged shootdown IPI. 1445 */ 1446void 1447pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1448{ 1449 pt_entry_t *endpte, oldpte, pa, *pte; 1450 vm_page_t m; 1451 1452 oldpte = 0; 1453 pte = vtopte(sva); 1454 endpte = pte + count; 1455 while (pte < endpte) { 1456 m = *ma++; 1457 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); 1458 if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { 1459 oldpte |= *pte; 1460 pte_store(pte, pa | PG_G | PG_RW | PG_V); 1461 } 1462 pte++; 1463 } 1464 if (__predict_false((oldpte & PG_V) != 0)) 1465 pmap_invalidate_range(kernel_pmap, sva, sva + count * 1466 PAGE_SIZE); 1467} 1468 1469/* 1470 * This routine tears out page mappings from the 1471 * kernel -- it is meant only for temporary mappings. 1472 * Note: SMP coherent. Uses a ranged shootdown IPI. 1473 */ 1474void 1475pmap_qremove(vm_offset_t sva, int count) 1476{ 1477 vm_offset_t va; 1478 1479 va = sva; 1480 while (count-- > 0) { 1481 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); 1482 pmap_kremove(va); 1483 va += PAGE_SIZE; 1484 } 1485 pmap_invalidate_range(kernel_pmap, sva, va); 1486} 1487 1488/*************************************************** 1489 * Page table page management routines..... 1490 ***************************************************/ 1491static __inline void 1492pmap_free_zero_pages(vm_page_t free) 1493{ 1494 vm_page_t m; 1495 1496 while (free != NULL) { 1497 m = free; 1498 free = (void *)m->object; 1499 m->object = NULL; 1500 /* Preserve the page's PG_ZERO setting. */ 1501 vm_page_free_toq(m); 1502 } 1503} 1504 1505/* 1506 * Schedule the specified unused page table page to be freed. Specifically, 1507 * add the page to the specified list of pages that will be released to the 1508 * physical memory manager after the TLB has been updated. 1509 */ 1510static __inline void 1511pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO) 1512{ 1513 1514 if (set_PG_ZERO) 1515 m->flags |= PG_ZERO; 1516 else 1517 m->flags &= ~PG_ZERO; 1518 m->object = (void *)*free; 1519 *free = m; 1520} 1521 1522/* 1523 * Inserts the specified page table page into the specified pmap's collection 1524 * of idle page table pages. Each of a pmap's page table pages is responsible 1525 * for mapping a distinct range of virtual addresses. The pmap's collection is 1526 * ordered by this virtual address range. 1527 */ 1528static __inline void 1529pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 1530{ 1531 1532 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1533 vm_radix_insert(&pmap->pm_root, mpte); 1534} 1535 1536/* 1537 * Looks for a page table page mapping the specified virtual address in the 1538 * specified pmap's collection of idle page table pages. Returns NULL if there 1539 * is no page table page corresponding to the specified virtual address. 1540 */ 1541static __inline vm_page_t 1542pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) 1543{ 1544 1545 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1546 return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va))); 1547} 1548 1549/* 1550 * Removes the specified page table page from the specified pmap's collection 1551 * of idle page table pages. The specified page table page must be a member of 1552 * the pmap's collection. 1553 */ 1554static __inline void 1555pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) 1556{ 1557 1558 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1559 vm_radix_remove(&pmap->pm_root, mpte->pindex); 1560} 1561 1562/* 1563 * Decrements a page table page's wire count, which is used to record the 1564 * number of valid page table entries within the page. If the wire count 1565 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1566 * page table page was unmapped and FALSE otherwise. 1567 */ 1568static inline boolean_t 1569pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free) 1570{ 1571 1572 --m->wire_count; 1573 if (m->wire_count == 0) { 1574 _pmap_unwire_ptp(pmap, va, m, free); 1575 return (TRUE); 1576 } else 1577 return (FALSE); 1578} 1579 1580static void 1581_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free) 1582{ 1583 1584 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1585 /* 1586 * unmap the page table page 1587 */ 1588 if (m->pindex >= (NUPDE + NUPDPE)) { 1589 /* PDP page */ 1590 pml4_entry_t *pml4; 1591 pml4 = pmap_pml4e(pmap, va); 1592 *pml4 = 0; 1593 } else if (m->pindex >= NUPDE) { 1594 /* PD page */ 1595 pdp_entry_t *pdp; 1596 pdp = pmap_pdpe(pmap, va); 1597 *pdp = 0; 1598 } else { 1599 /* PTE page */ 1600 pd_entry_t *pd; 1601 pd = pmap_pde(pmap, va); 1602 *pd = 0; 1603 } 1604 pmap_resident_count_dec(pmap, 1); 1605 if (m->pindex < NUPDE) { 1606 /* We just released a PT, unhold the matching PD */ 1607 vm_page_t pdpg; 1608 1609 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 1610 pmap_unwire_ptp(pmap, va, pdpg, free); 1611 } 1612 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 1613 /* We just released a PD, unhold the matching PDP */ 1614 vm_page_t pdppg; 1615 1616 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 1617 pmap_unwire_ptp(pmap, va, pdppg, free); 1618 } 1619 1620 /* 1621 * This is a release store so that the ordinary store unmapping 1622 * the page table page is globally performed before TLB shoot- 1623 * down is begun. 1624 */ 1625 atomic_subtract_rel_int(&cnt.v_wire_count, 1); 1626 1627 /* 1628 * Put page on a list so that it is released after 1629 * *ALL* TLB shootdown is done 1630 */ 1631 pmap_add_delayed_free_list(m, free, TRUE); 1632} 1633 1634/* 1635 * After removing a page table entry, this routine is used to 1636 * conditionally free the page, and manage the hold/wire counts. 1637 */ 1638static int 1639pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, vm_page_t *free) 1640{ 1641 vm_page_t mpte; 1642 1643 if (va >= VM_MAXUSER_ADDRESS) 1644 return (0); 1645 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 1646 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 1647 return (pmap_unwire_ptp(pmap, va, mpte, free)); 1648} 1649 1650void 1651pmap_pinit0(pmap_t pmap) 1652{ 1653 1654 PMAP_LOCK_INIT(pmap); 1655 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 1656 pmap->pm_root.rt_root = 0; 1657 CPU_ZERO(&pmap->pm_active); 1658 PCPU_SET(curpmap, pmap); 1659 TAILQ_INIT(&pmap->pm_pvchunk); 1660 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1661} 1662 1663/* 1664 * Initialize a preallocated and zeroed pmap structure, 1665 * such as one in a vmspace structure. 1666 */ 1667int 1668pmap_pinit(pmap_t pmap) 1669{ 1670 vm_page_t pml4pg; 1671 int i; 1672 1673 PMAP_LOCK_INIT(pmap); 1674 1675 /* 1676 * allocate the page directory page 1677 */ 1678 while ((pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 1679 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) 1680 VM_WAIT; 1681 1682 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); 1683 1684 if ((pml4pg->flags & PG_ZERO) == 0) 1685 pagezero(pmap->pm_pml4); 1686 1687 /* Wire in kernel global address entries. */ 1688 pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U; 1689 for (i = 0; i < NDMPML4E; i++) { 1690 pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + (i << PAGE_SHIFT)) | 1691 PG_RW | PG_V | PG_U; 1692 } 1693 1694 /* install self-referential address mapping entry(s) */ 1695 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M; 1696 1697 pmap->pm_root.rt_root = 0; 1698 CPU_ZERO(&pmap->pm_active); 1699 TAILQ_INIT(&pmap->pm_pvchunk); 1700 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1701 1702 return (1); 1703} 1704 1705/* 1706 * This routine is called if the desired page table page does not exist. 1707 * 1708 * If page table page allocation fails, this routine may sleep before 1709 * returning NULL. It sleeps only if a lock pointer was given. 1710 * 1711 * Note: If a page allocation fails at page table level two or three, 1712 * one or two pages may be held during the wait, only to be released 1713 * afterwards. This conservative approach is easily argued to avoid 1714 * race conditions. 1715 */ 1716static vm_page_t 1717_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 1718{ 1719 vm_page_t m, pdppg, pdpg; 1720 1721 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1722 1723 /* 1724 * Allocate a page table page. 1725 */ 1726 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1727 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1728 if (lockp != NULL) { 1729 RELEASE_PV_LIST_LOCK(lockp); 1730 PMAP_UNLOCK(pmap); 1731 rw_runlock(&pvh_global_lock); 1732 VM_WAIT; 1733 rw_rlock(&pvh_global_lock); 1734 PMAP_LOCK(pmap); 1735 } 1736 1737 /* 1738 * Indicate the need to retry. While waiting, the page table 1739 * page may have been allocated. 1740 */ 1741 return (NULL); 1742 } 1743 if ((m->flags & PG_ZERO) == 0) 1744 pmap_zero_page(m); 1745 1746 /* 1747 * Map the pagetable page into the process address space, if 1748 * it isn't already there. 1749 */ 1750 1751 if (ptepindex >= (NUPDE + NUPDPE)) { 1752 pml4_entry_t *pml4; 1753 vm_pindex_t pml4index; 1754 1755 /* Wire up a new PDPE page */ 1756 pml4index = ptepindex - (NUPDE + NUPDPE); 1757 pml4 = &pmap->pm_pml4[pml4index]; 1758 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1759 1760 } else if (ptepindex >= NUPDE) { 1761 vm_pindex_t pml4index; 1762 vm_pindex_t pdpindex; 1763 pml4_entry_t *pml4; 1764 pdp_entry_t *pdp; 1765 1766 /* Wire up a new PDE page */ 1767 pdpindex = ptepindex - NUPDE; 1768 pml4index = pdpindex >> NPML4EPGSHIFT; 1769 1770 pml4 = &pmap->pm_pml4[pml4index]; 1771 if ((*pml4 & PG_V) == 0) { 1772 /* Have to allocate a new pdp, recurse */ 1773 if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, 1774 lockp) == NULL) { 1775 --m->wire_count; 1776 atomic_subtract_int(&cnt.v_wire_count, 1); 1777 vm_page_free_zero(m); 1778 return (NULL); 1779 } 1780 } else { 1781 /* Add reference to pdp page */ 1782 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 1783 pdppg->wire_count++; 1784 } 1785 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1786 1787 /* Now find the pdp page */ 1788 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1789 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1790 1791 } else { 1792 vm_pindex_t pml4index; 1793 vm_pindex_t pdpindex; 1794 pml4_entry_t *pml4; 1795 pdp_entry_t *pdp; 1796 pd_entry_t *pd; 1797 1798 /* Wire up a new PTE page */ 1799 pdpindex = ptepindex >> NPDPEPGSHIFT; 1800 pml4index = pdpindex >> NPML4EPGSHIFT; 1801 1802 /* First, find the pdp and check that its valid. */ 1803 pml4 = &pmap->pm_pml4[pml4index]; 1804 if ((*pml4 & PG_V) == 0) { 1805 /* Have to allocate a new pd, recurse */ 1806 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 1807 lockp) == NULL) { 1808 --m->wire_count; 1809 atomic_subtract_int(&cnt.v_wire_count, 1); 1810 vm_page_free_zero(m); 1811 return (NULL); 1812 } 1813 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1814 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1815 } else { 1816 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 1817 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 1818 if ((*pdp & PG_V) == 0) { 1819 /* Have to allocate a new pd, recurse */ 1820 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 1821 lockp) == NULL) { 1822 --m->wire_count; 1823 atomic_subtract_int(&cnt.v_wire_count, 1824 1); 1825 vm_page_free_zero(m); 1826 return (NULL); 1827 } 1828 } else { 1829 /* Add reference to the pd page */ 1830 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 1831 pdpg->wire_count++; 1832 } 1833 } 1834 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 1835 1836 /* Now we know where the page directory page is */ 1837 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; 1838 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 1839 } 1840 1841 pmap_resident_count_inc(pmap, 1); 1842 1843 return (m); 1844} 1845 1846static vm_page_t 1847pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1848{ 1849 vm_pindex_t pdpindex, ptepindex; 1850 pdp_entry_t *pdpe; 1851 vm_page_t pdpg; 1852 1853retry: 1854 pdpe = pmap_pdpe(pmap, va); 1855 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 1856 /* Add a reference to the pd page. */ 1857 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 1858 pdpg->wire_count++; 1859 } else { 1860 /* Allocate a pd page. */ 1861 ptepindex = pmap_pde_pindex(va); 1862 pdpindex = ptepindex >> NPDPEPGSHIFT; 1863 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); 1864 if (pdpg == NULL && lockp != NULL) 1865 goto retry; 1866 } 1867 return (pdpg); 1868} 1869 1870static vm_page_t 1871pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 1872{ 1873 vm_pindex_t ptepindex; 1874 pd_entry_t *pd; 1875 vm_page_t m; 1876 1877 /* 1878 * Calculate pagetable page index 1879 */ 1880 ptepindex = pmap_pde_pindex(va); 1881retry: 1882 /* 1883 * Get the page directory entry 1884 */ 1885 pd = pmap_pde(pmap, va); 1886 1887 /* 1888 * This supports switching from a 2MB page to a 1889 * normal 4K page. 1890 */ 1891 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 1892 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { 1893 /* 1894 * Invalidation of the 2MB page mapping may have caused 1895 * the deallocation of the underlying PD page. 1896 */ 1897 pd = NULL; 1898 } 1899 } 1900 1901 /* 1902 * If the page table page is mapped, we just increment the 1903 * hold count, and activate it. 1904 */ 1905 if (pd != NULL && (*pd & PG_V) != 0) { 1906 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 1907 m->wire_count++; 1908 } else { 1909 /* 1910 * Here if the pte page isn't mapped, or if it has been 1911 * deallocated. 1912 */ 1913 m = _pmap_allocpte(pmap, ptepindex, lockp); 1914 if (m == NULL && lockp != NULL) 1915 goto retry; 1916 } 1917 return (m); 1918} 1919 1920 1921/*************************************************** 1922 * Pmap allocation/deallocation routines. 1923 ***************************************************/ 1924 1925/* 1926 * Release any resources held by the given physical map. 1927 * Called when a pmap initialized by pmap_pinit is being released. 1928 * Should only be called if the map contains no valid mappings. 1929 */ 1930void 1931pmap_release(pmap_t pmap) 1932{ 1933 vm_page_t m; 1934 int i; 1935 1936 KASSERT(pmap->pm_stats.resident_count == 0, 1937 ("pmap_release: pmap resident count %ld != 0", 1938 pmap->pm_stats.resident_count)); 1939 KASSERT(vm_radix_is_empty(&pmap->pm_root), 1940 ("pmap_release: pmap has reserved page table page(s)")); 1941 1942 m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME); 1943 1944 pmap->pm_pml4[KPML4I] = 0; /* KVA */ 1945 for (i = 0; i < NDMPML4E; i++) /* Direct Map */ 1946 pmap->pm_pml4[DMPML4I + i] = 0; 1947 pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ 1948 1949 m->wire_count--; 1950 atomic_subtract_int(&cnt.v_wire_count, 1); 1951 vm_page_free_zero(m); 1952 PMAP_LOCK_DESTROY(pmap); 1953} 1954 1955static int 1956kvm_size(SYSCTL_HANDLER_ARGS) 1957{ 1958 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 1959 1960 return sysctl_handle_long(oidp, &ksize, 0, req); 1961} 1962SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 1963 0, 0, kvm_size, "LU", "Size of KVM"); 1964 1965static int 1966kvm_free(SYSCTL_HANDLER_ARGS) 1967{ 1968 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 1969 1970 return sysctl_handle_long(oidp, &kfree, 0, req); 1971} 1972SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 1973 0, 0, kvm_free, "LU", "Amount of KVM free"); 1974 1975/* 1976 * grow the number of kernel page table entries, if needed 1977 */ 1978void 1979pmap_growkernel(vm_offset_t addr) 1980{ 1981 vm_paddr_t paddr; 1982 vm_page_t nkpg; 1983 pd_entry_t *pde, newpdir; 1984 pdp_entry_t *pdpe; 1985 1986 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 1987 1988 /* 1989 * Return if "addr" is within the range of kernel page table pages 1990 * that were preallocated during pmap bootstrap. Moreover, leave 1991 * "kernel_vm_end" and the kernel page table as they were. 1992 * 1993 * The correctness of this action is based on the following 1994 * argument: vm_map_findspace() allocates contiguous ranges of the 1995 * kernel virtual address space. It calls this function if a range 1996 * ends after "kernel_vm_end". If the kernel is mapped between 1997 * "kernel_vm_end" and "addr", then the range cannot begin at 1998 * "kernel_vm_end". In fact, its beginning address cannot be less 1999 * than the kernel. Thus, there is no immediate need to allocate 2000 * any new kernel page table pages between "kernel_vm_end" and 2001 * "KERNBASE". 2002 */ 2003 if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR) 2004 return; 2005 2006 addr = roundup2(addr, NBPDR); 2007 if (addr - 1 >= kernel_map->max_offset) 2008 addr = kernel_map->max_offset; 2009 while (kernel_vm_end < addr) { 2010 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end); 2011 if ((*pdpe & PG_V) == 0) { 2012 /* We need a new PDP entry */ 2013 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT, 2014 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 2015 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2016 if (nkpg == NULL) 2017 panic("pmap_growkernel: no memory to grow kernel"); 2018 if ((nkpg->flags & PG_ZERO) == 0) 2019 pmap_zero_page(nkpg); 2020 paddr = VM_PAGE_TO_PHYS(nkpg); 2021 *pdpe = (pdp_entry_t) 2022 (paddr | PG_V | PG_RW | PG_A | PG_M); 2023 continue; /* try again */ 2024 } 2025 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); 2026 if ((*pde & PG_V) != 0) { 2027 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2028 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2029 kernel_vm_end = kernel_map->max_offset; 2030 break; 2031 } 2032 continue; 2033 } 2034 2035 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end), 2036 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2037 VM_ALLOC_ZERO); 2038 if (nkpg == NULL) 2039 panic("pmap_growkernel: no memory to grow kernel"); 2040 if ((nkpg->flags & PG_ZERO) == 0) 2041 pmap_zero_page(nkpg); 2042 paddr = VM_PAGE_TO_PHYS(nkpg); 2043 newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M); 2044 pde_store(pde, newpdir); 2045 2046 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2047 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2048 kernel_vm_end = kernel_map->max_offset; 2049 break; 2050 } 2051 } 2052} 2053 2054 2055/*************************************************** 2056 * page management routines. 2057 ***************************************************/ 2058 2059CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2060CTASSERT(_NPCM == 3); 2061CTASSERT(_NPCPV == 168); 2062 2063static __inline struct pv_chunk * 2064pv_to_chunk(pv_entry_t pv) 2065{ 2066 2067 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2068} 2069 2070#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2071 2072#define PC_FREE0 0xfffffffffffffffful 2073#define PC_FREE1 0xfffffffffffffffful 2074#define PC_FREE2 0x000000fffffffffful 2075 2076static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 2077 2078#ifdef PV_STATS 2079static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2080 2081SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2082 "Current number of pv entry chunks"); 2083SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2084 "Current number of pv entry chunks allocated"); 2085SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2086 "Current number of pv entry chunks frees"); 2087SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2088 "Number of times tried to get a chunk page but failed."); 2089 2090static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 2091static int pv_entry_spare; 2092 2093SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2094 "Current number of pv entry frees"); 2095SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2096 "Current number of pv entry allocs"); 2097SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2098 "Current number of pv entries"); 2099SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2100 "Current number of spare pv entries"); 2101#endif 2102 2103/* 2104 * We are in a serious low memory condition. Resort to 2105 * drastic measures to free some pages so we can allocate 2106 * another pv entry chunk. 2107 * 2108 * Returns NULL if PV entries were reclaimed from the specified pmap. 2109 * 2110 * We do not, however, unmap 2mpages because subsequent accesses will 2111 * allocate per-page pv entries until repromotion occurs, thereby 2112 * exacerbating the shortage of free pv entries. 2113 */ 2114static vm_page_t 2115reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 2116{ 2117 struct pch new_tail; 2118 struct pv_chunk *pc; 2119 struct md_page *pvh; 2120 pd_entry_t *pde; 2121 pmap_t pmap; 2122 pt_entry_t *pte, tpte; 2123 pv_entry_t pv; 2124 vm_offset_t va; 2125 vm_page_t free, m, m_pc; 2126 uint64_t inuse; 2127 int bit, field, freed; 2128 2129 rw_assert(&pvh_global_lock, RA_LOCKED); 2130 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2131 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 2132 pmap = NULL; 2133 free = m_pc = NULL; 2134 TAILQ_INIT(&new_tail); 2135 mtx_lock(&pv_chunks_mutex); 2136 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && free == NULL) { 2137 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2138 mtx_unlock(&pv_chunks_mutex); 2139 if (pmap != pc->pc_pmap) { 2140 if (pmap != NULL) { 2141 pmap_invalidate_all(pmap); 2142 if (pmap != locked_pmap) 2143 PMAP_UNLOCK(pmap); 2144 } 2145 pmap = pc->pc_pmap; 2146 /* Avoid deadlock and lock recursion. */ 2147 if (pmap > locked_pmap) { 2148 RELEASE_PV_LIST_LOCK(lockp); 2149 PMAP_LOCK(pmap); 2150 } else if (pmap != locked_pmap && 2151 !PMAP_TRYLOCK(pmap)) { 2152 pmap = NULL; 2153 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2154 mtx_lock(&pv_chunks_mutex); 2155 continue; 2156 } 2157 } 2158 2159 /* 2160 * Destroy every non-wired, 4 KB page mapping in the chunk. 2161 */ 2162 freed = 0; 2163 for (field = 0; field < _NPCM; field++) { 2164 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2165 inuse != 0; inuse &= ~(1UL << bit)) { 2166 bit = bsfq(inuse); 2167 pv = &pc->pc_pventry[field * 64 + bit]; 2168 va = pv->pv_va; 2169 pde = pmap_pde(pmap, va); 2170 if ((*pde & PG_PS) != 0) 2171 continue; 2172 pte = pmap_pde_to_pte(pde, va); 2173 if ((*pte & PG_W) != 0) 2174 continue; 2175 tpte = pte_load_clear(pte); 2176 if ((tpte & PG_G) != 0) 2177 pmap_invalidate_page(pmap, va); 2178 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 2179 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2180 vm_page_dirty(m); 2181 if ((tpte & PG_A) != 0) 2182 vm_page_aflag_set(m, PGA_REFERENCED); 2183 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2184 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2185 if (TAILQ_EMPTY(&m->md.pv_list) && 2186 (m->flags & PG_FICTITIOUS) == 0) { 2187 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2188 if (TAILQ_EMPTY(&pvh->pv_list)) { 2189 vm_page_aflag_clear(m, 2190 PGA_WRITEABLE); 2191 } 2192 } 2193 pc->pc_map[field] |= 1UL << bit; 2194 pmap_unuse_pt(pmap, va, *pde, &free); 2195 freed++; 2196 } 2197 } 2198 if (freed == 0) { 2199 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2200 mtx_lock(&pv_chunks_mutex); 2201 continue; 2202 } 2203 /* Every freed mapping is for a 4 KB page. */ 2204 pmap_resident_count_dec(pmap, freed); 2205 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 2206 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 2207 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 2208 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2209 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && 2210 pc->pc_map[2] == PC_FREE2) { 2211 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 2212 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 2213 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 2214 /* Entire chunk is free; return it. */ 2215 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2216 dump_drop_page(m_pc->phys_addr); 2217 mtx_lock(&pv_chunks_mutex); 2218 break; 2219 } 2220 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2221 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2222 mtx_lock(&pv_chunks_mutex); 2223 /* One freed pv entry in locked_pmap is sufficient. */ 2224 if (pmap == locked_pmap) 2225 break; 2226 } 2227 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 2228 mtx_unlock(&pv_chunks_mutex); 2229 if (pmap != NULL) { 2230 pmap_invalidate_all(pmap); 2231 if (pmap != locked_pmap) 2232 PMAP_UNLOCK(pmap); 2233 } 2234 if (m_pc == NULL && free != NULL) { 2235 m_pc = free; 2236 free = (void *)m_pc->object; 2237 /* Recycle a freed page table page. */ 2238 m_pc->wire_count = 1; 2239 atomic_add_int(&cnt.v_wire_count, 1); 2240 } 2241 pmap_free_zero_pages(free); 2242 return (m_pc); 2243} 2244 2245/* 2246 * free the pv_entry back to the free list 2247 */ 2248static void 2249free_pv_entry(pmap_t pmap, pv_entry_t pv) 2250{ 2251 struct pv_chunk *pc; 2252 int idx, field, bit; 2253 2254 rw_assert(&pvh_global_lock, RA_LOCKED); 2255 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2256 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 2257 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 2258 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 2259 pc = pv_to_chunk(pv); 2260 idx = pv - &pc->pc_pventry[0]; 2261 field = idx / 64; 2262 bit = idx % 64; 2263 pc->pc_map[field] |= 1ul << bit; 2264 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 2265 pc->pc_map[2] != PC_FREE2) { 2266 /* 98% of the time, pc is already at the head of the list. */ 2267 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 2268 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2269 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2270 } 2271 return; 2272 } 2273 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2274 free_pv_chunk(pc); 2275} 2276 2277static void 2278free_pv_chunk(struct pv_chunk *pc) 2279{ 2280 vm_page_t m; 2281 2282 mtx_lock(&pv_chunks_mutex); 2283 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2284 mtx_unlock(&pv_chunks_mutex); 2285 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 2286 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 2287 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 2288 /* entire chunk is free, return it */ 2289 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2290 dump_drop_page(m->phys_addr); 2291 vm_page_unwire(m, 0); 2292 vm_page_free(m); 2293} 2294 2295/* 2296 * Returns a new PV entry, allocating a new PV chunk from the system when 2297 * needed. If this PV chunk allocation fails and a PV list lock pointer was 2298 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 2299 * returned. 2300 * 2301 * The given PV list lock may be released. 2302 */ 2303static pv_entry_t 2304get_pv_entry(pmap_t pmap, struct rwlock **lockp) 2305{ 2306 int bit, field; 2307 pv_entry_t pv; 2308 struct pv_chunk *pc; 2309 vm_page_t m; 2310 2311 rw_assert(&pvh_global_lock, RA_LOCKED); 2312 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2313 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 2314retry: 2315 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2316 if (pc != NULL) { 2317 for (field = 0; field < _NPCM; field++) { 2318 if (pc->pc_map[field]) { 2319 bit = bsfq(pc->pc_map[field]); 2320 break; 2321 } 2322 } 2323 if (field < _NPCM) { 2324 pv = &pc->pc_pventry[field * 64 + bit]; 2325 pc->pc_map[field] &= ~(1ul << bit); 2326 /* If this was the last item, move it to tail */ 2327 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 2328 pc->pc_map[2] == 0) { 2329 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2330 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 2331 pc_list); 2332 } 2333 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 2334 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 2335 return (pv); 2336 } 2337 } 2338 /* No free items, allocate another chunk */ 2339 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 2340 VM_ALLOC_WIRED); 2341 if (m == NULL) { 2342 if (lockp == NULL) { 2343 PV_STAT(pc_chunk_tryfail++); 2344 return (NULL); 2345 } 2346 m = reclaim_pv_chunk(pmap, lockp); 2347 if (m == NULL) 2348 goto retry; 2349 } 2350 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 2351 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 2352 dump_add_page(m->phys_addr); 2353 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 2354 pc->pc_pmap = pmap; 2355 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 2356 pc->pc_map[1] = PC_FREE1; 2357 pc->pc_map[2] = PC_FREE2; 2358 mtx_lock(&pv_chunks_mutex); 2359 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2360 mtx_unlock(&pv_chunks_mutex); 2361 pv = &pc->pc_pventry[0]; 2362 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2363 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 2364 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 2365 return (pv); 2366} 2367 2368/* 2369 * Returns the number of one bits within the given PV chunk map element. 2370 */ 2371static int 2372popcnt_pc_map_elem(uint64_t elem) 2373{ 2374 int count; 2375 2376 /* 2377 * This simple method of counting the one bits performs well because 2378 * the given element typically contains more zero bits than one bits. 2379 */ 2380 count = 0; 2381 for (; elem != 0; elem &= elem - 1) 2382 count++; 2383 return (count); 2384} 2385 2386/* 2387 * Ensure that the number of spare PV entries in the specified pmap meets or 2388 * exceeds the given count, "needed". 2389 * 2390 * The given PV list lock may be released. 2391 */ 2392static void 2393reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 2394{ 2395 struct pch new_tail; 2396 struct pv_chunk *pc; 2397 int avail, free; 2398 vm_page_t m; 2399 2400 rw_assert(&pvh_global_lock, RA_LOCKED); 2401 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2402 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 2403 2404 /* 2405 * Newly allocated PV chunks must be stored in a private list until 2406 * the required number of PV chunks have been allocated. Otherwise, 2407 * reclaim_pv_chunk() could recycle one of these chunks. In 2408 * contrast, these chunks must be added to the pmap upon allocation. 2409 */ 2410 TAILQ_INIT(&new_tail); 2411retry: 2412 avail = 0; 2413 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 2414 if ((cpu_feature2 & CPUID2_POPCNT) == 0) { 2415 free = popcnt_pc_map_elem(pc->pc_map[0]); 2416 free += popcnt_pc_map_elem(pc->pc_map[1]); 2417 free += popcnt_pc_map_elem(pc->pc_map[2]); 2418 } else { 2419 free = popcntq(pc->pc_map[0]); 2420 free += popcntq(pc->pc_map[1]); 2421 free += popcntq(pc->pc_map[2]); 2422 } 2423 if (free == 0) 2424 break; 2425 avail += free; 2426 if (avail >= needed) 2427 break; 2428 } 2429 for (; avail < needed; avail += _NPCPV) { 2430 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 2431 VM_ALLOC_WIRED); 2432 if (m == NULL) { 2433 m = reclaim_pv_chunk(pmap, lockp); 2434 if (m == NULL) 2435 goto retry; 2436 } 2437 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 2438 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 2439 dump_add_page(m->phys_addr); 2440 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 2441 pc->pc_pmap = pmap; 2442 pc->pc_map[0] = PC_FREE0; 2443 pc->pc_map[1] = PC_FREE1; 2444 pc->pc_map[2] = PC_FREE2; 2445 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2446 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2447 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 2448 } 2449 if (!TAILQ_EMPTY(&new_tail)) { 2450 mtx_lock(&pv_chunks_mutex); 2451 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 2452 mtx_unlock(&pv_chunks_mutex); 2453 } 2454} 2455 2456/* 2457 * First find and then remove the pv entry for the specified pmap and virtual 2458 * address from the specified pv list. Returns the pv entry if found and NULL 2459 * otherwise. This operation can be performed on pv lists for either 4KB or 2460 * 2MB page mappings. 2461 */ 2462static __inline pv_entry_t 2463pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2464{ 2465 pv_entry_t pv; 2466 2467 rw_assert(&pvh_global_lock, RA_LOCKED); 2468 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 2469 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 2470 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 2471 break; 2472 } 2473 } 2474 return (pv); 2475} 2476 2477/* 2478 * After demotion from a 2MB page mapping to 512 4KB page mappings, 2479 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 2480 * entries for each of the 4KB page mappings. 2481 */ 2482static void 2483pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 2484 struct rwlock **lockp) 2485{ 2486 struct md_page *pvh; 2487 struct pv_chunk *pc; 2488 pv_entry_t pv; 2489 vm_offset_t va_last; 2490 vm_page_t m; 2491 int bit, field; 2492 2493 rw_assert(&pvh_global_lock, RA_LOCKED); 2494 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2495 KASSERT((pa & PDRMASK) == 0, 2496 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 2497 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2498 2499 /* 2500 * Transfer the 2mpage's pv entry for this mapping to the first 2501 * page's pv list. Once this transfer begins, the pv list lock 2502 * must not be released until the last pv entry is reinstantiated. 2503 */ 2504 pvh = pa_to_pvh(pa); 2505 va = trunc_2mpage(va); 2506 pv = pmap_pvh_remove(pvh, pmap, va); 2507 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 2508 m = PHYS_TO_VM_PAGE(pa); 2509 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2510 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 2511 PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); 2512 va_last = va + NBPDR - PAGE_SIZE; 2513 for (;;) { 2514 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2515 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 2516 pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); 2517 for (field = 0; field < _NPCM; field++) { 2518 while (pc->pc_map[field]) { 2519 bit = bsfq(pc->pc_map[field]); 2520 pc->pc_map[field] &= ~(1ul << bit); 2521 pv = &pc->pc_pventry[field * 64 + bit]; 2522 va += PAGE_SIZE; 2523 pv->pv_va = va; 2524 m++; 2525 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2526 ("pmap_pv_demote_pde: page %p is not managed", m)); 2527 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2528 if (va == va_last) 2529 goto out; 2530 } 2531 } 2532 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2533 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2534 } 2535out: 2536 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 2537 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2538 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2539 } 2540 PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); 2541 PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); 2542} 2543 2544/* 2545 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 2546 * replace the many pv entries for the 4KB page mappings by a single pv entry 2547 * for the 2MB page mapping. 2548 */ 2549static void 2550pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 2551 struct rwlock **lockp) 2552{ 2553 struct md_page *pvh; 2554 pv_entry_t pv; 2555 vm_offset_t va_last; 2556 vm_page_t m; 2557 2558 rw_assert(&pvh_global_lock, RA_LOCKED); 2559 KASSERT((pa & PDRMASK) == 0, 2560 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 2561 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2562 2563 /* 2564 * Transfer the first page's pv entry for this mapping to the 2mpage's 2565 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 2566 * a transfer avoids the possibility that get_pv_entry() calls 2567 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 2568 * mappings that is being promoted. 2569 */ 2570 m = PHYS_TO_VM_PAGE(pa); 2571 va = trunc_2mpage(va); 2572 pv = pmap_pvh_remove(&m->md, pmap, va); 2573 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 2574 pvh = pa_to_pvh(pa); 2575 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2576 /* Free the remaining NPTEPG - 1 pv entries. */ 2577 va_last = va + NBPDR - PAGE_SIZE; 2578 do { 2579 m++; 2580 va += PAGE_SIZE; 2581 pmap_pvh_free(&m->md, pmap, va); 2582 } while (va < va_last); 2583} 2584 2585/* 2586 * First find and then destroy the pv entry for the specified pmap and virtual 2587 * address. This operation can be performed on pv lists for either 4KB or 2MB 2588 * page mappings. 2589 */ 2590static void 2591pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2592{ 2593 pv_entry_t pv; 2594 2595 pv = pmap_pvh_remove(pvh, pmap, va); 2596 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 2597 free_pv_entry(pmap, pv); 2598} 2599 2600/* 2601 * Conditionally create the PV entry for a 4KB page mapping if the required 2602 * memory can be allocated without resorting to reclamation. 2603 */ 2604static boolean_t 2605pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 2606 struct rwlock **lockp) 2607{ 2608 pv_entry_t pv; 2609 2610 rw_assert(&pvh_global_lock, RA_LOCKED); 2611 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2612 /* Pass NULL instead of the lock pointer to disable reclamation. */ 2613 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 2614 pv->pv_va = va; 2615 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2616 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2617 return (TRUE); 2618 } else 2619 return (FALSE); 2620} 2621 2622/* 2623 * Conditionally create the PV entry for a 2MB page mapping if the required 2624 * memory can be allocated without resorting to reclamation. 2625 */ 2626static boolean_t 2627pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 2628 struct rwlock **lockp) 2629{ 2630 struct md_page *pvh; 2631 pv_entry_t pv; 2632 2633 rw_assert(&pvh_global_lock, RA_LOCKED); 2634 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2635 /* Pass NULL instead of the lock pointer to disable reclamation. */ 2636 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 2637 pv->pv_va = va; 2638 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 2639 pvh = pa_to_pvh(pa); 2640 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2641 return (TRUE); 2642 } else 2643 return (FALSE); 2644} 2645 2646/* 2647 * Fills a page table page with mappings to consecutive physical pages. 2648 */ 2649static void 2650pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 2651{ 2652 pt_entry_t *pte; 2653 2654 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 2655 *pte = newpte; 2656 newpte += PAGE_SIZE; 2657 } 2658} 2659 2660/* 2661 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 2662 * mapping is invalidated. 2663 */ 2664static boolean_t 2665pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2666{ 2667 struct rwlock *lock; 2668 boolean_t rv; 2669 2670 lock = NULL; 2671 rv = pmap_demote_pde_locked(pmap, pde, va, &lock); 2672 if (lock != NULL) 2673 rw_wunlock(lock); 2674 return (rv); 2675} 2676 2677static boolean_t 2678pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 2679 struct rwlock **lockp) 2680{ 2681 pd_entry_t newpde, oldpde; 2682 pt_entry_t *firstpte, newpte; 2683 vm_paddr_t mptepa; 2684 vm_page_t free, mpte; 2685 2686 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2687 oldpde = *pde; 2688 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 2689 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 2690 mpte = pmap_lookup_pt_page(pmap, va); 2691 if (mpte != NULL) 2692 pmap_remove_pt_page(pmap, mpte); 2693 else { 2694 KASSERT((oldpde & PG_W) == 0, 2695 ("pmap_demote_pde: page table page for a wired mapping" 2696 " is missing")); 2697 2698 /* 2699 * Invalidate the 2MB page mapping and return "failure" if the 2700 * mapping was never accessed or the allocation of the new 2701 * page table page fails. If the 2MB page mapping belongs to 2702 * the direct map region of the kernel's address space, then 2703 * the page allocation request specifies the highest possible 2704 * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is 2705 * normal. Page table pages are preallocated for every other 2706 * part of the kernel address space, so the direct map region 2707 * is the only part of the kernel address space that must be 2708 * handled here. 2709 */ 2710 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 2711 pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va < 2712 DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | 2713 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 2714 free = NULL; 2715 pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free, 2716 lockp); 2717 pmap_invalidate_page(pmap, trunc_2mpage(va)); 2718 pmap_free_zero_pages(free); 2719 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx" 2720 " in pmap %p", va, pmap); 2721 return (FALSE); 2722 } 2723 if (va < VM_MAXUSER_ADDRESS) 2724 pmap_resident_count_inc(pmap, 1); 2725 } 2726 mptepa = VM_PAGE_TO_PHYS(mpte); 2727 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 2728 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 2729 KASSERT((oldpde & PG_A) != 0, 2730 ("pmap_demote_pde: oldpde is missing PG_A")); 2731 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 2732 ("pmap_demote_pde: oldpde is missing PG_M")); 2733 newpte = oldpde & ~PG_PS; 2734 if ((newpte & PG_PDE_PAT) != 0) 2735 newpte ^= PG_PDE_PAT | PG_PTE_PAT; 2736 2737 /* 2738 * If the page table page is new, initialize it. 2739 */ 2740 if (mpte->wire_count == 1) { 2741 mpte->wire_count = NPTEPG; 2742 pmap_fill_ptp(firstpte, newpte); 2743 } 2744 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 2745 ("pmap_demote_pde: firstpte and newpte map different physical" 2746 " addresses")); 2747 2748 /* 2749 * If the mapping has changed attributes, update the page table 2750 * entries. 2751 */ 2752 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 2753 pmap_fill_ptp(firstpte, newpte); 2754 2755 /* 2756 * The spare PV entries must be reserved prior to demoting the 2757 * mapping, that is, prior to changing the PDE. Otherwise, the state 2758 * of the PDE and the PV lists will be inconsistent, which can result 2759 * in reclaim_pv_chunk() attempting to remove a PV entry from the 2760 * wrong PV list and pmap_pv_demote_pde() failing to find the expected 2761 * PV entry for the 2MB page mapping that is being demoted. 2762 */ 2763 if ((oldpde & PG_MANAGED) != 0) 2764 reserve_pv_entries(pmap, NPTEPG - 1, lockp); 2765 2766 /* 2767 * Demote the mapping. This pmap is locked. The old PDE has 2768 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 2769 * set. Thus, there is no danger of a race with another 2770 * processor changing the setting of PG_A and/or PG_M between 2771 * the read above and the store below. 2772 */ 2773 if (workaround_erratum383) 2774 pmap_update_pde(pmap, va, pde, newpde); 2775 else 2776 pde_store(pde, newpde); 2777 2778 /* 2779 * Invalidate a stale recursive mapping of the page table page. 2780 */ 2781 if (va >= VM_MAXUSER_ADDRESS) 2782 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 2783 2784 /* 2785 * Demote the PV entry. 2786 */ 2787 if ((oldpde & PG_MANAGED) != 0) 2788 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); 2789 2790 atomic_add_long(&pmap_pde_demotions, 1); 2791 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx" 2792 " in pmap %p", va, pmap); 2793 return (TRUE); 2794} 2795 2796/* 2797 * pmap_remove_pde: do the things to unmap a superpage in a process 2798 */ 2799static int 2800pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 2801 vm_page_t *free, struct rwlock **lockp) 2802{ 2803 struct md_page *pvh; 2804 pd_entry_t oldpde; 2805 vm_offset_t eva, va; 2806 vm_page_t m, mpte; 2807 2808 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2809 KASSERT((sva & PDRMASK) == 0, 2810 ("pmap_remove_pde: sva is not 2mpage aligned")); 2811 oldpde = pte_load_clear(pdq); 2812 if (oldpde & PG_W) 2813 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 2814 2815 /* 2816 * Machines that don't support invlpg, also don't support 2817 * PG_G. 2818 */ 2819 if (oldpde & PG_G) 2820 pmap_invalidate_page(kernel_pmap, sva); 2821 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); 2822 if (oldpde & PG_MANAGED) { 2823 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); 2824 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 2825 pmap_pvh_free(pvh, pmap, sva); 2826 eva = sva + NBPDR; 2827 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 2828 va < eva; va += PAGE_SIZE, m++) { 2829 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2830 vm_page_dirty(m); 2831 if (oldpde & PG_A) 2832 vm_page_aflag_set(m, PGA_REFERENCED); 2833 if (TAILQ_EMPTY(&m->md.pv_list) && 2834 TAILQ_EMPTY(&pvh->pv_list)) 2835 vm_page_aflag_clear(m, PGA_WRITEABLE); 2836 } 2837 } 2838 if (pmap == kernel_pmap) { 2839 if (!pmap_demote_pde_locked(pmap, pdq, sva, lockp)) 2840 panic("pmap_remove_pde: failed demotion"); 2841 } else { 2842 mpte = pmap_lookup_pt_page(pmap, sva); 2843 if (mpte != NULL) { 2844 pmap_remove_pt_page(pmap, mpte); 2845 pmap_resident_count_dec(pmap, 1); 2846 KASSERT(mpte->wire_count == NPTEPG, 2847 ("pmap_remove_pde: pte page wire count error")); 2848 mpte->wire_count = 0; 2849 pmap_add_delayed_free_list(mpte, free, FALSE); 2850 atomic_subtract_int(&cnt.v_wire_count, 1); 2851 } 2852 } 2853 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); 2854} 2855 2856/* 2857 * pmap_remove_pte: do the things to unmap a page in a process 2858 */ 2859static int 2860pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 2861 pd_entry_t ptepde, vm_page_t *free, struct rwlock **lockp) 2862{ 2863 struct md_page *pvh; 2864 pt_entry_t oldpte; 2865 vm_page_t m; 2866 2867 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2868 oldpte = pte_load_clear(ptq); 2869 if (oldpte & PG_W) 2870 pmap->pm_stats.wired_count -= 1; 2871 pmap_resident_count_dec(pmap, 1); 2872 if (oldpte & PG_MANAGED) { 2873 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 2874 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2875 vm_page_dirty(m); 2876 if (oldpte & PG_A) 2877 vm_page_aflag_set(m, PGA_REFERENCED); 2878 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2879 pmap_pvh_free(&m->md, pmap, va); 2880 if (TAILQ_EMPTY(&m->md.pv_list) && 2881 (m->flags & PG_FICTITIOUS) == 0) { 2882 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2883 if (TAILQ_EMPTY(&pvh->pv_list)) 2884 vm_page_aflag_clear(m, PGA_WRITEABLE); 2885 } 2886 } 2887 return (pmap_unuse_pt(pmap, va, ptepde, free)); 2888} 2889 2890/* 2891 * Remove a single page from a process address space 2892 */ 2893static void 2894pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, vm_page_t *free) 2895{ 2896 struct rwlock *lock; 2897 pt_entry_t *pte; 2898 2899 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2900 if ((*pde & PG_V) == 0) 2901 return; 2902 pte = pmap_pde_to_pte(pde, va); 2903 if ((*pte & PG_V) == 0) 2904 return; 2905 lock = NULL; 2906 pmap_remove_pte(pmap, pte, va, *pde, free, &lock); 2907 if (lock != NULL) 2908 rw_wunlock(lock); 2909 pmap_invalidate_page(pmap, va); 2910} 2911 2912/* 2913 * Remove the given range of addresses from the specified map. 2914 * 2915 * It is assumed that the start and end are properly 2916 * rounded to the page size. 2917 */ 2918void 2919pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2920{ 2921 struct rwlock *lock; 2922 vm_offset_t va, va_next; 2923 pml4_entry_t *pml4e; 2924 pdp_entry_t *pdpe; 2925 pd_entry_t ptpaddr, *pde; 2926 pt_entry_t *pte; 2927 vm_page_t free = NULL; 2928 int anyvalid; 2929 2930 /* 2931 * Perform an unsynchronized read. This is, however, safe. 2932 */ 2933 if (pmap->pm_stats.resident_count == 0) 2934 return; 2935 2936 anyvalid = 0; 2937 2938 rw_rlock(&pvh_global_lock); 2939 PMAP_LOCK(pmap); 2940 2941 /* 2942 * special handling of removing one page. a very 2943 * common operation and easy to short circuit some 2944 * code. 2945 */ 2946 if (sva + PAGE_SIZE == eva) { 2947 pde = pmap_pde(pmap, sva); 2948 if (pde && (*pde & PG_PS) == 0) { 2949 pmap_remove_page(pmap, sva, pde, &free); 2950 goto out; 2951 } 2952 } 2953 2954 lock = NULL; 2955 for (; sva < eva; sva = va_next) { 2956 2957 if (pmap->pm_stats.resident_count == 0) 2958 break; 2959 2960 pml4e = pmap_pml4e(pmap, sva); 2961 if ((*pml4e & PG_V) == 0) { 2962 va_next = (sva + NBPML4) & ~PML4MASK; 2963 if (va_next < sva) 2964 va_next = eva; 2965 continue; 2966 } 2967 2968 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 2969 if ((*pdpe & PG_V) == 0) { 2970 va_next = (sva + NBPDP) & ~PDPMASK; 2971 if (va_next < sva) 2972 va_next = eva; 2973 continue; 2974 } 2975 2976 /* 2977 * Calculate index for next page table. 2978 */ 2979 va_next = (sva + NBPDR) & ~PDRMASK; 2980 if (va_next < sva) 2981 va_next = eva; 2982 2983 pde = pmap_pdpe_to_pde(pdpe, sva); 2984 ptpaddr = *pde; 2985 2986 /* 2987 * Weed out invalid mappings. 2988 */ 2989 if (ptpaddr == 0) 2990 continue; 2991 2992 /* 2993 * Check for large page. 2994 */ 2995 if ((ptpaddr & PG_PS) != 0) { 2996 /* 2997 * Are we removing the entire large page? If not, 2998 * demote the mapping and fall through. 2999 */ 3000 if (sva + NBPDR == va_next && eva >= va_next) { 3001 /* 3002 * The TLB entry for a PG_G mapping is 3003 * invalidated by pmap_remove_pde(). 3004 */ 3005 if ((ptpaddr & PG_G) == 0) 3006 anyvalid = 1; 3007 pmap_remove_pde(pmap, pde, sva, &free, &lock); 3008 continue; 3009 } else if (!pmap_demote_pde_locked(pmap, pde, sva, 3010 &lock)) { 3011 /* The large page mapping was destroyed. */ 3012 continue; 3013 } else 3014 ptpaddr = *pde; 3015 } 3016 3017 /* 3018 * Limit our scan to either the end of the va represented 3019 * by the current page table page, or to the end of the 3020 * range being removed. 3021 */ 3022 if (va_next > eva) 3023 va_next = eva; 3024 3025 va = va_next; 3026 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 3027 sva += PAGE_SIZE) { 3028 if (*pte == 0) { 3029 if (va != va_next) { 3030 pmap_invalidate_range(pmap, va, sva); 3031 va = va_next; 3032 } 3033 continue; 3034 } 3035 if ((*pte & PG_G) == 0) 3036 anyvalid = 1; 3037 else if (va == va_next) 3038 va = sva; 3039 if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free, 3040 &lock)) { 3041 sva += PAGE_SIZE; 3042 break; 3043 } 3044 } 3045 if (va != va_next) 3046 pmap_invalidate_range(pmap, va, sva); 3047 } 3048 if (lock != NULL) 3049 rw_wunlock(lock); 3050out: 3051 if (anyvalid) 3052 pmap_invalidate_all(pmap); 3053 rw_runlock(&pvh_global_lock); 3054 PMAP_UNLOCK(pmap); 3055 pmap_free_zero_pages(free); 3056} 3057 3058/* 3059 * Routine: pmap_remove_all 3060 * Function: 3061 * Removes this physical page from 3062 * all physical maps in which it resides. 3063 * Reflects back modify bits to the pager. 3064 * 3065 * Notes: 3066 * Original versions of this routine were very 3067 * inefficient because they iteratively called 3068 * pmap_remove (slow...) 3069 */ 3070 3071void 3072pmap_remove_all(vm_page_t m) 3073{ 3074 struct md_page *pvh; 3075 pv_entry_t pv; 3076 pmap_t pmap; 3077 pt_entry_t *pte, tpte; 3078 pd_entry_t *pde; 3079 vm_offset_t va; 3080 vm_page_t free; 3081 3082 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3083 ("pmap_remove_all: page %p is not managed", m)); 3084 free = NULL; 3085 rw_wlock(&pvh_global_lock); 3086 if ((m->flags & PG_FICTITIOUS) != 0) 3087 goto small_mappings; 3088 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3089 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 3090 pmap = PV_PMAP(pv); 3091 PMAP_LOCK(pmap); 3092 va = pv->pv_va; 3093 pde = pmap_pde(pmap, va); 3094 (void)pmap_demote_pde(pmap, pde, va); 3095 PMAP_UNLOCK(pmap); 3096 } 3097small_mappings: 3098 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3099 pmap = PV_PMAP(pv); 3100 PMAP_LOCK(pmap); 3101 pmap_resident_count_dec(pmap, 1); 3102 pde = pmap_pde(pmap, pv->pv_va); 3103 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 3104 " a 2mpage in page %p's pv list", m)); 3105 pte = pmap_pde_to_pte(pde, pv->pv_va); 3106 tpte = pte_load_clear(pte); 3107 if (tpte & PG_W) 3108 pmap->pm_stats.wired_count--; 3109 if (tpte & PG_A) 3110 vm_page_aflag_set(m, PGA_REFERENCED); 3111 3112 /* 3113 * Update the vm_page_t clean and reference bits. 3114 */ 3115 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3116 vm_page_dirty(m); 3117 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); 3118 pmap_invalidate_page(pmap, pv->pv_va); 3119 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3120 free_pv_entry(pmap, pv); 3121 PMAP_UNLOCK(pmap); 3122 } 3123 vm_page_aflag_clear(m, PGA_WRITEABLE); 3124 rw_wunlock(&pvh_global_lock); 3125 pmap_free_zero_pages(free); 3126} 3127 3128/* 3129 * pmap_protect_pde: do the things to protect a 2mpage in a process 3130 */ 3131static boolean_t 3132pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 3133{ 3134 pd_entry_t newpde, oldpde; 3135 vm_offset_t eva, va; 3136 vm_page_t m; 3137 boolean_t anychanged; 3138 3139 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3140 KASSERT((sva & PDRMASK) == 0, 3141 ("pmap_protect_pde: sva is not 2mpage aligned")); 3142 anychanged = FALSE; 3143retry: 3144 oldpde = newpde = *pde; 3145 if (oldpde & PG_MANAGED) { 3146 eva = sva + NBPDR; 3147 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 3148 va < eva; va += PAGE_SIZE, m++) 3149 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3150 vm_page_dirty(m); 3151 } 3152 if ((prot & VM_PROT_WRITE) == 0) 3153 newpde &= ~(PG_RW | PG_M); 3154 if ((prot & VM_PROT_EXECUTE) == 0) 3155 newpde |= pg_nx; 3156 if (newpde != oldpde) { 3157 if (!atomic_cmpset_long(pde, oldpde, newpde)) 3158 goto retry; 3159 if (oldpde & PG_G) 3160 pmap_invalidate_page(pmap, sva); 3161 else 3162 anychanged = TRUE; 3163 } 3164 return (anychanged); 3165} 3166 3167/* 3168 * Set the physical protection on the 3169 * specified range of this map as requested. 3170 */ 3171void 3172pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 3173{ 3174 vm_offset_t va_next; 3175 pml4_entry_t *pml4e; 3176 pdp_entry_t *pdpe; 3177 pd_entry_t ptpaddr, *pde; 3178 pt_entry_t *pte; 3179 boolean_t anychanged, pv_lists_locked; 3180 3181 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 3182 pmap_remove(pmap, sva, eva); 3183 return; 3184 } 3185 3186 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 3187 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 3188 return; 3189 3190 pv_lists_locked = FALSE; 3191resume: 3192 anychanged = FALSE; 3193 3194 PMAP_LOCK(pmap); 3195 for (; sva < eva; sva = va_next) { 3196 3197 pml4e = pmap_pml4e(pmap, sva); 3198 if ((*pml4e & PG_V) == 0) { 3199 va_next = (sva + NBPML4) & ~PML4MASK; 3200 if (va_next < sva) 3201 va_next = eva; 3202 continue; 3203 } 3204 3205 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 3206 if ((*pdpe & PG_V) == 0) { 3207 va_next = (sva + NBPDP) & ~PDPMASK; 3208 if (va_next < sva) 3209 va_next = eva; 3210 continue; 3211 } 3212 3213 va_next = (sva + NBPDR) & ~PDRMASK; 3214 if (va_next < sva) 3215 va_next = eva; 3216 3217 pde = pmap_pdpe_to_pde(pdpe, sva); 3218 ptpaddr = *pde; 3219 3220 /* 3221 * Weed out invalid mappings. 3222 */ 3223 if (ptpaddr == 0) 3224 continue; 3225 3226 /* 3227 * Check for large page. 3228 */ 3229 if ((ptpaddr & PG_PS) != 0) { 3230 /* 3231 * Are we protecting the entire large page? If not, 3232 * demote the mapping and fall through. 3233 */ 3234 if (sva + NBPDR == va_next && eva >= va_next) { 3235 /* 3236 * The TLB entry for a PG_G mapping is 3237 * invalidated by pmap_protect_pde(). 3238 */ 3239 if (pmap_protect_pde(pmap, pde, sva, prot)) 3240 anychanged = TRUE; 3241 continue; 3242 } else { 3243 if (!pv_lists_locked) { 3244 pv_lists_locked = TRUE; 3245 if (!rw_try_rlock(&pvh_global_lock)) { 3246 if (anychanged) 3247 pmap_invalidate_all( 3248 pmap); 3249 PMAP_UNLOCK(pmap); 3250 rw_rlock(&pvh_global_lock); 3251 goto resume; 3252 } 3253 } 3254 if (!pmap_demote_pde(pmap, pde, sva)) { 3255 /* 3256 * The large page mapping was 3257 * destroyed. 3258 */ 3259 continue; 3260 } 3261 } 3262 } 3263 3264 if (va_next > eva) 3265 va_next = eva; 3266 3267 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 3268 sva += PAGE_SIZE) { 3269 pt_entry_t obits, pbits; 3270 vm_page_t m; 3271 3272retry: 3273 obits = pbits = *pte; 3274 if ((pbits & PG_V) == 0) 3275 continue; 3276 3277 if ((prot & VM_PROT_WRITE) == 0) { 3278 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 3279 (PG_MANAGED | PG_M | PG_RW)) { 3280 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 3281 vm_page_dirty(m); 3282 } 3283 pbits &= ~(PG_RW | PG_M); 3284 } 3285 if ((prot & VM_PROT_EXECUTE) == 0) 3286 pbits |= pg_nx; 3287 3288 if (pbits != obits) { 3289 if (!atomic_cmpset_long(pte, obits, pbits)) 3290 goto retry; 3291 if (obits & PG_G) 3292 pmap_invalidate_page(pmap, sva); 3293 else 3294 anychanged = TRUE; 3295 } 3296 } 3297 } 3298 if (anychanged) 3299 pmap_invalidate_all(pmap); 3300 if (pv_lists_locked) 3301 rw_runlock(&pvh_global_lock); 3302 PMAP_UNLOCK(pmap); 3303} 3304 3305/* 3306 * Tries to promote the 512, contiguous 4KB page mappings that are within a 3307 * single page table page (PTP) to a single 2MB page mapping. For promotion 3308 * to occur, two conditions must be met: (1) the 4KB page mappings must map 3309 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 3310 * identical characteristics. 3311 */ 3312static void 3313pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 3314 struct rwlock **lockp) 3315{ 3316 pd_entry_t newpde; 3317 pt_entry_t *firstpte, oldpte, pa, *pte; 3318 vm_offset_t oldpteva; 3319 vm_page_t mpte; 3320 3321 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3322 3323 /* 3324 * Examine the first PTE in the specified PTP. Abort if this PTE is 3325 * either invalid, unused, or does not map the first 4KB physical page 3326 * within a 2MB page. 3327 */ 3328 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 3329setpde: 3330 newpde = *firstpte; 3331 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 3332 atomic_add_long(&pmap_pde_p_failures, 1); 3333 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 3334 " in pmap %p", va, pmap); 3335 return; 3336 } 3337 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 3338 /* 3339 * When PG_M is already clear, PG_RW can be cleared without 3340 * a TLB invalidation. 3341 */ 3342 if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW)) 3343 goto setpde; 3344 newpde &= ~PG_RW; 3345 } 3346 3347 /* 3348 * Examine each of the other PTEs in the specified PTP. Abort if this 3349 * PTE maps an unexpected 4KB physical page or does not have identical 3350 * characteristics to the first PTE. 3351 */ 3352 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 3353 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 3354setpte: 3355 oldpte = *pte; 3356 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 3357 atomic_add_long(&pmap_pde_p_failures, 1); 3358 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 3359 " in pmap %p", va, pmap); 3360 return; 3361 } 3362 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 3363 /* 3364 * When PG_M is already clear, PG_RW can be cleared 3365 * without a TLB invalidation. 3366 */ 3367 if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW)) 3368 goto setpte; 3369 oldpte &= ~PG_RW; 3370 oldpteva = (oldpte & PG_FRAME & PDRMASK) | 3371 (va & ~PDRMASK); 3372 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 3373 " in pmap %p", oldpteva, pmap); 3374 } 3375 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 3376 atomic_add_long(&pmap_pde_p_failures, 1); 3377 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 3378 " in pmap %p", va, pmap); 3379 return; 3380 } 3381 pa -= PAGE_SIZE; 3382 } 3383 3384 /* 3385 * Save the page table page in its current state until the PDE 3386 * mapping the superpage is demoted by pmap_demote_pde() or 3387 * destroyed by pmap_remove_pde(). 3388 */ 3389 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 3390 KASSERT(mpte >= vm_page_array && 3391 mpte < &vm_page_array[vm_page_array_size], 3392 ("pmap_promote_pde: page table page is out of range")); 3393 KASSERT(mpte->pindex == pmap_pde_pindex(va), 3394 ("pmap_promote_pde: page table page's pindex is wrong")); 3395 pmap_insert_pt_page(pmap, mpte); 3396 3397 /* 3398 * Promote the pv entries. 3399 */ 3400 if ((newpde & PG_MANAGED) != 0) 3401 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); 3402 3403 /* 3404 * Propagate the PAT index to its proper position. 3405 */ 3406 if ((newpde & PG_PTE_PAT) != 0) 3407 newpde ^= PG_PDE_PAT | PG_PTE_PAT; 3408 3409 /* 3410 * Map the superpage. 3411 */ 3412 if (workaround_erratum383) 3413 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 3414 else 3415 pde_store(pde, PG_PS | newpde); 3416 3417 atomic_add_long(&pmap_pde_promotions, 1); 3418 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" 3419 " in pmap %p", va, pmap); 3420} 3421 3422/* 3423 * Insert the given physical page (p) at 3424 * the specified virtual address (v) in the 3425 * target physical map with the protection requested. 3426 * 3427 * If specified, the page will be wired down, meaning 3428 * that the related pte can not be reclaimed. 3429 * 3430 * NB: This is the only routine which MAY NOT lazy-evaluate 3431 * or lose information. That is, this routine must actually 3432 * insert this page into the given map NOW. 3433 */ 3434void 3435pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m, 3436 vm_prot_t prot, boolean_t wired) 3437{ 3438 struct rwlock *lock; 3439 pd_entry_t *pde; 3440 pt_entry_t *pte; 3441 pt_entry_t newpte, origpte; 3442 pv_entry_t pv; 3443 vm_paddr_t opa, pa; 3444 vm_page_t mpte, om; 3445 3446 va = trunc_page(va); 3447 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 3448 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 3449 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", 3450 va)); 3451 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || 3452 va >= kmi.clean_eva, 3453 ("pmap_enter: managed mapping within the clean submap")); 3454 if ((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) == 0) 3455 VM_OBJECT_ASSERT_WLOCKED(m->object); 3456 pa = VM_PAGE_TO_PHYS(m); 3457 newpte = (pt_entry_t)(pa | PG_A | PG_V); 3458 if ((access & VM_PROT_WRITE) != 0) 3459 newpte |= PG_M; 3460 if ((prot & VM_PROT_WRITE) != 0) 3461 newpte |= PG_RW; 3462 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 3463 ("pmap_enter: access includes VM_PROT_WRITE but prot doesn't")); 3464 if ((prot & VM_PROT_EXECUTE) == 0) 3465 newpte |= pg_nx; 3466 if (wired) 3467 newpte |= PG_W; 3468 if (va < VM_MAXUSER_ADDRESS) 3469 newpte |= PG_U; 3470 if (pmap == kernel_pmap) 3471 newpte |= PG_G; 3472 newpte |= pmap_cache_bits(m->md.pat_mode, 0); 3473 3474 mpte = NULL; 3475 3476 lock = NULL; 3477 rw_rlock(&pvh_global_lock); 3478 PMAP_LOCK(pmap); 3479 3480 /* 3481 * In the case that a page table page is not 3482 * resident, we are creating it here. 3483 */ 3484retry: 3485 pde = pmap_pde(pmap, va); 3486 if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || 3487 pmap_demote_pde_locked(pmap, pde, va, &lock))) { 3488 pte = pmap_pde_to_pte(pde, va); 3489 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 3490 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 3491 mpte->wire_count++; 3492 } 3493 } else if (va < VM_MAXUSER_ADDRESS) { 3494 /* 3495 * Here if the pte page isn't mapped, or if it has been 3496 * deallocated. 3497 */ 3498 mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), &lock); 3499 goto retry; 3500 } else 3501 panic("pmap_enter: invalid page directory va=%#lx", va); 3502 3503 origpte = *pte; 3504 3505 /* 3506 * Is the specified virtual address already mapped? 3507 */ 3508 if ((origpte & PG_V) != 0) { 3509 /* 3510 * Wiring change, just update stats. We don't worry about 3511 * wiring PT pages as they remain resident as long as there 3512 * are valid mappings in them. Hence, if a user page is wired, 3513 * the PT page will be also. 3514 */ 3515 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 3516 pmap->pm_stats.wired_count++; 3517 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 3518 pmap->pm_stats.wired_count--; 3519 3520 /* 3521 * Remove the extra PT page reference. 3522 */ 3523 if (mpte != NULL) { 3524 mpte->wire_count--; 3525 KASSERT(mpte->wire_count > 0, 3526 ("pmap_enter: missing reference to page table page," 3527 " va: 0x%lx", va)); 3528 } 3529 3530 /* 3531 * Has the physical page changed? 3532 */ 3533 opa = origpte & PG_FRAME; 3534 if (opa == pa) { 3535 /* 3536 * No, might be a protection or wiring change. 3537 */ 3538 if ((origpte & PG_MANAGED) != 0) { 3539 newpte |= PG_MANAGED; 3540 if ((newpte & PG_RW) != 0) 3541 vm_page_aflag_set(m, PGA_WRITEABLE); 3542 } 3543 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 3544 goto unchanged; 3545 goto validate; 3546 } 3547 } else { 3548 /* 3549 * Increment the counters. 3550 */ 3551 if ((newpte & PG_W) != 0) 3552 pmap->pm_stats.wired_count++; 3553 pmap_resident_count_inc(pmap, 1); 3554 } 3555 3556 /* 3557 * Enter on the PV list if part of our managed memory. 3558 */ 3559 if ((m->oflags & VPO_UNMANAGED) == 0) { 3560 newpte |= PG_MANAGED; 3561 pv = get_pv_entry(pmap, &lock); 3562 pv->pv_va = va; 3563 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 3564 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3565 if ((newpte & PG_RW) != 0) 3566 vm_page_aflag_set(m, PGA_WRITEABLE); 3567 } 3568 3569 /* 3570 * Update the PTE. 3571 */ 3572 if ((origpte & PG_V) != 0) { 3573validate: 3574 origpte = pte_load_store(pte, newpte); 3575 opa = origpte & PG_FRAME; 3576 if (opa != pa) { 3577 if ((origpte & PG_MANAGED) != 0) { 3578 om = PHYS_TO_VM_PAGE(opa); 3579 if ((origpte & (PG_M | PG_RW)) == (PG_M | 3580 PG_RW)) 3581 vm_page_dirty(om); 3582 if ((origpte & PG_A) != 0) 3583 vm_page_aflag_set(om, PGA_REFERENCED); 3584 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 3585 pmap_pvh_free(&om->md, pmap, va); 3586 if ((om->aflags & PGA_WRITEABLE) != 0 && 3587 TAILQ_EMPTY(&om->md.pv_list) && 3588 ((om->flags & PG_FICTITIOUS) != 0 || 3589 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 3590 vm_page_aflag_clear(om, PGA_WRITEABLE); 3591 } 3592 } else if ((newpte & PG_M) == 0 && (origpte & (PG_M | 3593 PG_RW)) == (PG_M | PG_RW)) { 3594 if ((origpte & PG_MANAGED) != 0) 3595 vm_page_dirty(m); 3596 3597 /* 3598 * Although the PTE may still have PG_RW set, TLB 3599 * invalidation may nonetheless be required because 3600 * the PTE no longer has PG_M set. 3601 */ 3602 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { 3603 /* 3604 * This PTE change does not require TLB invalidation. 3605 */ 3606 goto unchanged; 3607 } 3608 if ((origpte & PG_A) != 0) 3609 pmap_invalidate_page(pmap, va); 3610 } else 3611 pte_store(pte, newpte); 3612 3613unchanged: 3614 3615 /* 3616 * If both the page table page and the reservation are fully 3617 * populated, then attempt promotion. 3618 */ 3619 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 3620 pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 && 3621 vm_reserv_level_iffullpop(m) == 0) 3622 pmap_promote_pde(pmap, pde, va, &lock); 3623 3624 if (lock != NULL) 3625 rw_wunlock(lock); 3626 rw_runlock(&pvh_global_lock); 3627 PMAP_UNLOCK(pmap); 3628} 3629 3630/* 3631 * Tries to create a 2MB page mapping. Returns TRUE if successful and FALSE 3632 * otherwise. Fails if (1) a page table page cannot be allocated without 3633 * blocking, (2) a mapping already exists at the specified virtual address, or 3634 * (3) a pv entry cannot be allocated without reclaiming another pv entry. 3635 */ 3636static boolean_t 3637pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3638 struct rwlock **lockp) 3639{ 3640 pd_entry_t *pde, newpde; 3641 vm_page_t free, mpde; 3642 3643 rw_assert(&pvh_global_lock, RA_LOCKED); 3644 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3645 if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) { 3646 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3647 " in pmap %p", va, pmap); 3648 return (FALSE); 3649 } 3650 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde)); 3651 pde = &pde[pmap_pde_index(va)]; 3652 if ((*pde & PG_V) != 0) { 3653 KASSERT(mpde->wire_count > 1, 3654 ("pmap_enter_pde: mpde's wire count is too low")); 3655 mpde->wire_count--; 3656 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3657 " in pmap %p", va, pmap); 3658 return (FALSE); 3659 } 3660 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) | 3661 PG_PS | PG_V; 3662 if ((m->oflags & VPO_UNMANAGED) == 0) { 3663 newpde |= PG_MANAGED; 3664 3665 /* 3666 * Abort this mapping if its PV entry could not be created. 3667 */ 3668 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m), 3669 lockp)) { 3670 free = NULL; 3671 if (pmap_unwire_ptp(pmap, va, mpde, &free)) { 3672 pmap_invalidate_page(pmap, va); 3673 pmap_free_zero_pages(free); 3674 } 3675 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3676 " in pmap %p", va, pmap); 3677 return (FALSE); 3678 } 3679 } 3680 if ((prot & VM_PROT_EXECUTE) == 0) 3681 newpde |= pg_nx; 3682 if (va < VM_MAXUSER_ADDRESS) 3683 newpde |= PG_U; 3684 3685 /* 3686 * Increment counters. 3687 */ 3688 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); 3689 3690 /* 3691 * Map the superpage. 3692 */ 3693 pde_store(pde, newpde); 3694 3695 atomic_add_long(&pmap_pde_mappings, 1); 3696 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 3697 " in pmap %p", va, pmap); 3698 return (TRUE); 3699} 3700 3701/* 3702 * Maps a sequence of resident pages belonging to the same object. 3703 * The sequence begins with the given page m_start. This page is 3704 * mapped at the given virtual address start. Each subsequent page is 3705 * mapped at a virtual address that is offset from start by the same 3706 * amount as the page is offset from m_start within the object. The 3707 * last page in the sequence is the page with the largest offset from 3708 * m_start that can be mapped at a virtual address less than the given 3709 * virtual address end. Not every virtual page between start and end 3710 * is mapped; only those for which a resident page exists with the 3711 * corresponding offset from m_start are mapped. 3712 */ 3713void 3714pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3715 vm_page_t m_start, vm_prot_t prot) 3716{ 3717 struct rwlock *lock; 3718 vm_offset_t va; 3719 vm_page_t m, mpte; 3720 vm_pindex_t diff, psize; 3721 3722 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3723 3724 psize = atop(end - start); 3725 mpte = NULL; 3726 m = m_start; 3727 lock = NULL; 3728 rw_rlock(&pvh_global_lock); 3729 PMAP_LOCK(pmap); 3730 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3731 va = start + ptoa(diff); 3732 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 3733 (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 && 3734 pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 && 3735 pmap_enter_pde(pmap, va, m, prot, &lock)) 3736 m = &m[NBPDR / PAGE_SIZE - 1]; 3737 else 3738 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 3739 mpte, &lock); 3740 m = TAILQ_NEXT(m, listq); 3741 } 3742 if (lock != NULL) 3743 rw_wunlock(lock); 3744 rw_runlock(&pvh_global_lock); 3745 PMAP_UNLOCK(pmap); 3746} 3747 3748/* 3749 * this code makes some *MAJOR* assumptions: 3750 * 1. Current pmap & pmap exists. 3751 * 2. Not wired. 3752 * 3. Read access. 3753 * 4. No page table pages. 3754 * but is *MUCH* faster than pmap_enter... 3755 */ 3756 3757void 3758pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3759{ 3760 struct rwlock *lock; 3761 3762 lock = NULL; 3763 rw_rlock(&pvh_global_lock); 3764 PMAP_LOCK(pmap); 3765 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 3766 if (lock != NULL) 3767 rw_wunlock(lock); 3768 rw_runlock(&pvh_global_lock); 3769 PMAP_UNLOCK(pmap); 3770} 3771 3772static vm_page_t 3773pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3774 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 3775{ 3776 vm_page_t free; 3777 pt_entry_t *pte; 3778 vm_paddr_t pa; 3779 3780 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 3781 (m->oflags & VPO_UNMANAGED) != 0, 3782 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3783 rw_assert(&pvh_global_lock, RA_LOCKED); 3784 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3785 3786 /* 3787 * In the case that a page table page is not 3788 * resident, we are creating it here. 3789 */ 3790 if (va < VM_MAXUSER_ADDRESS) { 3791 vm_pindex_t ptepindex; 3792 pd_entry_t *ptepa; 3793 3794 /* 3795 * Calculate pagetable page index 3796 */ 3797 ptepindex = pmap_pde_pindex(va); 3798 if (mpte && (mpte->pindex == ptepindex)) { 3799 mpte->wire_count++; 3800 } else { 3801 /* 3802 * Get the page directory entry 3803 */ 3804 ptepa = pmap_pde(pmap, va); 3805 3806 /* 3807 * If the page table page is mapped, we just increment 3808 * the hold count, and activate it. Otherwise, we 3809 * attempt to allocate a page table page. If this 3810 * attempt fails, we don't retry. Instead, we give up. 3811 */ 3812 if (ptepa && (*ptepa & PG_V) != 0) { 3813 if (*ptepa & PG_PS) 3814 return (NULL); 3815 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); 3816 mpte->wire_count++; 3817 } else { 3818 /* 3819 * Pass NULL instead of the PV list lock 3820 * pointer, because we don't intend to sleep. 3821 */ 3822 mpte = _pmap_allocpte(pmap, ptepindex, NULL); 3823 if (mpte == NULL) 3824 return (mpte); 3825 } 3826 } 3827 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 3828 pte = &pte[pmap_pte_index(va)]; 3829 } else { 3830 mpte = NULL; 3831 pte = vtopte(va); 3832 } 3833 if (*pte) { 3834 if (mpte != NULL) { 3835 mpte->wire_count--; 3836 mpte = NULL; 3837 } 3838 return (mpte); 3839 } 3840 3841 /* 3842 * Enter on the PV list if part of our managed memory. 3843 */ 3844 if ((m->oflags & VPO_UNMANAGED) == 0 && 3845 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 3846 if (mpte != NULL) { 3847 free = NULL; 3848 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 3849 pmap_invalidate_page(pmap, va); 3850 pmap_free_zero_pages(free); 3851 } 3852 mpte = NULL; 3853 } 3854 return (mpte); 3855 } 3856 3857 /* 3858 * Increment counters 3859 */ 3860 pmap_resident_count_inc(pmap, 1); 3861 3862 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); 3863 if ((prot & VM_PROT_EXECUTE) == 0) 3864 pa |= pg_nx; 3865 3866 /* 3867 * Now validate mapping with RO protection 3868 */ 3869 if ((m->oflags & VPO_UNMANAGED) != 0) 3870 pte_store(pte, pa | PG_V | PG_U); 3871 else 3872 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 3873 return (mpte); 3874} 3875 3876/* 3877 * Make a temporary mapping for a physical address. This is only intended 3878 * to be used for panic dumps. 3879 */ 3880void * 3881pmap_kenter_temporary(vm_paddr_t pa, int i) 3882{ 3883 vm_offset_t va; 3884 3885 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 3886 pmap_kenter(va, pa); 3887 invlpg(va); 3888 return ((void *)crashdumpmap); 3889} 3890 3891/* 3892 * This code maps large physical mmap regions into the 3893 * processor address space. Note that some shortcuts 3894 * are taken, but the code works. 3895 */ 3896void 3897pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3898 vm_pindex_t pindex, vm_size_t size) 3899{ 3900 pd_entry_t *pde; 3901 vm_paddr_t pa, ptepa; 3902 vm_page_t p, pdpg; 3903 int pat_mode; 3904 3905 VM_OBJECT_ASSERT_WLOCKED(object); 3906 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3907 ("pmap_object_init_pt: non-device object")); 3908 if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 3909 if (!vm_object_populate(object, pindex, pindex + atop(size))) 3910 return; 3911 p = vm_page_lookup(object, pindex); 3912 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3913 ("pmap_object_init_pt: invalid page %p", p)); 3914 pat_mode = p->md.pat_mode; 3915 3916 /* 3917 * Abort the mapping if the first page is not physically 3918 * aligned to a 2MB page boundary. 3919 */ 3920 ptepa = VM_PAGE_TO_PHYS(p); 3921 if (ptepa & (NBPDR - 1)) 3922 return; 3923 3924 /* 3925 * Skip the first page. Abort the mapping if the rest of 3926 * the pages are not physically contiguous or have differing 3927 * memory attributes. 3928 */ 3929 p = TAILQ_NEXT(p, listq); 3930 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 3931 pa += PAGE_SIZE) { 3932 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3933 ("pmap_object_init_pt: invalid page %p", p)); 3934 if (pa != VM_PAGE_TO_PHYS(p) || 3935 pat_mode != p->md.pat_mode) 3936 return; 3937 p = TAILQ_NEXT(p, listq); 3938 } 3939 3940 /* 3941 * Map using 2MB pages. Since "ptepa" is 2M aligned and 3942 * "size" is a multiple of 2M, adding the PAT setting to "pa" 3943 * will not affect the termination of this loop. 3944 */ 3945 PMAP_LOCK(pmap); 3946 for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa + 3947 size; pa += NBPDR) { 3948 pdpg = pmap_allocpde(pmap, addr, NULL); 3949 if (pdpg == NULL) { 3950 /* 3951 * The creation of mappings below is only an 3952 * optimization. If a page directory page 3953 * cannot be allocated without blocking, 3954 * continue on to the next mapping rather than 3955 * blocking. 3956 */ 3957 addr += NBPDR; 3958 continue; 3959 } 3960 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 3961 pde = &pde[pmap_pde_index(addr)]; 3962 if ((*pde & PG_V) == 0) { 3963 pde_store(pde, pa | PG_PS | PG_M | PG_A | 3964 PG_U | PG_RW | PG_V); 3965 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); 3966 atomic_add_long(&pmap_pde_mappings, 1); 3967 } else { 3968 /* Continue on if the PDE is already valid. */ 3969 pdpg->wire_count--; 3970 KASSERT(pdpg->wire_count > 0, 3971 ("pmap_object_init_pt: missing reference " 3972 "to page directory page, va: 0x%lx", addr)); 3973 } 3974 addr += NBPDR; 3975 } 3976 PMAP_UNLOCK(pmap); 3977 } 3978} 3979 3980/* 3981 * Routine: pmap_change_wiring 3982 * Function: Change the wiring attribute for a map/virtual-address 3983 * pair. 3984 * In/out conditions: 3985 * The mapping must already exist in the pmap. 3986 */ 3987void 3988pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) 3989{ 3990 pd_entry_t *pde; 3991 pt_entry_t *pte; 3992 boolean_t pv_lists_locked; 3993 3994 pv_lists_locked = FALSE; 3995 3996 /* 3997 * Wiring is not a hardware characteristic so there is no need to 3998 * invalidate TLB. 3999 */ 4000retry: 4001 PMAP_LOCK(pmap); 4002 pde = pmap_pde(pmap, va); 4003 if ((*pde & PG_PS) != 0) { 4004 if (!wired != ((*pde & PG_W) == 0)) { 4005 if (!pv_lists_locked) { 4006 pv_lists_locked = TRUE; 4007 if (!rw_try_rlock(&pvh_global_lock)) { 4008 PMAP_UNLOCK(pmap); 4009 rw_rlock(&pvh_global_lock); 4010 goto retry; 4011 } 4012 } 4013 if (!pmap_demote_pde(pmap, pde, va)) 4014 panic("pmap_change_wiring: demotion failed"); 4015 } else 4016 goto out; 4017 } 4018 pte = pmap_pde_to_pte(pde, va); 4019 if (wired && (*pte & PG_W) == 0) { 4020 pmap->pm_stats.wired_count++; 4021 atomic_set_long(pte, PG_W); 4022 } else if (!wired && (*pte & PG_W) != 0) { 4023 pmap->pm_stats.wired_count--; 4024 atomic_clear_long(pte, PG_W); 4025 } 4026out: 4027 if (pv_lists_locked) 4028 rw_runlock(&pvh_global_lock); 4029 PMAP_UNLOCK(pmap); 4030} 4031 4032/* 4033 * Copy the range specified by src_addr/len 4034 * from the source map to the range dst_addr/len 4035 * in the destination map. 4036 * 4037 * This routine is only advisory and need not do anything. 4038 */ 4039 4040void 4041pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 4042 vm_offset_t src_addr) 4043{ 4044 struct rwlock *lock; 4045 vm_page_t free; 4046 vm_offset_t addr; 4047 vm_offset_t end_addr = src_addr + len; 4048 vm_offset_t va_next; 4049 4050 if (dst_addr != src_addr) 4051 return; 4052 4053 lock = NULL; 4054 rw_rlock(&pvh_global_lock); 4055 if (dst_pmap < src_pmap) { 4056 PMAP_LOCK(dst_pmap); 4057 PMAP_LOCK(src_pmap); 4058 } else { 4059 PMAP_LOCK(src_pmap); 4060 PMAP_LOCK(dst_pmap); 4061 } 4062 for (addr = src_addr; addr < end_addr; addr = va_next) { 4063 pt_entry_t *src_pte, *dst_pte; 4064 vm_page_t dstmpde, dstmpte, srcmpte; 4065 pml4_entry_t *pml4e; 4066 pdp_entry_t *pdpe; 4067 pd_entry_t srcptepaddr, *pde; 4068 4069 KASSERT(addr < UPT_MIN_ADDRESS, 4070 ("pmap_copy: invalid to pmap_copy page tables")); 4071 4072 pml4e = pmap_pml4e(src_pmap, addr); 4073 if ((*pml4e & PG_V) == 0) { 4074 va_next = (addr + NBPML4) & ~PML4MASK; 4075 if (va_next < addr) 4076 va_next = end_addr; 4077 continue; 4078 } 4079 4080 pdpe = pmap_pml4e_to_pdpe(pml4e, addr); 4081 if ((*pdpe & PG_V) == 0) { 4082 va_next = (addr + NBPDP) & ~PDPMASK; 4083 if (va_next < addr) 4084 va_next = end_addr; 4085 continue; 4086 } 4087 4088 va_next = (addr + NBPDR) & ~PDRMASK; 4089 if (va_next < addr) 4090 va_next = end_addr; 4091 4092 pde = pmap_pdpe_to_pde(pdpe, addr); 4093 srcptepaddr = *pde; 4094 if (srcptepaddr == 0) 4095 continue; 4096 4097 if (srcptepaddr & PG_PS) { 4098 dstmpde = pmap_allocpde(dst_pmap, addr, NULL); 4099 if (dstmpde == NULL) 4100 break; 4101 pde = (pd_entry_t *) 4102 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde)); 4103 pde = &pde[pmap_pde_index(addr)]; 4104 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 4105 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & 4106 PG_PS_FRAME, &lock))) { 4107 *pde = srcptepaddr & ~PG_W; 4108 pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE); 4109 } else 4110 dstmpde->wire_count--; 4111 continue; 4112 } 4113 4114 srcptepaddr &= PG_FRAME; 4115 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 4116 KASSERT(srcmpte->wire_count > 0, 4117 ("pmap_copy: source page table page is unused")); 4118 4119 if (va_next > end_addr) 4120 va_next = end_addr; 4121 4122 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 4123 src_pte = &src_pte[pmap_pte_index(addr)]; 4124 dstmpte = NULL; 4125 while (addr < va_next) { 4126 pt_entry_t ptetemp; 4127 ptetemp = *src_pte; 4128 /* 4129 * we only virtual copy managed pages 4130 */ 4131 if ((ptetemp & PG_MANAGED) != 0) { 4132 if (dstmpte != NULL && 4133 dstmpte->pindex == pmap_pde_pindex(addr)) 4134 dstmpte->wire_count++; 4135 else if ((dstmpte = pmap_allocpte(dst_pmap, 4136 addr, NULL)) == NULL) 4137 goto out; 4138 dst_pte = (pt_entry_t *) 4139 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 4140 dst_pte = &dst_pte[pmap_pte_index(addr)]; 4141 if (*dst_pte == 0 && 4142 pmap_try_insert_pv_entry(dst_pmap, addr, 4143 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), 4144 &lock)) { 4145 /* 4146 * Clear the wired, modified, and 4147 * accessed (referenced) bits 4148 * during the copy. 4149 */ 4150 *dst_pte = ptetemp & ~(PG_W | PG_M | 4151 PG_A); 4152 pmap_resident_count_inc(dst_pmap, 1); 4153 } else { 4154 free = NULL; 4155 if (pmap_unwire_ptp(dst_pmap, addr, 4156 dstmpte, &free)) { 4157 pmap_invalidate_page(dst_pmap, 4158 addr); 4159 pmap_free_zero_pages(free); 4160 } 4161 goto out; 4162 } 4163 if (dstmpte->wire_count >= srcmpte->wire_count) 4164 break; 4165 } 4166 addr += PAGE_SIZE; 4167 src_pte++; 4168 } 4169 } 4170out: 4171 if (lock != NULL) 4172 rw_wunlock(lock); 4173 rw_runlock(&pvh_global_lock); 4174 PMAP_UNLOCK(src_pmap); 4175 PMAP_UNLOCK(dst_pmap); 4176} 4177 4178/* 4179 * pmap_zero_page zeros the specified hardware page by mapping 4180 * the page into KVM and using bzero to clear its contents. 4181 */ 4182void 4183pmap_zero_page(vm_page_t m) 4184{ 4185 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 4186 4187 pagezero((void *)va); 4188} 4189 4190/* 4191 * pmap_zero_page_area zeros the specified hardware page by mapping 4192 * the page into KVM and using bzero to clear its contents. 4193 * 4194 * off and size may not cover an area beyond a single hardware page. 4195 */ 4196void 4197pmap_zero_page_area(vm_page_t m, int off, int size) 4198{ 4199 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 4200 4201 if (off == 0 && size == PAGE_SIZE) 4202 pagezero((void *)va); 4203 else 4204 bzero((char *)va + off, size); 4205} 4206 4207/* 4208 * pmap_zero_page_idle zeros the specified hardware page by mapping 4209 * the page into KVM and using bzero to clear its contents. This 4210 * is intended to be called from the vm_pagezero process only and 4211 * outside of Giant. 4212 */ 4213void 4214pmap_zero_page_idle(vm_page_t m) 4215{ 4216 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 4217 4218 pagezero((void *)va); 4219} 4220 4221/* 4222 * pmap_copy_page copies the specified (machine independent) 4223 * page by mapping the page into virtual memory and using 4224 * bcopy to copy the page, one machine dependent page at a 4225 * time. 4226 */ 4227void 4228pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 4229{ 4230 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 4231 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 4232 4233 pagecopy((void *)src, (void *)dst); 4234} 4235 4236int unmapped_buf_allowed = 1; 4237 4238void 4239pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 4240 vm_offset_t b_offset, int xfersize) 4241{ 4242 void *a_cp, *b_cp; 4243 vm_offset_t a_pg_offset, b_pg_offset; 4244 int cnt; 4245 4246 while (xfersize > 0) { 4247 a_pg_offset = a_offset & PAGE_MASK; 4248 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 4249 a_cp = (char *)PHYS_TO_DMAP(ma[a_offset >> PAGE_SHIFT]-> 4250 phys_addr) + a_pg_offset; 4251 b_pg_offset = b_offset & PAGE_MASK; 4252 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 4253 b_cp = (char *)PHYS_TO_DMAP(mb[b_offset >> PAGE_SHIFT]-> 4254 phys_addr) + b_pg_offset; 4255 bcopy(a_cp, b_cp, cnt); 4256 a_offset += cnt; 4257 b_offset += cnt; 4258 xfersize -= cnt; 4259 } 4260} 4261 4262/* 4263 * Returns true if the pmap's pv is one of the first 4264 * 16 pvs linked to from this page. This count may 4265 * be changed upwards or downwards in the future; it 4266 * is only necessary that true be returned for a small 4267 * subset of pmaps for proper page aging. 4268 */ 4269boolean_t 4270pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 4271{ 4272 struct md_page *pvh; 4273 struct rwlock *lock; 4274 pv_entry_t pv; 4275 int loops = 0; 4276 boolean_t rv; 4277 4278 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4279 ("pmap_page_exists_quick: page %p is not managed", m)); 4280 rv = FALSE; 4281 rw_rlock(&pvh_global_lock); 4282 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4283 rw_rlock(lock); 4284 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4285 if (PV_PMAP(pv) == pmap) { 4286 rv = TRUE; 4287 break; 4288 } 4289 loops++; 4290 if (loops >= 16) 4291 break; 4292 } 4293 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 4294 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4295 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4296 if (PV_PMAP(pv) == pmap) { 4297 rv = TRUE; 4298 break; 4299 } 4300 loops++; 4301 if (loops >= 16) 4302 break; 4303 } 4304 } 4305 rw_runlock(lock); 4306 rw_runlock(&pvh_global_lock); 4307 return (rv); 4308} 4309 4310/* 4311 * pmap_page_wired_mappings: 4312 * 4313 * Return the number of managed mappings to the given physical page 4314 * that are wired. 4315 */ 4316int 4317pmap_page_wired_mappings(vm_page_t m) 4318{ 4319 int count; 4320 4321 count = 0; 4322 if ((m->oflags & VPO_UNMANAGED) != 0) 4323 return (count); 4324 rw_wlock(&pvh_global_lock); 4325 count = pmap_pvh_wired_mappings(&m->md, count); 4326 if ((m->flags & PG_FICTITIOUS) == 0) { 4327 count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), 4328 count); 4329 } 4330 rw_wunlock(&pvh_global_lock); 4331 return (count); 4332} 4333 4334/* 4335 * pmap_pvh_wired_mappings: 4336 * 4337 * Return the updated number "count" of managed mappings that are wired. 4338 */ 4339static int 4340pmap_pvh_wired_mappings(struct md_page *pvh, int count) 4341{ 4342 pmap_t pmap; 4343 pt_entry_t *pte; 4344 pv_entry_t pv; 4345 4346 rw_assert(&pvh_global_lock, RA_WLOCKED); 4347 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4348 pmap = PV_PMAP(pv); 4349 PMAP_LOCK(pmap); 4350 pte = pmap_pte(pmap, pv->pv_va); 4351 if ((*pte & PG_W) != 0) 4352 count++; 4353 PMAP_UNLOCK(pmap); 4354 } 4355 return (count); 4356} 4357 4358/* 4359 * Returns TRUE if the given page is mapped individually or as part of 4360 * a 2mpage. Otherwise, returns FALSE. 4361 */ 4362boolean_t 4363pmap_page_is_mapped(vm_page_t m) 4364{ 4365 struct rwlock *lock; 4366 boolean_t rv; 4367 4368 if ((m->oflags & VPO_UNMANAGED) != 0) 4369 return (FALSE); 4370 rw_rlock(&pvh_global_lock); 4371 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 4372 rw_rlock(lock); 4373 rv = !TAILQ_EMPTY(&m->md.pv_list) || 4374 ((m->flags & PG_FICTITIOUS) == 0 && 4375 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 4376 rw_runlock(lock); 4377 rw_runlock(&pvh_global_lock); 4378 return (rv); 4379} 4380 4381/* 4382 * Remove all pages from specified address space 4383 * this aids process exit speeds. Also, this code 4384 * is special cased for current process only, but 4385 * can have the more generic (and slightly slower) 4386 * mode enabled. This is much faster than pmap_remove 4387 * in the case of running down an entire address space. 4388 */ 4389void 4390pmap_remove_pages(pmap_t pmap) 4391{ 4392 pd_entry_t ptepde; 4393 pt_entry_t *pte, tpte; 4394 vm_page_t free = NULL; 4395 vm_page_t m, mpte, mt; 4396 pv_entry_t pv; 4397 struct md_page *pvh; 4398 struct pv_chunk *pc, *npc; 4399 struct rwlock *lock; 4400 int64_t bit; 4401 uint64_t inuse, bitmask; 4402 int allfree, field, freed, idx; 4403 vm_paddr_t pa; 4404 4405 if (pmap != PCPU_GET(curpmap)) { 4406 printf("warning: pmap_remove_pages called with non-current pmap\n"); 4407 return; 4408 } 4409 lock = NULL; 4410 rw_rlock(&pvh_global_lock); 4411 PMAP_LOCK(pmap); 4412 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4413 allfree = 1; 4414 freed = 0; 4415 for (field = 0; field < _NPCM; field++) { 4416 inuse = ~pc->pc_map[field] & pc_freemask[field]; 4417 while (inuse != 0) { 4418 bit = bsfq(inuse); 4419 bitmask = 1UL << bit; 4420 idx = field * 64 + bit; 4421 pv = &pc->pc_pventry[idx]; 4422 inuse &= ~bitmask; 4423 4424 pte = pmap_pdpe(pmap, pv->pv_va); 4425 ptepde = *pte; 4426 pte = pmap_pdpe_to_pde(pte, pv->pv_va); 4427 tpte = *pte; 4428 if ((tpte & (PG_PS | PG_V)) == PG_V) { 4429 ptepde = tpte; 4430 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 4431 PG_FRAME); 4432 pte = &pte[pmap_pte_index(pv->pv_va)]; 4433 tpte = *pte; 4434 } 4435 if ((tpte & PG_V) == 0) { 4436 panic("bad pte va %lx pte %lx", 4437 pv->pv_va, tpte); 4438 } 4439 4440/* 4441 * We cannot remove wired pages from a process' mapping at this time 4442 */ 4443 if (tpte & PG_W) { 4444 allfree = 0; 4445 continue; 4446 } 4447 4448 if (tpte & PG_PS) 4449 pa = tpte & PG_PS_FRAME; 4450 else 4451 pa = tpte & PG_FRAME; 4452 4453 m = PHYS_TO_VM_PAGE(pa); 4454 KASSERT(m->phys_addr == pa, 4455 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 4456 m, (uintmax_t)m->phys_addr, 4457 (uintmax_t)tpte)); 4458 4459 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4460 m < &vm_page_array[vm_page_array_size], 4461 ("pmap_remove_pages: bad tpte %#jx", 4462 (uintmax_t)tpte)); 4463 4464 pte_clear(pte); 4465 4466 /* 4467 * Update the vm_page_t clean/reference bits. 4468 */ 4469 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4470 if ((tpte & PG_PS) != 0) { 4471 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4472 vm_page_dirty(mt); 4473 } else 4474 vm_page_dirty(m); 4475 } 4476 4477 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 4478 4479 /* Mark free */ 4480 pc->pc_map[field] |= bitmask; 4481 if ((tpte & PG_PS) != 0) { 4482 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); 4483 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 4484 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4485 if (TAILQ_EMPTY(&pvh->pv_list)) { 4486 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4487 if ((mt->aflags & PGA_WRITEABLE) != 0 && 4488 TAILQ_EMPTY(&mt->md.pv_list)) 4489 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4490 } 4491 mpte = pmap_lookup_pt_page(pmap, pv->pv_va); 4492 if (mpte != NULL) { 4493 pmap_remove_pt_page(pmap, mpte); 4494 pmap_resident_count_dec(pmap, 1); 4495 KASSERT(mpte->wire_count == NPTEPG, 4496 ("pmap_remove_pages: pte page wire count error")); 4497 mpte->wire_count = 0; 4498 pmap_add_delayed_free_list(mpte, &free, FALSE); 4499 atomic_subtract_int(&cnt.v_wire_count, 1); 4500 } 4501 } else { 4502 pmap_resident_count_dec(pmap, 1); 4503 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4504 if ((m->aflags & PGA_WRITEABLE) != 0 && 4505 TAILQ_EMPTY(&m->md.pv_list) && 4506 (m->flags & PG_FICTITIOUS) == 0) { 4507 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4508 if (TAILQ_EMPTY(&pvh->pv_list)) 4509 vm_page_aflag_clear(m, PGA_WRITEABLE); 4510 } 4511 } 4512 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 4513 freed++; 4514 } 4515 } 4516 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 4517 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 4518 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 4519 if (allfree) { 4520 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4521 free_pv_chunk(pc); 4522 } 4523 } 4524 if (lock != NULL) 4525 rw_wunlock(lock); 4526 pmap_invalidate_all(pmap); 4527 rw_runlock(&pvh_global_lock); 4528 PMAP_UNLOCK(pmap); 4529 pmap_free_zero_pages(free); 4530} 4531 4532/* 4533 * pmap_is_modified: 4534 * 4535 * Return whether or not the specified physical page was modified 4536 * in any physical maps. 4537 */ 4538boolean_t 4539pmap_is_modified(vm_page_t m) 4540{ 4541 boolean_t rv; 4542 4543 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4544 ("pmap_is_modified: page %p is not managed", m)); 4545 4546 /* 4547 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be 4548 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 4549 * is clear, no PTEs can have PG_M set. 4550 */ 4551 VM_OBJECT_ASSERT_WLOCKED(m->object); 4552 if ((m->oflags & VPO_BUSY) == 0 && 4553 (m->aflags & PGA_WRITEABLE) == 0) 4554 return (FALSE); 4555 rw_wlock(&pvh_global_lock); 4556 rv = pmap_is_modified_pvh(&m->md) || 4557 ((m->flags & PG_FICTITIOUS) == 0 && 4558 pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 4559 rw_wunlock(&pvh_global_lock); 4560 return (rv); 4561} 4562 4563/* 4564 * Returns TRUE if any of the given mappings were used to modify 4565 * physical memory. Otherwise, returns FALSE. Both page and 2mpage 4566 * mappings are supported. 4567 */ 4568static boolean_t 4569pmap_is_modified_pvh(struct md_page *pvh) 4570{ 4571 pv_entry_t pv; 4572 pt_entry_t *pte; 4573 pmap_t pmap; 4574 boolean_t rv; 4575 4576 rw_assert(&pvh_global_lock, RA_WLOCKED); 4577 rv = FALSE; 4578 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4579 pmap = PV_PMAP(pv); 4580 PMAP_LOCK(pmap); 4581 pte = pmap_pte(pmap, pv->pv_va); 4582 rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); 4583 PMAP_UNLOCK(pmap); 4584 if (rv) 4585 break; 4586 } 4587 return (rv); 4588} 4589 4590/* 4591 * pmap_is_prefaultable: 4592 * 4593 * Return whether or not the specified virtual address is elgible 4594 * for prefault. 4595 */ 4596boolean_t 4597pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 4598{ 4599 pd_entry_t *pde; 4600 pt_entry_t *pte; 4601 boolean_t rv; 4602 4603 rv = FALSE; 4604 PMAP_LOCK(pmap); 4605 pde = pmap_pde(pmap, addr); 4606 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { 4607 pte = pmap_pde_to_pte(pde, addr); 4608 rv = (*pte & PG_V) == 0; 4609 } 4610 PMAP_UNLOCK(pmap); 4611 return (rv); 4612} 4613 4614/* 4615 * pmap_is_referenced: 4616 * 4617 * Return whether or not the specified physical page was referenced 4618 * in any physical maps. 4619 */ 4620boolean_t 4621pmap_is_referenced(vm_page_t m) 4622{ 4623 boolean_t rv; 4624 4625 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4626 ("pmap_is_referenced: page %p is not managed", m)); 4627 rw_wlock(&pvh_global_lock); 4628 rv = pmap_is_referenced_pvh(&m->md) || 4629 ((m->flags & PG_FICTITIOUS) == 0 && 4630 pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 4631 rw_wunlock(&pvh_global_lock); 4632 return (rv); 4633} 4634 4635/* 4636 * Returns TRUE if any of the given mappings were referenced and FALSE 4637 * otherwise. Both page and 2mpage mappings are supported. 4638 */ 4639static boolean_t 4640pmap_is_referenced_pvh(struct md_page *pvh) 4641{ 4642 pv_entry_t pv; 4643 pt_entry_t *pte; 4644 pmap_t pmap; 4645 boolean_t rv; 4646 4647 rw_assert(&pvh_global_lock, RA_WLOCKED); 4648 rv = FALSE; 4649 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4650 pmap = PV_PMAP(pv); 4651 PMAP_LOCK(pmap); 4652 pte = pmap_pte(pmap, pv->pv_va); 4653 rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); 4654 PMAP_UNLOCK(pmap); 4655 if (rv) 4656 break; 4657 } 4658 return (rv); 4659} 4660 4661/* 4662 * Clear the write and modified bits in each of the given page's mappings. 4663 */ 4664void 4665pmap_remove_write(vm_page_t m) 4666{ 4667 struct md_page *pvh; 4668 pmap_t pmap; 4669 pv_entry_t next_pv, pv; 4670 pd_entry_t *pde; 4671 pt_entry_t oldpte, *pte; 4672 vm_offset_t va; 4673 4674 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4675 ("pmap_remove_write: page %p is not managed", m)); 4676 4677 /* 4678 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be set by 4679 * another thread while the object is locked. Thus, if PGA_WRITEABLE 4680 * is clear, no page table entries need updating. 4681 */ 4682 VM_OBJECT_ASSERT_WLOCKED(m->object); 4683 if ((m->oflags & VPO_BUSY) == 0 && 4684 (m->aflags & PGA_WRITEABLE) == 0) 4685 return; 4686 rw_wlock(&pvh_global_lock); 4687 if ((m->flags & PG_FICTITIOUS) != 0) 4688 goto small_mappings; 4689 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4690 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4691 pmap = PV_PMAP(pv); 4692 PMAP_LOCK(pmap); 4693 va = pv->pv_va; 4694 pde = pmap_pde(pmap, va); 4695 if ((*pde & PG_RW) != 0) 4696 (void)pmap_demote_pde(pmap, pde, va); 4697 PMAP_UNLOCK(pmap); 4698 } 4699small_mappings: 4700 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4701 pmap = PV_PMAP(pv); 4702 PMAP_LOCK(pmap); 4703 pde = pmap_pde(pmap, pv->pv_va); 4704 KASSERT((*pde & PG_PS) == 0, 4705 ("pmap_remove_write: found a 2mpage in page %p's pv list", 4706 m)); 4707 pte = pmap_pde_to_pte(pde, pv->pv_va); 4708retry: 4709 oldpte = *pte; 4710 if (oldpte & PG_RW) { 4711 if (!atomic_cmpset_long(pte, oldpte, oldpte & 4712 ~(PG_RW | PG_M))) 4713 goto retry; 4714 if ((oldpte & PG_M) != 0) 4715 vm_page_dirty(m); 4716 pmap_invalidate_page(pmap, pv->pv_va); 4717 } 4718 PMAP_UNLOCK(pmap); 4719 } 4720 vm_page_aflag_clear(m, PGA_WRITEABLE); 4721 rw_wunlock(&pvh_global_lock); 4722} 4723 4724/* 4725 * pmap_ts_referenced: 4726 * 4727 * Return a count of reference bits for a page, clearing those bits. 4728 * It is not necessary for every reference bit to be cleared, but it 4729 * is necessary that 0 only be returned when there are truly no 4730 * reference bits set. 4731 * 4732 * XXX: The exact number of bits to check and clear is a matter that 4733 * should be tested and standardized at some point in the future for 4734 * optimal aging of shared pages. 4735 */ 4736int 4737pmap_ts_referenced(vm_page_t m) 4738{ 4739 struct md_page *pvh; 4740 pv_entry_t pv, pvf, pvn; 4741 pmap_t pmap; 4742 pd_entry_t oldpde, *pde; 4743 pt_entry_t *pte; 4744 vm_offset_t va; 4745 int rtval = 0; 4746 4747 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4748 ("pmap_ts_referenced: page %p is not managed", m)); 4749 rw_wlock(&pvh_global_lock); 4750 if ((m->flags & PG_FICTITIOUS) != 0) 4751 goto small_mappings; 4752 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4753 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, pvn) { 4754 pmap = PV_PMAP(pv); 4755 PMAP_LOCK(pmap); 4756 va = pv->pv_va; 4757 pde = pmap_pde(pmap, va); 4758 oldpde = *pde; 4759 if ((oldpde & PG_A) != 0) { 4760 if (pmap_demote_pde(pmap, pde, va)) { 4761 if ((oldpde & PG_W) == 0) { 4762 /* 4763 * Remove the mapping to a single page 4764 * so that a subsequent access may 4765 * repromote. Since the underlying 4766 * page table page is fully populated, 4767 * this removal never frees a page 4768 * table page. 4769 */ 4770 va += VM_PAGE_TO_PHYS(m) - (oldpde & 4771 PG_PS_FRAME); 4772 pmap_remove_page(pmap, va, pde, NULL); 4773 rtval++; 4774 if (rtval > 4) { 4775 PMAP_UNLOCK(pmap); 4776 goto out; 4777 } 4778 } 4779 } 4780 } 4781 PMAP_UNLOCK(pmap); 4782 } 4783small_mappings: 4784 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4785 pvf = pv; 4786 do { 4787 pvn = TAILQ_NEXT(pv, pv_next); 4788 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4789 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4790 pmap = PV_PMAP(pv); 4791 PMAP_LOCK(pmap); 4792 pde = pmap_pde(pmap, pv->pv_va); 4793 KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:" 4794 " found a 2mpage in page %p's pv list", m)); 4795 pte = pmap_pde_to_pte(pde, pv->pv_va); 4796 if ((*pte & PG_A) != 0) { 4797 atomic_clear_long(pte, PG_A); 4798 pmap_invalidate_page(pmap, pv->pv_va); 4799 rtval++; 4800 if (rtval > 4) 4801 pvn = NULL; 4802 } 4803 PMAP_UNLOCK(pmap); 4804 } while ((pv = pvn) != NULL && pv != pvf); 4805 } 4806out: 4807 rw_wunlock(&pvh_global_lock); 4808 return (rtval); 4809} 4810 4811/* 4812 * Clear the modify bits on the specified physical page. 4813 */ 4814void 4815pmap_clear_modify(vm_page_t m) 4816{ 4817 struct md_page *pvh; 4818 pmap_t pmap; 4819 pv_entry_t next_pv, pv; 4820 pd_entry_t oldpde, *pde; 4821 pt_entry_t oldpte, *pte; 4822 vm_offset_t va; 4823 4824 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4825 ("pmap_clear_modify: page %p is not managed", m)); 4826 VM_OBJECT_ASSERT_WLOCKED(m->object); 4827 KASSERT((m->oflags & VPO_BUSY) == 0, 4828 ("pmap_clear_modify: page %p is busy", m)); 4829 4830 /* 4831 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 4832 * If the object containing the page is locked and the page is not 4833 * VPO_BUSY, then PGA_WRITEABLE cannot be concurrently set. 4834 */ 4835 if ((m->aflags & PGA_WRITEABLE) == 0) 4836 return; 4837 rw_wlock(&pvh_global_lock); 4838 if ((m->flags & PG_FICTITIOUS) != 0) 4839 goto small_mappings; 4840 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4841 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4842 pmap = PV_PMAP(pv); 4843 PMAP_LOCK(pmap); 4844 va = pv->pv_va; 4845 pde = pmap_pde(pmap, va); 4846 oldpde = *pde; 4847 if ((oldpde & PG_RW) != 0) { 4848 if (pmap_demote_pde(pmap, pde, va)) { 4849 if ((oldpde & PG_W) == 0) { 4850 /* 4851 * Write protect the mapping to a 4852 * single page so that a subsequent 4853 * write access may repromote. 4854 */ 4855 va += VM_PAGE_TO_PHYS(m) - (oldpde & 4856 PG_PS_FRAME); 4857 pte = pmap_pde_to_pte(pde, va); 4858 oldpte = *pte; 4859 if ((oldpte & PG_V) != 0) { 4860 while (!atomic_cmpset_long(pte, 4861 oldpte, 4862 oldpte & ~(PG_M | PG_RW))) 4863 oldpte = *pte; 4864 vm_page_dirty(m); 4865 pmap_invalidate_page(pmap, va); 4866 } 4867 } 4868 } 4869 } 4870 PMAP_UNLOCK(pmap); 4871 } 4872small_mappings: 4873 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4874 pmap = PV_PMAP(pv); 4875 PMAP_LOCK(pmap); 4876 pde = pmap_pde(pmap, pv->pv_va); 4877 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 4878 " a 2mpage in page %p's pv list", m)); 4879 pte = pmap_pde_to_pte(pde, pv->pv_va); 4880 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4881 atomic_clear_long(pte, PG_M); 4882 pmap_invalidate_page(pmap, pv->pv_va); 4883 } 4884 PMAP_UNLOCK(pmap); 4885 } 4886 rw_wunlock(&pvh_global_lock); 4887} 4888 4889/* 4890 * pmap_clear_reference: 4891 * 4892 * Clear the reference bit on the specified physical page. 4893 */ 4894void 4895pmap_clear_reference(vm_page_t m) 4896{ 4897 struct md_page *pvh; 4898 pmap_t pmap; 4899 pv_entry_t next_pv, pv; 4900 pd_entry_t oldpde, *pde; 4901 pt_entry_t *pte; 4902 vm_offset_t va; 4903 4904 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4905 ("pmap_clear_reference: page %p is not managed", m)); 4906 rw_wlock(&pvh_global_lock); 4907 if ((m->flags & PG_FICTITIOUS) != 0) 4908 goto small_mappings; 4909 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4910 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4911 pmap = PV_PMAP(pv); 4912 PMAP_LOCK(pmap); 4913 va = pv->pv_va; 4914 pde = pmap_pde(pmap, va); 4915 oldpde = *pde; 4916 if ((oldpde & PG_A) != 0) { 4917 if (pmap_demote_pde(pmap, pde, va)) { 4918 /* 4919 * Remove the mapping to a single page so 4920 * that a subsequent access may repromote. 4921 * Since the underlying page table page is 4922 * fully populated, this removal never frees 4923 * a page table page. 4924 */ 4925 va += VM_PAGE_TO_PHYS(m) - (oldpde & 4926 PG_PS_FRAME); 4927 pmap_remove_page(pmap, va, pde, NULL); 4928 } 4929 } 4930 PMAP_UNLOCK(pmap); 4931 } 4932small_mappings: 4933 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4934 pmap = PV_PMAP(pv); 4935 PMAP_LOCK(pmap); 4936 pde = pmap_pde(pmap, pv->pv_va); 4937 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found" 4938 " a 2mpage in page %p's pv list", m)); 4939 pte = pmap_pde_to_pte(pde, pv->pv_va); 4940 if (*pte & PG_A) { 4941 atomic_clear_long(pte, PG_A); 4942 pmap_invalidate_page(pmap, pv->pv_va); 4943 } 4944 PMAP_UNLOCK(pmap); 4945 } 4946 rw_wunlock(&pvh_global_lock); 4947} 4948 4949/* 4950 * Miscellaneous support routines follow 4951 */ 4952 4953/* Adjust the cache mode for a 4KB page mapped via a PTE. */ 4954static __inline void 4955pmap_pte_attr(pt_entry_t *pte, int cache_bits) 4956{ 4957 u_int opte, npte; 4958 4959 /* 4960 * The cache mode bits are all in the low 32-bits of the 4961 * PTE, so we can just spin on updating the low 32-bits. 4962 */ 4963 do { 4964 opte = *(u_int *)pte; 4965 npte = opte & ~PG_PTE_CACHE; 4966 npte |= cache_bits; 4967 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 4968} 4969 4970/* Adjust the cache mode for a 2MB page mapped via a PDE. */ 4971static __inline void 4972pmap_pde_attr(pd_entry_t *pde, int cache_bits) 4973{ 4974 u_int opde, npde; 4975 4976 /* 4977 * The cache mode bits are all in the low 32-bits of the 4978 * PDE, so we can just spin on updating the low 32-bits. 4979 */ 4980 do { 4981 opde = *(u_int *)pde; 4982 npde = opde & ~PG_PDE_CACHE; 4983 npde |= cache_bits; 4984 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 4985} 4986 4987/* 4988 * Map a set of physical memory pages into the kernel virtual 4989 * address space. Return a pointer to where it is mapped. This 4990 * routine is intended to be used for mapping device memory, 4991 * NOT real memory. 4992 */ 4993void * 4994pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 4995{ 4996 vm_offset_t va, offset; 4997 vm_size_t tmpsize; 4998 4999 /* 5000 * If the specified range of physical addresses fits within the direct 5001 * map window, use the direct map. 5002 */ 5003 if (pa < dmaplimit && pa + size < dmaplimit) { 5004 va = PHYS_TO_DMAP(pa); 5005 if (!pmap_change_attr(va, size, mode)) 5006 return ((void *)va); 5007 } 5008 offset = pa & PAGE_MASK; 5009 size = round_page(offset + size); 5010 va = kmem_alloc_nofault(kernel_map, size); 5011 if (!va) 5012 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 5013 pa = trunc_page(pa); 5014 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 5015 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 5016 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 5017 pmap_invalidate_cache_range(va, va + tmpsize); 5018 return ((void *)(va + offset)); 5019} 5020 5021void * 5022pmap_mapdev(vm_paddr_t pa, vm_size_t size) 5023{ 5024 5025 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 5026} 5027 5028void * 5029pmap_mapbios(vm_paddr_t pa, vm_size_t size) 5030{ 5031 5032 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5033} 5034 5035void 5036pmap_unmapdev(vm_offset_t va, vm_size_t size) 5037{ 5038 vm_offset_t base, offset; 5039 5040 /* If we gave a direct map region in pmap_mapdev, do nothing */ 5041 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 5042 return; 5043 base = trunc_page(va); 5044 offset = va & PAGE_MASK; 5045 size = round_page(offset + size); 5046 kmem_free(kernel_map, base, size); 5047} 5048 5049/* 5050 * Tries to demote a 1GB page mapping. 5051 */ 5052static boolean_t 5053pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) 5054{ 5055 pdp_entry_t newpdpe, oldpdpe; 5056 pd_entry_t *firstpde, newpde, *pde; 5057 vm_paddr_t mpdepa; 5058 vm_page_t mpde; 5059 5060 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 5061 oldpdpe = *pdpe; 5062 KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), 5063 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 5064 if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT | 5065 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 5066 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 5067 " in pmap %p", va, pmap); 5068 return (FALSE); 5069 } 5070 mpdepa = VM_PAGE_TO_PHYS(mpde); 5071 firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa); 5072 newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; 5073 KASSERT((oldpdpe & PG_A) != 0, 5074 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 5075 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 5076 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 5077 newpde = oldpdpe; 5078 5079 /* 5080 * Initialize the page directory page. 5081 */ 5082 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 5083 *pde = newpde; 5084 newpde += NBPDR; 5085 } 5086 5087 /* 5088 * Demote the mapping. 5089 */ 5090 *pdpe = newpdpe; 5091 5092 /* 5093 * Invalidate a stale recursive mapping of the page directory page. 5094 */ 5095 pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); 5096 5097 pmap_pdpe_demotions++; 5098 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 5099 " in pmap %p", va, pmap); 5100 return (TRUE); 5101} 5102 5103/* 5104 * Sets the memory attribute for the specified page. 5105 */ 5106void 5107pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5108{ 5109 5110 m->md.pat_mode = ma; 5111 5112 /* 5113 * If "m" is a normal page, update its direct mapping. This update 5114 * can be relied upon to perform any cache operations that are 5115 * required for data coherence. 5116 */ 5117 if ((m->flags & PG_FICTITIOUS) == 0 && 5118 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 5119 m->md.pat_mode)) 5120 panic("memory attribute change on the direct map failed"); 5121} 5122 5123/* 5124 * Changes the specified virtual address range's memory type to that given by 5125 * the parameter "mode". The specified virtual address range must be 5126 * completely contained within either the direct map or the kernel map. If 5127 * the virtual address range is contained within the kernel map, then the 5128 * memory type for each of the corresponding ranges of the direct map is also 5129 * changed. (The corresponding ranges of the direct map are those ranges that 5130 * map the same physical pages as the specified virtual address range.) These 5131 * changes to the direct map are necessary because Intel describes the 5132 * behavior of their processors as "undefined" if two or more mappings to the 5133 * same physical page have different memory types. 5134 * 5135 * Returns zero if the change completed successfully, and either EINVAL or 5136 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 5137 * of the virtual address range was not mapped, and ENOMEM is returned if 5138 * there was insufficient memory available to complete the change. In the 5139 * latter case, the memory type may have been changed on some part of the 5140 * virtual address range or the direct map. 5141 */ 5142int 5143pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 5144{ 5145 int error; 5146 5147 PMAP_LOCK(kernel_pmap); 5148 error = pmap_change_attr_locked(va, size, mode); 5149 PMAP_UNLOCK(kernel_pmap); 5150 return (error); 5151} 5152 5153static int 5154pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) 5155{ 5156 vm_offset_t base, offset, tmpva; 5157 vm_paddr_t pa_start, pa_end; 5158 pdp_entry_t *pdpe; 5159 pd_entry_t *pde; 5160 pt_entry_t *pte; 5161 int cache_bits_pte, cache_bits_pde, error; 5162 boolean_t changed; 5163 5164 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 5165 base = trunc_page(va); 5166 offset = va & PAGE_MASK; 5167 size = round_page(offset + size); 5168 5169 /* 5170 * Only supported on kernel virtual addresses, including the direct 5171 * map but excluding the recursive map. 5172 */ 5173 if (base < DMAP_MIN_ADDRESS) 5174 return (EINVAL); 5175 5176 cache_bits_pde = pmap_cache_bits(mode, 1); 5177 cache_bits_pte = pmap_cache_bits(mode, 0); 5178 changed = FALSE; 5179 5180 /* 5181 * Pages that aren't mapped aren't supported. Also break down 2MB pages 5182 * into 4KB pages if required. 5183 */ 5184 for (tmpva = base; tmpva < base + size; ) { 5185 pdpe = pmap_pdpe(kernel_pmap, tmpva); 5186 if (*pdpe == 0) 5187 return (EINVAL); 5188 if (*pdpe & PG_PS) { 5189 /* 5190 * If the current 1GB page already has the required 5191 * memory type, then we need not demote this page. Just 5192 * increment tmpva to the next 1GB page frame. 5193 */ 5194 if ((*pdpe & PG_PDE_CACHE) == cache_bits_pde) { 5195 tmpva = trunc_1gpage(tmpva) + NBPDP; 5196 continue; 5197 } 5198 5199 /* 5200 * If the current offset aligns with a 1GB page frame 5201 * and there is at least 1GB left within the range, then 5202 * we need not break down this page into 2MB pages. 5203 */ 5204 if ((tmpva & PDPMASK) == 0 && 5205 tmpva + PDPMASK < base + size) { 5206 tmpva += NBPDP; 5207 continue; 5208 } 5209 if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) 5210 return (ENOMEM); 5211 } 5212 pde = pmap_pdpe_to_pde(pdpe, tmpva); 5213 if (*pde == 0) 5214 return (EINVAL); 5215 if (*pde & PG_PS) { 5216 /* 5217 * If the current 2MB page already has the required 5218 * memory type, then we need not demote this page. Just 5219 * increment tmpva to the next 2MB page frame. 5220 */ 5221 if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { 5222 tmpva = trunc_2mpage(tmpva) + NBPDR; 5223 continue; 5224 } 5225 5226 /* 5227 * If the current offset aligns with a 2MB page frame 5228 * and there is at least 2MB left within the range, then 5229 * we need not break down this page into 4KB pages. 5230 */ 5231 if ((tmpva & PDRMASK) == 0 && 5232 tmpva + PDRMASK < base + size) { 5233 tmpva += NBPDR; 5234 continue; 5235 } 5236 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) 5237 return (ENOMEM); 5238 } 5239 pte = pmap_pde_to_pte(pde, tmpva); 5240 if (*pte == 0) 5241 return (EINVAL); 5242 tmpva += PAGE_SIZE; 5243 } 5244 error = 0; 5245 5246 /* 5247 * Ok, all the pages exist, so run through them updating their 5248 * cache mode if required. 5249 */ 5250 pa_start = pa_end = 0; 5251 for (tmpva = base; tmpva < base + size; ) { 5252 pdpe = pmap_pdpe(kernel_pmap, tmpva); 5253 if (*pdpe & PG_PS) { 5254 if ((*pdpe & PG_PDE_CACHE) != cache_bits_pde) { 5255 pmap_pde_attr(pdpe, cache_bits_pde); 5256 changed = TRUE; 5257 } 5258 if (tmpva >= VM_MIN_KERNEL_ADDRESS) { 5259 if (pa_start == pa_end) { 5260 /* Start physical address run. */ 5261 pa_start = *pdpe & PG_PS_FRAME; 5262 pa_end = pa_start + NBPDP; 5263 } else if (pa_end == (*pdpe & PG_PS_FRAME)) 5264 pa_end += NBPDP; 5265 else { 5266 /* Run ended, update direct map. */ 5267 error = pmap_change_attr_locked( 5268 PHYS_TO_DMAP(pa_start), 5269 pa_end - pa_start, mode); 5270 if (error != 0) 5271 break; 5272 /* Start physical address run. */ 5273 pa_start = *pdpe & PG_PS_FRAME; 5274 pa_end = pa_start + NBPDP; 5275 } 5276 } 5277 tmpva = trunc_1gpage(tmpva) + NBPDP; 5278 continue; 5279 } 5280 pde = pmap_pdpe_to_pde(pdpe, tmpva); 5281 if (*pde & PG_PS) { 5282 if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { 5283 pmap_pde_attr(pde, cache_bits_pde); 5284 changed = TRUE; 5285 } 5286 if (tmpva >= VM_MIN_KERNEL_ADDRESS) { 5287 if (pa_start == pa_end) { 5288 /* Start physical address run. */ 5289 pa_start = *pde & PG_PS_FRAME; 5290 pa_end = pa_start + NBPDR; 5291 } else if (pa_end == (*pde & PG_PS_FRAME)) 5292 pa_end += NBPDR; 5293 else { 5294 /* Run ended, update direct map. */ 5295 error = pmap_change_attr_locked( 5296 PHYS_TO_DMAP(pa_start), 5297 pa_end - pa_start, mode); 5298 if (error != 0) 5299 break; 5300 /* Start physical address run. */ 5301 pa_start = *pde & PG_PS_FRAME; 5302 pa_end = pa_start + NBPDR; 5303 } 5304 } 5305 tmpva = trunc_2mpage(tmpva) + NBPDR; 5306 } else { 5307 pte = pmap_pde_to_pte(pde, tmpva); 5308 if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { 5309 pmap_pte_attr(pte, cache_bits_pte); 5310 changed = TRUE; 5311 } 5312 if (tmpva >= VM_MIN_KERNEL_ADDRESS) { 5313 if (pa_start == pa_end) { 5314 /* Start physical address run. */ 5315 pa_start = *pte & PG_FRAME; 5316 pa_end = pa_start + PAGE_SIZE; 5317 } else if (pa_end == (*pte & PG_FRAME)) 5318 pa_end += PAGE_SIZE; 5319 else { 5320 /* Run ended, update direct map. */ 5321 error = pmap_change_attr_locked( 5322 PHYS_TO_DMAP(pa_start), 5323 pa_end - pa_start, mode); 5324 if (error != 0) 5325 break; 5326 /* Start physical address run. */ 5327 pa_start = *pte & PG_FRAME; 5328 pa_end = pa_start + PAGE_SIZE; 5329 } 5330 } 5331 tmpva += PAGE_SIZE; 5332 } 5333 } 5334 if (error == 0 && pa_start != pa_end) 5335 error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), 5336 pa_end - pa_start, mode); 5337 5338 /* 5339 * Flush CPU caches if required to make sure any data isn't cached that 5340 * shouldn't be, etc. 5341 */ 5342 if (changed) { 5343 pmap_invalidate_range(kernel_pmap, base, tmpva); 5344 pmap_invalidate_cache_range(base, tmpva); 5345 } 5346 return (error); 5347} 5348 5349/* 5350 * Demotes any mapping within the direct map region that covers more than the 5351 * specified range of physical addresses. This range's size must be a power 5352 * of two and its starting address must be a multiple of its size. Since the 5353 * demotion does not change any attributes of the mapping, a TLB invalidation 5354 * is not mandatory. The caller may, however, request a TLB invalidation. 5355 */ 5356void 5357pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) 5358{ 5359 pdp_entry_t *pdpe; 5360 pd_entry_t *pde; 5361 vm_offset_t va; 5362 boolean_t changed; 5363 5364 if (len == 0) 5365 return; 5366 KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); 5367 KASSERT((base & (len - 1)) == 0, 5368 ("pmap_demote_DMAP: base is not a multiple of len")); 5369 if (len < NBPDP && base < dmaplimit) { 5370 va = PHYS_TO_DMAP(base); 5371 changed = FALSE; 5372 PMAP_LOCK(kernel_pmap); 5373 pdpe = pmap_pdpe(kernel_pmap, va); 5374 if ((*pdpe & PG_V) == 0) 5375 panic("pmap_demote_DMAP: invalid PDPE"); 5376 if ((*pdpe & PG_PS) != 0) { 5377 if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) 5378 panic("pmap_demote_DMAP: PDPE failed"); 5379 changed = TRUE; 5380 } 5381 if (len < NBPDR) { 5382 pde = pmap_pdpe_to_pde(pdpe, va); 5383 if ((*pde & PG_V) == 0) 5384 panic("pmap_demote_DMAP: invalid PDE"); 5385 if ((*pde & PG_PS) != 0) { 5386 if (!pmap_demote_pde(kernel_pmap, pde, va)) 5387 panic("pmap_demote_DMAP: PDE failed"); 5388 changed = TRUE; 5389 } 5390 } 5391 if (changed && invalidate) 5392 pmap_invalidate_page(kernel_pmap, va); 5393 PMAP_UNLOCK(kernel_pmap); 5394 } 5395} 5396 5397/* 5398 * perform the pmap work for mincore 5399 */ 5400int 5401pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 5402{ 5403 pd_entry_t *pdep; 5404 pt_entry_t pte; 5405 vm_paddr_t pa; 5406 int val; 5407 5408 PMAP_LOCK(pmap); 5409retry: 5410 pdep = pmap_pde(pmap, addr); 5411 if (pdep != NULL && (*pdep & PG_V)) { 5412 if (*pdep & PG_PS) { 5413 pte = *pdep; 5414 /* Compute the physical address of the 4KB page. */ 5415 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & 5416 PG_FRAME; 5417 val = MINCORE_SUPER; 5418 } else { 5419 pte = *pmap_pde_to_pte(pdep, addr); 5420 pa = pte & PG_FRAME; 5421 val = 0; 5422 } 5423 } else { 5424 pte = 0; 5425 pa = 0; 5426 val = 0; 5427 } 5428 if ((pte & PG_V) != 0) { 5429 val |= MINCORE_INCORE; 5430 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5431 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 5432 if ((pte & PG_A) != 0) 5433 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 5434 } 5435 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 5436 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 5437 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 5438 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 5439 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 5440 goto retry; 5441 } else 5442 PA_UNLOCK_COND(*locked_pa); 5443 PMAP_UNLOCK(pmap); 5444 return (val); 5445} 5446 5447void 5448pmap_activate(struct thread *td) 5449{ 5450 pmap_t pmap, oldpmap; 5451 u_int cpuid; 5452 u_int64_t cr3; 5453 5454 critical_enter(); 5455 pmap = vmspace_pmap(td->td_proc->p_vmspace); 5456 oldpmap = PCPU_GET(curpmap); 5457 cpuid = PCPU_GET(cpuid); 5458#ifdef SMP 5459 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 5460 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 5461#else 5462 CPU_CLR(cpuid, &oldpmap->pm_active); 5463 CPU_SET(cpuid, &pmap->pm_active); 5464#endif 5465 cr3 = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4); 5466 td->td_pcb->pcb_cr3 = cr3; 5467 load_cr3(cr3); 5468 PCPU_SET(curpmap, pmap); 5469 critical_exit(); 5470} 5471 5472void 5473pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 5474{ 5475} 5476 5477/* 5478 * Increase the starting virtual address of the given mapping if a 5479 * different alignment might result in more superpage mappings. 5480 */ 5481void 5482pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 5483 vm_offset_t *addr, vm_size_t size) 5484{ 5485 vm_offset_t superpage_offset; 5486 5487 if (size < NBPDR) 5488 return; 5489 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 5490 offset += ptoa(object->pg_color); 5491 superpage_offset = offset & PDRMASK; 5492 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 5493 (*addr & PDRMASK) == superpage_offset) 5494 return; 5495 if ((*addr & PDRMASK) < superpage_offset) 5496 *addr = (*addr & ~PDRMASK) + superpage_offset; 5497 else 5498 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 5499} 5500 5501#include "opt_ddb.h" 5502#ifdef DDB 5503#include <ddb/ddb.h> 5504 5505DB_SHOW_COMMAND(pte, pmap_print_pte) 5506{ 5507 pmap_t pmap; 5508 pml4_entry_t *pml4; 5509 pdp_entry_t *pdp; 5510 pd_entry_t *pde; 5511 pt_entry_t *pte; 5512 vm_offset_t va; 5513 5514 if (have_addr) { 5515 va = (vm_offset_t)addr; 5516 pmap = PCPU_GET(curpmap); /* XXX */ 5517 } else { 5518 db_printf("show pte addr\n"); 5519 return; 5520 } 5521 pml4 = pmap_pml4e(pmap, va); 5522 db_printf("VA %#016lx pml4e %#016lx", va, *pml4); 5523 if ((*pml4 & PG_V) == 0) { 5524 db_printf("\n"); 5525 return; 5526 } 5527 pdp = pmap_pml4e_to_pdpe(pml4, va); 5528 db_printf(" pdpe %#016lx", *pdp); 5529 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { 5530 db_printf("\n"); 5531 return; 5532 } 5533 pde = pmap_pdpe_to_pde(pdp, va); 5534 db_printf(" pde %#016lx", *pde); 5535 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { 5536 db_printf("\n"); 5537 return; 5538 } 5539 pte = pmap_pde_to_pte(pde, va); 5540 db_printf(" pte %#016lx\n", *pte); 5541} 5542 5543DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) 5544{ 5545 vm_paddr_t a; 5546 5547 if (have_addr) { 5548 a = (vm_paddr_t)addr; 5549 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); 5550 } else { 5551 db_printf("show phys2dmap addr\n"); 5552 } 5553} 5554#endif 5555