pmap.c revision 269072
1/*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2003 Peter Wemm 9 * All rights reserved. 10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 11 * All rights reserved. 12 * 13 * This code is derived from software contributed to Berkeley by 14 * the Systems Programming Group of the University of Utah Computer 15 * Science Department and William Jolitz of UUNET Technologies Inc. 16 * 17 * Redistribution and use in source and binary forms, with or without 18 * modification, are permitted provided that the following conditions 19 * are met: 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 3. All advertising materials mentioning features or use of this software 26 * must display the following acknowledgement: 27 * This product includes software developed by the University of 28 * California, Berkeley and its contributors. 29 * 4. Neither the name of the University nor the names of its contributors 30 * may be used to endorse or promote products derived from this software 31 * without specific prior written permission. 32 * 33 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 34 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 35 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 36 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 37 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 38 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 39 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 40 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 41 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 42 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 43 * SUCH DAMAGE. 44 * 45 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 46 */ 47/*- 48 * Copyright (c) 2003 Networks Associates Technology, Inc. 49 * All rights reserved. 50 * 51 * This software was developed for the FreeBSD Project by Jake Burkholder, 52 * Safeport Network Services, and Network Associates Laboratories, the 53 * Security Research Division of Network Associates, Inc. under 54 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 55 * CHATS research program. 56 * 57 * Redistribution and use in source and binary forms, with or without 58 * modification, are permitted provided that the following conditions 59 * are met: 60 * 1. Redistributions of source code must retain the above copyright 61 * notice, this list of conditions and the following disclaimer. 62 * 2. Redistributions in binary form must reproduce the above copyright 63 * notice, this list of conditions and the following disclaimer in the 64 * documentation and/or other materials provided with the distribution. 65 * 66 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 67 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 68 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 69 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 70 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 71 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 72 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 73 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 74 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 75 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 76 * SUCH DAMAGE. 77 */ 78 79#define AMD64_NPT_AWARE 80 81#include <sys/cdefs.h> 82__FBSDID("$FreeBSD: stable/10/sys/amd64/amd64/pmap.c 269072 2014-07-24 16:29:44Z kib $"); 83 84/* 85 * Manages physical address maps. 86 * 87 * Since the information managed by this module is 88 * also stored by the logical address mapping module, 89 * this module may throw away valid virtual-to-physical 90 * mappings at almost any time. However, invalidations 91 * of virtual-to-physical mappings must be done as 92 * requested. 93 * 94 * In order to cope with hardware architectures which 95 * make virtual-to-physical map invalidates expensive, 96 * this module may delay invalidate or reduced protection 97 * operations until such time as they are actually 98 * necessary. This module is given full information as 99 * to which processors are currently using which maps, 100 * and to when physical maps must be made correct. 101 */ 102 103#include "opt_pmap.h" 104#include "opt_vm.h" 105 106#include <sys/param.h> 107#include <sys/bus.h> 108#include <sys/systm.h> 109#include <sys/kernel.h> 110#include <sys/ktr.h> 111#include <sys/lock.h> 112#include <sys/malloc.h> 113#include <sys/mman.h> 114#include <sys/mutex.h> 115#include <sys/proc.h> 116#include <sys/rwlock.h> 117#include <sys/sx.h> 118#include <sys/vmmeter.h> 119#include <sys/sched.h> 120#include <sys/sysctl.h> 121#include <sys/_unrhdr.h> 122#include <sys/smp.h> 123 124#include <vm/vm.h> 125#include <vm/vm_param.h> 126#include <vm/vm_kern.h> 127#include <vm/vm_page.h> 128#include <vm/vm_map.h> 129#include <vm/vm_object.h> 130#include <vm/vm_extern.h> 131#include <vm/vm_pageout.h> 132#include <vm/vm_pager.h> 133#include <vm/vm_radix.h> 134#include <vm/vm_reserv.h> 135#include <vm/uma.h> 136 137#include <machine/intr_machdep.h> 138#include <machine/apicvar.h> 139#include <machine/cpu.h> 140#include <machine/cputypes.h> 141#include <machine/md_var.h> 142#include <machine/pcb.h> 143#include <machine/specialreg.h> 144#ifdef SMP 145#include <machine/smp.h> 146#endif 147 148static __inline boolean_t 149pmap_emulate_ad_bits(pmap_t pmap) 150{ 151 152 return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); 153} 154 155static __inline pt_entry_t 156pmap_valid_bit(pmap_t pmap) 157{ 158 pt_entry_t mask; 159 160 switch (pmap->pm_type) { 161 case PT_X86: 162 mask = X86_PG_V; 163 break; 164 case PT_EPT: 165 if (pmap_emulate_ad_bits(pmap)) 166 mask = EPT_PG_EMUL_V; 167 else 168 mask = EPT_PG_READ; 169 break; 170 default: 171 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); 172 } 173 174 return (mask); 175} 176 177static __inline pt_entry_t 178pmap_rw_bit(pmap_t pmap) 179{ 180 pt_entry_t mask; 181 182 switch (pmap->pm_type) { 183 case PT_X86: 184 mask = X86_PG_RW; 185 break; 186 case PT_EPT: 187 if (pmap_emulate_ad_bits(pmap)) 188 mask = EPT_PG_EMUL_RW; 189 else 190 mask = EPT_PG_WRITE; 191 break; 192 default: 193 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); 194 } 195 196 return (mask); 197} 198 199static __inline pt_entry_t 200pmap_global_bit(pmap_t pmap) 201{ 202 pt_entry_t mask; 203 204 switch (pmap->pm_type) { 205 case PT_X86: 206 mask = X86_PG_G; 207 break; 208 case PT_EPT: 209 mask = 0; 210 break; 211 default: 212 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); 213 } 214 215 return (mask); 216} 217 218static __inline pt_entry_t 219pmap_accessed_bit(pmap_t pmap) 220{ 221 pt_entry_t mask; 222 223 switch (pmap->pm_type) { 224 case PT_X86: 225 mask = X86_PG_A; 226 break; 227 case PT_EPT: 228 if (pmap_emulate_ad_bits(pmap)) 229 mask = EPT_PG_READ; 230 else 231 mask = EPT_PG_A; 232 break; 233 default: 234 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); 235 } 236 237 return (mask); 238} 239 240static __inline pt_entry_t 241pmap_modified_bit(pmap_t pmap) 242{ 243 pt_entry_t mask; 244 245 switch (pmap->pm_type) { 246 case PT_X86: 247 mask = X86_PG_M; 248 break; 249 case PT_EPT: 250 if (pmap_emulate_ad_bits(pmap)) 251 mask = EPT_PG_WRITE; 252 else 253 mask = EPT_PG_M; 254 break; 255 default: 256 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); 257 } 258 259 return (mask); 260} 261 262#if !defined(DIAGNOSTIC) 263#ifdef __GNUC_GNU_INLINE__ 264#define PMAP_INLINE __attribute__((__gnu_inline__)) inline 265#else 266#define PMAP_INLINE extern inline 267#endif 268#else 269#define PMAP_INLINE 270#endif 271 272#ifdef PV_STATS 273#define PV_STAT(x) do { x ; } while (0) 274#else 275#define PV_STAT(x) do { } while (0) 276#endif 277 278#define pa_index(pa) ((pa) >> PDRSHIFT) 279#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 280 281#define NPV_LIST_LOCKS MAXCPU 282 283#define PHYS_TO_PV_LIST_LOCK(pa) \ 284 (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) 285 286#define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ 287 struct rwlock **_lockp = (lockp); \ 288 struct rwlock *_new_lock; \ 289 \ 290 _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ 291 if (_new_lock != *_lockp) { \ 292 if (*_lockp != NULL) \ 293 rw_wunlock(*_lockp); \ 294 *_lockp = _new_lock; \ 295 rw_wlock(*_lockp); \ 296 } \ 297} while (0) 298 299#define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ 300 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) 301 302#define RELEASE_PV_LIST_LOCK(lockp) do { \ 303 struct rwlock **_lockp = (lockp); \ 304 \ 305 if (*_lockp != NULL) { \ 306 rw_wunlock(*_lockp); \ 307 *_lockp = NULL; \ 308 } \ 309} while (0) 310 311#define VM_PAGE_TO_PV_LIST_LOCK(m) \ 312 PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) 313 314struct pmap kernel_pmap_store; 315 316vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 317vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 318 319int nkpt; 320SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, 321 "Number of kernel page table pages allocated on bootup"); 322 323static int ndmpdp; 324vm_paddr_t dmaplimit; 325vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; 326pt_entry_t pg_nx; 327 328static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 329 330static int pat_works = 1; 331SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, 332 "Is page attribute table fully functional?"); 333 334static int pg_ps_enabled = 1; 335SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0, 336 "Are large page mappings enabled?"); 337 338#define PAT_INDEX_SIZE 8 339static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 340 341static u_int64_t KPTphys; /* phys addr of kernel level 1 */ 342static u_int64_t KPDphys; /* phys addr of kernel level 2 */ 343u_int64_t KPDPphys; /* phys addr of kernel level 3 */ 344u_int64_t KPML4phys; /* phys addr of kernel level 4 */ 345 346static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ 347static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ 348static int ndmpdpphys; /* number of DMPDPphys pages */ 349 350static struct rwlock_padalign pvh_global_lock; 351 352/* 353 * Data for the pv entry allocation mechanism 354 */ 355static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 356static struct mtx pv_chunks_mutex; 357static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; 358static struct md_page *pv_table; 359 360/* 361 * All those kernel PT submaps that BSD is so fond of 362 */ 363pt_entry_t *CMAP1 = 0; 364caddr_t CADDR1 = 0; 365 366static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ 367 368static struct unrhdr pcid_unr; 369static struct mtx pcid_mtx; 370int pmap_pcid_enabled = 0; 371SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN, &pmap_pcid_enabled, 372 0, "Is TLB Context ID enabled ?"); 373int invpcid_works = 0; 374SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, 375 "Is the invpcid instruction available ?"); 376 377static int 378pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS) 379{ 380 int i; 381 uint64_t res; 382 383 res = 0; 384 CPU_FOREACH(i) { 385 res += cpuid_to_pcpu[i]->pc_pm_save_cnt; 386 } 387 return (sysctl_handle_64(oidp, &res, 0, req)); 388} 389SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW | 390 CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU", 391 "Count of saved TLB context on switch"); 392 393/* pmap_copy_pages() over non-DMAP */ 394static struct mtx cpage_lock; 395static vm_offset_t cpage_a; 396static vm_offset_t cpage_b; 397 398/* 399 * Crashdump maps. 400 */ 401static caddr_t crashdumpmap; 402 403static void free_pv_chunk(struct pv_chunk *pc); 404static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 405static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); 406static int popcnt_pc_map_elem(uint64_t elem); 407static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); 408static void reserve_pv_entries(pmap_t pmap, int needed, 409 struct rwlock **lockp); 410static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 411 struct rwlock **lockp); 412static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 413 struct rwlock **lockp); 414static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 415 struct rwlock **lockp); 416static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 417static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 418 vm_offset_t va); 419 420static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); 421static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 422static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, 423 vm_offset_t va, struct rwlock **lockp); 424static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, 425 vm_offset_t va); 426static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, 427 vm_prot_t prot, struct rwlock **lockp); 428static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 429 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); 430static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 431static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 432static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 433static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); 434static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask); 435static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 436 struct rwlock **lockp); 437static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 438 vm_prot_t prot); 439static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask); 440static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 441 struct spglist *free, struct rwlock **lockp); 442static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 443 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); 444static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); 445static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 446 struct spglist *free); 447static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 448 vm_page_t m, struct rwlock **lockp); 449static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 450 pd_entry_t newpde); 451static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); 452 453static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, 454 struct rwlock **lockp); 455static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, 456 struct rwlock **lockp); 457static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, 458 struct rwlock **lockp); 459 460static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, 461 struct spglist *free); 462static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); 463static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 464 465/* 466 * Move the kernel virtual free pointer to the next 467 * 2MB. This is used to help improve performance 468 * by using a large (2MB) page for much of the kernel 469 * (.text, .data, .bss) 470 */ 471static vm_offset_t 472pmap_kmem_choose(vm_offset_t addr) 473{ 474 vm_offset_t newaddr = addr; 475 476 newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1); 477 return (newaddr); 478} 479 480/********************/ 481/* Inline functions */ 482/********************/ 483 484/* Return a non-clipped PD index for a given VA */ 485static __inline vm_pindex_t 486pmap_pde_pindex(vm_offset_t va) 487{ 488 return (va >> PDRSHIFT); 489} 490 491 492/* Return various clipped indexes for a given VA */ 493static __inline vm_pindex_t 494pmap_pte_index(vm_offset_t va) 495{ 496 497 return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); 498} 499 500static __inline vm_pindex_t 501pmap_pde_index(vm_offset_t va) 502{ 503 504 return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); 505} 506 507static __inline vm_pindex_t 508pmap_pdpe_index(vm_offset_t va) 509{ 510 511 return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); 512} 513 514static __inline vm_pindex_t 515pmap_pml4e_index(vm_offset_t va) 516{ 517 518 return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); 519} 520 521/* Return a pointer to the PML4 slot that corresponds to a VA */ 522static __inline pml4_entry_t * 523pmap_pml4e(pmap_t pmap, vm_offset_t va) 524{ 525 526 return (&pmap->pm_pml4[pmap_pml4e_index(va)]); 527} 528 529/* Return a pointer to the PDP slot that corresponds to a VA */ 530static __inline pdp_entry_t * 531pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) 532{ 533 pdp_entry_t *pdpe; 534 535 pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); 536 return (&pdpe[pmap_pdpe_index(va)]); 537} 538 539/* Return a pointer to the PDP slot that corresponds to a VA */ 540static __inline pdp_entry_t * 541pmap_pdpe(pmap_t pmap, vm_offset_t va) 542{ 543 pml4_entry_t *pml4e; 544 pt_entry_t PG_V; 545 546 PG_V = pmap_valid_bit(pmap); 547 pml4e = pmap_pml4e(pmap, va); 548 if ((*pml4e & PG_V) == 0) 549 return (NULL); 550 return (pmap_pml4e_to_pdpe(pml4e, va)); 551} 552 553/* Return a pointer to the PD slot that corresponds to a VA */ 554static __inline pd_entry_t * 555pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) 556{ 557 pd_entry_t *pde; 558 559 pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); 560 return (&pde[pmap_pde_index(va)]); 561} 562 563/* Return a pointer to the PD slot that corresponds to a VA */ 564static __inline pd_entry_t * 565pmap_pde(pmap_t pmap, vm_offset_t va) 566{ 567 pdp_entry_t *pdpe; 568 pt_entry_t PG_V; 569 570 PG_V = pmap_valid_bit(pmap); 571 pdpe = pmap_pdpe(pmap, va); 572 if (pdpe == NULL || (*pdpe & PG_V) == 0) 573 return (NULL); 574 return (pmap_pdpe_to_pde(pdpe, va)); 575} 576 577/* Return a pointer to the PT slot that corresponds to a VA */ 578static __inline pt_entry_t * 579pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) 580{ 581 pt_entry_t *pte; 582 583 pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 584 return (&pte[pmap_pte_index(va)]); 585} 586 587/* Return a pointer to the PT slot that corresponds to a VA */ 588static __inline pt_entry_t * 589pmap_pte(pmap_t pmap, vm_offset_t va) 590{ 591 pd_entry_t *pde; 592 pt_entry_t PG_V; 593 594 PG_V = pmap_valid_bit(pmap); 595 pde = pmap_pde(pmap, va); 596 if (pde == NULL || (*pde & PG_V) == 0) 597 return (NULL); 598 if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ 599 return ((pt_entry_t *)pde); 600 return (pmap_pde_to_pte(pde, va)); 601} 602 603static __inline void 604pmap_resident_count_inc(pmap_t pmap, int count) 605{ 606 607 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 608 pmap->pm_stats.resident_count += count; 609} 610 611static __inline void 612pmap_resident_count_dec(pmap_t pmap, int count) 613{ 614 615 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 616 KASSERT(pmap->pm_stats.resident_count >= count, 617 ("pmap %p resident count underflow %ld %d", pmap, 618 pmap->pm_stats.resident_count, count)); 619 pmap->pm_stats.resident_count -= count; 620} 621 622PMAP_INLINE pt_entry_t * 623vtopte(vm_offset_t va) 624{ 625 u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 626 627 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); 628 629 return (PTmap + ((va >> PAGE_SHIFT) & mask)); 630} 631 632static __inline pd_entry_t * 633vtopde(vm_offset_t va) 634{ 635 u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); 636 637 KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); 638 639 return (PDmap + ((va >> PDRSHIFT) & mask)); 640} 641 642static u_int64_t 643allocpages(vm_paddr_t *firstaddr, int n) 644{ 645 u_int64_t ret; 646 647 ret = *firstaddr; 648 bzero((void *)ret, n * PAGE_SIZE); 649 *firstaddr += n * PAGE_SIZE; 650 return (ret); 651} 652 653CTASSERT(powerof2(NDMPML4E)); 654 655/* number of kernel PDP slots */ 656#define NKPDPE(ptpgs) howmany((ptpgs), NPDEPG) 657 658static void 659nkpt_init(vm_paddr_t addr) 660{ 661 int pt_pages; 662 663#ifdef NKPT 664 pt_pages = NKPT; 665#else 666 pt_pages = howmany(addr, 1 << PDRSHIFT); 667 pt_pages += NKPDPE(pt_pages); 668 669 /* 670 * Add some slop beyond the bare minimum required for bootstrapping 671 * the kernel. 672 * 673 * This is quite important when allocating KVA for kernel modules. 674 * The modules are required to be linked in the negative 2GB of 675 * the address space. If we run out of KVA in this region then 676 * pmap_growkernel() will need to allocate page table pages to map 677 * the entire 512GB of KVA space which is an unnecessary tax on 678 * physical memory. 679 */ 680 pt_pages += 8; /* 16MB additional slop for kernel modules */ 681#endif 682 nkpt = pt_pages; 683} 684 685static void 686create_pagetables(vm_paddr_t *firstaddr) 687{ 688 int i, j, ndm1g, nkpdpe; 689 pt_entry_t *pt_p; 690 pd_entry_t *pd_p; 691 pdp_entry_t *pdp_p; 692 pml4_entry_t *p4_p; 693 694 /* Allocate page table pages for the direct map */ 695 ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; 696 if (ndmpdp < 4) /* Minimum 4GB of dirmap */ 697 ndmpdp = 4; 698 ndmpdpphys = howmany(ndmpdp, NPDPEPG); 699 if (ndmpdpphys > NDMPML4E) { 700 /* 701 * Each NDMPML4E allows 512 GB, so limit to that, 702 * and then readjust ndmpdp and ndmpdpphys. 703 */ 704 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); 705 Maxmem = atop(NDMPML4E * NBPML4); 706 ndmpdpphys = NDMPML4E; 707 ndmpdp = NDMPML4E * NPDEPG; 708 } 709 DMPDPphys = allocpages(firstaddr, ndmpdpphys); 710 ndm1g = 0; 711 if ((amd_feature & AMDID_PAGE1GB) != 0) 712 ndm1g = ptoa(Maxmem) >> PDPSHIFT; 713 if (ndm1g < ndmpdp) 714 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); 715 dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; 716 717 /* Allocate pages */ 718 KPML4phys = allocpages(firstaddr, 1); 719 KPDPphys = allocpages(firstaddr, NKPML4E); 720 721 /* 722 * Allocate the initial number of kernel page table pages required to 723 * bootstrap. We defer this until after all memory-size dependent 724 * allocations are done (e.g. direct map), so that we don't have to 725 * build in too much slop in our estimate. 726 * 727 * Note that when NKPML4E > 1, we have an empty page underneath 728 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) 729 * pages. (pmap_enter requires a PD page to exist for each KPML4E.) 730 */ 731 nkpt_init(*firstaddr); 732 nkpdpe = NKPDPE(nkpt); 733 734 KPTphys = allocpages(firstaddr, nkpt); 735 KPDphys = allocpages(firstaddr, nkpdpe); 736 737 /* Fill in the underlying page table pages */ 738 /* Nominally read-only (but really R/W) from zero to physfree */ 739 /* XXX not fully used, underneath 2M pages */ 740 pt_p = (pt_entry_t *)KPTphys; 741 for (i = 0; ptoa(i) < *firstaddr; i++) 742 pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | X86_PG_G; 743 744 /* Now map the page tables at their location within PTmap */ 745 pd_p = (pd_entry_t *)KPDphys; 746 for (i = 0; i < nkpt; i++) 747 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; 748 749 /* Map from zero to end of allocations under 2M pages */ 750 /* This replaces some of the KPTphys entries above */ 751 for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) 752 pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS | 753 X86_PG_G; 754 755 /* And connect up the PD to the PDP (leaving room for L4 pages) */ 756 pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); 757 for (i = 0; i < nkpdpe; i++) 758 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V | 759 PG_U; 760 761 /* 762 * Now, set up the direct map region using 2MB and/or 1GB pages. If 763 * the end of physical memory is not aligned to a 1GB page boundary, 764 * then the residual physical memory is mapped with 2MB pages. Later, 765 * if pmap_mapdev{_attr}() uses the direct map for non-write-back 766 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings 767 * that are partially used. 768 */ 769 pd_p = (pd_entry_t *)DMPDphys; 770 for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { 771 pd_p[j] = (vm_paddr_t)i << PDRSHIFT; 772 /* Preset PG_M and PG_A because demotion expects it. */ 773 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | 774 X86_PG_M | X86_PG_A; 775 } 776 pdp_p = (pdp_entry_t *)DMPDPphys; 777 for (i = 0; i < ndm1g; i++) { 778 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; 779 /* Preset PG_M and PG_A because demotion expects it. */ 780 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | X86_PG_G | 781 X86_PG_M | X86_PG_A; 782 } 783 for (j = 0; i < ndmpdp; i++, j++) { 784 pdp_p[i] = DMPDphys + ptoa(j); 785 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U; 786 } 787 788 /* And recursively map PML4 to itself in order to get PTmap */ 789 p4_p = (pml4_entry_t *)KPML4phys; 790 p4_p[PML4PML4I] = KPML4phys; 791 p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U; 792 793 /* Connect the Direct Map slot(s) up to the PML4. */ 794 for (i = 0; i < ndmpdpphys; i++) { 795 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); 796 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V | PG_U; 797 } 798 799 /* Connect the KVA slots up to the PML4 */ 800 for (i = 0; i < NKPML4E; i++) { 801 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); 802 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V | PG_U; 803 } 804} 805 806/* 807 * Bootstrap the system enough to run with virtual memory. 808 * 809 * On amd64 this is called after mapping has already been enabled 810 * and just syncs the pmap module with what has already been done. 811 * [We can't call it easily with mapping off since the kernel is not 812 * mapped with PA == VA, hence we would have to relocate every address 813 * from the linked base (virtual) address "KERNBASE" to the actual 814 * (physical) address starting relative to 0] 815 */ 816void 817pmap_bootstrap(vm_paddr_t *firstaddr) 818{ 819 vm_offset_t va; 820 pt_entry_t *pte; 821 822 /* 823 * Create an initial set of page tables to run the kernel in. 824 */ 825 create_pagetables(firstaddr); 826 827 virtual_avail = (vm_offset_t) KERNBASE + *firstaddr; 828 virtual_avail = pmap_kmem_choose(virtual_avail); 829 830 virtual_end = VM_MAX_KERNEL_ADDRESS; 831 832 833 /* XXX do %cr0 as well */ 834 load_cr4(rcr4() | CR4_PGE | CR4_PSE); 835 load_cr3(KPML4phys); 836 if (cpu_stdext_feature & CPUID_STDEXT_SMEP) 837 load_cr4(rcr4() | CR4_SMEP); 838 839 /* 840 * Initialize the kernel pmap (which is statically allocated). 841 */ 842 PMAP_LOCK_INIT(kernel_pmap); 843 kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); 844 kernel_pmap->pm_cr3 = KPML4phys; 845 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 846 CPU_FILL(&kernel_pmap->pm_save); /* always superset of pm_active */ 847 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 848 kernel_pmap->pm_flags = pmap_flags; 849 850 /* 851 * Initialize the global pv list lock. 852 */ 853 rw_init(&pvh_global_lock, "pmap pv global"); 854 855 /* 856 * Reserve some special page table entries/VA space for temporary 857 * mapping of pages. 858 */ 859#define SYSMAP(c, p, v, n) \ 860 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 861 862 va = virtual_avail; 863 pte = vtopte(va); 864 865 /* 866 * Crashdump maps. The first page is reused as CMAP1 for the 867 * memory test. 868 */ 869 SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) 870 CADDR1 = crashdumpmap; 871 872 virtual_avail = va; 873 874 /* Initialize the PAT MSR. */ 875 pmap_init_pat(); 876 877 /* Initialize TLB Context Id. */ 878 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); 879 if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { 880 load_cr4(rcr4() | CR4_PCIDE); 881 mtx_init(&pcid_mtx, "pcid", NULL, MTX_DEF); 882 init_unrhdr(&pcid_unr, 1, (1 << 12) - 1, &pcid_mtx); 883 /* Check for INVPCID support */ 884 invpcid_works = (cpu_stdext_feature & CPUID_STDEXT_INVPCID) 885 != 0; 886 kernel_pmap->pm_pcid = 0; 887#ifndef SMP 888 pmap_pcid_enabled = 0; 889#endif 890 } else 891 pmap_pcid_enabled = 0; 892} 893 894/* 895 * Setup the PAT MSR. 896 */ 897void 898pmap_init_pat(void) 899{ 900 int pat_table[PAT_INDEX_SIZE]; 901 uint64_t pat_msr; 902 u_long cr0, cr4; 903 int i; 904 905 /* Bail if this CPU doesn't implement PAT. */ 906 if ((cpu_feature & CPUID_PAT) == 0) 907 panic("no PAT??"); 908 909 /* Set default PAT index table. */ 910 for (i = 0; i < PAT_INDEX_SIZE; i++) 911 pat_table[i] = -1; 912 pat_table[PAT_WRITE_BACK] = 0; 913 pat_table[PAT_WRITE_THROUGH] = 1; 914 pat_table[PAT_UNCACHEABLE] = 3; 915 pat_table[PAT_WRITE_COMBINING] = 3; 916 pat_table[PAT_WRITE_PROTECTED] = 3; 917 pat_table[PAT_UNCACHED] = 3; 918 919 /* Initialize default PAT entries. */ 920 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 921 PAT_VALUE(1, PAT_WRITE_THROUGH) | 922 PAT_VALUE(2, PAT_UNCACHED) | 923 PAT_VALUE(3, PAT_UNCACHEABLE) | 924 PAT_VALUE(4, PAT_WRITE_BACK) | 925 PAT_VALUE(5, PAT_WRITE_THROUGH) | 926 PAT_VALUE(6, PAT_UNCACHED) | 927 PAT_VALUE(7, PAT_UNCACHEABLE); 928 929 if (pat_works) { 930 /* 931 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 932 * Program 5 and 6 as WP and WC. 933 * Leave 4 and 7 as WB and UC. 934 */ 935 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); 936 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | 937 PAT_VALUE(6, PAT_WRITE_COMBINING); 938 pat_table[PAT_UNCACHED] = 2; 939 pat_table[PAT_WRITE_PROTECTED] = 5; 940 pat_table[PAT_WRITE_COMBINING] = 6; 941 } else { 942 /* 943 * Just replace PAT Index 2 with WC instead of UC-. 944 */ 945 pat_msr &= ~PAT_MASK(2); 946 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 947 pat_table[PAT_WRITE_COMBINING] = 2; 948 } 949 950 /* Disable PGE. */ 951 cr4 = rcr4(); 952 load_cr4(cr4 & ~CR4_PGE); 953 954 /* Disable caches (CD = 1, NW = 0). */ 955 cr0 = rcr0(); 956 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 957 958 /* Flushes caches and TLBs. */ 959 wbinvd(); 960 invltlb(); 961 962 /* Update PAT and index table. */ 963 wrmsr(MSR_PAT, pat_msr); 964 for (i = 0; i < PAT_INDEX_SIZE; i++) 965 pat_index[i] = pat_table[i]; 966 967 /* Flush caches and TLBs again. */ 968 wbinvd(); 969 invltlb(); 970 971 /* Restore caches and PGE. */ 972 load_cr0(cr0); 973 load_cr4(cr4); 974} 975 976/* 977 * Initialize a vm_page's machine-dependent fields. 978 */ 979void 980pmap_page_init(vm_page_t m) 981{ 982 983 TAILQ_INIT(&m->md.pv_list); 984 m->md.pat_mode = PAT_WRITE_BACK; 985} 986 987/* 988 * Initialize the pmap module. 989 * Called by vm_init, to initialize any structures that the pmap 990 * system needs to map virtual memory. 991 */ 992void 993pmap_init(void) 994{ 995 vm_page_t mpte; 996 vm_size_t s; 997 int i, pv_npg; 998 999 /* 1000 * Initialize the vm page array entries for the kernel pmap's 1001 * page table pages. 1002 */ 1003 for (i = 0; i < nkpt; i++) { 1004 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 1005 KASSERT(mpte >= vm_page_array && 1006 mpte < &vm_page_array[vm_page_array_size], 1007 ("pmap_init: page table page is out of range")); 1008 mpte->pindex = pmap_pde_pindex(KERNBASE) + i; 1009 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 1010 } 1011 1012 /* 1013 * If the kernel is running on a virtual machine, then it must assume 1014 * that MCA is enabled by the hypervisor. Moreover, the kernel must 1015 * be prepared for the hypervisor changing the vendor and family that 1016 * are reported by CPUID. Consequently, the workaround for AMD Family 1017 * 10h Erratum 383 is enabled if the processor's feature set does not 1018 * include at least one feature that is only supported by older Intel 1019 * or newer AMD processors. 1020 */ 1021 if (vm_guest == VM_GUEST_VM && (cpu_feature & CPUID_SS) == 0 && 1022 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 1023 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 1024 AMDID2_FMA4)) == 0) 1025 workaround_erratum383 = 1; 1026 1027 /* 1028 * Are large page mappings enabled? 1029 */ 1030 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 1031 if (pg_ps_enabled) { 1032 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 1033 ("pmap_init: can't assign to pagesizes[1]")); 1034 pagesizes[1] = NBPDR; 1035 } 1036 1037 /* 1038 * Initialize the pv chunk list mutex. 1039 */ 1040 mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); 1041 1042 /* 1043 * Initialize the pool of pv list locks. 1044 */ 1045 for (i = 0; i < NPV_LIST_LOCKS; i++) 1046 rw_init(&pv_list_locks[i], "pmap pv list"); 1047 1048 /* 1049 * Calculate the size of the pv head table for superpages. 1050 */ 1051 for (i = 0; phys_avail[i + 1]; i += 2); 1052 pv_npg = round_2mpage(phys_avail[(i - 2) + 1]) / NBPDR; 1053 1054 /* 1055 * Allocate memory for the pv head table for superpages. 1056 */ 1057 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 1058 s = round_page(s); 1059 pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, 1060 M_WAITOK | M_ZERO); 1061 for (i = 0; i < pv_npg; i++) 1062 TAILQ_INIT(&pv_table[i].pv_list); 1063 1064 mtx_init(&cpage_lock, "cpage", NULL, MTX_DEF); 1065 cpage_a = kva_alloc(PAGE_SIZE); 1066 cpage_b = kva_alloc(PAGE_SIZE); 1067} 1068 1069static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, 1070 "2MB page mapping counters"); 1071 1072static u_long pmap_pde_demotions; 1073SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, 1074 &pmap_pde_demotions, 0, "2MB page demotions"); 1075 1076static u_long pmap_pde_mappings; 1077SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 1078 &pmap_pde_mappings, 0, "2MB page mappings"); 1079 1080static u_long pmap_pde_p_failures; 1081SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 1082 &pmap_pde_p_failures, 0, "2MB page promotion failures"); 1083 1084static u_long pmap_pde_promotions; 1085SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 1086 &pmap_pde_promotions, 0, "2MB page promotions"); 1087 1088static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0, 1089 "1GB page mapping counters"); 1090 1091static u_long pmap_pdpe_demotions; 1092SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, 1093 &pmap_pdpe_demotions, 0, "1GB page demotions"); 1094 1095/*************************************************** 1096 * Low level helper routines..... 1097 ***************************************************/ 1098 1099static pt_entry_t 1100pmap_swap_pat(pmap_t pmap, pt_entry_t entry) 1101{ 1102 int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; 1103 1104 switch (pmap->pm_type) { 1105 case PT_X86: 1106 /* Verify that both PAT bits are not set at the same time */ 1107 KASSERT((entry & x86_pat_bits) != x86_pat_bits, 1108 ("Invalid PAT bits in entry %#lx", entry)); 1109 1110 /* Swap the PAT bits if one of them is set */ 1111 if ((entry & x86_pat_bits) != 0) 1112 entry ^= x86_pat_bits; 1113 break; 1114 case PT_EPT: 1115 /* 1116 * Nothing to do - the memory attributes are represented 1117 * the same way for regular pages and superpages. 1118 */ 1119 break; 1120 default: 1121 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); 1122 } 1123 1124 return (entry); 1125} 1126 1127/* 1128 * Determine the appropriate bits to set in a PTE or PDE for a specified 1129 * caching mode. 1130 */ 1131static int 1132pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) 1133{ 1134 int cache_bits, pat_flag, pat_idx; 1135 1136 if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) 1137 panic("Unknown caching mode %d\n", mode); 1138 1139 switch (pmap->pm_type) { 1140 case PT_X86: 1141 /* The PAT bit is different for PTE's and PDE's. */ 1142 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; 1143 1144 /* Map the caching mode to a PAT index. */ 1145 pat_idx = pat_index[mode]; 1146 1147 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 1148 cache_bits = 0; 1149 if (pat_idx & 0x4) 1150 cache_bits |= pat_flag; 1151 if (pat_idx & 0x2) 1152 cache_bits |= PG_NC_PCD; 1153 if (pat_idx & 0x1) 1154 cache_bits |= PG_NC_PWT; 1155 break; 1156 1157 case PT_EPT: 1158 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); 1159 break; 1160 1161 default: 1162 panic("unsupported pmap type %d", pmap->pm_type); 1163 } 1164 1165 return (cache_bits); 1166} 1167 1168static int 1169pmap_cache_mask(pmap_t pmap, boolean_t is_pde) 1170{ 1171 int mask; 1172 1173 switch (pmap->pm_type) { 1174 case PT_X86: 1175 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; 1176 break; 1177 case PT_EPT: 1178 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); 1179 break; 1180 default: 1181 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); 1182 } 1183 1184 return (mask); 1185} 1186 1187static __inline boolean_t 1188pmap_ps_enabled(pmap_t pmap) 1189{ 1190 1191 return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); 1192} 1193 1194static void 1195pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) 1196{ 1197 1198 switch (pmap->pm_type) { 1199 case PT_X86: 1200 break; 1201 case PT_EPT: 1202 /* 1203 * XXX 1204 * This is a little bogus since the generation number is 1205 * supposed to be bumped up when a region of the address 1206 * space is invalidated in the page tables. 1207 * 1208 * In this case the old PDE entry is valid but yet we want 1209 * to make sure that any mappings using the old entry are 1210 * invalidated in the TLB. 1211 * 1212 * The reason this works as expected is because we rendezvous 1213 * "all" host cpus and force any vcpu context to exit as a 1214 * side-effect. 1215 */ 1216 atomic_add_acq_long(&pmap->pm_eptgen, 1); 1217 break; 1218 default: 1219 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); 1220 } 1221 pde_store(pde, newpde); 1222} 1223 1224/* 1225 * After changing the page size for the specified virtual address in the page 1226 * table, flush the corresponding entries from the processor's TLB. Only the 1227 * calling processor's TLB is affected. 1228 * 1229 * The calling thread must be pinned to a processor. 1230 */ 1231static void 1232pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) 1233{ 1234 pt_entry_t PG_G; 1235 1236 if (pmap->pm_type == PT_EPT) 1237 return; 1238 1239 KASSERT(pmap->pm_type == PT_X86, 1240 ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); 1241 1242 PG_G = pmap_global_bit(pmap); 1243 1244 if ((newpde & PG_PS) == 0) 1245 /* Demotion: flush a specific 2MB page mapping. */ 1246 invlpg(va); 1247 else if ((newpde & PG_G) == 0) 1248 /* 1249 * Promotion: flush every 4KB page mapping from the TLB 1250 * because there are too many to flush individually. 1251 */ 1252 invltlb(); 1253 else { 1254 /* 1255 * Promotion: flush every 4KB page mapping from the TLB, 1256 * including any global (PG_G) mappings. 1257 */ 1258 invltlb_globpcid(); 1259 } 1260} 1261#ifdef SMP 1262 1263static void 1264pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va) 1265{ 1266 struct invpcid_descr d; 1267 uint64_t cr3; 1268 1269 if (invpcid_works) { 1270 d.pcid = pmap->pm_pcid; 1271 d.pad = 0; 1272 d.addr = va; 1273 invpcid(&d, INVPCID_ADDR); 1274 return; 1275 } 1276 1277 cr3 = rcr3(); 1278 critical_enter(); 1279 load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE); 1280 invlpg(va); 1281 load_cr3(cr3 | CR3_PCID_SAVE); 1282 critical_exit(); 1283} 1284 1285/* 1286 * For SMP, these functions have to use the IPI mechanism for coherence. 1287 * 1288 * N.B.: Before calling any of the following TLB invalidation functions, 1289 * the calling processor must ensure that all stores updating a non- 1290 * kernel page table are globally performed. Otherwise, another 1291 * processor could cache an old, pre-update entry without being 1292 * invalidated. This can happen one of two ways: (1) The pmap becomes 1293 * active on another processor after its pm_active field is checked by 1294 * one of the following functions but before a store updating the page 1295 * table is globally performed. (2) The pmap becomes active on another 1296 * processor before its pm_active field is checked but due to 1297 * speculative loads one of the following functions stills reads the 1298 * pmap as inactive on the other processor. 1299 * 1300 * The kernel page table is exempt because its pm_active field is 1301 * immutable. The kernel page table is always active on every 1302 * processor. 1303 */ 1304 1305/* 1306 * Interrupt the cpus that are executing in the guest context. 1307 * This will force the vcpu to exit and the cached EPT mappings 1308 * will be invalidated by the host before the next vmresume. 1309 */ 1310static __inline void 1311pmap_invalidate_ept(pmap_t pmap) 1312{ 1313 int ipinum; 1314 1315 sched_pin(); 1316 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 1317 ("pmap_invalidate_ept: absurd pm_active")); 1318 1319 /* 1320 * The TLB mappings associated with a vcpu context are not 1321 * flushed each time a different vcpu is chosen to execute. 1322 * 1323 * This is in contrast with a process's vtop mappings that 1324 * are flushed from the TLB on each context switch. 1325 * 1326 * Therefore we need to do more than just a TLB shootdown on 1327 * the active cpus in 'pmap->pm_active'. To do this we keep 1328 * track of the number of invalidations performed on this pmap. 1329 * 1330 * Each vcpu keeps a cache of this counter and compares it 1331 * just before a vmresume. If the counter is out-of-date an 1332 * invept will be done to flush stale mappings from the TLB. 1333 */ 1334 atomic_add_acq_long(&pmap->pm_eptgen, 1); 1335 1336 /* 1337 * Force the vcpu to exit and trap back into the hypervisor. 1338 */ 1339 ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; 1340 ipi_selected(pmap->pm_active, ipinum); 1341 sched_unpin(); 1342} 1343 1344void 1345pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1346{ 1347 cpuset_t other_cpus; 1348 u_int cpuid; 1349 1350 if (pmap->pm_type == PT_EPT) { 1351 pmap_invalidate_ept(pmap); 1352 return; 1353 } 1354 1355 KASSERT(pmap->pm_type == PT_X86, 1356 ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); 1357 1358 sched_pin(); 1359 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 1360 if (!pmap_pcid_enabled) { 1361 invlpg(va); 1362 } else { 1363 if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) { 1364 if (pmap == PCPU_GET(curpmap)) 1365 invlpg(va); 1366 else 1367 pmap_invalidate_page_pcid(pmap, va); 1368 } else { 1369 invltlb_globpcid(); 1370 } 1371 } 1372 smp_invlpg(pmap, va); 1373 } else { 1374 cpuid = PCPU_GET(cpuid); 1375 other_cpus = all_cpus; 1376 CPU_CLR(cpuid, &other_cpus); 1377 if (CPU_ISSET(cpuid, &pmap->pm_active)) 1378 invlpg(va); 1379 else if (pmap_pcid_enabled) { 1380 if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) 1381 pmap_invalidate_page_pcid(pmap, va); 1382 else 1383 invltlb_globpcid(); 1384 } 1385 if (pmap_pcid_enabled) 1386 CPU_AND(&other_cpus, &pmap->pm_save); 1387 else 1388 CPU_AND(&other_cpus, &pmap->pm_active); 1389 if (!CPU_EMPTY(&other_cpus)) 1390 smp_masked_invlpg(other_cpus, pmap, va); 1391 } 1392 sched_unpin(); 1393} 1394 1395static void 1396pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1397{ 1398 struct invpcid_descr d; 1399 uint64_t cr3; 1400 vm_offset_t addr; 1401 1402 if (invpcid_works) { 1403 d.pcid = pmap->pm_pcid; 1404 d.pad = 0; 1405 for (addr = sva; addr < eva; addr += PAGE_SIZE) { 1406 d.addr = addr; 1407 invpcid(&d, INVPCID_ADDR); 1408 } 1409 return; 1410 } 1411 1412 cr3 = rcr3(); 1413 critical_enter(); 1414 load_cr3(pmap->pm_cr3 | CR3_PCID_SAVE); 1415 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1416 invlpg(addr); 1417 load_cr3(cr3 | CR3_PCID_SAVE); 1418 critical_exit(); 1419} 1420 1421void 1422pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1423{ 1424 cpuset_t other_cpus; 1425 vm_offset_t addr; 1426 u_int cpuid; 1427 1428 if (pmap->pm_type == PT_EPT) { 1429 pmap_invalidate_ept(pmap); 1430 return; 1431 } 1432 1433 KASSERT(pmap->pm_type == PT_X86, 1434 ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); 1435 1436 sched_pin(); 1437 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 1438 if (!pmap_pcid_enabled) { 1439 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1440 invlpg(addr); 1441 } else { 1442 if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) { 1443 if (pmap == PCPU_GET(curpmap)) { 1444 for (addr = sva; addr < eva; 1445 addr += PAGE_SIZE) 1446 invlpg(addr); 1447 } else { 1448 pmap_invalidate_range_pcid(pmap, 1449 sva, eva); 1450 } 1451 } else { 1452 invltlb_globpcid(); 1453 } 1454 } 1455 smp_invlpg_range(pmap, sva, eva); 1456 } else { 1457 cpuid = PCPU_GET(cpuid); 1458 other_cpus = all_cpus; 1459 CPU_CLR(cpuid, &other_cpus); 1460 if (CPU_ISSET(cpuid, &pmap->pm_active)) { 1461 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1462 invlpg(addr); 1463 } else if (pmap_pcid_enabled) { 1464 if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) 1465 pmap_invalidate_range_pcid(pmap, sva, eva); 1466 else 1467 invltlb_globpcid(); 1468 } 1469 if (pmap_pcid_enabled) 1470 CPU_AND(&other_cpus, &pmap->pm_save); 1471 else 1472 CPU_AND(&other_cpus, &pmap->pm_active); 1473 if (!CPU_EMPTY(&other_cpus)) 1474 smp_masked_invlpg_range(other_cpus, pmap, sva, eva); 1475 } 1476 sched_unpin(); 1477} 1478 1479void 1480pmap_invalidate_all(pmap_t pmap) 1481{ 1482 cpuset_t other_cpus; 1483 struct invpcid_descr d; 1484 uint64_t cr3; 1485 u_int cpuid; 1486 1487 if (pmap->pm_type == PT_EPT) { 1488 pmap_invalidate_ept(pmap); 1489 return; 1490 } 1491 1492 KASSERT(pmap->pm_type == PT_X86, 1493 ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); 1494 1495 sched_pin(); 1496 cpuid = PCPU_GET(cpuid); 1497 if (pmap == kernel_pmap || 1498 (pmap_pcid_enabled && !CPU_CMP(&pmap->pm_save, &all_cpus)) || 1499 !CPU_CMP(&pmap->pm_active, &all_cpus)) { 1500 if (invpcid_works) { 1501 bzero(&d, sizeof(d)); 1502 invpcid(&d, INVPCID_CTXGLOB); 1503 } else { 1504 invltlb_globpcid(); 1505 } 1506 if (!CPU_ISSET(cpuid, &pmap->pm_active)) 1507 CPU_CLR_ATOMIC(cpuid, &pmap->pm_save); 1508 smp_invltlb(pmap); 1509 } else { 1510 other_cpus = all_cpus; 1511 CPU_CLR(cpuid, &other_cpus); 1512 1513 /* 1514 * This logic is duplicated in the Xinvltlb shootdown 1515 * IPI handler. 1516 */ 1517 if (pmap_pcid_enabled) { 1518 if (pmap->pm_pcid != -1 && pmap->pm_pcid != 0) { 1519 if (invpcid_works) { 1520 d.pcid = pmap->pm_pcid; 1521 d.pad = 0; 1522 d.addr = 0; 1523 invpcid(&d, INVPCID_CTX); 1524 } else { 1525 cr3 = rcr3(); 1526 critical_enter(); 1527 1528 /* 1529 * Bit 63 is clear, pcid TLB 1530 * entries are invalidated. 1531 */ 1532 load_cr3(pmap->pm_cr3); 1533 load_cr3(cr3 | CR3_PCID_SAVE); 1534 critical_exit(); 1535 } 1536 } else { 1537 invltlb_globpcid(); 1538 } 1539 } else if (CPU_ISSET(cpuid, &pmap->pm_active)) 1540 invltlb(); 1541 if (!CPU_ISSET(cpuid, &pmap->pm_active)) 1542 CPU_CLR_ATOMIC(cpuid, &pmap->pm_save); 1543 if (pmap_pcid_enabled) 1544 CPU_AND(&other_cpus, &pmap->pm_save); 1545 else 1546 CPU_AND(&other_cpus, &pmap->pm_active); 1547 if (!CPU_EMPTY(&other_cpus)) 1548 smp_masked_invltlb(other_cpus, pmap); 1549 } 1550 sched_unpin(); 1551} 1552 1553void 1554pmap_invalidate_cache(void) 1555{ 1556 1557 sched_pin(); 1558 wbinvd(); 1559 smp_cache_flush(); 1560 sched_unpin(); 1561} 1562 1563struct pde_action { 1564 cpuset_t invalidate; /* processors that invalidate their TLB */ 1565 pmap_t pmap; 1566 vm_offset_t va; 1567 pd_entry_t *pde; 1568 pd_entry_t newpde; 1569 u_int store; /* processor that updates the PDE */ 1570}; 1571 1572static void 1573pmap_update_pde_action(void *arg) 1574{ 1575 struct pde_action *act = arg; 1576 1577 if (act->store == PCPU_GET(cpuid)) 1578 pmap_update_pde_store(act->pmap, act->pde, act->newpde); 1579} 1580 1581static void 1582pmap_update_pde_teardown(void *arg) 1583{ 1584 struct pde_action *act = arg; 1585 1586 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 1587 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); 1588} 1589 1590/* 1591 * Change the page size for the specified virtual address in a way that 1592 * prevents any possibility of the TLB ever having two entries that map the 1593 * same virtual address using different page sizes. This is the recommended 1594 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 1595 * machine check exception for a TLB state that is improperly diagnosed as a 1596 * hardware error. 1597 */ 1598static void 1599pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1600{ 1601 struct pde_action act; 1602 cpuset_t active, other_cpus; 1603 u_int cpuid; 1604 1605 sched_pin(); 1606 cpuid = PCPU_GET(cpuid); 1607 other_cpus = all_cpus; 1608 CPU_CLR(cpuid, &other_cpus); 1609 if (pmap == kernel_pmap || pmap->pm_type == PT_EPT) 1610 active = all_cpus; 1611 else { 1612 active = pmap->pm_active; 1613 CPU_AND_ATOMIC(&pmap->pm_save, &active); 1614 } 1615 if (CPU_OVERLAP(&active, &other_cpus)) { 1616 act.store = cpuid; 1617 act.invalidate = active; 1618 act.va = va; 1619 act.pmap = pmap; 1620 act.pde = pde; 1621 act.newpde = newpde; 1622 CPU_SET(cpuid, &active); 1623 smp_rendezvous_cpus(active, 1624 smp_no_rendevous_barrier, pmap_update_pde_action, 1625 pmap_update_pde_teardown, &act); 1626 } else { 1627 pmap_update_pde_store(pmap, pde, newpde); 1628 if (CPU_ISSET(cpuid, &active)) 1629 pmap_update_pde_invalidate(pmap, va, newpde); 1630 } 1631 sched_unpin(); 1632} 1633#else /* !SMP */ 1634/* 1635 * Normal, non-SMP, invalidation functions. 1636 * We inline these within pmap.c for speed. 1637 */ 1638PMAP_INLINE void 1639pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1640{ 1641 1642 switch (pmap->pm_type) { 1643 case PT_X86: 1644 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1645 invlpg(va); 1646 break; 1647 case PT_EPT: 1648 pmap->pm_eptgen++; 1649 break; 1650 default: 1651 panic("pmap_invalidate_page: unknown type: %d", pmap->pm_type); 1652 } 1653} 1654 1655PMAP_INLINE void 1656pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1657{ 1658 vm_offset_t addr; 1659 1660 switch (pmap->pm_type) { 1661 case PT_X86: 1662 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1663 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1664 invlpg(addr); 1665 break; 1666 case PT_EPT: 1667 pmap->pm_eptgen++; 1668 break; 1669 default: 1670 panic("pmap_invalidate_range: unknown type: %d", pmap->pm_type); 1671 } 1672} 1673 1674PMAP_INLINE void 1675pmap_invalidate_all(pmap_t pmap) 1676{ 1677 1678 switch (pmap->pm_type) { 1679 case PT_X86: 1680 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1681 invltlb(); 1682 break; 1683 case PT_EPT: 1684 pmap->pm_eptgen++; 1685 break; 1686 default: 1687 panic("pmap_invalidate_all: unknown type %d", pmap->pm_type); 1688 } 1689} 1690 1691PMAP_INLINE void 1692pmap_invalidate_cache(void) 1693{ 1694 1695 wbinvd(); 1696} 1697 1698static void 1699pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1700{ 1701 1702 pmap_update_pde_store(pmap, pde, newpde); 1703 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1704 pmap_update_pde_invalidate(pmap, va, newpde); 1705 else 1706 CPU_ZERO(&pmap->pm_save); 1707} 1708#endif /* !SMP */ 1709 1710#define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1711 1712void 1713pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 1714{ 1715 1716 KASSERT((sva & PAGE_MASK) == 0, 1717 ("pmap_invalidate_cache_range: sva not page-aligned")); 1718 KASSERT((eva & PAGE_MASK) == 0, 1719 ("pmap_invalidate_cache_range: eva not page-aligned")); 1720 1721 if (cpu_feature & CPUID_SS) 1722 ; /* If "Self Snoop" is supported, do nothing. */ 1723 else if ((cpu_feature & CPUID_CLFSH) != 0 && 1724 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 1725 1726 /* 1727 * XXX: Some CPUs fault, hang, or trash the local APIC 1728 * registers if we use CLFLUSH on the local APIC 1729 * range. The local APIC is always uncached, so we 1730 * don't need to flush for that range anyway. 1731 */ 1732 if (pmap_kextract(sva) == lapic_paddr) 1733 return; 1734 1735 /* 1736 * Otherwise, do per-cache line flush. Use the mfence 1737 * instruction to insure that previous stores are 1738 * included in the write-back. The processor 1739 * propagates flush to other processors in the cache 1740 * coherence domain. 1741 */ 1742 mfence(); 1743 for (; sva < eva; sva += cpu_clflush_line_size) 1744 clflush(sva); 1745 mfence(); 1746 } else { 1747 1748 /* 1749 * No targeted cache flush methods are supported by CPU, 1750 * or the supplied range is bigger than 2MB. 1751 * Globally invalidate cache. 1752 */ 1753 pmap_invalidate_cache(); 1754 } 1755} 1756 1757/* 1758 * Remove the specified set of pages from the data and instruction caches. 1759 * 1760 * In contrast to pmap_invalidate_cache_range(), this function does not 1761 * rely on the CPU's self-snoop feature, because it is intended for use 1762 * when moving pages into a different cache domain. 1763 */ 1764void 1765pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1766{ 1767 vm_offset_t daddr, eva; 1768 int i; 1769 1770 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1771 (cpu_feature & CPUID_CLFSH) == 0) 1772 pmap_invalidate_cache(); 1773 else { 1774 mfence(); 1775 for (i = 0; i < count; i++) { 1776 daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); 1777 eva = daddr + PAGE_SIZE; 1778 for (; daddr < eva; daddr += cpu_clflush_line_size) 1779 clflush(daddr); 1780 } 1781 mfence(); 1782 } 1783} 1784 1785/* 1786 * Routine: pmap_extract 1787 * Function: 1788 * Extract the physical page address associated 1789 * with the given map/virtual_address pair. 1790 */ 1791vm_paddr_t 1792pmap_extract(pmap_t pmap, vm_offset_t va) 1793{ 1794 pdp_entry_t *pdpe; 1795 pd_entry_t *pde; 1796 pt_entry_t *pte, PG_V; 1797 vm_paddr_t pa; 1798 1799 pa = 0; 1800 PG_V = pmap_valid_bit(pmap); 1801 PMAP_LOCK(pmap); 1802 pdpe = pmap_pdpe(pmap, va); 1803 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 1804 if ((*pdpe & PG_PS) != 0) 1805 pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); 1806 else { 1807 pde = pmap_pdpe_to_pde(pdpe, va); 1808 if ((*pde & PG_V) != 0) { 1809 if ((*pde & PG_PS) != 0) { 1810 pa = (*pde & PG_PS_FRAME) | 1811 (va & PDRMASK); 1812 } else { 1813 pte = pmap_pde_to_pte(pde, va); 1814 pa = (*pte & PG_FRAME) | 1815 (va & PAGE_MASK); 1816 } 1817 } 1818 } 1819 } 1820 PMAP_UNLOCK(pmap); 1821 return (pa); 1822} 1823 1824/* 1825 * Routine: pmap_extract_and_hold 1826 * Function: 1827 * Atomically extract and hold the physical page 1828 * with the given pmap and virtual address pair 1829 * if that mapping permits the given protection. 1830 */ 1831vm_page_t 1832pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1833{ 1834 pd_entry_t pde, *pdep; 1835 pt_entry_t pte, PG_RW, PG_V; 1836 vm_paddr_t pa; 1837 vm_page_t m; 1838 1839 pa = 0; 1840 m = NULL; 1841 PG_RW = pmap_rw_bit(pmap); 1842 PG_V = pmap_valid_bit(pmap); 1843 PMAP_LOCK(pmap); 1844retry: 1845 pdep = pmap_pde(pmap, va); 1846 if (pdep != NULL && (pde = *pdep)) { 1847 if (pde & PG_PS) { 1848 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 1849 if (vm_page_pa_tryrelock(pmap, (pde & 1850 PG_PS_FRAME) | (va & PDRMASK), &pa)) 1851 goto retry; 1852 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | 1853 (va & PDRMASK)); 1854 vm_page_hold(m); 1855 } 1856 } else { 1857 pte = *pmap_pde_to_pte(pdep, va); 1858 if ((pte & PG_V) && 1859 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 1860 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, 1861 &pa)) 1862 goto retry; 1863 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 1864 vm_page_hold(m); 1865 } 1866 } 1867 } 1868 PA_UNLOCK_COND(pa); 1869 PMAP_UNLOCK(pmap); 1870 return (m); 1871} 1872 1873vm_paddr_t 1874pmap_kextract(vm_offset_t va) 1875{ 1876 pd_entry_t pde; 1877 vm_paddr_t pa; 1878 1879 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { 1880 pa = DMAP_TO_PHYS(va); 1881 } else { 1882 pde = *vtopde(va); 1883 if (pde & PG_PS) { 1884 pa = (pde & PG_PS_FRAME) | (va & PDRMASK); 1885 } else { 1886 /* 1887 * Beware of a concurrent promotion that changes the 1888 * PDE at this point! For example, vtopte() must not 1889 * be used to access the PTE because it would use the 1890 * new PDE. It is, however, safe to use the old PDE 1891 * because the page table page is preserved by the 1892 * promotion. 1893 */ 1894 pa = *pmap_pde_to_pte(&pde, va); 1895 pa = (pa & PG_FRAME) | (va & PAGE_MASK); 1896 } 1897 } 1898 return (pa); 1899} 1900 1901/*************************************************** 1902 * Low level mapping routines..... 1903 ***************************************************/ 1904 1905/* 1906 * Add a wired page to the kva. 1907 * Note: not SMP coherent. 1908 */ 1909PMAP_INLINE void 1910pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1911{ 1912 pt_entry_t *pte; 1913 1914 pte = vtopte(va); 1915 pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G); 1916} 1917 1918static __inline void 1919pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 1920{ 1921 pt_entry_t *pte; 1922 int cache_bits; 1923 1924 pte = vtopte(va); 1925 cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); 1926 pte_store(pte, pa | X86_PG_RW | X86_PG_V | X86_PG_G | cache_bits); 1927} 1928 1929/* 1930 * Remove a page from the kernel pagetables. 1931 * Note: not SMP coherent. 1932 */ 1933PMAP_INLINE void 1934pmap_kremove(vm_offset_t va) 1935{ 1936 pt_entry_t *pte; 1937 1938 pte = vtopte(va); 1939 pte_clear(pte); 1940} 1941 1942/* 1943 * Used to map a range of physical addresses into kernel 1944 * virtual address space. 1945 * 1946 * The value passed in '*virt' is a suggested virtual address for 1947 * the mapping. Architectures which can support a direct-mapped 1948 * physical to virtual region can return the appropriate address 1949 * within that region, leaving '*virt' unchanged. Other 1950 * architectures should map the pages starting at '*virt' and 1951 * update '*virt' with the first usable address after the mapped 1952 * region. 1953 */ 1954vm_offset_t 1955pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1956{ 1957 return PHYS_TO_DMAP(start); 1958} 1959 1960 1961/* 1962 * Add a list of wired pages to the kva 1963 * this routine is only used for temporary 1964 * kernel mappings that do not need to have 1965 * page modification or references recorded. 1966 * Note that old mappings are simply written 1967 * over. The page *must* be wired. 1968 * Note: SMP coherent. Uses a ranged shootdown IPI. 1969 */ 1970void 1971pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1972{ 1973 pt_entry_t *endpte, oldpte, pa, *pte; 1974 vm_page_t m; 1975 int cache_bits; 1976 1977 oldpte = 0; 1978 pte = vtopte(sva); 1979 endpte = pte + count; 1980 while (pte < endpte) { 1981 m = *ma++; 1982 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); 1983 pa = VM_PAGE_TO_PHYS(m) | cache_bits; 1984 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { 1985 oldpte |= *pte; 1986 pte_store(pte, pa | X86_PG_G | X86_PG_RW | X86_PG_V); 1987 } 1988 pte++; 1989 } 1990 if (__predict_false((oldpte & X86_PG_V) != 0)) 1991 pmap_invalidate_range(kernel_pmap, sva, sva + count * 1992 PAGE_SIZE); 1993} 1994 1995/* 1996 * This routine tears out page mappings from the 1997 * kernel -- it is meant only for temporary mappings. 1998 * Note: SMP coherent. Uses a ranged shootdown IPI. 1999 */ 2000void 2001pmap_qremove(vm_offset_t sva, int count) 2002{ 2003 vm_offset_t va; 2004 2005 va = sva; 2006 while (count-- > 0) { 2007 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); 2008 pmap_kremove(va); 2009 va += PAGE_SIZE; 2010 } 2011 pmap_invalidate_range(kernel_pmap, sva, va); 2012} 2013 2014/*************************************************** 2015 * Page table page management routines..... 2016 ***************************************************/ 2017static __inline void 2018pmap_free_zero_pages(struct spglist *free) 2019{ 2020 vm_page_t m; 2021 2022 while ((m = SLIST_FIRST(free)) != NULL) { 2023 SLIST_REMOVE_HEAD(free, plinks.s.ss); 2024 /* Preserve the page's PG_ZERO setting. */ 2025 vm_page_free_toq(m); 2026 } 2027} 2028 2029/* 2030 * Schedule the specified unused page table page to be freed. Specifically, 2031 * add the page to the specified list of pages that will be released to the 2032 * physical memory manager after the TLB has been updated. 2033 */ 2034static __inline void 2035pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 2036 boolean_t set_PG_ZERO) 2037{ 2038 2039 if (set_PG_ZERO) 2040 m->flags |= PG_ZERO; 2041 else 2042 m->flags &= ~PG_ZERO; 2043 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 2044} 2045 2046/* 2047 * Inserts the specified page table page into the specified pmap's collection 2048 * of idle page table pages. Each of a pmap's page table pages is responsible 2049 * for mapping a distinct range of virtual addresses. The pmap's collection is 2050 * ordered by this virtual address range. 2051 */ 2052static __inline int 2053pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 2054{ 2055 2056 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2057 return (vm_radix_insert(&pmap->pm_root, mpte)); 2058} 2059 2060/* 2061 * Looks for a page table page mapping the specified virtual address in the 2062 * specified pmap's collection of idle page table pages. Returns NULL if there 2063 * is no page table page corresponding to the specified virtual address. 2064 */ 2065static __inline vm_page_t 2066pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) 2067{ 2068 2069 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2070 return (vm_radix_lookup(&pmap->pm_root, pmap_pde_pindex(va))); 2071} 2072 2073/* 2074 * Removes the specified page table page from the specified pmap's collection 2075 * of idle page table pages. The specified page table page must be a member of 2076 * the pmap's collection. 2077 */ 2078static __inline void 2079pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) 2080{ 2081 2082 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2083 vm_radix_remove(&pmap->pm_root, mpte->pindex); 2084} 2085 2086/* 2087 * Decrements a page table page's wire count, which is used to record the 2088 * number of valid page table entries within the page. If the wire count 2089 * drops to zero, then the page table page is unmapped. Returns TRUE if the 2090 * page table page was unmapped and FALSE otherwise. 2091 */ 2092static inline boolean_t 2093pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2094{ 2095 2096 --m->wire_count; 2097 if (m->wire_count == 0) { 2098 _pmap_unwire_ptp(pmap, va, m, free); 2099 return (TRUE); 2100 } else 2101 return (FALSE); 2102} 2103 2104static void 2105_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) 2106{ 2107 2108 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2109 /* 2110 * unmap the page table page 2111 */ 2112 if (m->pindex >= (NUPDE + NUPDPE)) { 2113 /* PDP page */ 2114 pml4_entry_t *pml4; 2115 pml4 = pmap_pml4e(pmap, va); 2116 *pml4 = 0; 2117 } else if (m->pindex >= NUPDE) { 2118 /* PD page */ 2119 pdp_entry_t *pdp; 2120 pdp = pmap_pdpe(pmap, va); 2121 *pdp = 0; 2122 } else { 2123 /* PTE page */ 2124 pd_entry_t *pd; 2125 pd = pmap_pde(pmap, va); 2126 *pd = 0; 2127 } 2128 pmap_resident_count_dec(pmap, 1); 2129 if (m->pindex < NUPDE) { 2130 /* We just released a PT, unhold the matching PD */ 2131 vm_page_t pdpg; 2132 2133 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); 2134 pmap_unwire_ptp(pmap, va, pdpg, free); 2135 } 2136 if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { 2137 /* We just released a PD, unhold the matching PDP */ 2138 vm_page_t pdppg; 2139 2140 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); 2141 pmap_unwire_ptp(pmap, va, pdppg, free); 2142 } 2143 2144 /* 2145 * This is a release store so that the ordinary store unmapping 2146 * the page table page is globally performed before TLB shoot- 2147 * down is begun. 2148 */ 2149 atomic_subtract_rel_int(&cnt.v_wire_count, 1); 2150 2151 /* 2152 * Put page on a list so that it is released after 2153 * *ALL* TLB shootdown is done 2154 */ 2155 pmap_add_delayed_free_list(m, free, TRUE); 2156} 2157 2158/* 2159 * After removing a page table entry, this routine is used to 2160 * conditionally free the page, and manage the hold/wire counts. 2161 */ 2162static int 2163pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, 2164 struct spglist *free) 2165{ 2166 vm_page_t mpte; 2167 2168 if (va >= VM_MAXUSER_ADDRESS) 2169 return (0); 2170 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); 2171 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 2172 return (pmap_unwire_ptp(pmap, va, mpte, free)); 2173} 2174 2175void 2176pmap_pinit0(pmap_t pmap) 2177{ 2178 2179 PMAP_LOCK_INIT(pmap); 2180 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); 2181 pmap->pm_cr3 = KPML4phys; 2182 pmap->pm_root.rt_root = 0; 2183 CPU_ZERO(&pmap->pm_active); 2184 CPU_ZERO(&pmap->pm_save); 2185 PCPU_SET(curpmap, pmap); 2186 TAILQ_INIT(&pmap->pm_pvchunk); 2187 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2188 pmap->pm_pcid = pmap_pcid_enabled ? 0 : -1; 2189 pmap->pm_flags = pmap_flags; 2190} 2191 2192/* 2193 * Initialize a preallocated and zeroed pmap structure, 2194 * such as one in a vmspace structure. 2195 */ 2196int 2197pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) 2198{ 2199 vm_page_t pml4pg; 2200 vm_paddr_t pml4phys; 2201 int i; 2202 2203 /* 2204 * allocate the page directory page 2205 */ 2206 while ((pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2207 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) 2208 VM_WAIT; 2209 2210 pml4phys = VM_PAGE_TO_PHYS(pml4pg); 2211 pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys); 2212 pmap->pm_pcid = -1; 2213 pmap->pm_cr3 = ~0; /* initialize to an invalid value */ 2214 2215 if ((pml4pg->flags & PG_ZERO) == 0) 2216 pagezero(pmap->pm_pml4); 2217 2218 /* 2219 * Do not install the host kernel mappings in the nested page 2220 * tables. These mappings are meaningless in the guest physical 2221 * address space. 2222 */ 2223 if ((pmap->pm_type = pm_type) == PT_X86) { 2224 pmap->pm_cr3 = pml4phys; 2225 2226 /* Wire in kernel global address entries. */ 2227 for (i = 0; i < NKPML4E; i++) { 2228 pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | 2229 X86_PG_RW | X86_PG_V | PG_U; 2230 } 2231 for (i = 0; i < ndmpdpphys; i++) { 2232 pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | 2233 X86_PG_RW | X86_PG_V | PG_U; 2234 } 2235 2236 /* install self-referential address mapping entry(s) */ 2237 pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | 2238 X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; 2239 2240 if (pmap_pcid_enabled) { 2241 pmap->pm_pcid = alloc_unr(&pcid_unr); 2242 if (pmap->pm_pcid != -1) 2243 pmap->pm_cr3 |= pmap->pm_pcid; 2244 } 2245 } 2246 2247 pmap->pm_root.rt_root = 0; 2248 CPU_ZERO(&pmap->pm_active); 2249 TAILQ_INIT(&pmap->pm_pvchunk); 2250 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 2251 pmap->pm_flags = flags; 2252 pmap->pm_eptgen = 0; 2253 CPU_ZERO(&pmap->pm_save); 2254 2255 return (1); 2256} 2257 2258int 2259pmap_pinit(pmap_t pmap) 2260{ 2261 2262 return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); 2263} 2264 2265/* 2266 * This routine is called if the desired page table page does not exist. 2267 * 2268 * If page table page allocation fails, this routine may sleep before 2269 * returning NULL. It sleeps only if a lock pointer was given. 2270 * 2271 * Note: If a page allocation fails at page table level two or three, 2272 * one or two pages may be held during the wait, only to be released 2273 * afterwards. This conservative approach is easily argued to avoid 2274 * race conditions. 2275 */ 2276static vm_page_t 2277_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) 2278{ 2279 vm_page_t m, pdppg, pdpg; 2280 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 2281 2282 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2283 2284 PG_A = pmap_accessed_bit(pmap); 2285 PG_M = pmap_modified_bit(pmap); 2286 PG_V = pmap_valid_bit(pmap); 2287 PG_RW = pmap_rw_bit(pmap); 2288 2289 /* 2290 * Allocate a page table page. 2291 */ 2292 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 2293 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 2294 if (lockp != NULL) { 2295 RELEASE_PV_LIST_LOCK(lockp); 2296 PMAP_UNLOCK(pmap); 2297 rw_runlock(&pvh_global_lock); 2298 VM_WAIT; 2299 rw_rlock(&pvh_global_lock); 2300 PMAP_LOCK(pmap); 2301 } 2302 2303 /* 2304 * Indicate the need to retry. While waiting, the page table 2305 * page may have been allocated. 2306 */ 2307 return (NULL); 2308 } 2309 if ((m->flags & PG_ZERO) == 0) 2310 pmap_zero_page(m); 2311 2312 /* 2313 * Map the pagetable page into the process address space, if 2314 * it isn't already there. 2315 */ 2316 2317 if (ptepindex >= (NUPDE + NUPDPE)) { 2318 pml4_entry_t *pml4; 2319 vm_pindex_t pml4index; 2320 2321 /* Wire up a new PDPE page */ 2322 pml4index = ptepindex - (NUPDE + NUPDPE); 2323 pml4 = &pmap->pm_pml4[pml4index]; 2324 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 2325 2326 } else if (ptepindex >= NUPDE) { 2327 vm_pindex_t pml4index; 2328 vm_pindex_t pdpindex; 2329 pml4_entry_t *pml4; 2330 pdp_entry_t *pdp; 2331 2332 /* Wire up a new PDE page */ 2333 pdpindex = ptepindex - NUPDE; 2334 pml4index = pdpindex >> NPML4EPGSHIFT; 2335 2336 pml4 = &pmap->pm_pml4[pml4index]; 2337 if ((*pml4 & PG_V) == 0) { 2338 /* Have to allocate a new pdp, recurse */ 2339 if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, 2340 lockp) == NULL) { 2341 --m->wire_count; 2342 atomic_subtract_int(&cnt.v_wire_count, 1); 2343 vm_page_free_zero(m); 2344 return (NULL); 2345 } 2346 } else { 2347 /* Add reference to pdp page */ 2348 pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); 2349 pdppg->wire_count++; 2350 } 2351 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 2352 2353 /* Now find the pdp page */ 2354 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 2355 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 2356 2357 } else { 2358 vm_pindex_t pml4index; 2359 vm_pindex_t pdpindex; 2360 pml4_entry_t *pml4; 2361 pdp_entry_t *pdp; 2362 pd_entry_t *pd; 2363 2364 /* Wire up a new PTE page */ 2365 pdpindex = ptepindex >> NPDPEPGSHIFT; 2366 pml4index = pdpindex >> NPML4EPGSHIFT; 2367 2368 /* First, find the pdp and check that its valid. */ 2369 pml4 = &pmap->pm_pml4[pml4index]; 2370 if ((*pml4 & PG_V) == 0) { 2371 /* Have to allocate a new pd, recurse */ 2372 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 2373 lockp) == NULL) { 2374 --m->wire_count; 2375 atomic_subtract_int(&cnt.v_wire_count, 1); 2376 vm_page_free_zero(m); 2377 return (NULL); 2378 } 2379 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 2380 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 2381 } else { 2382 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); 2383 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; 2384 if ((*pdp & PG_V) == 0) { 2385 /* Have to allocate a new pd, recurse */ 2386 if (_pmap_allocpte(pmap, NUPDE + pdpindex, 2387 lockp) == NULL) { 2388 --m->wire_count; 2389 atomic_subtract_int(&cnt.v_wire_count, 2390 1); 2391 vm_page_free_zero(m); 2392 return (NULL); 2393 } 2394 } else { 2395 /* Add reference to the pd page */ 2396 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); 2397 pdpg->wire_count++; 2398 } 2399 } 2400 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); 2401 2402 /* Now we know where the page directory page is */ 2403 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; 2404 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; 2405 } 2406 2407 pmap_resident_count_inc(pmap, 1); 2408 2409 return (m); 2410} 2411 2412static vm_page_t 2413pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 2414{ 2415 vm_pindex_t pdpindex, ptepindex; 2416 pdp_entry_t *pdpe, PG_V; 2417 vm_page_t pdpg; 2418 2419 PG_V = pmap_valid_bit(pmap); 2420 2421retry: 2422 pdpe = pmap_pdpe(pmap, va); 2423 if (pdpe != NULL && (*pdpe & PG_V) != 0) { 2424 /* Add a reference to the pd page. */ 2425 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); 2426 pdpg->wire_count++; 2427 } else { 2428 /* Allocate a pd page. */ 2429 ptepindex = pmap_pde_pindex(va); 2430 pdpindex = ptepindex >> NPDPEPGSHIFT; 2431 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); 2432 if (pdpg == NULL && lockp != NULL) 2433 goto retry; 2434 } 2435 return (pdpg); 2436} 2437 2438static vm_page_t 2439pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) 2440{ 2441 vm_pindex_t ptepindex; 2442 pd_entry_t *pd, PG_V; 2443 vm_page_t m; 2444 2445 PG_V = pmap_valid_bit(pmap); 2446 2447 /* 2448 * Calculate pagetable page index 2449 */ 2450 ptepindex = pmap_pde_pindex(va); 2451retry: 2452 /* 2453 * Get the page directory entry 2454 */ 2455 pd = pmap_pde(pmap, va); 2456 2457 /* 2458 * This supports switching from a 2MB page to a 2459 * normal 4K page. 2460 */ 2461 if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { 2462 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { 2463 /* 2464 * Invalidation of the 2MB page mapping may have caused 2465 * the deallocation of the underlying PD page. 2466 */ 2467 pd = NULL; 2468 } 2469 } 2470 2471 /* 2472 * If the page table page is mapped, we just increment the 2473 * hold count, and activate it. 2474 */ 2475 if (pd != NULL && (*pd & PG_V) != 0) { 2476 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); 2477 m->wire_count++; 2478 } else { 2479 /* 2480 * Here if the pte page isn't mapped, or if it has been 2481 * deallocated. 2482 */ 2483 m = _pmap_allocpte(pmap, ptepindex, lockp); 2484 if (m == NULL && lockp != NULL) 2485 goto retry; 2486 } 2487 return (m); 2488} 2489 2490 2491/*************************************************** 2492 * Pmap allocation/deallocation routines. 2493 ***************************************************/ 2494 2495/* 2496 * Release any resources held by the given physical map. 2497 * Called when a pmap initialized by pmap_pinit is being released. 2498 * Should only be called if the map contains no valid mappings. 2499 */ 2500void 2501pmap_release(pmap_t pmap) 2502{ 2503 vm_page_t m; 2504 int i; 2505 2506 KASSERT(pmap->pm_stats.resident_count == 0, 2507 ("pmap_release: pmap resident count %ld != 0", 2508 pmap->pm_stats.resident_count)); 2509 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2510 ("pmap_release: pmap has reserved page table page(s)")); 2511 2512 if (pmap_pcid_enabled) { 2513 /* 2514 * Invalidate any left TLB entries, to allow the reuse 2515 * of the pcid. 2516 */ 2517 pmap_invalidate_all(pmap); 2518 } 2519 2520 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4)); 2521 2522 for (i = 0; i < NKPML4E; i++) /* KVA */ 2523 pmap->pm_pml4[KPML4BASE + i] = 0; 2524 for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ 2525 pmap->pm_pml4[DMPML4I + i] = 0; 2526 pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ 2527 2528 m->wire_count--; 2529 atomic_subtract_int(&cnt.v_wire_count, 1); 2530 vm_page_free_zero(m); 2531 if (pmap->pm_pcid != -1) 2532 free_unr(&pcid_unr, pmap->pm_pcid); 2533} 2534 2535static int 2536kvm_size(SYSCTL_HANDLER_ARGS) 2537{ 2538 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; 2539 2540 return sysctl_handle_long(oidp, &ksize, 0, req); 2541} 2542SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 2543 0, 0, kvm_size, "LU", "Size of KVM"); 2544 2545static int 2546kvm_free(SYSCTL_HANDLER_ARGS) 2547{ 2548 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 2549 2550 return sysctl_handle_long(oidp, &kfree, 0, req); 2551} 2552SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 2553 0, 0, kvm_free, "LU", "Amount of KVM free"); 2554 2555/* 2556 * grow the number of kernel page table entries, if needed 2557 */ 2558void 2559pmap_growkernel(vm_offset_t addr) 2560{ 2561 vm_paddr_t paddr; 2562 vm_page_t nkpg; 2563 pd_entry_t *pde, newpdir; 2564 pdp_entry_t *pdpe; 2565 2566 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2567 2568 /* 2569 * Return if "addr" is within the range of kernel page table pages 2570 * that were preallocated during pmap bootstrap. Moreover, leave 2571 * "kernel_vm_end" and the kernel page table as they were. 2572 * 2573 * The correctness of this action is based on the following 2574 * argument: vm_map_findspace() allocates contiguous ranges of the 2575 * kernel virtual address space. It calls this function if a range 2576 * ends after "kernel_vm_end". If the kernel is mapped between 2577 * "kernel_vm_end" and "addr", then the range cannot begin at 2578 * "kernel_vm_end". In fact, its beginning address cannot be less 2579 * than the kernel. Thus, there is no immediate need to allocate 2580 * any new kernel page table pages between "kernel_vm_end" and 2581 * "KERNBASE". 2582 */ 2583 if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR) 2584 return; 2585 2586 addr = roundup2(addr, NBPDR); 2587 if (addr - 1 >= kernel_map->max_offset) 2588 addr = kernel_map->max_offset; 2589 while (kernel_vm_end < addr) { 2590 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end); 2591 if ((*pdpe & X86_PG_V) == 0) { 2592 /* We need a new PDP entry */ 2593 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT, 2594 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | 2595 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 2596 if (nkpg == NULL) 2597 panic("pmap_growkernel: no memory to grow kernel"); 2598 if ((nkpg->flags & PG_ZERO) == 0) 2599 pmap_zero_page(nkpg); 2600 paddr = VM_PAGE_TO_PHYS(nkpg); 2601 *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | 2602 X86_PG_A | X86_PG_M); 2603 continue; /* try again */ 2604 } 2605 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); 2606 if ((*pde & X86_PG_V) != 0) { 2607 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2608 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2609 kernel_vm_end = kernel_map->max_offset; 2610 break; 2611 } 2612 continue; 2613 } 2614 2615 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end), 2616 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2617 VM_ALLOC_ZERO); 2618 if (nkpg == NULL) 2619 panic("pmap_growkernel: no memory to grow kernel"); 2620 if ((nkpg->flags & PG_ZERO) == 0) 2621 pmap_zero_page(nkpg); 2622 paddr = VM_PAGE_TO_PHYS(nkpg); 2623 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; 2624 pde_store(pde, newpdir); 2625 2626 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2627 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2628 kernel_vm_end = kernel_map->max_offset; 2629 break; 2630 } 2631 } 2632} 2633 2634 2635/*************************************************** 2636 * page management routines. 2637 ***************************************************/ 2638 2639CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2640CTASSERT(_NPCM == 3); 2641CTASSERT(_NPCPV == 168); 2642 2643static __inline struct pv_chunk * 2644pv_to_chunk(pv_entry_t pv) 2645{ 2646 2647 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2648} 2649 2650#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2651 2652#define PC_FREE0 0xfffffffffffffffful 2653#define PC_FREE1 0xfffffffffffffffful 2654#define PC_FREE2 0x000000fffffffffful 2655 2656static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; 2657 2658#ifdef PV_STATS 2659static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2660 2661SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2662 "Current number of pv entry chunks"); 2663SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2664 "Current number of pv entry chunks allocated"); 2665SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2666 "Current number of pv entry chunks frees"); 2667SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2668 "Number of times tried to get a chunk page but failed."); 2669 2670static long pv_entry_frees, pv_entry_allocs, pv_entry_count; 2671static int pv_entry_spare; 2672 2673SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2674 "Current number of pv entry frees"); 2675SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2676 "Current number of pv entry allocs"); 2677SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2678 "Current number of pv entries"); 2679SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2680 "Current number of spare pv entries"); 2681#endif 2682 2683/* 2684 * We are in a serious low memory condition. Resort to 2685 * drastic measures to free some pages so we can allocate 2686 * another pv entry chunk. 2687 * 2688 * Returns NULL if PV entries were reclaimed from the specified pmap. 2689 * 2690 * We do not, however, unmap 2mpages because subsequent accesses will 2691 * allocate per-page pv entries until repromotion occurs, thereby 2692 * exacerbating the shortage of free pv entries. 2693 */ 2694static vm_page_t 2695reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) 2696{ 2697 struct pch new_tail; 2698 struct pv_chunk *pc; 2699 struct md_page *pvh; 2700 pd_entry_t *pde; 2701 pmap_t pmap; 2702 pt_entry_t *pte, tpte; 2703 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 2704 pv_entry_t pv; 2705 vm_offset_t va; 2706 vm_page_t m, m_pc; 2707 struct spglist free; 2708 uint64_t inuse; 2709 int bit, field, freed; 2710 2711 rw_assert(&pvh_global_lock, RA_LOCKED); 2712 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2713 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); 2714 pmap = NULL; 2715 m_pc = NULL; 2716 PG_G = PG_A = PG_M = PG_RW = 0; 2717 SLIST_INIT(&free); 2718 TAILQ_INIT(&new_tail); 2719 mtx_lock(&pv_chunks_mutex); 2720 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && SLIST_EMPTY(&free)) { 2721 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2722 mtx_unlock(&pv_chunks_mutex); 2723 if (pmap != pc->pc_pmap) { 2724 if (pmap != NULL) { 2725 pmap_invalidate_all(pmap); 2726 if (pmap != locked_pmap) 2727 PMAP_UNLOCK(pmap); 2728 } 2729 pmap = pc->pc_pmap; 2730 /* Avoid deadlock and lock recursion. */ 2731 if (pmap > locked_pmap) { 2732 RELEASE_PV_LIST_LOCK(lockp); 2733 PMAP_LOCK(pmap); 2734 } else if (pmap != locked_pmap && 2735 !PMAP_TRYLOCK(pmap)) { 2736 pmap = NULL; 2737 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2738 mtx_lock(&pv_chunks_mutex); 2739 continue; 2740 } 2741 PG_G = pmap_global_bit(pmap); 2742 PG_A = pmap_accessed_bit(pmap); 2743 PG_M = pmap_modified_bit(pmap); 2744 PG_RW = pmap_rw_bit(pmap); 2745 } 2746 2747 /* 2748 * Destroy every non-wired, 4 KB page mapping in the chunk. 2749 */ 2750 freed = 0; 2751 for (field = 0; field < _NPCM; field++) { 2752 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2753 inuse != 0; inuse &= ~(1UL << bit)) { 2754 bit = bsfq(inuse); 2755 pv = &pc->pc_pventry[field * 64 + bit]; 2756 va = pv->pv_va; 2757 pde = pmap_pde(pmap, va); 2758 if ((*pde & PG_PS) != 0) 2759 continue; 2760 pte = pmap_pde_to_pte(pde, va); 2761 if ((*pte & PG_W) != 0) 2762 continue; 2763 tpte = pte_load_clear(pte); 2764 if ((tpte & PG_G) != 0) 2765 pmap_invalidate_page(pmap, va); 2766 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 2767 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2768 vm_page_dirty(m); 2769 if ((tpte & PG_A) != 0) 2770 vm_page_aflag_set(m, PGA_REFERENCED); 2771 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 2772 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2773 m->md.pv_gen++; 2774 if (TAILQ_EMPTY(&m->md.pv_list) && 2775 (m->flags & PG_FICTITIOUS) == 0) { 2776 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2777 if (TAILQ_EMPTY(&pvh->pv_list)) { 2778 vm_page_aflag_clear(m, 2779 PGA_WRITEABLE); 2780 } 2781 } 2782 pc->pc_map[field] |= 1UL << bit; 2783 pmap_unuse_pt(pmap, va, *pde, &free); 2784 freed++; 2785 } 2786 } 2787 if (freed == 0) { 2788 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2789 mtx_lock(&pv_chunks_mutex); 2790 continue; 2791 } 2792 /* Every freed mapping is for a 4 KB page. */ 2793 pmap_resident_count_dec(pmap, freed); 2794 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 2795 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 2796 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 2797 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2798 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && 2799 pc->pc_map[2] == PC_FREE2) { 2800 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 2801 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 2802 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 2803 /* Entire chunk is free; return it. */ 2804 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2805 dump_drop_page(m_pc->phys_addr); 2806 mtx_lock(&pv_chunks_mutex); 2807 break; 2808 } 2809 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2810 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 2811 mtx_lock(&pv_chunks_mutex); 2812 /* One freed pv entry in locked_pmap is sufficient. */ 2813 if (pmap == locked_pmap) 2814 break; 2815 } 2816 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 2817 mtx_unlock(&pv_chunks_mutex); 2818 if (pmap != NULL) { 2819 pmap_invalidate_all(pmap); 2820 if (pmap != locked_pmap) 2821 PMAP_UNLOCK(pmap); 2822 } 2823 if (m_pc == NULL && !SLIST_EMPTY(&free)) { 2824 m_pc = SLIST_FIRST(&free); 2825 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2826 /* Recycle a freed page table page. */ 2827 m_pc->wire_count = 1; 2828 atomic_add_int(&cnt.v_wire_count, 1); 2829 } 2830 pmap_free_zero_pages(&free); 2831 return (m_pc); 2832} 2833 2834/* 2835 * free the pv_entry back to the free list 2836 */ 2837static void 2838free_pv_entry(pmap_t pmap, pv_entry_t pv) 2839{ 2840 struct pv_chunk *pc; 2841 int idx, field, bit; 2842 2843 rw_assert(&pvh_global_lock, RA_LOCKED); 2844 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2845 PV_STAT(atomic_add_long(&pv_entry_frees, 1)); 2846 PV_STAT(atomic_add_int(&pv_entry_spare, 1)); 2847 PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); 2848 pc = pv_to_chunk(pv); 2849 idx = pv - &pc->pc_pventry[0]; 2850 field = idx / 64; 2851 bit = idx % 64; 2852 pc->pc_map[field] |= 1ul << bit; 2853 if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || 2854 pc->pc_map[2] != PC_FREE2) { 2855 /* 98% of the time, pc is already at the head of the list. */ 2856 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { 2857 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2858 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2859 } 2860 return; 2861 } 2862 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2863 free_pv_chunk(pc); 2864} 2865 2866static void 2867free_pv_chunk(struct pv_chunk *pc) 2868{ 2869 vm_page_t m; 2870 2871 mtx_lock(&pv_chunks_mutex); 2872 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2873 mtx_unlock(&pv_chunks_mutex); 2874 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); 2875 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); 2876 PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); 2877 /* entire chunk is free, return it */ 2878 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); 2879 dump_drop_page(m->phys_addr); 2880 vm_page_unwire(m, 0); 2881 vm_page_free(m); 2882} 2883 2884/* 2885 * Returns a new PV entry, allocating a new PV chunk from the system when 2886 * needed. If this PV chunk allocation fails and a PV list lock pointer was 2887 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is 2888 * returned. 2889 * 2890 * The given PV list lock may be released. 2891 */ 2892static pv_entry_t 2893get_pv_entry(pmap_t pmap, struct rwlock **lockp) 2894{ 2895 int bit, field; 2896 pv_entry_t pv; 2897 struct pv_chunk *pc; 2898 vm_page_t m; 2899 2900 rw_assert(&pvh_global_lock, RA_LOCKED); 2901 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2902 PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); 2903retry: 2904 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2905 if (pc != NULL) { 2906 for (field = 0; field < _NPCM; field++) { 2907 if (pc->pc_map[field]) { 2908 bit = bsfq(pc->pc_map[field]); 2909 break; 2910 } 2911 } 2912 if (field < _NPCM) { 2913 pv = &pc->pc_pventry[field * 64 + bit]; 2914 pc->pc_map[field] &= ~(1ul << bit); 2915 /* If this was the last item, move it to tail */ 2916 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && 2917 pc->pc_map[2] == 0) { 2918 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2919 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, 2920 pc_list); 2921 } 2922 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 2923 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); 2924 return (pv); 2925 } 2926 } 2927 /* No free items, allocate another chunk */ 2928 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 2929 VM_ALLOC_WIRED); 2930 if (m == NULL) { 2931 if (lockp == NULL) { 2932 PV_STAT(pc_chunk_tryfail++); 2933 return (NULL); 2934 } 2935 m = reclaim_pv_chunk(pmap, lockp); 2936 if (m == NULL) 2937 goto retry; 2938 } 2939 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 2940 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 2941 dump_add_page(m->phys_addr); 2942 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 2943 pc->pc_pmap = pmap; 2944 pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ 2945 pc->pc_map[1] = PC_FREE1; 2946 pc->pc_map[2] = PC_FREE2; 2947 mtx_lock(&pv_chunks_mutex); 2948 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2949 mtx_unlock(&pv_chunks_mutex); 2950 pv = &pc->pc_pventry[0]; 2951 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2952 PV_STAT(atomic_add_long(&pv_entry_count, 1)); 2953 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); 2954 return (pv); 2955} 2956 2957/* 2958 * Returns the number of one bits within the given PV chunk map element. 2959 */ 2960static int 2961popcnt_pc_map_elem(uint64_t elem) 2962{ 2963 int count; 2964 2965 /* 2966 * This simple method of counting the one bits performs well because 2967 * the given element typically contains more zero bits than one bits. 2968 */ 2969 count = 0; 2970 for (; elem != 0; elem &= elem - 1) 2971 count++; 2972 return (count); 2973} 2974 2975/* 2976 * Ensure that the number of spare PV entries in the specified pmap meets or 2977 * exceeds the given count, "needed". 2978 * 2979 * The given PV list lock may be released. 2980 */ 2981static void 2982reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) 2983{ 2984 struct pch new_tail; 2985 struct pv_chunk *pc; 2986 int avail, free; 2987 vm_page_t m; 2988 2989 rw_assert(&pvh_global_lock, RA_LOCKED); 2990 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2991 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); 2992 2993 /* 2994 * Newly allocated PV chunks must be stored in a private list until 2995 * the required number of PV chunks have been allocated. Otherwise, 2996 * reclaim_pv_chunk() could recycle one of these chunks. In 2997 * contrast, these chunks must be added to the pmap upon allocation. 2998 */ 2999 TAILQ_INIT(&new_tail); 3000retry: 3001 avail = 0; 3002 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { 3003 if ((cpu_feature2 & CPUID2_POPCNT) == 0) { 3004 free = popcnt_pc_map_elem(pc->pc_map[0]); 3005 free += popcnt_pc_map_elem(pc->pc_map[1]); 3006 free += popcnt_pc_map_elem(pc->pc_map[2]); 3007 } else { 3008 free = popcntq(pc->pc_map[0]); 3009 free += popcntq(pc->pc_map[1]); 3010 free += popcntq(pc->pc_map[2]); 3011 } 3012 if (free == 0) 3013 break; 3014 avail += free; 3015 if (avail >= needed) 3016 break; 3017 } 3018 for (; avail < needed; avail += _NPCPV) { 3019 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 3020 VM_ALLOC_WIRED); 3021 if (m == NULL) { 3022 m = reclaim_pv_chunk(pmap, lockp); 3023 if (m == NULL) 3024 goto retry; 3025 } 3026 PV_STAT(atomic_add_int(&pc_chunk_count, 1)); 3027 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); 3028 dump_add_page(m->phys_addr); 3029 pc = (void *)PHYS_TO_DMAP(m->phys_addr); 3030 pc->pc_pmap = pmap; 3031 pc->pc_map[0] = PC_FREE0; 3032 pc->pc_map[1] = PC_FREE1; 3033 pc->pc_map[2] = PC_FREE2; 3034 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 3035 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); 3036 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); 3037 } 3038 if (!TAILQ_EMPTY(&new_tail)) { 3039 mtx_lock(&pv_chunks_mutex); 3040 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); 3041 mtx_unlock(&pv_chunks_mutex); 3042 } 3043} 3044 3045/* 3046 * First find and then remove the pv entry for the specified pmap and virtual 3047 * address from the specified pv list. Returns the pv entry if found and NULL 3048 * otherwise. This operation can be performed on pv lists for either 4KB or 3049 * 2MB page mappings. 3050 */ 3051static __inline pv_entry_t 3052pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3053{ 3054 pv_entry_t pv; 3055 3056 rw_assert(&pvh_global_lock, RA_LOCKED); 3057 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 3058 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 3059 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 3060 pvh->pv_gen++; 3061 break; 3062 } 3063 } 3064 return (pv); 3065} 3066 3067/* 3068 * After demotion from a 2MB page mapping to 512 4KB page mappings, 3069 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv 3070 * entries for each of the 4KB page mappings. 3071 */ 3072static void 3073pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3074 struct rwlock **lockp) 3075{ 3076 struct md_page *pvh; 3077 struct pv_chunk *pc; 3078 pv_entry_t pv; 3079 vm_offset_t va_last; 3080 vm_page_t m; 3081 int bit, field; 3082 3083 rw_assert(&pvh_global_lock, RA_LOCKED); 3084 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3085 KASSERT((pa & PDRMASK) == 0, 3086 ("pmap_pv_demote_pde: pa is not 2mpage aligned")); 3087 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3088 3089 /* 3090 * Transfer the 2mpage's pv entry for this mapping to the first 3091 * page's pv list. Once this transfer begins, the pv list lock 3092 * must not be released until the last pv entry is reinstantiated. 3093 */ 3094 pvh = pa_to_pvh(pa); 3095 va = trunc_2mpage(va); 3096 pv = pmap_pvh_remove(pvh, pmap, va); 3097 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 3098 m = PHYS_TO_VM_PAGE(pa); 3099 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3100 m->md.pv_gen++; 3101 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 3102 PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); 3103 va_last = va + NBPDR - PAGE_SIZE; 3104 for (;;) { 3105 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 3106 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || 3107 pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); 3108 for (field = 0; field < _NPCM; field++) { 3109 while (pc->pc_map[field]) { 3110 bit = bsfq(pc->pc_map[field]); 3111 pc->pc_map[field] &= ~(1ul << bit); 3112 pv = &pc->pc_pventry[field * 64 + bit]; 3113 va += PAGE_SIZE; 3114 pv->pv_va = va; 3115 m++; 3116 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3117 ("pmap_pv_demote_pde: page %p is not managed", m)); 3118 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3119 m->md.pv_gen++; 3120 if (va == va_last) 3121 goto out; 3122 } 3123 } 3124 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3125 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3126 } 3127out: 3128 if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { 3129 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3130 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 3131 } 3132 PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); 3133 PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); 3134} 3135 3136/* 3137 * After promotion from 512 4KB page mappings to a single 2MB page mapping, 3138 * replace the many pv entries for the 4KB page mappings by a single pv entry 3139 * for the 2MB page mapping. 3140 */ 3141static void 3142pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3143 struct rwlock **lockp) 3144{ 3145 struct md_page *pvh; 3146 pv_entry_t pv; 3147 vm_offset_t va_last; 3148 vm_page_t m; 3149 3150 rw_assert(&pvh_global_lock, RA_LOCKED); 3151 KASSERT((pa & PDRMASK) == 0, 3152 ("pmap_pv_promote_pde: pa is not 2mpage aligned")); 3153 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3154 3155 /* 3156 * Transfer the first page's pv entry for this mapping to the 2mpage's 3157 * pv list. Aside from avoiding the cost of a call to get_pv_entry(), 3158 * a transfer avoids the possibility that get_pv_entry() calls 3159 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the 3160 * mappings that is being promoted. 3161 */ 3162 m = PHYS_TO_VM_PAGE(pa); 3163 va = trunc_2mpage(va); 3164 pv = pmap_pvh_remove(&m->md, pmap, va); 3165 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 3166 pvh = pa_to_pvh(pa); 3167 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3168 pvh->pv_gen++; 3169 /* Free the remaining NPTEPG - 1 pv entries. */ 3170 va_last = va + NBPDR - PAGE_SIZE; 3171 do { 3172 m++; 3173 va += PAGE_SIZE; 3174 pmap_pvh_free(&m->md, pmap, va); 3175 } while (va < va_last); 3176} 3177 3178/* 3179 * First find and then destroy the pv entry for the specified pmap and virtual 3180 * address. This operation can be performed on pv lists for either 4KB or 2MB 3181 * page mappings. 3182 */ 3183static void 3184pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 3185{ 3186 pv_entry_t pv; 3187 3188 pv = pmap_pvh_remove(pvh, pmap, va); 3189 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 3190 free_pv_entry(pmap, pv); 3191} 3192 3193/* 3194 * Conditionally create the PV entry for a 4KB page mapping if the required 3195 * memory can be allocated without resorting to reclamation. 3196 */ 3197static boolean_t 3198pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, 3199 struct rwlock **lockp) 3200{ 3201 pv_entry_t pv; 3202 3203 rw_assert(&pvh_global_lock, RA_LOCKED); 3204 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3205 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3206 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 3207 pv->pv_va = va; 3208 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3209 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3210 m->md.pv_gen++; 3211 return (TRUE); 3212 } else 3213 return (FALSE); 3214} 3215 3216/* 3217 * Conditionally create the PV entry for a 2MB page mapping if the required 3218 * memory can be allocated without resorting to reclamation. 3219 */ 3220static boolean_t 3221pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, 3222 struct rwlock **lockp) 3223{ 3224 struct md_page *pvh; 3225 pv_entry_t pv; 3226 3227 rw_assert(&pvh_global_lock, RA_LOCKED); 3228 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3229 /* Pass NULL instead of the lock pointer to disable reclamation. */ 3230 if ((pv = get_pv_entry(pmap, NULL)) != NULL) { 3231 pv->pv_va = va; 3232 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); 3233 pvh = pa_to_pvh(pa); 3234 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 3235 pvh->pv_gen++; 3236 return (TRUE); 3237 } else 3238 return (FALSE); 3239} 3240 3241/* 3242 * Fills a page table page with mappings to consecutive physical pages. 3243 */ 3244static void 3245pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 3246{ 3247 pt_entry_t *pte; 3248 3249 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 3250 *pte = newpte; 3251 newpte += PAGE_SIZE; 3252 } 3253} 3254 3255/* 3256 * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page 3257 * mapping is invalidated. 3258 */ 3259static boolean_t 3260pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3261{ 3262 struct rwlock *lock; 3263 boolean_t rv; 3264 3265 lock = NULL; 3266 rv = pmap_demote_pde_locked(pmap, pde, va, &lock); 3267 if (lock != NULL) 3268 rw_wunlock(lock); 3269 return (rv); 3270} 3271 3272static boolean_t 3273pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 3274 struct rwlock **lockp) 3275{ 3276 pd_entry_t newpde, oldpde; 3277 pt_entry_t *firstpte, newpte; 3278 pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V; 3279 vm_paddr_t mptepa; 3280 vm_page_t mpte; 3281 struct spglist free; 3282 int PG_PTE_CACHE; 3283 3284 PG_G = pmap_global_bit(pmap); 3285 PG_A = pmap_accessed_bit(pmap); 3286 PG_M = pmap_modified_bit(pmap); 3287 PG_RW = pmap_rw_bit(pmap); 3288 PG_V = pmap_valid_bit(pmap); 3289 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 3290 3291 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3292 oldpde = *pde; 3293 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 3294 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 3295 if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) != 3296 NULL) 3297 pmap_remove_pt_page(pmap, mpte); 3298 else { 3299 KASSERT((oldpde & PG_W) == 0, 3300 ("pmap_demote_pde: page table page for a wired mapping" 3301 " is missing")); 3302 3303 /* 3304 * Invalidate the 2MB page mapping and return "failure" if the 3305 * mapping was never accessed or the allocation of the new 3306 * page table page fails. If the 2MB page mapping belongs to 3307 * the direct map region of the kernel's address space, then 3308 * the page allocation request specifies the highest possible 3309 * priority (VM_ALLOC_INTERRUPT). Otherwise, the priority is 3310 * normal. Page table pages are preallocated for every other 3311 * part of the kernel address space, so the direct map region 3312 * is the only part of the kernel address space that must be 3313 * handled here. 3314 */ 3315 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 3316 pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va < 3317 DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | 3318 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 3319 SLIST_INIT(&free); 3320 pmap_remove_pde(pmap, pde, trunc_2mpage(va), &free, 3321 lockp); 3322 pmap_invalidate_page(pmap, trunc_2mpage(va)); 3323 pmap_free_zero_pages(&free); 3324 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx" 3325 " in pmap %p", va, pmap); 3326 return (FALSE); 3327 } 3328 if (va < VM_MAXUSER_ADDRESS) 3329 pmap_resident_count_inc(pmap, 1); 3330 } 3331 mptepa = VM_PAGE_TO_PHYS(mpte); 3332 firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); 3333 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 3334 KASSERT((oldpde & PG_A) != 0, 3335 ("pmap_demote_pde: oldpde is missing PG_A")); 3336 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 3337 ("pmap_demote_pde: oldpde is missing PG_M")); 3338 newpte = oldpde & ~PG_PS; 3339 newpte = pmap_swap_pat(pmap, newpte); 3340 3341 /* 3342 * If the page table page is new, initialize it. 3343 */ 3344 if (mpte->wire_count == 1) { 3345 mpte->wire_count = NPTEPG; 3346 pmap_fill_ptp(firstpte, newpte); 3347 } 3348 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 3349 ("pmap_demote_pde: firstpte and newpte map different physical" 3350 " addresses")); 3351 3352 /* 3353 * If the mapping has changed attributes, update the page table 3354 * entries. 3355 */ 3356 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 3357 pmap_fill_ptp(firstpte, newpte); 3358 3359 /* 3360 * The spare PV entries must be reserved prior to demoting the 3361 * mapping, that is, prior to changing the PDE. Otherwise, the state 3362 * of the PDE and the PV lists will be inconsistent, which can result 3363 * in reclaim_pv_chunk() attempting to remove a PV entry from the 3364 * wrong PV list and pmap_pv_demote_pde() failing to find the expected 3365 * PV entry for the 2MB page mapping that is being demoted. 3366 */ 3367 if ((oldpde & PG_MANAGED) != 0) 3368 reserve_pv_entries(pmap, NPTEPG - 1, lockp); 3369 3370 /* 3371 * Demote the mapping. This pmap is locked. The old PDE has 3372 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 3373 * set. Thus, there is no danger of a race with another 3374 * processor changing the setting of PG_A and/or PG_M between 3375 * the read above and the store below. 3376 */ 3377 if (workaround_erratum383) 3378 pmap_update_pde(pmap, va, pde, newpde); 3379 else 3380 pde_store(pde, newpde); 3381 3382 /* 3383 * Invalidate a stale recursive mapping of the page table page. 3384 */ 3385 if (va >= VM_MAXUSER_ADDRESS) 3386 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 3387 3388 /* 3389 * Demote the PV entry. 3390 */ 3391 if ((oldpde & PG_MANAGED) != 0) 3392 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); 3393 3394 atomic_add_long(&pmap_pde_demotions, 1); 3395 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx" 3396 " in pmap %p", va, pmap); 3397 return (TRUE); 3398} 3399 3400/* 3401 * pmap_remove_kernel_pde: Remove a kernel superpage mapping. 3402 */ 3403static void 3404pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3405{ 3406 pd_entry_t newpde; 3407 vm_paddr_t mptepa; 3408 vm_page_t mpte; 3409 3410 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); 3411 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3412 mpte = pmap_lookup_pt_page(pmap, va); 3413 if (mpte == NULL) 3414 panic("pmap_remove_kernel_pde: Missing pt page."); 3415 3416 pmap_remove_pt_page(pmap, mpte); 3417 mptepa = VM_PAGE_TO_PHYS(mpte); 3418 newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; 3419 3420 /* 3421 * Initialize the page table page. 3422 */ 3423 pagezero((void *)PHYS_TO_DMAP(mptepa)); 3424 3425 /* 3426 * Demote the mapping. 3427 */ 3428 if (workaround_erratum383) 3429 pmap_update_pde(pmap, va, pde, newpde); 3430 else 3431 pde_store(pde, newpde); 3432 3433 /* 3434 * Invalidate a stale recursive mapping of the page table page. 3435 */ 3436 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 3437} 3438 3439/* 3440 * pmap_remove_pde: do the things to unmap a superpage in a process 3441 */ 3442static int 3443pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 3444 struct spglist *free, struct rwlock **lockp) 3445{ 3446 struct md_page *pvh; 3447 pd_entry_t oldpde; 3448 vm_offset_t eva, va; 3449 vm_page_t m, mpte; 3450 pt_entry_t PG_G, PG_A, PG_M, PG_RW; 3451 3452 PG_G = pmap_global_bit(pmap); 3453 PG_A = pmap_accessed_bit(pmap); 3454 PG_M = pmap_modified_bit(pmap); 3455 PG_RW = pmap_rw_bit(pmap); 3456 3457 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3458 KASSERT((sva & PDRMASK) == 0, 3459 ("pmap_remove_pde: sva is not 2mpage aligned")); 3460 oldpde = pte_load_clear(pdq); 3461 if (oldpde & PG_W) 3462 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 3463 3464 /* 3465 * Machines that don't support invlpg, also don't support 3466 * PG_G. 3467 */ 3468 if (oldpde & PG_G) 3469 pmap_invalidate_page(kernel_pmap, sva); 3470 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); 3471 if (oldpde & PG_MANAGED) { 3472 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); 3473 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 3474 pmap_pvh_free(pvh, pmap, sva); 3475 eva = sva + NBPDR; 3476 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 3477 va < eva; va += PAGE_SIZE, m++) { 3478 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3479 vm_page_dirty(m); 3480 if (oldpde & PG_A) 3481 vm_page_aflag_set(m, PGA_REFERENCED); 3482 if (TAILQ_EMPTY(&m->md.pv_list) && 3483 TAILQ_EMPTY(&pvh->pv_list)) 3484 vm_page_aflag_clear(m, PGA_WRITEABLE); 3485 } 3486 } 3487 if (pmap == kernel_pmap) { 3488 pmap_remove_kernel_pde(pmap, pdq, sva); 3489 } else { 3490 mpte = pmap_lookup_pt_page(pmap, sva); 3491 if (mpte != NULL) { 3492 pmap_remove_pt_page(pmap, mpte); 3493 pmap_resident_count_dec(pmap, 1); 3494 KASSERT(mpte->wire_count == NPTEPG, 3495 ("pmap_remove_pde: pte page wire count error")); 3496 mpte->wire_count = 0; 3497 pmap_add_delayed_free_list(mpte, free, FALSE); 3498 atomic_subtract_int(&cnt.v_wire_count, 1); 3499 } 3500 } 3501 return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); 3502} 3503 3504/* 3505 * pmap_remove_pte: do the things to unmap a page in a process 3506 */ 3507static int 3508pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 3509 pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) 3510{ 3511 struct md_page *pvh; 3512 pt_entry_t oldpte, PG_A, PG_M, PG_RW; 3513 vm_page_t m; 3514 3515 PG_A = pmap_accessed_bit(pmap); 3516 PG_M = pmap_modified_bit(pmap); 3517 PG_RW = pmap_rw_bit(pmap); 3518 3519 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3520 oldpte = pte_load_clear(ptq); 3521 if (oldpte & PG_W) 3522 pmap->pm_stats.wired_count -= 1; 3523 pmap_resident_count_dec(pmap, 1); 3524 if (oldpte & PG_MANAGED) { 3525 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 3526 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3527 vm_page_dirty(m); 3528 if (oldpte & PG_A) 3529 vm_page_aflag_set(m, PGA_REFERENCED); 3530 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); 3531 pmap_pvh_free(&m->md, pmap, va); 3532 if (TAILQ_EMPTY(&m->md.pv_list) && 3533 (m->flags & PG_FICTITIOUS) == 0) { 3534 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3535 if (TAILQ_EMPTY(&pvh->pv_list)) 3536 vm_page_aflag_clear(m, PGA_WRITEABLE); 3537 } 3538 } 3539 return (pmap_unuse_pt(pmap, va, ptepde, free)); 3540} 3541 3542/* 3543 * Remove a single page from a process address space 3544 */ 3545static void 3546pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 3547 struct spglist *free) 3548{ 3549 struct rwlock *lock; 3550 pt_entry_t *pte, PG_V; 3551 3552 PG_V = pmap_valid_bit(pmap); 3553 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3554 if ((*pde & PG_V) == 0) 3555 return; 3556 pte = pmap_pde_to_pte(pde, va); 3557 if ((*pte & PG_V) == 0) 3558 return; 3559 lock = NULL; 3560 pmap_remove_pte(pmap, pte, va, *pde, free, &lock); 3561 if (lock != NULL) 3562 rw_wunlock(lock); 3563 pmap_invalidate_page(pmap, va); 3564} 3565 3566/* 3567 * Remove the given range of addresses from the specified map. 3568 * 3569 * It is assumed that the start and end are properly 3570 * rounded to the page size. 3571 */ 3572void 3573pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 3574{ 3575 struct rwlock *lock; 3576 vm_offset_t va, va_next; 3577 pml4_entry_t *pml4e; 3578 pdp_entry_t *pdpe; 3579 pd_entry_t ptpaddr, *pde; 3580 pt_entry_t *pte, PG_G, PG_V; 3581 struct spglist free; 3582 int anyvalid; 3583 3584 PG_G = pmap_global_bit(pmap); 3585 PG_V = pmap_valid_bit(pmap); 3586 3587 /* 3588 * Perform an unsynchronized read. This is, however, safe. 3589 */ 3590 if (pmap->pm_stats.resident_count == 0) 3591 return; 3592 3593 anyvalid = 0; 3594 SLIST_INIT(&free); 3595 3596 rw_rlock(&pvh_global_lock); 3597 PMAP_LOCK(pmap); 3598 3599 /* 3600 * special handling of removing one page. a very 3601 * common operation and easy to short circuit some 3602 * code. 3603 */ 3604 if (sva + PAGE_SIZE == eva) { 3605 pde = pmap_pde(pmap, sva); 3606 if (pde && (*pde & PG_PS) == 0) { 3607 pmap_remove_page(pmap, sva, pde, &free); 3608 goto out; 3609 } 3610 } 3611 3612 lock = NULL; 3613 for (; sva < eva; sva = va_next) { 3614 3615 if (pmap->pm_stats.resident_count == 0) 3616 break; 3617 3618 pml4e = pmap_pml4e(pmap, sva); 3619 if ((*pml4e & PG_V) == 0) { 3620 va_next = (sva + NBPML4) & ~PML4MASK; 3621 if (va_next < sva) 3622 va_next = eva; 3623 continue; 3624 } 3625 3626 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 3627 if ((*pdpe & PG_V) == 0) { 3628 va_next = (sva + NBPDP) & ~PDPMASK; 3629 if (va_next < sva) 3630 va_next = eva; 3631 continue; 3632 } 3633 3634 /* 3635 * Calculate index for next page table. 3636 */ 3637 va_next = (sva + NBPDR) & ~PDRMASK; 3638 if (va_next < sva) 3639 va_next = eva; 3640 3641 pde = pmap_pdpe_to_pde(pdpe, sva); 3642 ptpaddr = *pde; 3643 3644 /* 3645 * Weed out invalid mappings. 3646 */ 3647 if (ptpaddr == 0) 3648 continue; 3649 3650 /* 3651 * Check for large page. 3652 */ 3653 if ((ptpaddr & PG_PS) != 0) { 3654 /* 3655 * Are we removing the entire large page? If not, 3656 * demote the mapping and fall through. 3657 */ 3658 if (sva + NBPDR == va_next && eva >= va_next) { 3659 /* 3660 * The TLB entry for a PG_G mapping is 3661 * invalidated by pmap_remove_pde(). 3662 */ 3663 if ((ptpaddr & PG_G) == 0) 3664 anyvalid = 1; 3665 pmap_remove_pde(pmap, pde, sva, &free, &lock); 3666 continue; 3667 } else if (!pmap_demote_pde_locked(pmap, pde, sva, 3668 &lock)) { 3669 /* The large page mapping was destroyed. */ 3670 continue; 3671 } else 3672 ptpaddr = *pde; 3673 } 3674 3675 /* 3676 * Limit our scan to either the end of the va represented 3677 * by the current page table page, or to the end of the 3678 * range being removed. 3679 */ 3680 if (va_next > eva) 3681 va_next = eva; 3682 3683 va = va_next; 3684 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 3685 sva += PAGE_SIZE) { 3686 if (*pte == 0) { 3687 if (va != va_next) { 3688 pmap_invalidate_range(pmap, va, sva); 3689 va = va_next; 3690 } 3691 continue; 3692 } 3693 if ((*pte & PG_G) == 0) 3694 anyvalid = 1; 3695 else if (va == va_next) 3696 va = sva; 3697 if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free, 3698 &lock)) { 3699 sva += PAGE_SIZE; 3700 break; 3701 } 3702 } 3703 if (va != va_next) 3704 pmap_invalidate_range(pmap, va, sva); 3705 } 3706 if (lock != NULL) 3707 rw_wunlock(lock); 3708out: 3709 if (anyvalid) 3710 pmap_invalidate_all(pmap); 3711 rw_runlock(&pvh_global_lock); 3712 PMAP_UNLOCK(pmap); 3713 pmap_free_zero_pages(&free); 3714} 3715 3716/* 3717 * Routine: pmap_remove_all 3718 * Function: 3719 * Removes this physical page from 3720 * all physical maps in which it resides. 3721 * Reflects back modify bits to the pager. 3722 * 3723 * Notes: 3724 * Original versions of this routine were very 3725 * inefficient because they iteratively called 3726 * pmap_remove (slow...) 3727 */ 3728 3729void 3730pmap_remove_all(vm_page_t m) 3731{ 3732 struct md_page *pvh; 3733 pv_entry_t pv; 3734 pmap_t pmap; 3735 pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; 3736 pd_entry_t *pde; 3737 vm_offset_t va; 3738 struct spglist free; 3739 3740 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3741 ("pmap_remove_all: page %p is not managed", m)); 3742 SLIST_INIT(&free); 3743 rw_wlock(&pvh_global_lock); 3744 if ((m->flags & PG_FICTITIOUS) != 0) 3745 goto small_mappings; 3746 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3747 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 3748 pmap = PV_PMAP(pv); 3749 PMAP_LOCK(pmap); 3750 va = pv->pv_va; 3751 pde = pmap_pde(pmap, va); 3752 (void)pmap_demote_pde(pmap, pde, va); 3753 PMAP_UNLOCK(pmap); 3754 } 3755small_mappings: 3756 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3757 pmap = PV_PMAP(pv); 3758 PMAP_LOCK(pmap); 3759 PG_A = pmap_accessed_bit(pmap); 3760 PG_M = pmap_modified_bit(pmap); 3761 PG_RW = pmap_rw_bit(pmap); 3762 pmap_resident_count_dec(pmap, 1); 3763 pde = pmap_pde(pmap, pv->pv_va); 3764 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 3765 " a 2mpage in page %p's pv list", m)); 3766 pte = pmap_pde_to_pte(pde, pv->pv_va); 3767 tpte = pte_load_clear(pte); 3768 if (tpte & PG_W) 3769 pmap->pm_stats.wired_count--; 3770 if (tpte & PG_A) 3771 vm_page_aflag_set(m, PGA_REFERENCED); 3772 3773 /* 3774 * Update the vm_page_t clean and reference bits. 3775 */ 3776 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3777 vm_page_dirty(m); 3778 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); 3779 pmap_invalidate_page(pmap, pv->pv_va); 3780 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3781 m->md.pv_gen++; 3782 free_pv_entry(pmap, pv); 3783 PMAP_UNLOCK(pmap); 3784 } 3785 vm_page_aflag_clear(m, PGA_WRITEABLE); 3786 rw_wunlock(&pvh_global_lock); 3787 pmap_free_zero_pages(&free); 3788} 3789 3790/* 3791 * pmap_protect_pde: do the things to protect a 2mpage in a process 3792 */ 3793static boolean_t 3794pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 3795{ 3796 pd_entry_t newpde, oldpde; 3797 vm_offset_t eva, va; 3798 vm_page_t m; 3799 boolean_t anychanged; 3800 pt_entry_t PG_G, PG_M, PG_RW; 3801 3802 PG_G = pmap_global_bit(pmap); 3803 PG_M = pmap_modified_bit(pmap); 3804 PG_RW = pmap_rw_bit(pmap); 3805 3806 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3807 KASSERT((sva & PDRMASK) == 0, 3808 ("pmap_protect_pde: sva is not 2mpage aligned")); 3809 anychanged = FALSE; 3810retry: 3811 oldpde = newpde = *pde; 3812 if (oldpde & PG_MANAGED) { 3813 eva = sva + NBPDR; 3814 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 3815 va < eva; va += PAGE_SIZE, m++) 3816 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3817 vm_page_dirty(m); 3818 } 3819 if ((prot & VM_PROT_WRITE) == 0) 3820 newpde &= ~(PG_RW | PG_M); 3821 if ((prot & VM_PROT_EXECUTE) == 0) 3822 newpde |= pg_nx; 3823 if (newpde != oldpde) { 3824 if (!atomic_cmpset_long(pde, oldpde, newpde)) 3825 goto retry; 3826 if (oldpde & PG_G) 3827 pmap_invalidate_page(pmap, sva); 3828 else 3829 anychanged = TRUE; 3830 } 3831 return (anychanged); 3832} 3833 3834/* 3835 * Set the physical protection on the 3836 * specified range of this map as requested. 3837 */ 3838void 3839pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 3840{ 3841 vm_offset_t va_next; 3842 pml4_entry_t *pml4e; 3843 pdp_entry_t *pdpe; 3844 pd_entry_t ptpaddr, *pde; 3845 pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; 3846 boolean_t anychanged, pv_lists_locked; 3847 3848 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 3849 pmap_remove(pmap, sva, eva); 3850 return; 3851 } 3852 3853 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 3854 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 3855 return; 3856 3857 PG_G = pmap_global_bit(pmap); 3858 PG_M = pmap_modified_bit(pmap); 3859 PG_V = pmap_valid_bit(pmap); 3860 PG_RW = pmap_rw_bit(pmap); 3861 pv_lists_locked = FALSE; 3862resume: 3863 anychanged = FALSE; 3864 3865 PMAP_LOCK(pmap); 3866 for (; sva < eva; sva = va_next) { 3867 3868 pml4e = pmap_pml4e(pmap, sva); 3869 if ((*pml4e & PG_V) == 0) { 3870 va_next = (sva + NBPML4) & ~PML4MASK; 3871 if (va_next < sva) 3872 va_next = eva; 3873 continue; 3874 } 3875 3876 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 3877 if ((*pdpe & PG_V) == 0) { 3878 va_next = (sva + NBPDP) & ~PDPMASK; 3879 if (va_next < sva) 3880 va_next = eva; 3881 continue; 3882 } 3883 3884 va_next = (sva + NBPDR) & ~PDRMASK; 3885 if (va_next < sva) 3886 va_next = eva; 3887 3888 pde = pmap_pdpe_to_pde(pdpe, sva); 3889 ptpaddr = *pde; 3890 3891 /* 3892 * Weed out invalid mappings. 3893 */ 3894 if (ptpaddr == 0) 3895 continue; 3896 3897 /* 3898 * Check for large page. 3899 */ 3900 if ((ptpaddr & PG_PS) != 0) { 3901 /* 3902 * Are we protecting the entire large page? If not, 3903 * demote the mapping and fall through. 3904 */ 3905 if (sva + NBPDR == va_next && eva >= va_next) { 3906 /* 3907 * The TLB entry for a PG_G mapping is 3908 * invalidated by pmap_protect_pde(). 3909 */ 3910 if (pmap_protect_pde(pmap, pde, sva, prot)) 3911 anychanged = TRUE; 3912 continue; 3913 } else { 3914 if (!pv_lists_locked) { 3915 pv_lists_locked = TRUE; 3916 if (!rw_try_rlock(&pvh_global_lock)) { 3917 if (anychanged) 3918 pmap_invalidate_all( 3919 pmap); 3920 PMAP_UNLOCK(pmap); 3921 rw_rlock(&pvh_global_lock); 3922 goto resume; 3923 } 3924 } 3925 if (!pmap_demote_pde(pmap, pde, sva)) { 3926 /* 3927 * The large page mapping was 3928 * destroyed. 3929 */ 3930 continue; 3931 } 3932 } 3933 } 3934 3935 if (va_next > eva) 3936 va_next = eva; 3937 3938 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 3939 sva += PAGE_SIZE) { 3940 pt_entry_t obits, pbits; 3941 vm_page_t m; 3942 3943retry: 3944 obits = pbits = *pte; 3945 if ((pbits & PG_V) == 0) 3946 continue; 3947 3948 if ((prot & VM_PROT_WRITE) == 0) { 3949 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 3950 (PG_MANAGED | PG_M | PG_RW)) { 3951 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 3952 vm_page_dirty(m); 3953 } 3954 pbits &= ~(PG_RW | PG_M); 3955 } 3956 if ((prot & VM_PROT_EXECUTE) == 0) 3957 pbits |= pg_nx; 3958 3959 if (pbits != obits) { 3960 if (!atomic_cmpset_long(pte, obits, pbits)) 3961 goto retry; 3962 if (obits & PG_G) 3963 pmap_invalidate_page(pmap, sva); 3964 else 3965 anychanged = TRUE; 3966 } 3967 } 3968 } 3969 if (anychanged) 3970 pmap_invalidate_all(pmap); 3971 if (pv_lists_locked) 3972 rw_runlock(&pvh_global_lock); 3973 PMAP_UNLOCK(pmap); 3974} 3975 3976/* 3977 * Tries to promote the 512, contiguous 4KB page mappings that are within a 3978 * single page table page (PTP) to a single 2MB page mapping. For promotion 3979 * to occur, two conditions must be met: (1) the 4KB page mappings must map 3980 * aligned, contiguous physical memory and (2) the 4KB page mappings must have 3981 * identical characteristics. 3982 */ 3983static void 3984pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, 3985 struct rwlock **lockp) 3986{ 3987 pd_entry_t newpde; 3988 pt_entry_t *firstpte, oldpte, pa, *pte; 3989 pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V; 3990 vm_offset_t oldpteva; 3991 vm_page_t mpte; 3992 int PG_PTE_CACHE; 3993 3994 PG_A = pmap_accessed_bit(pmap); 3995 PG_G = pmap_global_bit(pmap); 3996 PG_M = pmap_modified_bit(pmap); 3997 PG_V = pmap_valid_bit(pmap); 3998 PG_RW = pmap_rw_bit(pmap); 3999 PG_PTE_CACHE = pmap_cache_mask(pmap, 0); 4000 4001 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4002 4003 /* 4004 * Examine the first PTE in the specified PTP. Abort if this PTE is 4005 * either invalid, unused, or does not map the first 4KB physical page 4006 * within a 2MB page. 4007 */ 4008 firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); 4009setpde: 4010 newpde = *firstpte; 4011 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 4012 atomic_add_long(&pmap_pde_p_failures, 1); 4013 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4014 " in pmap %p", va, pmap); 4015 return; 4016 } 4017 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 4018 /* 4019 * When PG_M is already clear, PG_RW can be cleared without 4020 * a TLB invalidation. 4021 */ 4022 if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW)) 4023 goto setpde; 4024 newpde &= ~PG_RW; 4025 } 4026 4027 /* 4028 * Examine each of the other PTEs in the specified PTP. Abort if this 4029 * PTE maps an unexpected 4KB physical page or does not have identical 4030 * characteristics to the first PTE. 4031 */ 4032 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 4033 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 4034setpte: 4035 oldpte = *pte; 4036 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 4037 atomic_add_long(&pmap_pde_p_failures, 1); 4038 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4039 " in pmap %p", va, pmap); 4040 return; 4041 } 4042 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 4043 /* 4044 * When PG_M is already clear, PG_RW can be cleared 4045 * without a TLB invalidation. 4046 */ 4047 if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW)) 4048 goto setpte; 4049 oldpte &= ~PG_RW; 4050 oldpteva = (oldpte & PG_FRAME & PDRMASK) | 4051 (va & ~PDRMASK); 4052 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" 4053 " in pmap %p", oldpteva, pmap); 4054 } 4055 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 4056 atomic_add_long(&pmap_pde_p_failures, 1); 4057 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" 4058 " in pmap %p", va, pmap); 4059 return; 4060 } 4061 pa -= PAGE_SIZE; 4062 } 4063 4064 /* 4065 * Save the page table page in its current state until the PDE 4066 * mapping the superpage is demoted by pmap_demote_pde() or 4067 * destroyed by pmap_remove_pde(). 4068 */ 4069 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 4070 KASSERT(mpte >= vm_page_array && 4071 mpte < &vm_page_array[vm_page_array_size], 4072 ("pmap_promote_pde: page table page is out of range")); 4073 KASSERT(mpte->pindex == pmap_pde_pindex(va), 4074 ("pmap_promote_pde: page table page's pindex is wrong")); 4075 if (pmap_insert_pt_page(pmap, mpte)) { 4076 atomic_add_long(&pmap_pde_p_failures, 1); 4077 CTR2(KTR_PMAP, 4078 "pmap_promote_pde: failure for va %#lx in pmap %p", va, 4079 pmap); 4080 return; 4081 } 4082 4083 /* 4084 * Promote the pv entries. 4085 */ 4086 if ((newpde & PG_MANAGED) != 0) 4087 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); 4088 4089 /* 4090 * Propagate the PAT index to its proper position. 4091 */ 4092 newpde = pmap_swap_pat(pmap, newpde); 4093 4094 /* 4095 * Map the superpage. 4096 */ 4097 if (workaround_erratum383) 4098 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 4099 else 4100 pde_store(pde, PG_PS | newpde); 4101 4102 atomic_add_long(&pmap_pde_promotions, 1); 4103 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" 4104 " in pmap %p", va, pmap); 4105} 4106 4107/* 4108 * Insert the given physical page (p) at 4109 * the specified virtual address (v) in the 4110 * target physical map with the protection requested. 4111 * 4112 * If specified, the page will be wired down, meaning 4113 * that the related pte can not be reclaimed. 4114 * 4115 * NB: This is the only routine which MAY NOT lazy-evaluate 4116 * or lose information. That is, this routine must actually 4117 * insert this page into the given map NOW. 4118 */ 4119void 4120pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m, 4121 vm_prot_t prot, boolean_t wired) 4122{ 4123 struct rwlock *lock; 4124 pd_entry_t *pde; 4125 pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; 4126 pt_entry_t newpte, origpte; 4127 pv_entry_t pv; 4128 vm_paddr_t opa, pa; 4129 vm_page_t mpte, om; 4130 4131 PG_A = pmap_accessed_bit(pmap); 4132 PG_G = pmap_global_bit(pmap); 4133 PG_M = pmap_modified_bit(pmap); 4134 PG_V = pmap_valid_bit(pmap); 4135 PG_RW = pmap_rw_bit(pmap); 4136 4137 va = trunc_page(va); 4138 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 4139 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 4140 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", 4141 va)); 4142 KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || 4143 va >= kmi.clean_eva, 4144 ("pmap_enter: managed mapping within the clean submap")); 4145 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) 4146 VM_OBJECT_ASSERT_WLOCKED(m->object); 4147 pa = VM_PAGE_TO_PHYS(m); 4148 newpte = (pt_entry_t)(pa | PG_A | PG_V); 4149 if ((access & VM_PROT_WRITE) != 0) 4150 newpte |= PG_M; 4151 if ((prot & VM_PROT_WRITE) != 0) 4152 newpte |= PG_RW; 4153 KASSERT((newpte & (PG_M | PG_RW)) != PG_M, 4154 ("pmap_enter: access includes VM_PROT_WRITE but prot doesn't")); 4155 if ((prot & VM_PROT_EXECUTE) == 0) 4156 newpte |= pg_nx; 4157 if (wired) 4158 newpte |= PG_W; 4159 if (va < VM_MAXUSER_ADDRESS) 4160 newpte |= PG_U; 4161 if (pmap == kernel_pmap) 4162 newpte |= PG_G; 4163 newpte |= pmap_cache_bits(pmap, m->md.pat_mode, 0); 4164 4165 /* 4166 * Set modified bit gratuitously for writeable mappings if 4167 * the page is unmanaged. We do not want to take a fault 4168 * to do the dirty bit accounting for these mappings. 4169 */ 4170 if ((m->oflags & VPO_UNMANAGED) != 0) { 4171 if ((newpte & PG_RW) != 0) 4172 newpte |= PG_M; 4173 } 4174 4175 mpte = NULL; 4176 4177 lock = NULL; 4178 rw_rlock(&pvh_global_lock); 4179 PMAP_LOCK(pmap); 4180 4181 /* 4182 * In the case that a page table page is not 4183 * resident, we are creating it here. 4184 */ 4185retry: 4186 pde = pmap_pde(pmap, va); 4187 if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || 4188 pmap_demote_pde_locked(pmap, pde, va, &lock))) { 4189 pte = pmap_pde_to_pte(pde, va); 4190 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { 4191 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 4192 mpte->wire_count++; 4193 } 4194 } else if (va < VM_MAXUSER_ADDRESS) { 4195 /* 4196 * Here if the pte page isn't mapped, or if it has been 4197 * deallocated. 4198 */ 4199 mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), &lock); 4200 goto retry; 4201 } else 4202 panic("pmap_enter: invalid page directory va=%#lx", va); 4203 4204 origpte = *pte; 4205 4206 /* 4207 * Is the specified virtual address already mapped? 4208 */ 4209 if ((origpte & PG_V) != 0) { 4210 /* 4211 * Wiring change, just update stats. We don't worry about 4212 * wiring PT pages as they remain resident as long as there 4213 * are valid mappings in them. Hence, if a user page is wired, 4214 * the PT page will be also. 4215 */ 4216 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) 4217 pmap->pm_stats.wired_count++; 4218 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) 4219 pmap->pm_stats.wired_count--; 4220 4221 /* 4222 * Remove the extra PT page reference. 4223 */ 4224 if (mpte != NULL) { 4225 mpte->wire_count--; 4226 KASSERT(mpte->wire_count > 0, 4227 ("pmap_enter: missing reference to page table page," 4228 " va: 0x%lx", va)); 4229 } 4230 4231 /* 4232 * Has the physical page changed? 4233 */ 4234 opa = origpte & PG_FRAME; 4235 if (opa == pa) { 4236 /* 4237 * No, might be a protection or wiring change. 4238 */ 4239 if ((origpte & PG_MANAGED) != 0) { 4240 newpte |= PG_MANAGED; 4241 if ((newpte & PG_RW) != 0) 4242 vm_page_aflag_set(m, PGA_WRITEABLE); 4243 } 4244 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) 4245 goto unchanged; 4246 goto validate; 4247 } 4248 } else { 4249 /* 4250 * Increment the counters. 4251 */ 4252 if ((newpte & PG_W) != 0) 4253 pmap->pm_stats.wired_count++; 4254 pmap_resident_count_inc(pmap, 1); 4255 } 4256 4257 /* 4258 * Enter on the PV list if part of our managed memory. 4259 */ 4260 if ((m->oflags & VPO_UNMANAGED) == 0) { 4261 newpte |= PG_MANAGED; 4262 pv = get_pv_entry(pmap, &lock); 4263 pv->pv_va = va; 4264 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); 4265 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4266 m->md.pv_gen++; 4267 if ((newpte & PG_RW) != 0) 4268 vm_page_aflag_set(m, PGA_WRITEABLE); 4269 } 4270 4271 /* 4272 * Update the PTE. 4273 */ 4274 if ((origpte & PG_V) != 0) { 4275validate: 4276 origpte = pte_load_store(pte, newpte); 4277 opa = origpte & PG_FRAME; 4278 if (opa != pa) { 4279 if ((origpte & PG_MANAGED) != 0) { 4280 om = PHYS_TO_VM_PAGE(opa); 4281 if ((origpte & (PG_M | PG_RW)) == (PG_M | 4282 PG_RW)) 4283 vm_page_dirty(om); 4284 if ((origpte & PG_A) != 0) 4285 vm_page_aflag_set(om, PGA_REFERENCED); 4286 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); 4287 pmap_pvh_free(&om->md, pmap, va); 4288 if ((om->aflags & PGA_WRITEABLE) != 0 && 4289 TAILQ_EMPTY(&om->md.pv_list) && 4290 ((om->flags & PG_FICTITIOUS) != 0 || 4291 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 4292 vm_page_aflag_clear(om, PGA_WRITEABLE); 4293 } 4294 } else if ((newpte & PG_M) == 0 && (origpte & (PG_M | 4295 PG_RW)) == (PG_M | PG_RW)) { 4296 if ((origpte & PG_MANAGED) != 0) 4297 vm_page_dirty(m); 4298 4299 /* 4300 * Although the PTE may still have PG_RW set, TLB 4301 * invalidation may nonetheless be required because 4302 * the PTE no longer has PG_M set. 4303 */ 4304 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { 4305 /* 4306 * This PTE change does not require TLB invalidation. 4307 */ 4308 goto unchanged; 4309 } 4310 if ((origpte & PG_A) != 0) 4311 pmap_invalidate_page(pmap, va); 4312 } else 4313 pte_store(pte, newpte); 4314 4315unchanged: 4316 4317 /* 4318 * If both the page table page and the reservation are fully 4319 * populated, then attempt promotion. 4320 */ 4321 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 4322 pmap_ps_enabled(pmap) && 4323 (m->flags & PG_FICTITIOUS) == 0 && 4324 vm_reserv_level_iffullpop(m) == 0) 4325 pmap_promote_pde(pmap, pde, va, &lock); 4326 4327 if (lock != NULL) 4328 rw_wunlock(lock); 4329 rw_runlock(&pvh_global_lock); 4330 PMAP_UNLOCK(pmap); 4331} 4332 4333/* 4334 * Tries to create a 2MB page mapping. Returns TRUE if successful and FALSE 4335 * otherwise. Fails if (1) a page table page cannot be allocated without 4336 * blocking, (2) a mapping already exists at the specified virtual address, or 4337 * (3) a pv entry cannot be allocated without reclaiming another pv entry. 4338 */ 4339static boolean_t 4340pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 4341 struct rwlock **lockp) 4342{ 4343 pd_entry_t *pde, newpde; 4344 pt_entry_t PG_V; 4345 vm_page_t mpde; 4346 struct spglist free; 4347 4348 PG_V = pmap_valid_bit(pmap); 4349 rw_assert(&pvh_global_lock, RA_LOCKED); 4350 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4351 4352 if ((mpde = pmap_allocpde(pmap, va, NULL)) == NULL) { 4353 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 4354 " in pmap %p", va, pmap); 4355 return (FALSE); 4356 } 4357 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpde)); 4358 pde = &pde[pmap_pde_index(va)]; 4359 if ((*pde & PG_V) != 0) { 4360 KASSERT(mpde->wire_count > 1, 4361 ("pmap_enter_pde: mpde's wire count is too low")); 4362 mpde->wire_count--; 4363 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 4364 " in pmap %p", va, pmap); 4365 return (FALSE); 4366 } 4367 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | 4368 PG_PS | PG_V; 4369 if ((m->oflags & VPO_UNMANAGED) == 0) { 4370 newpde |= PG_MANAGED; 4371 4372 /* 4373 * Abort this mapping if its PV entry could not be created. 4374 */ 4375 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m), 4376 lockp)) { 4377 SLIST_INIT(&free); 4378 if (pmap_unwire_ptp(pmap, va, mpde, &free)) { 4379 pmap_invalidate_page(pmap, va); 4380 pmap_free_zero_pages(&free); 4381 } 4382 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 4383 " in pmap %p", va, pmap); 4384 return (FALSE); 4385 } 4386 } 4387 if ((prot & VM_PROT_EXECUTE) == 0) 4388 newpde |= pg_nx; 4389 if (va < VM_MAXUSER_ADDRESS) 4390 newpde |= PG_U; 4391 4392 /* 4393 * Increment counters. 4394 */ 4395 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); 4396 4397 /* 4398 * Map the superpage. 4399 */ 4400 pde_store(pde, newpde); 4401 4402 atomic_add_long(&pmap_pde_mappings, 1); 4403 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 4404 " in pmap %p", va, pmap); 4405 return (TRUE); 4406} 4407 4408/* 4409 * Maps a sequence of resident pages belonging to the same object. 4410 * The sequence begins with the given page m_start. This page is 4411 * mapped at the given virtual address start. Each subsequent page is 4412 * mapped at a virtual address that is offset from start by the same 4413 * amount as the page is offset from m_start within the object. The 4414 * last page in the sequence is the page with the largest offset from 4415 * m_start that can be mapped at a virtual address less than the given 4416 * virtual address end. Not every virtual page between start and end 4417 * is mapped; only those for which a resident page exists with the 4418 * corresponding offset from m_start are mapped. 4419 */ 4420void 4421pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 4422 vm_page_t m_start, vm_prot_t prot) 4423{ 4424 struct rwlock *lock; 4425 vm_offset_t va; 4426 vm_page_t m, mpte; 4427 vm_pindex_t diff, psize; 4428 4429 VM_OBJECT_ASSERT_LOCKED(m_start->object); 4430 4431 psize = atop(end - start); 4432 mpte = NULL; 4433 m = m_start; 4434 lock = NULL; 4435 rw_rlock(&pvh_global_lock); 4436 PMAP_LOCK(pmap); 4437 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 4438 va = start + ptoa(diff); 4439 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 4440 m->psind == 1 && pmap_ps_enabled(pmap) && 4441 pmap_enter_pde(pmap, va, m, prot, &lock)) 4442 m = &m[NBPDR / PAGE_SIZE - 1]; 4443 else 4444 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 4445 mpte, &lock); 4446 m = TAILQ_NEXT(m, listq); 4447 } 4448 if (lock != NULL) 4449 rw_wunlock(lock); 4450 rw_runlock(&pvh_global_lock); 4451 PMAP_UNLOCK(pmap); 4452} 4453 4454/* 4455 * this code makes some *MAJOR* assumptions: 4456 * 1. Current pmap & pmap exists. 4457 * 2. Not wired. 4458 * 3. Read access. 4459 * 4. No page table pages. 4460 * but is *MUCH* faster than pmap_enter... 4461 */ 4462 4463void 4464pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 4465{ 4466 struct rwlock *lock; 4467 4468 lock = NULL; 4469 rw_rlock(&pvh_global_lock); 4470 PMAP_LOCK(pmap); 4471 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); 4472 if (lock != NULL) 4473 rw_wunlock(lock); 4474 rw_runlock(&pvh_global_lock); 4475 PMAP_UNLOCK(pmap); 4476} 4477 4478static vm_page_t 4479pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 4480 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) 4481{ 4482 struct spglist free; 4483 pt_entry_t *pte, PG_V; 4484 vm_paddr_t pa; 4485 4486 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 4487 (m->oflags & VPO_UNMANAGED) != 0, 4488 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 4489 PG_V = pmap_valid_bit(pmap); 4490 rw_assert(&pvh_global_lock, RA_LOCKED); 4491 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 4492 4493 /* 4494 * In the case that a page table page is not 4495 * resident, we are creating it here. 4496 */ 4497 if (va < VM_MAXUSER_ADDRESS) { 4498 vm_pindex_t ptepindex; 4499 pd_entry_t *ptepa; 4500 4501 /* 4502 * Calculate pagetable page index 4503 */ 4504 ptepindex = pmap_pde_pindex(va); 4505 if (mpte && (mpte->pindex == ptepindex)) { 4506 mpte->wire_count++; 4507 } else { 4508 /* 4509 * Get the page directory entry 4510 */ 4511 ptepa = pmap_pde(pmap, va); 4512 4513 /* 4514 * If the page table page is mapped, we just increment 4515 * the hold count, and activate it. Otherwise, we 4516 * attempt to allocate a page table page. If this 4517 * attempt fails, we don't retry. Instead, we give up. 4518 */ 4519 if (ptepa && (*ptepa & PG_V) != 0) { 4520 if (*ptepa & PG_PS) 4521 return (NULL); 4522 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); 4523 mpte->wire_count++; 4524 } else { 4525 /* 4526 * Pass NULL instead of the PV list lock 4527 * pointer, because we don't intend to sleep. 4528 */ 4529 mpte = _pmap_allocpte(pmap, ptepindex, NULL); 4530 if (mpte == NULL) 4531 return (mpte); 4532 } 4533 } 4534 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); 4535 pte = &pte[pmap_pte_index(va)]; 4536 } else { 4537 mpte = NULL; 4538 pte = vtopte(va); 4539 } 4540 if (*pte) { 4541 if (mpte != NULL) { 4542 mpte->wire_count--; 4543 mpte = NULL; 4544 } 4545 return (mpte); 4546 } 4547 4548 /* 4549 * Enter on the PV list if part of our managed memory. 4550 */ 4551 if ((m->oflags & VPO_UNMANAGED) == 0 && 4552 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { 4553 if (mpte != NULL) { 4554 SLIST_INIT(&free); 4555 if (pmap_unwire_ptp(pmap, va, mpte, &free)) { 4556 pmap_invalidate_page(pmap, va); 4557 pmap_free_zero_pages(&free); 4558 } 4559 mpte = NULL; 4560 } 4561 return (mpte); 4562 } 4563 4564 /* 4565 * Increment counters 4566 */ 4567 pmap_resident_count_inc(pmap, 1); 4568 4569 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0); 4570 if ((prot & VM_PROT_EXECUTE) == 0) 4571 pa |= pg_nx; 4572 4573 /* 4574 * Now validate mapping with RO protection 4575 */ 4576 if ((m->oflags & VPO_UNMANAGED) != 0) 4577 pte_store(pte, pa | PG_V | PG_U); 4578 else 4579 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 4580 return (mpte); 4581} 4582 4583/* 4584 * Make a temporary mapping for a physical address. This is only intended 4585 * to be used for panic dumps. 4586 */ 4587void * 4588pmap_kenter_temporary(vm_paddr_t pa, int i) 4589{ 4590 vm_offset_t va; 4591 4592 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 4593 pmap_kenter(va, pa); 4594 invlpg(va); 4595 return ((void *)crashdumpmap); 4596} 4597 4598/* 4599 * This code maps large physical mmap regions into the 4600 * processor address space. Note that some shortcuts 4601 * are taken, but the code works. 4602 */ 4603void 4604pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 4605 vm_pindex_t pindex, vm_size_t size) 4606{ 4607 pd_entry_t *pde; 4608 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 4609 vm_paddr_t pa, ptepa; 4610 vm_page_t p, pdpg; 4611 int pat_mode; 4612 4613 PG_A = pmap_accessed_bit(pmap); 4614 PG_M = pmap_modified_bit(pmap); 4615 PG_V = pmap_valid_bit(pmap); 4616 PG_RW = pmap_rw_bit(pmap); 4617 4618 VM_OBJECT_ASSERT_WLOCKED(object); 4619 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 4620 ("pmap_object_init_pt: non-device object")); 4621 if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 4622 if (!pmap_ps_enabled(pmap)) 4623 return; 4624 if (!vm_object_populate(object, pindex, pindex + atop(size))) 4625 return; 4626 p = vm_page_lookup(object, pindex); 4627 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4628 ("pmap_object_init_pt: invalid page %p", p)); 4629 pat_mode = p->md.pat_mode; 4630 4631 /* 4632 * Abort the mapping if the first page is not physically 4633 * aligned to a 2MB page boundary. 4634 */ 4635 ptepa = VM_PAGE_TO_PHYS(p); 4636 if (ptepa & (NBPDR - 1)) 4637 return; 4638 4639 /* 4640 * Skip the first page. Abort the mapping if the rest of 4641 * the pages are not physically contiguous or have differing 4642 * memory attributes. 4643 */ 4644 p = TAILQ_NEXT(p, listq); 4645 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 4646 pa += PAGE_SIZE) { 4647 KASSERT(p->valid == VM_PAGE_BITS_ALL, 4648 ("pmap_object_init_pt: invalid page %p", p)); 4649 if (pa != VM_PAGE_TO_PHYS(p) || 4650 pat_mode != p->md.pat_mode) 4651 return; 4652 p = TAILQ_NEXT(p, listq); 4653 } 4654 4655 /* 4656 * Map using 2MB pages. Since "ptepa" is 2M aligned and 4657 * "size" is a multiple of 2M, adding the PAT setting to "pa" 4658 * will not affect the termination of this loop. 4659 */ 4660 PMAP_LOCK(pmap); 4661 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); 4662 pa < ptepa + size; pa += NBPDR) { 4663 pdpg = pmap_allocpde(pmap, addr, NULL); 4664 if (pdpg == NULL) { 4665 /* 4666 * The creation of mappings below is only an 4667 * optimization. If a page directory page 4668 * cannot be allocated without blocking, 4669 * continue on to the next mapping rather than 4670 * blocking. 4671 */ 4672 addr += NBPDR; 4673 continue; 4674 } 4675 pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); 4676 pde = &pde[pmap_pde_index(addr)]; 4677 if ((*pde & PG_V) == 0) { 4678 pde_store(pde, pa | PG_PS | PG_M | PG_A | 4679 PG_U | PG_RW | PG_V); 4680 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); 4681 atomic_add_long(&pmap_pde_mappings, 1); 4682 } else { 4683 /* Continue on if the PDE is already valid. */ 4684 pdpg->wire_count--; 4685 KASSERT(pdpg->wire_count > 0, 4686 ("pmap_object_init_pt: missing reference " 4687 "to page directory page, va: 0x%lx", addr)); 4688 } 4689 addr += NBPDR; 4690 } 4691 PMAP_UNLOCK(pmap); 4692 } 4693} 4694 4695/* 4696 * Routine: pmap_change_wiring 4697 * Function: Change the wiring attribute for a map/virtual-address 4698 * pair. 4699 * In/out conditions: 4700 * The mapping must already exist in the pmap. 4701 */ 4702void 4703pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) 4704{ 4705 pd_entry_t *pde; 4706 pt_entry_t *pte; 4707 boolean_t pv_lists_locked; 4708 4709 pv_lists_locked = FALSE; 4710 4711 /* 4712 * Wiring is not a hardware characteristic so there is no need to 4713 * invalidate TLB. 4714 */ 4715retry: 4716 PMAP_LOCK(pmap); 4717 pde = pmap_pde(pmap, va); 4718 if ((*pde & PG_PS) != 0) { 4719 if (!wired != ((*pde & PG_W) == 0)) { 4720 if (!pv_lists_locked) { 4721 pv_lists_locked = TRUE; 4722 if (!rw_try_rlock(&pvh_global_lock)) { 4723 PMAP_UNLOCK(pmap); 4724 rw_rlock(&pvh_global_lock); 4725 goto retry; 4726 } 4727 } 4728 if (!pmap_demote_pde(pmap, pde, va)) 4729 panic("pmap_change_wiring: demotion failed"); 4730 } else 4731 goto out; 4732 } 4733 pte = pmap_pde_to_pte(pde, va); 4734 if (wired && (*pte & PG_W) == 0) { 4735 pmap->pm_stats.wired_count++; 4736 atomic_set_long(pte, PG_W); 4737 } else if (!wired && (*pte & PG_W) != 0) { 4738 pmap->pm_stats.wired_count--; 4739 atomic_clear_long(pte, PG_W); 4740 } 4741out: 4742 if (pv_lists_locked) 4743 rw_runlock(&pvh_global_lock); 4744 PMAP_UNLOCK(pmap); 4745} 4746 4747/* 4748 * Copy the range specified by src_addr/len 4749 * from the source map to the range dst_addr/len 4750 * in the destination map. 4751 * 4752 * This routine is only advisory and need not do anything. 4753 */ 4754 4755void 4756pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 4757 vm_offset_t src_addr) 4758{ 4759 struct rwlock *lock; 4760 struct spglist free; 4761 vm_offset_t addr; 4762 vm_offset_t end_addr = src_addr + len; 4763 vm_offset_t va_next; 4764 pt_entry_t PG_A, PG_M, PG_V; 4765 4766 if (dst_addr != src_addr) 4767 return; 4768 4769 if (dst_pmap->pm_type != src_pmap->pm_type) 4770 return; 4771 4772 /* 4773 * EPT page table entries that require emulation of A/D bits are 4774 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although 4775 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit 4776 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT 4777 * implementations flag an EPT misconfiguration for exec-only 4778 * mappings we skip this function entirely for emulated pmaps. 4779 */ 4780 if (pmap_emulate_ad_bits(dst_pmap)) 4781 return; 4782 4783 lock = NULL; 4784 rw_rlock(&pvh_global_lock); 4785 if (dst_pmap < src_pmap) { 4786 PMAP_LOCK(dst_pmap); 4787 PMAP_LOCK(src_pmap); 4788 } else { 4789 PMAP_LOCK(src_pmap); 4790 PMAP_LOCK(dst_pmap); 4791 } 4792 4793 PG_A = pmap_accessed_bit(dst_pmap); 4794 PG_M = pmap_modified_bit(dst_pmap); 4795 PG_V = pmap_valid_bit(dst_pmap); 4796 4797 for (addr = src_addr; addr < end_addr; addr = va_next) { 4798 pt_entry_t *src_pte, *dst_pte; 4799 vm_page_t dstmpde, dstmpte, srcmpte; 4800 pml4_entry_t *pml4e; 4801 pdp_entry_t *pdpe; 4802 pd_entry_t srcptepaddr, *pde; 4803 4804 KASSERT(addr < UPT_MIN_ADDRESS, 4805 ("pmap_copy: invalid to pmap_copy page tables")); 4806 4807 pml4e = pmap_pml4e(src_pmap, addr); 4808 if ((*pml4e & PG_V) == 0) { 4809 va_next = (addr + NBPML4) & ~PML4MASK; 4810 if (va_next < addr) 4811 va_next = end_addr; 4812 continue; 4813 } 4814 4815 pdpe = pmap_pml4e_to_pdpe(pml4e, addr); 4816 if ((*pdpe & PG_V) == 0) { 4817 va_next = (addr + NBPDP) & ~PDPMASK; 4818 if (va_next < addr) 4819 va_next = end_addr; 4820 continue; 4821 } 4822 4823 va_next = (addr + NBPDR) & ~PDRMASK; 4824 if (va_next < addr) 4825 va_next = end_addr; 4826 4827 pde = pmap_pdpe_to_pde(pdpe, addr); 4828 srcptepaddr = *pde; 4829 if (srcptepaddr == 0) 4830 continue; 4831 4832 if (srcptepaddr & PG_PS) { 4833 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 4834 continue; 4835 dstmpde = pmap_allocpde(dst_pmap, addr, NULL); 4836 if (dstmpde == NULL) 4837 break; 4838 pde = (pd_entry_t *) 4839 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde)); 4840 pde = &pde[pmap_pde_index(addr)]; 4841 if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || 4842 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & 4843 PG_PS_FRAME, &lock))) { 4844 *pde = srcptepaddr & ~PG_W; 4845 pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE); 4846 } else 4847 dstmpde->wire_count--; 4848 continue; 4849 } 4850 4851 srcptepaddr &= PG_FRAME; 4852 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 4853 KASSERT(srcmpte->wire_count > 0, 4854 ("pmap_copy: source page table page is unused")); 4855 4856 if (va_next > end_addr) 4857 va_next = end_addr; 4858 4859 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); 4860 src_pte = &src_pte[pmap_pte_index(addr)]; 4861 dstmpte = NULL; 4862 while (addr < va_next) { 4863 pt_entry_t ptetemp; 4864 ptetemp = *src_pte; 4865 /* 4866 * we only virtual copy managed pages 4867 */ 4868 if ((ptetemp & PG_MANAGED) != 0) { 4869 if (dstmpte != NULL && 4870 dstmpte->pindex == pmap_pde_pindex(addr)) 4871 dstmpte->wire_count++; 4872 else if ((dstmpte = pmap_allocpte(dst_pmap, 4873 addr, NULL)) == NULL) 4874 goto out; 4875 dst_pte = (pt_entry_t *) 4876 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); 4877 dst_pte = &dst_pte[pmap_pte_index(addr)]; 4878 if (*dst_pte == 0 && 4879 pmap_try_insert_pv_entry(dst_pmap, addr, 4880 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), 4881 &lock)) { 4882 /* 4883 * Clear the wired, modified, and 4884 * accessed (referenced) bits 4885 * during the copy. 4886 */ 4887 *dst_pte = ptetemp & ~(PG_W | PG_M | 4888 PG_A); 4889 pmap_resident_count_inc(dst_pmap, 1); 4890 } else { 4891 SLIST_INIT(&free); 4892 if (pmap_unwire_ptp(dst_pmap, addr, 4893 dstmpte, &free)) { 4894 pmap_invalidate_page(dst_pmap, 4895 addr); 4896 pmap_free_zero_pages(&free); 4897 } 4898 goto out; 4899 } 4900 if (dstmpte->wire_count >= srcmpte->wire_count) 4901 break; 4902 } 4903 addr += PAGE_SIZE; 4904 src_pte++; 4905 } 4906 } 4907out: 4908 if (lock != NULL) 4909 rw_wunlock(lock); 4910 rw_runlock(&pvh_global_lock); 4911 PMAP_UNLOCK(src_pmap); 4912 PMAP_UNLOCK(dst_pmap); 4913} 4914 4915/* 4916 * pmap_zero_page zeros the specified hardware page by mapping 4917 * the page into KVM and using bzero to clear its contents. 4918 */ 4919void 4920pmap_zero_page(vm_page_t m) 4921{ 4922 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 4923 4924 pagezero((void *)va); 4925} 4926 4927/* 4928 * pmap_zero_page_area zeros the specified hardware page by mapping 4929 * the page into KVM and using bzero to clear its contents. 4930 * 4931 * off and size may not cover an area beyond a single hardware page. 4932 */ 4933void 4934pmap_zero_page_area(vm_page_t m, int off, int size) 4935{ 4936 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 4937 4938 if (off == 0 && size == PAGE_SIZE) 4939 pagezero((void *)va); 4940 else 4941 bzero((char *)va + off, size); 4942} 4943 4944/* 4945 * pmap_zero_page_idle zeros the specified hardware page by mapping 4946 * the page into KVM and using bzero to clear its contents. This 4947 * is intended to be called from the vm_pagezero process only and 4948 * outside of Giant. 4949 */ 4950void 4951pmap_zero_page_idle(vm_page_t m) 4952{ 4953 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); 4954 4955 pagezero((void *)va); 4956} 4957 4958/* 4959 * pmap_copy_page copies the specified (machine independent) 4960 * page by mapping the page into virtual memory and using 4961 * bcopy to copy the page, one machine dependent page at a 4962 * time. 4963 */ 4964void 4965pmap_copy_page(vm_page_t msrc, vm_page_t mdst) 4966{ 4967 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); 4968 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); 4969 4970 pagecopy((void *)src, (void *)dst); 4971} 4972 4973int unmapped_buf_allowed = 1; 4974 4975void 4976pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 4977 vm_offset_t b_offset, int xfersize) 4978{ 4979 void *a_cp, *b_cp; 4980 vm_page_t m_a, m_b; 4981 vm_paddr_t p_a, p_b; 4982 pt_entry_t *pte; 4983 vm_offset_t a_pg_offset, b_pg_offset; 4984 int cnt; 4985 boolean_t pinned; 4986 4987 pinned = FALSE; 4988 while (xfersize > 0) { 4989 a_pg_offset = a_offset & PAGE_MASK; 4990 m_a = ma[a_offset >> PAGE_SHIFT]; 4991 p_a = m_a->phys_addr; 4992 b_pg_offset = b_offset & PAGE_MASK; 4993 m_b = mb[b_offset >> PAGE_SHIFT]; 4994 p_b = m_b->phys_addr; 4995 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 4996 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 4997 if (__predict_false(p_a < DMAP_MIN_ADDRESS || 4998 p_a > DMAP_MIN_ADDRESS + dmaplimit)) { 4999 mtx_lock(&cpage_lock); 5000 sched_pin(); 5001 pinned = TRUE; 5002 pte = vtopte(cpage_a); 5003 *pte = p_a | X86_PG_A | X86_PG_V | 5004 pmap_cache_bits(kernel_pmap, m_a->md.pat_mode, 0); 5005 invlpg(cpage_a); 5006 a_cp = (char *)cpage_a + a_pg_offset; 5007 } else { 5008 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; 5009 } 5010 if (__predict_false(p_b < DMAP_MIN_ADDRESS || 5011 p_b > DMAP_MIN_ADDRESS + dmaplimit)) { 5012 if (!pinned) { 5013 mtx_lock(&cpage_lock); 5014 sched_pin(); 5015 pinned = TRUE; 5016 } 5017 pte = vtopte(cpage_b); 5018 *pte = p_b | X86_PG_A | X86_PG_M | X86_PG_RW | 5019 X86_PG_V | pmap_cache_bits(kernel_pmap, 5020 m_b->md.pat_mode, 0); 5021 invlpg(cpage_b); 5022 b_cp = (char *)cpage_b + b_pg_offset; 5023 } else { 5024 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; 5025 } 5026 bcopy(a_cp, b_cp, cnt); 5027 if (__predict_false(pinned)) { 5028 sched_unpin(); 5029 mtx_unlock(&cpage_lock); 5030 pinned = FALSE; 5031 } 5032 a_offset += cnt; 5033 b_offset += cnt; 5034 xfersize -= cnt; 5035 } 5036} 5037 5038/* 5039 * Returns true if the pmap's pv is one of the first 5040 * 16 pvs linked to from this page. This count may 5041 * be changed upwards or downwards in the future; it 5042 * is only necessary that true be returned for a small 5043 * subset of pmaps for proper page aging. 5044 */ 5045boolean_t 5046pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 5047{ 5048 struct md_page *pvh; 5049 struct rwlock *lock; 5050 pv_entry_t pv; 5051 int loops = 0; 5052 boolean_t rv; 5053 5054 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5055 ("pmap_page_exists_quick: page %p is not managed", m)); 5056 rv = FALSE; 5057 rw_rlock(&pvh_global_lock); 5058 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5059 rw_rlock(lock); 5060 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5061 if (PV_PMAP(pv) == pmap) { 5062 rv = TRUE; 5063 break; 5064 } 5065 loops++; 5066 if (loops >= 16) 5067 break; 5068 } 5069 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 5070 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5071 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5072 if (PV_PMAP(pv) == pmap) { 5073 rv = TRUE; 5074 break; 5075 } 5076 loops++; 5077 if (loops >= 16) 5078 break; 5079 } 5080 } 5081 rw_runlock(lock); 5082 rw_runlock(&pvh_global_lock); 5083 return (rv); 5084} 5085 5086/* 5087 * pmap_page_wired_mappings: 5088 * 5089 * Return the number of managed mappings to the given physical page 5090 * that are wired. 5091 */ 5092int 5093pmap_page_wired_mappings(vm_page_t m) 5094{ 5095 struct rwlock *lock; 5096 struct md_page *pvh; 5097 pmap_t pmap; 5098 pt_entry_t *pte; 5099 pv_entry_t pv; 5100 int count, md_gen, pvh_gen; 5101 5102 if ((m->oflags & VPO_UNMANAGED) != 0) 5103 return (0); 5104 rw_rlock(&pvh_global_lock); 5105 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5106 rw_rlock(lock); 5107restart: 5108 count = 0; 5109 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5110 pmap = PV_PMAP(pv); 5111 if (!PMAP_TRYLOCK(pmap)) { 5112 md_gen = m->md.pv_gen; 5113 rw_runlock(lock); 5114 PMAP_LOCK(pmap); 5115 rw_rlock(lock); 5116 if (md_gen != m->md.pv_gen) { 5117 PMAP_UNLOCK(pmap); 5118 goto restart; 5119 } 5120 } 5121 pte = pmap_pte(pmap, pv->pv_va); 5122 if ((*pte & PG_W) != 0) 5123 count++; 5124 PMAP_UNLOCK(pmap); 5125 } 5126 if ((m->flags & PG_FICTITIOUS) == 0) { 5127 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5128 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5129 pmap = PV_PMAP(pv); 5130 if (!PMAP_TRYLOCK(pmap)) { 5131 md_gen = m->md.pv_gen; 5132 pvh_gen = pvh->pv_gen; 5133 rw_runlock(lock); 5134 PMAP_LOCK(pmap); 5135 rw_rlock(lock); 5136 if (md_gen != m->md.pv_gen || 5137 pvh_gen != pvh->pv_gen) { 5138 PMAP_UNLOCK(pmap); 5139 goto restart; 5140 } 5141 } 5142 pte = pmap_pde(pmap, pv->pv_va); 5143 if ((*pte & PG_W) != 0) 5144 count++; 5145 PMAP_UNLOCK(pmap); 5146 } 5147 } 5148 rw_runlock(lock); 5149 rw_runlock(&pvh_global_lock); 5150 return (count); 5151} 5152 5153/* 5154 * Returns TRUE if the given page is mapped individually or as part of 5155 * a 2mpage. Otherwise, returns FALSE. 5156 */ 5157boolean_t 5158pmap_page_is_mapped(vm_page_t m) 5159{ 5160 struct rwlock *lock; 5161 boolean_t rv; 5162 5163 if ((m->oflags & VPO_UNMANAGED) != 0) 5164 return (FALSE); 5165 rw_rlock(&pvh_global_lock); 5166 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5167 rw_rlock(lock); 5168 rv = !TAILQ_EMPTY(&m->md.pv_list) || 5169 ((m->flags & PG_FICTITIOUS) == 0 && 5170 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 5171 rw_runlock(lock); 5172 rw_runlock(&pvh_global_lock); 5173 return (rv); 5174} 5175 5176/* 5177 * Destroy all managed, non-wired mappings in the given user-space 5178 * pmap. This pmap cannot be active on any processor besides the 5179 * caller. 5180 * 5181 * This function cannot be applied to the kernel pmap. Moreover, it 5182 * is not intended for general use. It is only to be used during 5183 * process termination. Consequently, it can be implemented in ways 5184 * that make it faster than pmap_remove(). First, it can more quickly 5185 * destroy mappings by iterating over the pmap's collection of PV 5186 * entries, rather than searching the page table. Second, it doesn't 5187 * have to test and clear the page table entries atomically, because 5188 * no processor is currently accessing the user address space. In 5189 * particular, a page table entry's dirty bit won't change state once 5190 * this function starts. 5191 */ 5192void 5193pmap_remove_pages(pmap_t pmap) 5194{ 5195 pd_entry_t ptepde; 5196 pt_entry_t *pte, tpte; 5197 pt_entry_t PG_M, PG_RW, PG_V; 5198 struct spglist free; 5199 vm_page_t m, mpte, mt; 5200 pv_entry_t pv; 5201 struct md_page *pvh; 5202 struct pv_chunk *pc, *npc; 5203 struct rwlock *lock; 5204 int64_t bit; 5205 uint64_t inuse, bitmask; 5206 int allfree, field, freed, idx; 5207 boolean_t superpage; 5208 vm_paddr_t pa; 5209 5210 /* 5211 * Assert that the given pmap is only active on the current 5212 * CPU. Unfortunately, we cannot block another CPU from 5213 * activating the pmap while this function is executing. 5214 */ 5215 KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); 5216#ifdef INVARIANTS 5217 { 5218 cpuset_t other_cpus; 5219 5220 other_cpus = all_cpus; 5221 critical_enter(); 5222 CPU_CLR(PCPU_GET(cpuid), &other_cpus); 5223 CPU_AND(&other_cpus, &pmap->pm_active); 5224 critical_exit(); 5225 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); 5226 } 5227#endif 5228 5229 lock = NULL; 5230 PG_M = pmap_modified_bit(pmap); 5231 PG_V = pmap_valid_bit(pmap); 5232 PG_RW = pmap_rw_bit(pmap); 5233 5234 SLIST_INIT(&free); 5235 rw_rlock(&pvh_global_lock); 5236 PMAP_LOCK(pmap); 5237 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 5238 allfree = 1; 5239 freed = 0; 5240 for (field = 0; field < _NPCM; field++) { 5241 inuse = ~pc->pc_map[field] & pc_freemask[field]; 5242 while (inuse != 0) { 5243 bit = bsfq(inuse); 5244 bitmask = 1UL << bit; 5245 idx = field * 64 + bit; 5246 pv = &pc->pc_pventry[idx]; 5247 inuse &= ~bitmask; 5248 5249 pte = pmap_pdpe(pmap, pv->pv_va); 5250 ptepde = *pte; 5251 pte = pmap_pdpe_to_pde(pte, pv->pv_va); 5252 tpte = *pte; 5253 if ((tpte & (PG_PS | PG_V)) == PG_V) { 5254 superpage = FALSE; 5255 ptepde = tpte; 5256 pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & 5257 PG_FRAME); 5258 pte = &pte[pmap_pte_index(pv->pv_va)]; 5259 tpte = *pte; 5260 } else { 5261 /* 5262 * Keep track whether 'tpte' is a 5263 * superpage explicitly instead of 5264 * relying on PG_PS being set. 5265 * 5266 * This is because PG_PS is numerically 5267 * identical to PG_PTE_PAT and thus a 5268 * regular page could be mistaken for 5269 * a superpage. 5270 */ 5271 superpage = TRUE; 5272 } 5273 5274 if ((tpte & PG_V) == 0) { 5275 panic("bad pte va %lx pte %lx", 5276 pv->pv_va, tpte); 5277 } 5278 5279/* 5280 * We cannot remove wired pages from a process' mapping at this time 5281 */ 5282 if (tpte & PG_W) { 5283 allfree = 0; 5284 continue; 5285 } 5286 5287 if (superpage) 5288 pa = tpte & PG_PS_FRAME; 5289 else 5290 pa = tpte & PG_FRAME; 5291 5292 m = PHYS_TO_VM_PAGE(pa); 5293 KASSERT(m->phys_addr == pa, 5294 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 5295 m, (uintmax_t)m->phys_addr, 5296 (uintmax_t)tpte)); 5297 5298 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 5299 m < &vm_page_array[vm_page_array_size], 5300 ("pmap_remove_pages: bad tpte %#jx", 5301 (uintmax_t)tpte)); 5302 5303 pte_clear(pte); 5304 5305 /* 5306 * Update the vm_page_t clean/reference bits. 5307 */ 5308 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5309 if (superpage) { 5310 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 5311 vm_page_dirty(mt); 5312 } else 5313 vm_page_dirty(m); 5314 } 5315 5316 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); 5317 5318 /* Mark free */ 5319 pc->pc_map[field] |= bitmask; 5320 if (superpage) { 5321 pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); 5322 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 5323 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5324 pvh->pv_gen++; 5325 if (TAILQ_EMPTY(&pvh->pv_list)) { 5326 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 5327 if ((mt->aflags & PGA_WRITEABLE) != 0 && 5328 TAILQ_EMPTY(&mt->md.pv_list)) 5329 vm_page_aflag_clear(mt, PGA_WRITEABLE); 5330 } 5331 mpte = pmap_lookup_pt_page(pmap, pv->pv_va); 5332 if (mpte != NULL) { 5333 pmap_remove_pt_page(pmap, mpte); 5334 pmap_resident_count_dec(pmap, 1); 5335 KASSERT(mpte->wire_count == NPTEPG, 5336 ("pmap_remove_pages: pte page wire count error")); 5337 mpte->wire_count = 0; 5338 pmap_add_delayed_free_list(mpte, &free, FALSE); 5339 atomic_subtract_int(&cnt.v_wire_count, 1); 5340 } 5341 } else { 5342 pmap_resident_count_dec(pmap, 1); 5343 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5344 m->md.pv_gen++; 5345 if ((m->aflags & PGA_WRITEABLE) != 0 && 5346 TAILQ_EMPTY(&m->md.pv_list) && 5347 (m->flags & PG_FICTITIOUS) == 0) { 5348 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5349 if (TAILQ_EMPTY(&pvh->pv_list)) 5350 vm_page_aflag_clear(m, PGA_WRITEABLE); 5351 } 5352 } 5353 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); 5354 freed++; 5355 } 5356 } 5357 PV_STAT(atomic_add_long(&pv_entry_frees, freed)); 5358 PV_STAT(atomic_add_int(&pv_entry_spare, freed)); 5359 PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); 5360 if (allfree) { 5361 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 5362 free_pv_chunk(pc); 5363 } 5364 } 5365 if (lock != NULL) 5366 rw_wunlock(lock); 5367 pmap_invalidate_all(pmap); 5368 rw_runlock(&pvh_global_lock); 5369 PMAP_UNLOCK(pmap); 5370 pmap_free_zero_pages(&free); 5371} 5372 5373static boolean_t 5374pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) 5375{ 5376 struct rwlock *lock; 5377 pv_entry_t pv; 5378 struct md_page *pvh; 5379 pt_entry_t *pte, mask; 5380 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 5381 pmap_t pmap; 5382 int md_gen, pvh_gen; 5383 boolean_t rv; 5384 5385 rv = FALSE; 5386 rw_rlock(&pvh_global_lock); 5387 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5388 rw_rlock(lock); 5389restart: 5390 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5391 pmap = PV_PMAP(pv); 5392 if (!PMAP_TRYLOCK(pmap)) { 5393 md_gen = m->md.pv_gen; 5394 rw_runlock(lock); 5395 PMAP_LOCK(pmap); 5396 rw_rlock(lock); 5397 if (md_gen != m->md.pv_gen) { 5398 PMAP_UNLOCK(pmap); 5399 goto restart; 5400 } 5401 } 5402 pte = pmap_pte(pmap, pv->pv_va); 5403 mask = 0; 5404 if (modified) { 5405 PG_M = pmap_modified_bit(pmap); 5406 PG_RW = pmap_rw_bit(pmap); 5407 mask |= PG_RW | PG_M; 5408 } 5409 if (accessed) { 5410 PG_A = pmap_accessed_bit(pmap); 5411 PG_V = pmap_valid_bit(pmap); 5412 mask |= PG_V | PG_A; 5413 } 5414 rv = (*pte & mask) == mask; 5415 PMAP_UNLOCK(pmap); 5416 if (rv) 5417 goto out; 5418 } 5419 if ((m->flags & PG_FICTITIOUS) == 0) { 5420 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5421 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 5422 pmap = PV_PMAP(pv); 5423 if (!PMAP_TRYLOCK(pmap)) { 5424 md_gen = m->md.pv_gen; 5425 pvh_gen = pvh->pv_gen; 5426 rw_runlock(lock); 5427 PMAP_LOCK(pmap); 5428 rw_rlock(lock); 5429 if (md_gen != m->md.pv_gen || 5430 pvh_gen != pvh->pv_gen) { 5431 PMAP_UNLOCK(pmap); 5432 goto restart; 5433 } 5434 } 5435 pte = pmap_pde(pmap, pv->pv_va); 5436 mask = 0; 5437 if (modified) { 5438 PG_M = pmap_modified_bit(pmap); 5439 PG_RW = pmap_rw_bit(pmap); 5440 mask |= PG_RW | PG_M; 5441 } 5442 if (accessed) { 5443 PG_A = pmap_accessed_bit(pmap); 5444 PG_V = pmap_valid_bit(pmap); 5445 mask |= PG_V | PG_A; 5446 } 5447 rv = (*pte & mask) == mask; 5448 PMAP_UNLOCK(pmap); 5449 if (rv) 5450 goto out; 5451 } 5452 } 5453out: 5454 rw_runlock(lock); 5455 rw_runlock(&pvh_global_lock); 5456 return (rv); 5457} 5458 5459/* 5460 * pmap_is_modified: 5461 * 5462 * Return whether or not the specified physical page was modified 5463 * in any physical maps. 5464 */ 5465boolean_t 5466pmap_is_modified(vm_page_t m) 5467{ 5468 5469 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5470 ("pmap_is_modified: page %p is not managed", m)); 5471 5472 /* 5473 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 5474 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 5475 * is clear, no PTEs can have PG_M set. 5476 */ 5477 VM_OBJECT_ASSERT_WLOCKED(m->object); 5478 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 5479 return (FALSE); 5480 return (pmap_page_test_mappings(m, FALSE, TRUE)); 5481} 5482 5483/* 5484 * pmap_is_prefaultable: 5485 * 5486 * Return whether or not the specified virtual address is eligible 5487 * for prefault. 5488 */ 5489boolean_t 5490pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 5491{ 5492 pd_entry_t *pde; 5493 pt_entry_t *pte, PG_V; 5494 boolean_t rv; 5495 5496 PG_V = pmap_valid_bit(pmap); 5497 rv = FALSE; 5498 PMAP_LOCK(pmap); 5499 pde = pmap_pde(pmap, addr); 5500 if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { 5501 pte = pmap_pde_to_pte(pde, addr); 5502 rv = (*pte & PG_V) == 0; 5503 } 5504 PMAP_UNLOCK(pmap); 5505 return (rv); 5506} 5507 5508/* 5509 * pmap_is_referenced: 5510 * 5511 * Return whether or not the specified physical page was referenced 5512 * in any physical maps. 5513 */ 5514boolean_t 5515pmap_is_referenced(vm_page_t m) 5516{ 5517 5518 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5519 ("pmap_is_referenced: page %p is not managed", m)); 5520 return (pmap_page_test_mappings(m, TRUE, FALSE)); 5521} 5522 5523/* 5524 * Clear the write and modified bits in each of the given page's mappings. 5525 */ 5526void 5527pmap_remove_write(vm_page_t m) 5528{ 5529 struct md_page *pvh; 5530 pmap_t pmap; 5531 struct rwlock *lock; 5532 pv_entry_t next_pv, pv; 5533 pd_entry_t *pde; 5534 pt_entry_t oldpte, *pte, PG_M, PG_RW; 5535 vm_offset_t va; 5536 int pvh_gen, md_gen; 5537 5538 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5539 ("pmap_remove_write: page %p is not managed", m)); 5540 5541 /* 5542 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 5543 * set by another thread while the object is locked. Thus, 5544 * if PGA_WRITEABLE is clear, no page table entries need updating. 5545 */ 5546 VM_OBJECT_ASSERT_WLOCKED(m->object); 5547 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 5548 return; 5549 rw_rlock(&pvh_global_lock); 5550 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 5551 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5552retry_pv_loop: 5553 rw_wlock(lock); 5554 if ((m->flags & PG_FICTITIOUS) != 0) 5555 goto small_mappings; 5556 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5557 pmap = PV_PMAP(pv); 5558 if (!PMAP_TRYLOCK(pmap)) { 5559 pvh_gen = pvh->pv_gen; 5560 rw_wunlock(lock); 5561 PMAP_LOCK(pmap); 5562 rw_wlock(lock); 5563 if (pvh_gen != pvh->pv_gen) { 5564 PMAP_UNLOCK(pmap); 5565 rw_wunlock(lock); 5566 goto retry_pv_loop; 5567 } 5568 } 5569 PG_RW = pmap_rw_bit(pmap); 5570 va = pv->pv_va; 5571 pde = pmap_pde(pmap, va); 5572 if ((*pde & PG_RW) != 0) 5573 (void)pmap_demote_pde_locked(pmap, pde, va, &lock); 5574 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 5575 ("inconsistent pv lock %p %p for page %p", 5576 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 5577 PMAP_UNLOCK(pmap); 5578 } 5579small_mappings: 5580 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5581 pmap = PV_PMAP(pv); 5582 if (!PMAP_TRYLOCK(pmap)) { 5583 pvh_gen = pvh->pv_gen; 5584 md_gen = m->md.pv_gen; 5585 rw_wunlock(lock); 5586 PMAP_LOCK(pmap); 5587 rw_wlock(lock); 5588 if (pvh_gen != pvh->pv_gen || 5589 md_gen != m->md.pv_gen) { 5590 PMAP_UNLOCK(pmap); 5591 rw_wunlock(lock); 5592 goto retry_pv_loop; 5593 } 5594 } 5595 PG_M = pmap_modified_bit(pmap); 5596 PG_RW = pmap_rw_bit(pmap); 5597 pde = pmap_pde(pmap, pv->pv_va); 5598 KASSERT((*pde & PG_PS) == 0, 5599 ("pmap_remove_write: found a 2mpage in page %p's pv list", 5600 m)); 5601 pte = pmap_pde_to_pte(pde, pv->pv_va); 5602retry: 5603 oldpte = *pte; 5604 if (oldpte & PG_RW) { 5605 if (!atomic_cmpset_long(pte, oldpte, oldpte & 5606 ~(PG_RW | PG_M))) 5607 goto retry; 5608 if ((oldpte & PG_M) != 0) 5609 vm_page_dirty(m); 5610 pmap_invalidate_page(pmap, pv->pv_va); 5611 } 5612 PMAP_UNLOCK(pmap); 5613 } 5614 rw_wunlock(lock); 5615 vm_page_aflag_clear(m, PGA_WRITEABLE); 5616 rw_runlock(&pvh_global_lock); 5617} 5618 5619static __inline boolean_t 5620safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) 5621{ 5622 5623 if (!pmap_emulate_ad_bits(pmap)) 5624 return (TRUE); 5625 5626 KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); 5627 5628 /* 5629 * RWX = 010 or 110 will cause an unconditional EPT misconfiguration 5630 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared 5631 * if the EPT_PG_WRITE bit is set. 5632 */ 5633 if ((pte & EPT_PG_WRITE) != 0) 5634 return (FALSE); 5635 5636 /* 5637 * RWX = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. 5638 */ 5639 if ((pte & EPT_PG_EXECUTE) == 0 || 5640 ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) 5641 return (TRUE); 5642 else 5643 return (FALSE); 5644} 5645 5646#define PMAP_TS_REFERENCED_MAX 5 5647 5648/* 5649 * pmap_ts_referenced: 5650 * 5651 * Return a count of reference bits for a page, clearing those bits. 5652 * It is not necessary for every reference bit to be cleared, but it 5653 * is necessary that 0 only be returned when there are truly no 5654 * reference bits set. 5655 * 5656 * XXX: The exact number of bits to check and clear is a matter that 5657 * should be tested and standardized at some point in the future for 5658 * optimal aging of shared pages. 5659 */ 5660int 5661pmap_ts_referenced(vm_page_t m) 5662{ 5663 struct md_page *pvh; 5664 pv_entry_t pv, pvf; 5665 pmap_t pmap; 5666 struct rwlock *lock; 5667 pd_entry_t oldpde, *pde; 5668 pt_entry_t *pte, PG_A; 5669 vm_offset_t va; 5670 vm_paddr_t pa; 5671 int cleared, md_gen, not_cleared, pvh_gen; 5672 struct spglist free; 5673 boolean_t demoted; 5674 5675 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5676 ("pmap_ts_referenced: page %p is not managed", m)); 5677 SLIST_INIT(&free); 5678 cleared = 0; 5679 pa = VM_PAGE_TO_PHYS(m); 5680 lock = PHYS_TO_PV_LIST_LOCK(pa); 5681 pvh = pa_to_pvh(pa); 5682 rw_rlock(&pvh_global_lock); 5683 rw_wlock(lock); 5684retry: 5685 not_cleared = 0; 5686 if ((m->flags & PG_FICTITIOUS) != 0 || 5687 (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 5688 goto small_mappings; 5689 pv = pvf; 5690 do { 5691 if (pvf == NULL) 5692 pvf = pv; 5693 pmap = PV_PMAP(pv); 5694 if (!PMAP_TRYLOCK(pmap)) { 5695 pvh_gen = pvh->pv_gen; 5696 rw_wunlock(lock); 5697 PMAP_LOCK(pmap); 5698 rw_wlock(lock); 5699 if (pvh_gen != pvh->pv_gen) { 5700 PMAP_UNLOCK(pmap); 5701 goto retry; 5702 } 5703 } 5704 PG_A = pmap_accessed_bit(pmap); 5705 va = pv->pv_va; 5706 pde = pmap_pde(pmap, pv->pv_va); 5707 oldpde = *pde; 5708 if ((*pde & PG_A) != 0) { 5709 /* 5710 * Since this reference bit is shared by 512 4KB 5711 * pages, it should not be cleared every time it is 5712 * tested. Apply a simple "hash" function on the 5713 * physical page number, the virtual superpage number, 5714 * and the pmap address to select one 4KB page out of 5715 * the 512 on which testing the reference bit will 5716 * result in clearing that reference bit. This 5717 * function is designed to avoid the selection of the 5718 * same 4KB page for every 2MB page mapping. 5719 * 5720 * On demotion, a mapping that hasn't been referenced 5721 * is simply destroyed. To avoid the possibility of a 5722 * subsequent page fault on a demoted wired mapping, 5723 * always leave its reference bit set. Moreover, 5724 * since the superpage is wired, the current state of 5725 * its reference bit won't affect page replacement. 5726 */ 5727 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 5728 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 5729 (*pde & PG_W) == 0) { 5730 if (safe_to_clear_referenced(pmap, oldpde)) { 5731 atomic_clear_long(pde, PG_A); 5732 pmap_invalidate_page(pmap, pv->pv_va); 5733 demoted = FALSE; 5734 } else if (pmap_demote_pde_locked(pmap, pde, 5735 pv->pv_va, &lock)) { 5736 /* 5737 * Remove the mapping to a single page 5738 * so that a subsequent access may 5739 * repromote. Since the underlying 5740 * page table page is fully populated, 5741 * this removal never frees a page 5742 * table page. 5743 */ 5744 demoted = TRUE; 5745 va += VM_PAGE_TO_PHYS(m) - (oldpde & 5746 PG_PS_FRAME); 5747 pte = pmap_pde_to_pte(pde, va); 5748 pmap_remove_pte(pmap, pte, va, *pde, 5749 NULL, &lock); 5750 pmap_invalidate_page(pmap, va); 5751 } else 5752 demoted = TRUE; 5753 5754 if (demoted) { 5755 /* 5756 * The superpage mapping was removed 5757 * entirely and therefore 'pv' is no 5758 * longer valid. 5759 */ 5760 if (pvf == pv) 5761 pvf = NULL; 5762 pv = NULL; 5763 } 5764 cleared++; 5765 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 5766 ("inconsistent pv lock %p %p for page %p", 5767 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 5768 } else 5769 not_cleared++; 5770 } 5771 PMAP_UNLOCK(pmap); 5772 /* Rotate the PV list if it has more than one entry. */ 5773 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 5774 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 5775 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 5776 pvh->pv_gen++; 5777 } 5778 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) 5779 goto out; 5780 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 5781small_mappings: 5782 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 5783 goto out; 5784 pv = pvf; 5785 do { 5786 if (pvf == NULL) 5787 pvf = pv; 5788 pmap = PV_PMAP(pv); 5789 if (!PMAP_TRYLOCK(pmap)) { 5790 pvh_gen = pvh->pv_gen; 5791 md_gen = m->md.pv_gen; 5792 rw_wunlock(lock); 5793 PMAP_LOCK(pmap); 5794 rw_wlock(lock); 5795 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 5796 PMAP_UNLOCK(pmap); 5797 goto retry; 5798 } 5799 } 5800 PG_A = pmap_accessed_bit(pmap); 5801 pde = pmap_pde(pmap, pv->pv_va); 5802 KASSERT((*pde & PG_PS) == 0, 5803 ("pmap_ts_referenced: found a 2mpage in page %p's pv list", 5804 m)); 5805 pte = pmap_pde_to_pte(pde, pv->pv_va); 5806 if ((*pte & PG_A) != 0) { 5807 if (safe_to_clear_referenced(pmap, *pte)) { 5808 atomic_clear_long(pte, PG_A); 5809 pmap_invalidate_page(pmap, pv->pv_va); 5810 cleared++; 5811 } else if ((*pte & PG_W) == 0) { 5812 /* 5813 * Wired pages cannot be paged out so 5814 * doing accessed bit emulation for 5815 * them is wasted effort. We do the 5816 * hard work for unwired pages only. 5817 */ 5818 pmap_remove_pte(pmap, pte, pv->pv_va, 5819 *pde, &free, &lock); 5820 pmap_invalidate_page(pmap, pv->pv_va); 5821 cleared++; 5822 if (pvf == pv) 5823 pvf = NULL; 5824 pv = NULL; 5825 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), 5826 ("inconsistent pv lock %p %p for page %p", 5827 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); 5828 } else 5829 not_cleared++; 5830 } 5831 PMAP_UNLOCK(pmap); 5832 /* Rotate the PV list if it has more than one entry. */ 5833 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { 5834 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 5835 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 5836 m->md.pv_gen++; 5837 } 5838 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + 5839 not_cleared < PMAP_TS_REFERENCED_MAX); 5840out: 5841 rw_wunlock(lock); 5842 rw_runlock(&pvh_global_lock); 5843 pmap_free_zero_pages(&free); 5844 return (cleared + not_cleared); 5845} 5846 5847/* 5848 * Apply the given advice to the specified range of addresses within the 5849 * given pmap. Depending on the advice, clear the referenced and/or 5850 * modified flags in each mapping and set the mapped page's dirty field. 5851 */ 5852void 5853pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 5854{ 5855 struct rwlock *lock; 5856 pml4_entry_t *pml4e; 5857 pdp_entry_t *pdpe; 5858 pd_entry_t oldpde, *pde; 5859 pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; 5860 vm_offset_t va_next; 5861 vm_page_t m; 5862 boolean_t anychanged, pv_lists_locked; 5863 5864 if (advice != MADV_DONTNEED && advice != MADV_FREE) 5865 return; 5866 5867 /* 5868 * A/D bit emulation requires an alternate code path when clearing 5869 * the modified and accessed bits below. Since this function is 5870 * advisory in nature we skip it entirely for pmaps that require 5871 * A/D bit emulation. 5872 */ 5873 if (pmap_emulate_ad_bits(pmap)) 5874 return; 5875 5876 PG_A = pmap_accessed_bit(pmap); 5877 PG_G = pmap_global_bit(pmap); 5878 PG_M = pmap_modified_bit(pmap); 5879 PG_V = pmap_valid_bit(pmap); 5880 PG_RW = pmap_rw_bit(pmap); 5881 5882 pv_lists_locked = FALSE; 5883resume: 5884 anychanged = FALSE; 5885 PMAP_LOCK(pmap); 5886 for (; sva < eva; sva = va_next) { 5887 pml4e = pmap_pml4e(pmap, sva); 5888 if ((*pml4e & PG_V) == 0) { 5889 va_next = (sva + NBPML4) & ~PML4MASK; 5890 if (va_next < sva) 5891 va_next = eva; 5892 continue; 5893 } 5894 pdpe = pmap_pml4e_to_pdpe(pml4e, sva); 5895 if ((*pdpe & PG_V) == 0) { 5896 va_next = (sva + NBPDP) & ~PDPMASK; 5897 if (va_next < sva) 5898 va_next = eva; 5899 continue; 5900 } 5901 va_next = (sva + NBPDR) & ~PDRMASK; 5902 if (va_next < sva) 5903 va_next = eva; 5904 pde = pmap_pdpe_to_pde(pdpe, sva); 5905 oldpde = *pde; 5906 if ((oldpde & PG_V) == 0) 5907 continue; 5908 else if ((oldpde & PG_PS) != 0) { 5909 if ((oldpde & PG_MANAGED) == 0) 5910 continue; 5911 if (!pv_lists_locked) { 5912 pv_lists_locked = TRUE; 5913 if (!rw_try_rlock(&pvh_global_lock)) { 5914 if (anychanged) 5915 pmap_invalidate_all(pmap); 5916 PMAP_UNLOCK(pmap); 5917 rw_rlock(&pvh_global_lock); 5918 goto resume; 5919 } 5920 } 5921 lock = NULL; 5922 if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { 5923 if (lock != NULL) 5924 rw_wunlock(lock); 5925 5926 /* 5927 * The large page mapping was destroyed. 5928 */ 5929 continue; 5930 } 5931 5932 /* 5933 * Unless the page mappings are wired, remove the 5934 * mapping to a single page so that a subsequent 5935 * access may repromote. Since the underlying page 5936 * table page is fully populated, this removal never 5937 * frees a page table page. 5938 */ 5939 if ((oldpde & PG_W) == 0) { 5940 pte = pmap_pde_to_pte(pde, sva); 5941 KASSERT((*pte & PG_V) != 0, 5942 ("pmap_advise: invalid PTE")); 5943 pmap_remove_pte(pmap, pte, sva, *pde, NULL, 5944 &lock); 5945 anychanged = TRUE; 5946 } 5947 if (lock != NULL) 5948 rw_wunlock(lock); 5949 } 5950 if (va_next > eva) 5951 va_next = eva; 5952 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, 5953 sva += PAGE_SIZE) { 5954 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | 5955 PG_V)) 5956 continue; 5957 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5958 if (advice == MADV_DONTNEED) { 5959 /* 5960 * Future calls to pmap_is_modified() 5961 * can be avoided by making the page 5962 * dirty now. 5963 */ 5964 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 5965 vm_page_dirty(m); 5966 } 5967 atomic_clear_long(pte, PG_M | PG_A); 5968 } else if ((*pte & PG_A) != 0) 5969 atomic_clear_long(pte, PG_A); 5970 else 5971 continue; 5972 if ((*pte & PG_G) != 0) 5973 pmap_invalidate_page(pmap, sva); 5974 else 5975 anychanged = TRUE; 5976 } 5977 } 5978 if (anychanged) 5979 pmap_invalidate_all(pmap); 5980 if (pv_lists_locked) 5981 rw_runlock(&pvh_global_lock); 5982 PMAP_UNLOCK(pmap); 5983} 5984 5985/* 5986 * Clear the modify bits on the specified physical page. 5987 */ 5988void 5989pmap_clear_modify(vm_page_t m) 5990{ 5991 struct md_page *pvh; 5992 pmap_t pmap; 5993 pv_entry_t next_pv, pv; 5994 pd_entry_t oldpde, *pde; 5995 pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V; 5996 struct rwlock *lock; 5997 vm_offset_t va; 5998 int md_gen, pvh_gen; 5999 6000 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 6001 ("pmap_clear_modify: page %p is not managed", m)); 6002 VM_OBJECT_ASSERT_WLOCKED(m->object); 6003 KASSERT(!vm_page_xbusied(m), 6004 ("pmap_clear_modify: page %p is exclusive busied", m)); 6005 6006 /* 6007 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 6008 * If the object containing the page is locked and the page is not 6009 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 6010 */ 6011 if ((m->aflags & PGA_WRITEABLE) == 0) 6012 return; 6013 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 6014 rw_rlock(&pvh_global_lock); 6015 lock = VM_PAGE_TO_PV_LIST_LOCK(m); 6016 rw_wlock(lock); 6017restart: 6018 if ((m->flags & PG_FICTITIOUS) != 0) 6019 goto small_mappings; 6020 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 6021 pmap = PV_PMAP(pv); 6022 if (!PMAP_TRYLOCK(pmap)) { 6023 pvh_gen = pvh->pv_gen; 6024 rw_wunlock(lock); 6025 PMAP_LOCK(pmap); 6026 rw_wlock(lock); 6027 if (pvh_gen != pvh->pv_gen) { 6028 PMAP_UNLOCK(pmap); 6029 goto restart; 6030 } 6031 } 6032 PG_M = pmap_modified_bit(pmap); 6033 PG_V = pmap_valid_bit(pmap); 6034 PG_RW = pmap_rw_bit(pmap); 6035 va = pv->pv_va; 6036 pde = pmap_pde(pmap, va); 6037 oldpde = *pde; 6038 if ((oldpde & PG_RW) != 0) { 6039 if (pmap_demote_pde_locked(pmap, pde, va, &lock)) { 6040 if ((oldpde & PG_W) == 0) { 6041 /* 6042 * Write protect the mapping to a 6043 * single page so that a subsequent 6044 * write access may repromote. 6045 */ 6046 va += VM_PAGE_TO_PHYS(m) - (oldpde & 6047 PG_PS_FRAME); 6048 pte = pmap_pde_to_pte(pde, va); 6049 oldpte = *pte; 6050 if ((oldpte & PG_V) != 0) { 6051 while (!atomic_cmpset_long(pte, 6052 oldpte, 6053 oldpte & ~(PG_M | PG_RW))) 6054 oldpte = *pte; 6055 vm_page_dirty(m); 6056 pmap_invalidate_page(pmap, va); 6057 } 6058 } 6059 } 6060 } 6061 PMAP_UNLOCK(pmap); 6062 } 6063small_mappings: 6064 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 6065 pmap = PV_PMAP(pv); 6066 if (!PMAP_TRYLOCK(pmap)) { 6067 md_gen = m->md.pv_gen; 6068 pvh_gen = pvh->pv_gen; 6069 rw_wunlock(lock); 6070 PMAP_LOCK(pmap); 6071 rw_wlock(lock); 6072 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { 6073 PMAP_UNLOCK(pmap); 6074 goto restart; 6075 } 6076 } 6077 PG_M = pmap_modified_bit(pmap); 6078 PG_RW = pmap_rw_bit(pmap); 6079 pde = pmap_pde(pmap, pv->pv_va); 6080 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 6081 " a 2mpage in page %p's pv list", m)); 6082 pte = pmap_pde_to_pte(pde, pv->pv_va); 6083 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 6084 atomic_clear_long(pte, PG_M); 6085 pmap_invalidate_page(pmap, pv->pv_va); 6086 } 6087 PMAP_UNLOCK(pmap); 6088 } 6089 rw_wunlock(lock); 6090 rw_runlock(&pvh_global_lock); 6091} 6092 6093/* 6094 * Miscellaneous support routines follow 6095 */ 6096 6097/* Adjust the cache mode for a 4KB page mapped via a PTE. */ 6098static __inline void 6099pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask) 6100{ 6101 u_int opte, npte; 6102 6103 /* 6104 * The cache mode bits are all in the low 32-bits of the 6105 * PTE, so we can just spin on updating the low 32-bits. 6106 */ 6107 do { 6108 opte = *(u_int *)pte; 6109 npte = opte & ~mask; 6110 npte |= cache_bits; 6111 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 6112} 6113 6114/* Adjust the cache mode for a 2MB page mapped via a PDE. */ 6115static __inline void 6116pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask) 6117{ 6118 u_int opde, npde; 6119 6120 /* 6121 * The cache mode bits are all in the low 32-bits of the 6122 * PDE, so we can just spin on updating the low 32-bits. 6123 */ 6124 do { 6125 opde = *(u_int *)pde; 6126 npde = opde & ~mask; 6127 npde |= cache_bits; 6128 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 6129} 6130 6131/* 6132 * Map a set of physical memory pages into the kernel virtual 6133 * address space. Return a pointer to where it is mapped. This 6134 * routine is intended to be used for mapping device memory, 6135 * NOT real memory. 6136 */ 6137void * 6138pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 6139{ 6140 vm_offset_t va, offset; 6141 vm_size_t tmpsize; 6142 6143 /* 6144 * If the specified range of physical addresses fits within the direct 6145 * map window, use the direct map. 6146 */ 6147 if (pa < dmaplimit && pa + size < dmaplimit) { 6148 va = PHYS_TO_DMAP(pa); 6149 if (!pmap_change_attr(va, size, mode)) 6150 return ((void *)va); 6151 } 6152 offset = pa & PAGE_MASK; 6153 size = round_page(offset + size); 6154 va = kva_alloc(size); 6155 if (!va) 6156 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 6157 pa = trunc_page(pa); 6158 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 6159 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 6160 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 6161 pmap_invalidate_cache_range(va, va + tmpsize); 6162 return ((void *)(va + offset)); 6163} 6164 6165void * 6166pmap_mapdev(vm_paddr_t pa, vm_size_t size) 6167{ 6168 6169 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 6170} 6171 6172void * 6173pmap_mapbios(vm_paddr_t pa, vm_size_t size) 6174{ 6175 6176 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 6177} 6178 6179void 6180pmap_unmapdev(vm_offset_t va, vm_size_t size) 6181{ 6182 vm_offset_t base, offset; 6183 6184 /* If we gave a direct map region in pmap_mapdev, do nothing */ 6185 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) 6186 return; 6187 base = trunc_page(va); 6188 offset = va & PAGE_MASK; 6189 size = round_page(offset + size); 6190 kva_free(base, size); 6191} 6192 6193/* 6194 * Tries to demote a 1GB page mapping. 6195 */ 6196static boolean_t 6197pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) 6198{ 6199 pdp_entry_t newpdpe, oldpdpe; 6200 pd_entry_t *firstpde, newpde, *pde; 6201 pt_entry_t PG_A, PG_M, PG_RW, PG_V; 6202 vm_paddr_t mpdepa; 6203 vm_page_t mpde; 6204 6205 PG_A = pmap_accessed_bit(pmap); 6206 PG_M = pmap_modified_bit(pmap); 6207 PG_V = pmap_valid_bit(pmap); 6208 PG_RW = pmap_rw_bit(pmap); 6209 6210 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 6211 oldpdpe = *pdpe; 6212 KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), 6213 ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); 6214 if ((mpde = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT | 6215 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 6216 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" 6217 " in pmap %p", va, pmap); 6218 return (FALSE); 6219 } 6220 mpdepa = VM_PAGE_TO_PHYS(mpde); 6221 firstpde = (pd_entry_t *)PHYS_TO_DMAP(mpdepa); 6222 newpdpe = mpdepa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; 6223 KASSERT((oldpdpe & PG_A) != 0, 6224 ("pmap_demote_pdpe: oldpdpe is missing PG_A")); 6225 KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, 6226 ("pmap_demote_pdpe: oldpdpe is missing PG_M")); 6227 newpde = oldpdpe; 6228 6229 /* 6230 * Initialize the page directory page. 6231 */ 6232 for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { 6233 *pde = newpde; 6234 newpde += NBPDR; 6235 } 6236 6237 /* 6238 * Demote the mapping. 6239 */ 6240 *pdpe = newpdpe; 6241 6242 /* 6243 * Invalidate a stale recursive mapping of the page directory page. 6244 */ 6245 pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); 6246 6247 pmap_pdpe_demotions++; 6248 CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" 6249 " in pmap %p", va, pmap); 6250 return (TRUE); 6251} 6252 6253/* 6254 * Sets the memory attribute for the specified page. 6255 */ 6256void 6257pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 6258{ 6259 6260 m->md.pat_mode = ma; 6261 6262 /* 6263 * If "m" is a normal page, update its direct mapping. This update 6264 * can be relied upon to perform any cache operations that are 6265 * required for data coherence. 6266 */ 6267 if ((m->flags & PG_FICTITIOUS) == 0 && 6268 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, 6269 m->md.pat_mode)) 6270 panic("memory attribute change on the direct map failed"); 6271} 6272 6273/* 6274 * Changes the specified virtual address range's memory type to that given by 6275 * the parameter "mode". The specified virtual address range must be 6276 * completely contained within either the direct map or the kernel map. If 6277 * the virtual address range is contained within the kernel map, then the 6278 * memory type for each of the corresponding ranges of the direct map is also 6279 * changed. (The corresponding ranges of the direct map are those ranges that 6280 * map the same physical pages as the specified virtual address range.) These 6281 * changes to the direct map are necessary because Intel describes the 6282 * behavior of their processors as "undefined" if two or more mappings to the 6283 * same physical page have different memory types. 6284 * 6285 * Returns zero if the change completed successfully, and either EINVAL or 6286 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 6287 * of the virtual address range was not mapped, and ENOMEM is returned if 6288 * there was insufficient memory available to complete the change. In the 6289 * latter case, the memory type may have been changed on some part of the 6290 * virtual address range or the direct map. 6291 */ 6292int 6293pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 6294{ 6295 int error; 6296 6297 PMAP_LOCK(kernel_pmap); 6298 error = pmap_change_attr_locked(va, size, mode); 6299 PMAP_UNLOCK(kernel_pmap); 6300 return (error); 6301} 6302 6303static int 6304pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) 6305{ 6306 vm_offset_t base, offset, tmpva; 6307 vm_paddr_t pa_start, pa_end; 6308 pdp_entry_t *pdpe; 6309 pd_entry_t *pde; 6310 pt_entry_t *pte; 6311 int cache_bits_pte, cache_bits_pde, error; 6312 boolean_t changed; 6313 6314 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); 6315 base = trunc_page(va); 6316 offset = va & PAGE_MASK; 6317 size = round_page(offset + size); 6318 6319 /* 6320 * Only supported on kernel virtual addresses, including the direct 6321 * map but excluding the recursive map. 6322 */ 6323 if (base < DMAP_MIN_ADDRESS) 6324 return (EINVAL); 6325 6326 cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1); 6327 cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0); 6328 changed = FALSE; 6329 6330 /* 6331 * Pages that aren't mapped aren't supported. Also break down 2MB pages 6332 * into 4KB pages if required. 6333 */ 6334 for (tmpva = base; tmpva < base + size; ) { 6335 pdpe = pmap_pdpe(kernel_pmap, tmpva); 6336 if (*pdpe == 0) 6337 return (EINVAL); 6338 if (*pdpe & PG_PS) { 6339 /* 6340 * If the current 1GB page already has the required 6341 * memory type, then we need not demote this page. Just 6342 * increment tmpva to the next 1GB page frame. 6343 */ 6344 if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) { 6345 tmpva = trunc_1gpage(tmpva) + NBPDP; 6346 continue; 6347 } 6348 6349 /* 6350 * If the current offset aligns with a 1GB page frame 6351 * and there is at least 1GB left within the range, then 6352 * we need not break down this page into 2MB pages. 6353 */ 6354 if ((tmpva & PDPMASK) == 0 && 6355 tmpva + PDPMASK < base + size) { 6356 tmpva += NBPDP; 6357 continue; 6358 } 6359 if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) 6360 return (ENOMEM); 6361 } 6362 pde = pmap_pdpe_to_pde(pdpe, tmpva); 6363 if (*pde == 0) 6364 return (EINVAL); 6365 if (*pde & PG_PS) { 6366 /* 6367 * If the current 2MB page already has the required 6368 * memory type, then we need not demote this page. Just 6369 * increment tmpva to the next 2MB page frame. 6370 */ 6371 if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) { 6372 tmpva = trunc_2mpage(tmpva) + NBPDR; 6373 continue; 6374 } 6375 6376 /* 6377 * If the current offset aligns with a 2MB page frame 6378 * and there is at least 2MB left within the range, then 6379 * we need not break down this page into 4KB pages. 6380 */ 6381 if ((tmpva & PDRMASK) == 0 && 6382 tmpva + PDRMASK < base + size) { 6383 tmpva += NBPDR; 6384 continue; 6385 } 6386 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) 6387 return (ENOMEM); 6388 } 6389 pte = pmap_pde_to_pte(pde, tmpva); 6390 if (*pte == 0) 6391 return (EINVAL); 6392 tmpva += PAGE_SIZE; 6393 } 6394 error = 0; 6395 6396 /* 6397 * Ok, all the pages exist, so run through them updating their 6398 * cache mode if required. 6399 */ 6400 pa_start = pa_end = 0; 6401 for (tmpva = base; tmpva < base + size; ) { 6402 pdpe = pmap_pdpe(kernel_pmap, tmpva); 6403 if (*pdpe & PG_PS) { 6404 if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) { 6405 pmap_pde_attr(pdpe, cache_bits_pde, 6406 X86_PG_PDE_CACHE); 6407 changed = TRUE; 6408 } 6409 if (tmpva >= VM_MIN_KERNEL_ADDRESS) { 6410 if (pa_start == pa_end) { 6411 /* Start physical address run. */ 6412 pa_start = *pdpe & PG_PS_FRAME; 6413 pa_end = pa_start + NBPDP; 6414 } else if (pa_end == (*pdpe & PG_PS_FRAME)) 6415 pa_end += NBPDP; 6416 else { 6417 /* Run ended, update direct map. */ 6418 error = pmap_change_attr_locked( 6419 PHYS_TO_DMAP(pa_start), 6420 pa_end - pa_start, mode); 6421 if (error != 0) 6422 break; 6423 /* Start physical address run. */ 6424 pa_start = *pdpe & PG_PS_FRAME; 6425 pa_end = pa_start + NBPDP; 6426 } 6427 } 6428 tmpva = trunc_1gpage(tmpva) + NBPDP; 6429 continue; 6430 } 6431 pde = pmap_pdpe_to_pde(pdpe, tmpva); 6432 if (*pde & PG_PS) { 6433 if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) { 6434 pmap_pde_attr(pde, cache_bits_pde, 6435 X86_PG_PDE_CACHE); 6436 changed = TRUE; 6437 } 6438 if (tmpva >= VM_MIN_KERNEL_ADDRESS) { 6439 if (pa_start == pa_end) { 6440 /* Start physical address run. */ 6441 pa_start = *pde & PG_PS_FRAME; 6442 pa_end = pa_start + NBPDR; 6443 } else if (pa_end == (*pde & PG_PS_FRAME)) 6444 pa_end += NBPDR; 6445 else { 6446 /* Run ended, update direct map. */ 6447 error = pmap_change_attr_locked( 6448 PHYS_TO_DMAP(pa_start), 6449 pa_end - pa_start, mode); 6450 if (error != 0) 6451 break; 6452 /* Start physical address run. */ 6453 pa_start = *pde & PG_PS_FRAME; 6454 pa_end = pa_start + NBPDR; 6455 } 6456 } 6457 tmpva = trunc_2mpage(tmpva) + NBPDR; 6458 } else { 6459 pte = pmap_pde_to_pte(pde, tmpva); 6460 if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) { 6461 pmap_pte_attr(pte, cache_bits_pte, 6462 X86_PG_PTE_CACHE); 6463 changed = TRUE; 6464 } 6465 if (tmpva >= VM_MIN_KERNEL_ADDRESS) { 6466 if (pa_start == pa_end) { 6467 /* Start physical address run. */ 6468 pa_start = *pte & PG_FRAME; 6469 pa_end = pa_start + PAGE_SIZE; 6470 } else if (pa_end == (*pte & PG_FRAME)) 6471 pa_end += PAGE_SIZE; 6472 else { 6473 /* Run ended, update direct map. */ 6474 error = pmap_change_attr_locked( 6475 PHYS_TO_DMAP(pa_start), 6476 pa_end - pa_start, mode); 6477 if (error != 0) 6478 break; 6479 /* Start physical address run. */ 6480 pa_start = *pte & PG_FRAME; 6481 pa_end = pa_start + PAGE_SIZE; 6482 } 6483 } 6484 tmpva += PAGE_SIZE; 6485 } 6486 } 6487 if (error == 0 && pa_start != pa_end) 6488 error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), 6489 pa_end - pa_start, mode); 6490 6491 /* 6492 * Flush CPU caches if required to make sure any data isn't cached that 6493 * shouldn't be, etc. 6494 */ 6495 if (changed) { 6496 pmap_invalidate_range(kernel_pmap, base, tmpva); 6497 pmap_invalidate_cache_range(base, tmpva); 6498 } 6499 return (error); 6500} 6501 6502/* 6503 * Demotes any mapping within the direct map region that covers more than the 6504 * specified range of physical addresses. This range's size must be a power 6505 * of two and its starting address must be a multiple of its size. Since the 6506 * demotion does not change any attributes of the mapping, a TLB invalidation 6507 * is not mandatory. The caller may, however, request a TLB invalidation. 6508 */ 6509void 6510pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) 6511{ 6512 pdp_entry_t *pdpe; 6513 pd_entry_t *pde; 6514 vm_offset_t va; 6515 boolean_t changed; 6516 6517 if (len == 0) 6518 return; 6519 KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); 6520 KASSERT((base & (len - 1)) == 0, 6521 ("pmap_demote_DMAP: base is not a multiple of len")); 6522 if (len < NBPDP && base < dmaplimit) { 6523 va = PHYS_TO_DMAP(base); 6524 changed = FALSE; 6525 PMAP_LOCK(kernel_pmap); 6526 pdpe = pmap_pdpe(kernel_pmap, va); 6527 if ((*pdpe & X86_PG_V) == 0) 6528 panic("pmap_demote_DMAP: invalid PDPE"); 6529 if ((*pdpe & PG_PS) != 0) { 6530 if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) 6531 panic("pmap_demote_DMAP: PDPE failed"); 6532 changed = TRUE; 6533 } 6534 if (len < NBPDR) { 6535 pde = pmap_pdpe_to_pde(pdpe, va); 6536 if ((*pde & X86_PG_V) == 0) 6537 panic("pmap_demote_DMAP: invalid PDE"); 6538 if ((*pde & PG_PS) != 0) { 6539 if (!pmap_demote_pde(kernel_pmap, pde, va)) 6540 panic("pmap_demote_DMAP: PDE failed"); 6541 changed = TRUE; 6542 } 6543 } 6544 if (changed && invalidate) 6545 pmap_invalidate_page(kernel_pmap, va); 6546 PMAP_UNLOCK(kernel_pmap); 6547 } 6548} 6549 6550/* 6551 * perform the pmap work for mincore 6552 */ 6553int 6554pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 6555{ 6556 pd_entry_t *pdep; 6557 pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; 6558 vm_paddr_t pa; 6559 int val; 6560 6561 PG_A = pmap_accessed_bit(pmap); 6562 PG_M = pmap_modified_bit(pmap); 6563 PG_V = pmap_valid_bit(pmap); 6564 PG_RW = pmap_rw_bit(pmap); 6565 6566 PMAP_LOCK(pmap); 6567retry: 6568 pdep = pmap_pde(pmap, addr); 6569 if (pdep != NULL && (*pdep & PG_V)) { 6570 if (*pdep & PG_PS) { 6571 pte = *pdep; 6572 /* Compute the physical address of the 4KB page. */ 6573 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & 6574 PG_FRAME; 6575 val = MINCORE_SUPER; 6576 } else { 6577 pte = *pmap_pde_to_pte(pdep, addr); 6578 pa = pte & PG_FRAME; 6579 val = 0; 6580 } 6581 } else { 6582 pte = 0; 6583 pa = 0; 6584 val = 0; 6585 } 6586 if ((pte & PG_V) != 0) { 6587 val |= MINCORE_INCORE; 6588 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 6589 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 6590 if ((pte & PG_A) != 0) 6591 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 6592 } 6593 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 6594 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 6595 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 6596 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 6597 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 6598 goto retry; 6599 } else 6600 PA_UNLOCK_COND(*locked_pa); 6601 PMAP_UNLOCK(pmap); 6602 return (val); 6603} 6604 6605void 6606pmap_activate(struct thread *td) 6607{ 6608 pmap_t pmap, oldpmap; 6609 u_int cpuid; 6610 6611 critical_enter(); 6612 pmap = vmspace_pmap(td->td_proc->p_vmspace); 6613 oldpmap = PCPU_GET(curpmap); 6614 cpuid = PCPU_GET(cpuid); 6615#ifdef SMP 6616 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 6617 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 6618 CPU_SET_ATOMIC(cpuid, &pmap->pm_save); 6619#else 6620 CPU_CLR(cpuid, &oldpmap->pm_active); 6621 CPU_SET(cpuid, &pmap->pm_active); 6622 CPU_SET(cpuid, &pmap->pm_save); 6623#endif 6624 td->td_pcb->pcb_cr3 = pmap->pm_cr3; 6625 load_cr3(pmap->pm_cr3); 6626 PCPU_SET(curpmap, pmap); 6627 critical_exit(); 6628} 6629 6630void 6631pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 6632{ 6633} 6634 6635/* 6636 * Increase the starting virtual address of the given mapping if a 6637 * different alignment might result in more superpage mappings. 6638 */ 6639void 6640pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 6641 vm_offset_t *addr, vm_size_t size) 6642{ 6643 vm_offset_t superpage_offset; 6644 6645 if (size < NBPDR) 6646 return; 6647 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 6648 offset += ptoa(object->pg_color); 6649 superpage_offset = offset & PDRMASK; 6650 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 6651 (*addr & PDRMASK) == superpage_offset) 6652 return; 6653 if ((*addr & PDRMASK) < superpage_offset) 6654 *addr = (*addr & ~PDRMASK) + superpage_offset; 6655 else 6656 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 6657} 6658 6659#ifdef INVARIANTS 6660static unsigned long num_dirty_emulations; 6661SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, 6662 &num_dirty_emulations, 0, NULL); 6663 6664static unsigned long num_accessed_emulations; 6665SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, 6666 &num_accessed_emulations, 0, NULL); 6667 6668static unsigned long num_superpage_accessed_emulations; 6669SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, 6670 &num_superpage_accessed_emulations, 0, NULL); 6671 6672static unsigned long ad_emulation_superpage_promotions; 6673SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, 6674 &ad_emulation_superpage_promotions, 0, NULL); 6675#endif /* INVARIANTS */ 6676 6677int 6678pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) 6679{ 6680 int rv; 6681 struct rwlock *lock; 6682 vm_page_t m, mpte; 6683 pd_entry_t *pde; 6684 pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; 6685 boolean_t pv_lists_locked; 6686 6687 KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, 6688 ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); 6689 6690 if (!pmap_emulate_ad_bits(pmap)) 6691 return (-1); 6692 6693 PG_A = pmap_accessed_bit(pmap); 6694 PG_M = pmap_modified_bit(pmap); 6695 PG_V = pmap_valid_bit(pmap); 6696 PG_RW = pmap_rw_bit(pmap); 6697 6698 rv = -1; 6699 lock = NULL; 6700 pv_lists_locked = FALSE; 6701retry: 6702 PMAP_LOCK(pmap); 6703 6704 pde = pmap_pde(pmap, va); 6705 if (pde == NULL || (*pde & PG_V) == 0) 6706 goto done; 6707 6708 if ((*pde & PG_PS) != 0) { 6709 if (ftype == VM_PROT_READ) { 6710#ifdef INVARIANTS 6711 atomic_add_long(&num_superpage_accessed_emulations, 1); 6712#endif 6713 *pde |= PG_A; 6714 rv = 0; 6715 } 6716 goto done; 6717 } 6718 6719 pte = pmap_pde_to_pte(pde, va); 6720 if ((*pte & PG_V) == 0) 6721 goto done; 6722 6723 if (ftype == VM_PROT_WRITE) { 6724 if ((*pte & PG_RW) == 0) 6725 goto done; 6726 *pte |= PG_M; 6727 } 6728 *pte |= PG_A; 6729 6730 /* try to promote the mapping */ 6731 if (va < VM_MAXUSER_ADDRESS) 6732 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 6733 else 6734 mpte = NULL; 6735 6736 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 6737 6738 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 6739 pmap_ps_enabled(pmap) && 6740 (m->flags & PG_FICTITIOUS) == 0 && 6741 vm_reserv_level_iffullpop(m) == 0) { 6742 if (!pv_lists_locked) { 6743 pv_lists_locked = TRUE; 6744 if (!rw_try_rlock(&pvh_global_lock)) { 6745 PMAP_UNLOCK(pmap); 6746 rw_rlock(&pvh_global_lock); 6747 goto retry; 6748 } 6749 } 6750 pmap_promote_pde(pmap, pde, va, &lock); 6751#ifdef INVARIANTS 6752 atomic_add_long(&ad_emulation_superpage_promotions, 1); 6753#endif 6754 } 6755#ifdef INVARIANTS 6756 if (ftype == VM_PROT_WRITE) 6757 atomic_add_long(&num_dirty_emulations, 1); 6758 else 6759 atomic_add_long(&num_accessed_emulations, 1); 6760#endif 6761 rv = 0; /* success */ 6762done: 6763 if (lock != NULL) 6764 rw_wunlock(lock); 6765 if (pv_lists_locked) 6766 rw_runlock(&pvh_global_lock); 6767 PMAP_UNLOCK(pmap); 6768 return (rv); 6769} 6770 6771void 6772pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) 6773{ 6774 pml4_entry_t *pml4; 6775 pdp_entry_t *pdp; 6776 pd_entry_t *pde; 6777 pt_entry_t *pte, PG_V; 6778 int idx; 6779 6780 idx = 0; 6781 PG_V = pmap_valid_bit(pmap); 6782 PMAP_LOCK(pmap); 6783 6784 pml4 = pmap_pml4e(pmap, va); 6785 ptr[idx++] = *pml4; 6786 if ((*pml4 & PG_V) == 0) 6787 goto done; 6788 6789 pdp = pmap_pml4e_to_pdpe(pml4, va); 6790 ptr[idx++] = *pdp; 6791 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) 6792 goto done; 6793 6794 pde = pmap_pdpe_to_pde(pdp, va); 6795 ptr[idx++] = *pde; 6796 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) 6797 goto done; 6798 6799 pte = pmap_pde_to_pte(pde, va); 6800 ptr[idx++] = *pte; 6801 6802done: 6803 PMAP_UNLOCK(pmap); 6804 *num = idx; 6805} 6806 6807#include "opt_ddb.h" 6808#ifdef DDB 6809#include <ddb/ddb.h> 6810 6811DB_SHOW_COMMAND(pte, pmap_print_pte) 6812{ 6813 pmap_t pmap; 6814 pml4_entry_t *pml4; 6815 pdp_entry_t *pdp; 6816 pd_entry_t *pde; 6817 pt_entry_t *pte, PG_V; 6818 vm_offset_t va; 6819 6820 if (have_addr) { 6821 va = (vm_offset_t)addr; 6822 pmap = PCPU_GET(curpmap); /* XXX */ 6823 } else { 6824 db_printf("show pte addr\n"); 6825 return; 6826 } 6827 PG_V = pmap_valid_bit(pmap); 6828 pml4 = pmap_pml4e(pmap, va); 6829 db_printf("VA %#016lx pml4e %#016lx", va, *pml4); 6830 if ((*pml4 & PG_V) == 0) { 6831 db_printf("\n"); 6832 return; 6833 } 6834 pdp = pmap_pml4e_to_pdpe(pml4, va); 6835 db_printf(" pdpe %#016lx", *pdp); 6836 if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { 6837 db_printf("\n"); 6838 return; 6839 } 6840 pde = pmap_pdpe_to_pde(pdp, va); 6841 db_printf(" pde %#016lx", *pde); 6842 if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { 6843 db_printf("\n"); 6844 return; 6845 } 6846 pte = pmap_pde_to_pte(pde, va); 6847 db_printf(" pte %#016lx\n", *pte); 6848} 6849 6850DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) 6851{ 6852 vm_paddr_t a; 6853 6854 if (have_addr) { 6855 a = (vm_paddr_t)addr; 6856 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); 6857 } else { 6858 db_printf("show phys2dmap addr\n"); 6859 } 6860} 6861#endif 6862