pmap.c revision 314210
1/*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 9 * All rights reserved. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * the Systems Programming Group of the University of Utah Computer 13 * Science Department and William Jolitz of UUNET Technologies Inc. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 3. All advertising materials mentioning features or use of this software 24 * must display the following acknowledgement: 25 * This product includes software developed by the University of 26 * California, Berkeley and its contributors. 27 * 4. Neither the name of the University nor the names of its contributors 28 * may be used to endorse or promote products derived from this software 29 * without specific prior written permission. 30 * 31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41 * SUCH DAMAGE. 42 * 43 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 44 */ 45/*- 46 * Copyright (c) 2003 Networks Associates Technology, Inc. 47 * All rights reserved. 48 * 49 * This software was developed for the FreeBSD Project by Jake Burkholder, 50 * Safeport Network Services, and Network Associates Laboratories, the 51 * Security Research Division of Network Associates, Inc. under 52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 53 * CHATS research program. 54 * 55 * Redistribution and use in source and binary forms, with or without 56 * modification, are permitted provided that the following conditions 57 * are met: 58 * 1. Redistributions of source code must retain the above copyright 59 * notice, this list of conditions and the following disclaimer. 60 * 2. Redistributions in binary form must reproduce the above copyright 61 * notice, this list of conditions and the following disclaimer in the 62 * documentation and/or other materials provided with the distribution. 63 * 64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 67 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 74 * SUCH DAMAGE. 75 */ 76 77#include <sys/cdefs.h> 78__FBSDID("$FreeBSD: stable/11/sys/i386/i386/pmap.c 314210 2017-02-24 16:02:01Z kib $"); 79 80/* 81 * Manages physical address maps. 82 * 83 * Since the information managed by this module is 84 * also stored by the logical address mapping module, 85 * this module may throw away valid virtual-to-physical 86 * mappings at almost any time. However, invalidations 87 * of virtual-to-physical mappings must be done as 88 * requested. 89 * 90 * In order to cope with hardware architectures which 91 * make virtual-to-physical map invalidates expensive, 92 * this module may delay invalidate or reduced protection 93 * operations until such time as they are actually 94 * necessary. This module is given full information as 95 * to which processors are currently using which maps, 96 * and to when physical maps must be made correct. 97 */ 98 99#include "opt_apic.h" 100#include "opt_cpu.h" 101#include "opt_pmap.h" 102#include "opt_smp.h" 103#include "opt_xbox.h" 104 105#include <sys/param.h> 106#include <sys/systm.h> 107#include <sys/kernel.h> 108#include <sys/ktr.h> 109#include <sys/lock.h> 110#include <sys/malloc.h> 111#include <sys/mman.h> 112#include <sys/msgbuf.h> 113#include <sys/mutex.h> 114#include <sys/proc.h> 115#include <sys/rwlock.h> 116#include <sys/sf_buf.h> 117#include <sys/sx.h> 118#include <sys/vmmeter.h> 119#include <sys/sched.h> 120#include <sys/sysctl.h> 121#include <sys/smp.h> 122 123#include <vm/vm.h> 124#include <vm/vm_param.h> 125#include <vm/vm_kern.h> 126#include <vm/vm_page.h> 127#include <vm/vm_map.h> 128#include <vm/vm_object.h> 129#include <vm/vm_extern.h> 130#include <vm/vm_pageout.h> 131#include <vm/vm_pager.h> 132#include <vm/vm_phys.h> 133#include <vm/vm_radix.h> 134#include <vm/vm_reserv.h> 135#include <vm/uma.h> 136 137#ifdef DEV_APIC 138#include <sys/bus.h> 139#include <machine/intr_machdep.h> 140#include <x86/apicvar.h> 141#endif 142#include <machine/cpu.h> 143#include <machine/cputypes.h> 144#include <machine/md_var.h> 145#include <machine/pcb.h> 146#include <machine/specialreg.h> 147#ifdef SMP 148#include <machine/smp.h> 149#endif 150 151#ifdef XBOX 152#include <machine/xbox.h> 153#endif 154 155#ifndef PMAP_SHPGPERPROC 156#define PMAP_SHPGPERPROC 200 157#endif 158 159#if !defined(DIAGNOSTIC) 160#ifdef __GNUC_GNU_INLINE__ 161#define PMAP_INLINE __attribute__((__gnu_inline__)) inline 162#else 163#define PMAP_INLINE extern inline 164#endif 165#else 166#define PMAP_INLINE 167#endif 168 169#ifdef PV_STATS 170#define PV_STAT(x) do { x ; } while (0) 171#else 172#define PV_STAT(x) do { } while (0) 173#endif 174 175#define pa_index(pa) ((pa) >> PDRSHIFT) 176#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 177 178/* 179 * Get PDEs and PTEs for user/kernel address space 180 */ 181#define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) 182#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 183 184#define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) 185#define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) 186#define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) 187#define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) 188#define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) 189 190#define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ 191 atomic_clear_int((u_int *)(pte), PG_W)) 192#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) 193 194struct pmap kernel_pmap_store; 195LIST_HEAD(pmaplist, pmap); 196static struct pmaplist allpmaps; 197static struct mtx allpmaps_lock; 198 199vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 200vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 201int pgeflag = 0; /* PG_G or-in */ 202int pseflag = 0; /* PG_PS or-in */ 203 204static int nkpt = NKPT; 205vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR; 206extern u_int32_t KERNend; 207extern u_int32_t KPTphys; 208 209#if defined(PAE) || defined(PAE_TABLES) 210pt_entry_t pg_nx; 211static uma_zone_t pdptzone; 212#endif 213 214static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 215 216static int pat_works = 1; 217SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, 218 "Is page attribute table fully functional?"); 219 220static int pg_ps_enabled = 1; 221SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 222 &pg_ps_enabled, 0, "Are large page mappings enabled?"); 223 224#define PAT_INDEX_SIZE 8 225static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 226 227/* 228 * pmap_mapdev support pre initialization (i.e. console) 229 */ 230#define PMAP_PREINIT_MAPPING_COUNT 8 231static struct pmap_preinit_mapping { 232 vm_paddr_t pa; 233 vm_offset_t va; 234 vm_size_t sz; 235 int mode; 236} pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 237static int pmap_initialized; 238 239static struct rwlock_padalign pvh_global_lock; 240 241/* 242 * Data for the pv entry allocation mechanism 243 */ 244static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 245static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 246static struct md_page *pv_table; 247static int shpgperproc = PMAP_SHPGPERPROC; 248 249struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ 250int pv_maxchunks; /* How many chunks we have KVA for */ 251vm_offset_t pv_vafree; /* freelist stored in the PTE */ 252 253/* 254 * All those kernel PT submaps that BSD is so fond of 255 */ 256pt_entry_t *CMAP3; 257static pd_entry_t *KPTD; 258caddr_t ptvmmap = 0; 259caddr_t CADDR3; 260struct msgbuf *msgbufp = NULL; 261 262/* 263 * Crashdump maps. 264 */ 265static caddr_t crashdumpmap; 266 267static pt_entry_t *PMAP1 = NULL, *PMAP2; 268static pt_entry_t *PADDR1 = NULL, *PADDR2; 269#ifdef SMP 270static int PMAP1cpu; 271static int PMAP1changedcpu; 272SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 273 &PMAP1changedcpu, 0, 274 "Number of times pmap_pte_quick changed CPU with same PMAP1"); 275#endif 276static int PMAP1changed; 277SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 278 &PMAP1changed, 0, 279 "Number of times pmap_pte_quick changed PMAP1"); 280static int PMAP1unchanged; 281SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 282 &PMAP1unchanged, 0, 283 "Number of times pmap_pte_quick didn't change PMAP1"); 284static struct mtx PMAP2mutex; 285 286static void free_pv_chunk(struct pv_chunk *pc); 287static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 288static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); 289static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 290static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 291static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 292static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 293static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 294 vm_offset_t va); 295static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); 296 297static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 298static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, 299 vm_prot_t prot); 300static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 301 vm_page_t m, vm_prot_t prot, vm_page_t mpte); 302static void pmap_flush_page(vm_page_t m); 303static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 304static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 305static boolean_t pmap_is_modified_pvh(struct md_page *pvh); 306static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); 307static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 308static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde); 309static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); 310static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); 311static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 312static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 313 vm_prot_t prot); 314static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); 315static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 316 struct spglist *free); 317static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 318 struct spglist *free); 319static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); 320static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, 321 struct spglist *free); 322static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, 323 vm_offset_t va); 324static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); 325static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 326 vm_page_t m); 327static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 328 pd_entry_t newpde); 329static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); 330 331static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); 332 333static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags); 334static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free); 335static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); 336static void pmap_pte_release(pt_entry_t *pte); 337static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *); 338#if defined(PAE) || defined(PAE_TABLES) 339static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, 340 int wait); 341#endif 342static void pmap_set_pg(void); 343 344static __inline void pagezero(void *page); 345 346CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); 347CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); 348 349/* 350 * If you get an error here, then you set KVA_PAGES wrong! See the 351 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be 352 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. 353 */ 354CTASSERT(KERNBASE % (1 << 24) == 0); 355 356/* 357 * Bootstrap the system enough to run with virtual memory. 358 * 359 * On the i386 this is called after mapping has already been enabled 360 * and just syncs the pmap module with what has already been done. 361 * [We can't call it easily with mapping off since the kernel is not 362 * mapped with PA == VA, hence we would have to relocate every address 363 * from the linked base (virtual) address "KERNBASE" to the actual 364 * (physical) address starting relative to 0] 365 */ 366void 367pmap_bootstrap(vm_paddr_t firstaddr) 368{ 369 vm_offset_t va; 370 pt_entry_t *pte, *unused; 371 struct pcpu *pc; 372 int i; 373 374 /* 375 * Add a physical memory segment (vm_phys_seg) corresponding to the 376 * preallocated kernel page table pages so that vm_page structures 377 * representing these pages will be created. The vm_page structures 378 * are required for promotion of the corresponding kernel virtual 379 * addresses to superpage mappings. 380 */ 381 vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 382 383 /* 384 * Initialize the first available kernel virtual address. However, 385 * using "firstaddr" may waste a few pages of the kernel virtual 386 * address space, because locore may not have mapped every physical 387 * page that it allocated. Preferably, locore would provide a first 388 * unused virtual address in addition to "firstaddr". 389 */ 390 virtual_avail = (vm_offset_t) KERNBASE + firstaddr; 391 392 virtual_end = VM_MAX_KERNEL_ADDRESS; 393 394 /* 395 * Initialize the kernel pmap (which is statically allocated). 396 */ 397 PMAP_LOCK_INIT(kernel_pmap); 398 kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); 399#if defined(PAE) || defined(PAE_TABLES) 400 kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); 401#endif 402 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 403 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 404 405 /* 406 * Initialize the global pv list lock. 407 */ 408 rw_init(&pvh_global_lock, "pmap pv global"); 409 410 LIST_INIT(&allpmaps); 411 412 /* 413 * Request a spin mutex so that changes to allpmaps cannot be 414 * preempted by smp_rendezvous_cpus(). Otherwise, 415 * pmap_update_pde_kernel() could access allpmaps while it is 416 * being changed. 417 */ 418 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); 419 mtx_lock_spin(&allpmaps_lock); 420 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); 421 mtx_unlock_spin(&allpmaps_lock); 422 423 /* 424 * Reserve some special page table entries/VA space for temporary 425 * mapping of pages. 426 */ 427#define SYSMAP(c, p, v, n) \ 428 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 429 430 va = virtual_avail; 431 pte = vtopte(va); 432 433 434 /* 435 * Initialize temporary map objects on the current CPU for use 436 * during early boot. 437 * CMAP1/CMAP2 are used for zeroing and copying pages. 438 * CMAP3 is used for the idle process page zeroing. 439 */ 440 pc = get_pcpu(); 441 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 442 SYSMAP(caddr_t, pc->pc_cmap_pte1, pc->pc_cmap_addr1, 1) 443 SYSMAP(caddr_t, pc->pc_cmap_pte2, pc->pc_cmap_addr2, 1) 444 SYSMAP(vm_offset_t, pte, pc->pc_qmap_addr, 1) 445 446 SYSMAP(caddr_t, CMAP3, CADDR3, 1) 447 448 /* 449 * Crashdump maps. 450 */ 451 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) 452 453 /* 454 * ptvmmap is used for reading arbitrary physical pages via /dev/mem. 455 */ 456 SYSMAP(caddr_t, unused, ptvmmap, 1) 457 458 /* 459 * msgbufp is used to map the system message buffer. 460 */ 461 SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) 462 463 /* 464 * KPTmap is used by pmap_kextract(). 465 * 466 * KPTmap is first initialized by locore. However, that initial 467 * KPTmap can only support NKPT page table pages. Here, a larger 468 * KPTmap is created that can support KVA_PAGES page table pages. 469 */ 470 SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES) 471 472 for (i = 0; i < NKPT; i++) 473 KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V; 474 475 /* 476 * Adjust the start of the KPTD and KPTmap so that the implementation 477 * of pmap_kextract() and pmap_growkernel() can be made simpler. 478 */ 479 KPTD -= KPTDI; 480 KPTmap -= i386_btop(KPTDI << PDRSHIFT); 481 482 /* 483 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), 484 * respectively. 485 */ 486 SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) 487 SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) 488 489 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); 490 491 virtual_avail = va; 492 493 /* 494 * Leave in place an identity mapping (virt == phys) for the low 1 MB 495 * physical memory region that is used by the ACPI wakeup code. This 496 * mapping must not have PG_G set. 497 */ 498#ifdef XBOX 499 /* FIXME: This is gross, but needed for the XBOX. Since we are in such 500 * an early stadium, we cannot yet neatly map video memory ... :-( 501 * Better fixes are very welcome! */ 502 if (!arch_i386_is_xbox) 503#endif 504 for (i = 1; i < NKPT; i++) 505 PTD[i] = 0; 506 507 /* 508 * Initialize the PAT MSR if present. 509 * pmap_init_pat() clears and sets CR4_PGE, which, as a 510 * side-effect, invalidates stale PG_G TLB entries that might 511 * have been created in our pre-boot environment. We assume 512 * that PAT support implies PGE and in reverse, PGE presence 513 * comes with PAT. Both features were added for Pentium Pro. 514 */ 515 pmap_init_pat(); 516 517 /* Turn on PG_G on kernel page(s) */ 518 pmap_set_pg(); 519} 520 521static void 522pmap_init_reserved_pages(void) 523{ 524 struct pcpu *pc; 525 vm_offset_t pages; 526 int i; 527 528 CPU_FOREACH(i) { 529 pc = pcpu_find(i); 530 /* 531 * Skip if the mapping has already been initialized, 532 * i.e. this is the BSP. 533 */ 534 if (pc->pc_cmap_addr1 != 0) 535 continue; 536 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 537 pages = kva_alloc(PAGE_SIZE * 3); 538 if (pages == 0) 539 panic("%s: unable to allocate KVA", __func__); 540 pc->pc_cmap_pte1 = vtopte(pages); 541 pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE); 542 pc->pc_cmap_addr1 = (caddr_t)pages; 543 pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE); 544 pc->pc_qmap_addr = pages + (PAGE_SIZE * 2); 545 } 546} 547 548SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); 549 550/* 551 * Setup the PAT MSR. 552 */ 553void 554pmap_init_pat(void) 555{ 556 int pat_table[PAT_INDEX_SIZE]; 557 uint64_t pat_msr; 558 u_long cr0, cr4; 559 int i; 560 561 /* Set default PAT index table. */ 562 for (i = 0; i < PAT_INDEX_SIZE; i++) 563 pat_table[i] = -1; 564 pat_table[PAT_WRITE_BACK] = 0; 565 pat_table[PAT_WRITE_THROUGH] = 1; 566 pat_table[PAT_UNCACHEABLE] = 3; 567 pat_table[PAT_WRITE_COMBINING] = 3; 568 pat_table[PAT_WRITE_PROTECTED] = 3; 569 pat_table[PAT_UNCACHED] = 3; 570 571 /* 572 * Bail if this CPU doesn't implement PAT. 573 * We assume that PAT support implies PGE. 574 */ 575 if ((cpu_feature & CPUID_PAT) == 0) { 576 for (i = 0; i < PAT_INDEX_SIZE; i++) 577 pat_index[i] = pat_table[i]; 578 pat_works = 0; 579 return; 580 } 581 582 /* 583 * Due to some Intel errata, we can only safely use the lower 4 584 * PAT entries. 585 * 586 * Intel Pentium III Processor Specification Update 587 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B 588 * or Mode C Paging) 589 * 590 * Intel Pentium IV Processor Specification Update 591 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) 592 */ 593 if (cpu_vendor_id == CPU_VENDOR_INTEL && 594 !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) 595 pat_works = 0; 596 597 /* Initialize default PAT entries. */ 598 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 599 PAT_VALUE(1, PAT_WRITE_THROUGH) | 600 PAT_VALUE(2, PAT_UNCACHED) | 601 PAT_VALUE(3, PAT_UNCACHEABLE) | 602 PAT_VALUE(4, PAT_WRITE_BACK) | 603 PAT_VALUE(5, PAT_WRITE_THROUGH) | 604 PAT_VALUE(6, PAT_UNCACHED) | 605 PAT_VALUE(7, PAT_UNCACHEABLE); 606 607 if (pat_works) { 608 /* 609 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 610 * Program 5 and 6 as WP and WC. 611 * Leave 4 and 7 as WB and UC. 612 */ 613 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); 614 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | 615 PAT_VALUE(6, PAT_WRITE_COMBINING); 616 pat_table[PAT_UNCACHED] = 2; 617 pat_table[PAT_WRITE_PROTECTED] = 5; 618 pat_table[PAT_WRITE_COMBINING] = 6; 619 } else { 620 /* 621 * Just replace PAT Index 2 with WC instead of UC-. 622 */ 623 pat_msr &= ~PAT_MASK(2); 624 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 625 pat_table[PAT_WRITE_COMBINING] = 2; 626 } 627 628 /* Disable PGE. */ 629 cr4 = rcr4(); 630 load_cr4(cr4 & ~CR4_PGE); 631 632 /* Disable caches (CD = 1, NW = 0). */ 633 cr0 = rcr0(); 634 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 635 636 /* Flushes caches and TLBs. */ 637 wbinvd(); 638 invltlb(); 639 640 /* Update PAT and index table. */ 641 wrmsr(MSR_PAT, pat_msr); 642 for (i = 0; i < PAT_INDEX_SIZE; i++) 643 pat_index[i] = pat_table[i]; 644 645 /* Flush caches and TLBs again. */ 646 wbinvd(); 647 invltlb(); 648 649 /* Restore caches and PGE. */ 650 load_cr0(cr0); 651 load_cr4(cr4); 652} 653 654/* 655 * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. 656 */ 657static void 658pmap_set_pg(void) 659{ 660 pt_entry_t *pte; 661 vm_offset_t va, endva; 662 663 if (pgeflag == 0) 664 return; 665 666 endva = KERNBASE + KERNend; 667 668 if (pseflag) { 669 va = KERNBASE + KERNLOAD; 670 while (va < endva) { 671 pdir_pde(PTD, va) |= pgeflag; 672 invltlb(); /* Flush non-PG_G entries. */ 673 va += NBPDR; 674 } 675 } else { 676 va = (vm_offset_t)btext; 677 while (va < endva) { 678 pte = vtopte(va); 679 if (*pte) 680 *pte |= pgeflag; 681 invltlb(); /* Flush non-PG_G entries. */ 682 va += PAGE_SIZE; 683 } 684 } 685} 686 687/* 688 * Initialize a vm_page's machine-dependent fields. 689 */ 690void 691pmap_page_init(vm_page_t m) 692{ 693 694 TAILQ_INIT(&m->md.pv_list); 695 m->md.pat_mode = PAT_WRITE_BACK; 696} 697 698#if defined(PAE) || defined(PAE_TABLES) 699static void * 700pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) 701{ 702 703 /* Inform UMA that this allocator uses kernel_map/object. */ 704 *flags = UMA_SLAB_KERNEL; 705 return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL, 706 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); 707} 708#endif 709 710/* 711 * Abuse the pte nodes for unmapped kva to thread a kva freelist through. 712 * Requirements: 713 * - Must deal with pages in order to ensure that none of the PG_* bits 714 * are ever set, PG_V in particular. 715 * - Assumes we can write to ptes without pte_store() atomic ops, even 716 * on PAE systems. This should be ok. 717 * - Assumes nothing will ever test these addresses for 0 to indicate 718 * no mapping instead of correctly checking PG_V. 719 * - Assumes a vm_offset_t will fit in a pte (true for i386). 720 * Because PG_V is never set, there can be no mappings to invalidate. 721 */ 722static vm_offset_t 723pmap_ptelist_alloc(vm_offset_t *head) 724{ 725 pt_entry_t *pte; 726 vm_offset_t va; 727 728 va = *head; 729 if (va == 0) 730 panic("pmap_ptelist_alloc: exhausted ptelist KVA"); 731 pte = vtopte(va); 732 *head = *pte; 733 if (*head & PG_V) 734 panic("pmap_ptelist_alloc: va with PG_V set!"); 735 *pte = 0; 736 return (va); 737} 738 739static void 740pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) 741{ 742 pt_entry_t *pte; 743 744 if (va & PG_V) 745 panic("pmap_ptelist_free: freeing va with PG_V set!"); 746 pte = vtopte(va); 747 *pte = *head; /* virtual! PG_V is 0 though */ 748 *head = va; 749} 750 751static void 752pmap_ptelist_init(vm_offset_t *head, void *base, int npages) 753{ 754 int i; 755 vm_offset_t va; 756 757 *head = 0; 758 for (i = npages - 1; i >= 0; i--) { 759 va = (vm_offset_t)base + i * PAGE_SIZE; 760 pmap_ptelist_free(head, va); 761 } 762} 763 764 765/* 766 * Initialize the pmap module. 767 * Called by vm_init, to initialize any structures that the pmap 768 * system needs to map virtual memory. 769 */ 770void 771pmap_init(void) 772{ 773 struct pmap_preinit_mapping *ppim; 774 vm_page_t mpte; 775 vm_size_t s; 776 int i, pv_npg; 777 778 /* 779 * Initialize the vm page array entries for the kernel pmap's 780 * page table pages. 781 */ 782 for (i = 0; i < NKPT; i++) { 783 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 784 KASSERT(mpte >= vm_page_array && 785 mpte < &vm_page_array[vm_page_array_size], 786 ("pmap_init: page table page is out of range")); 787 mpte->pindex = i + KPTDI; 788 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 789 } 790 791 /* 792 * Initialize the address space (zone) for the pv entries. Set a 793 * high water mark so that the system can recover from excessive 794 * numbers of pv entries. 795 */ 796 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 797 pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; 798 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 799 pv_entry_max = roundup(pv_entry_max, _NPCPV); 800 pv_entry_high_water = 9 * (pv_entry_max / 10); 801 802 /* 803 * If the kernel is running on a virtual machine, then it must assume 804 * that MCA is enabled by the hypervisor. Moreover, the kernel must 805 * be prepared for the hypervisor changing the vendor and family that 806 * are reported by CPUID. Consequently, the workaround for AMD Family 807 * 10h Erratum 383 is enabled if the processor's feature set does not 808 * include at least one feature that is only supported by older Intel 809 * or newer AMD processors. 810 */ 811 if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && 812 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 813 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 814 AMDID2_FMA4)) == 0) 815 workaround_erratum383 = 1; 816 817 /* 818 * Are large page mappings supported and enabled? 819 */ 820 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 821 if (pseflag == 0) 822 pg_ps_enabled = 0; 823 else if (pg_ps_enabled) { 824 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 825 ("pmap_init: can't assign to pagesizes[1]")); 826 pagesizes[1] = NBPDR; 827 } 828 829 /* 830 * Calculate the size of the pv head table for superpages. 831 * Handle the possibility that "vm_phys_segs[...].end" is zero. 832 */ 833 pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end - 834 PAGE_SIZE) / NBPDR + 1; 835 836 /* 837 * Allocate memory for the pv head table for superpages. 838 */ 839 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 840 s = round_page(s); 841 pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, 842 M_WAITOK | M_ZERO); 843 for (i = 0; i < pv_npg; i++) 844 TAILQ_INIT(&pv_table[i].pv_list); 845 846 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); 847 pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); 848 if (pv_chunkbase == NULL) 849 panic("pmap_init: not enough kvm for pv chunks"); 850 pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); 851#if defined(PAE) || defined(PAE_TABLES) 852 pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, 853 NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, 854 UMA_ZONE_VM | UMA_ZONE_NOFREE); 855 uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); 856#endif 857 858 pmap_initialized = 1; 859 if (!bootverbose) 860 return; 861 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 862 ppim = pmap_preinit_mapping + i; 863 if (ppim->va == 0) 864 continue; 865 printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i, 866 (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode); 867 } 868} 869 870 871SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, 872 "Max number of PV entries"); 873SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, 874 "Page share factor per proc"); 875 876static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, 877 "2/4MB page mapping counters"); 878 879static u_long pmap_pde_demotions; 880SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, 881 &pmap_pde_demotions, 0, "2/4MB page demotions"); 882 883static u_long pmap_pde_mappings; 884SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 885 &pmap_pde_mappings, 0, "2/4MB page mappings"); 886 887static u_long pmap_pde_p_failures; 888SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 889 &pmap_pde_p_failures, 0, "2/4MB page promotion failures"); 890 891static u_long pmap_pde_promotions; 892SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 893 &pmap_pde_promotions, 0, "2/4MB page promotions"); 894 895/*************************************************** 896 * Low level helper routines..... 897 ***************************************************/ 898 899/* 900 * Determine the appropriate bits to set in a PTE or PDE for a specified 901 * caching mode. 902 */ 903int 904pmap_cache_bits(int mode, boolean_t is_pde) 905{ 906 int cache_bits, pat_flag, pat_idx; 907 908 if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) 909 panic("Unknown caching mode %d\n", mode); 910 911 /* The PAT bit is different for PTE's and PDE's. */ 912 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; 913 914 /* Map the caching mode to a PAT index. */ 915 pat_idx = pat_index[mode]; 916 917 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 918 cache_bits = 0; 919 if (pat_idx & 0x4) 920 cache_bits |= pat_flag; 921 if (pat_idx & 0x2) 922 cache_bits |= PG_NC_PCD; 923 if (pat_idx & 0x1) 924 cache_bits |= PG_NC_PWT; 925 return (cache_bits); 926} 927 928/* 929 * The caller is responsible for maintaining TLB consistency. 930 */ 931static void 932pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde) 933{ 934 pd_entry_t *pde; 935 pmap_t pmap; 936 boolean_t PTD_updated; 937 938 PTD_updated = FALSE; 939 mtx_lock_spin(&allpmaps_lock); 940 LIST_FOREACH(pmap, &allpmaps, pm_list) { 941 if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & 942 PG_FRAME)) 943 PTD_updated = TRUE; 944 pde = pmap_pde(pmap, va); 945 pde_store(pde, newpde); 946 } 947 mtx_unlock_spin(&allpmaps_lock); 948 KASSERT(PTD_updated, 949 ("pmap_kenter_pde: current page table is not in allpmaps")); 950} 951 952/* 953 * After changing the page size for the specified virtual address in the page 954 * table, flush the corresponding entries from the processor's TLB. Only the 955 * calling processor's TLB is affected. 956 * 957 * The calling thread must be pinned to a processor. 958 */ 959static void 960pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) 961{ 962 u_long cr4; 963 964 if ((newpde & PG_PS) == 0) 965 /* Demotion: flush a specific 2MB page mapping. */ 966 invlpg(va); 967 else if ((newpde & PG_G) == 0) 968 /* 969 * Promotion: flush every 4KB page mapping from the TLB 970 * because there are too many to flush individually. 971 */ 972 invltlb(); 973 else { 974 /* 975 * Promotion: flush every 4KB page mapping from the TLB, 976 * including any global (PG_G) mappings. 977 */ 978 cr4 = rcr4(); 979 load_cr4(cr4 & ~CR4_PGE); 980 /* 981 * Although preemption at this point could be detrimental to 982 * performance, it would not lead to an error. PG_G is simply 983 * ignored if CR4.PGE is clear. Moreover, in case this block 984 * is re-entered, the load_cr4() either above or below will 985 * modify CR4.PGE flushing the TLB. 986 */ 987 load_cr4(cr4 | CR4_PGE); 988 } 989} 990 991void 992invltlb_glob(void) 993{ 994 uint64_t cr4; 995 996 if (pgeflag == 0) { 997 invltlb(); 998 } else { 999 cr4 = rcr4(); 1000 load_cr4(cr4 & ~CR4_PGE); 1001 load_cr4(cr4 | CR4_PGE); 1002 } 1003} 1004 1005 1006#ifdef SMP 1007/* 1008 * For SMP, these functions have to use the IPI mechanism for coherence. 1009 * 1010 * N.B.: Before calling any of the following TLB invalidation functions, 1011 * the calling processor must ensure that all stores updating a non- 1012 * kernel page table are globally performed. Otherwise, another 1013 * processor could cache an old, pre-update entry without being 1014 * invalidated. This can happen one of two ways: (1) The pmap becomes 1015 * active on another processor after its pm_active field is checked by 1016 * one of the following functions but before a store updating the page 1017 * table is globally performed. (2) The pmap becomes active on another 1018 * processor before its pm_active field is checked but due to 1019 * speculative loads one of the following functions stills reads the 1020 * pmap as inactive on the other processor. 1021 * 1022 * The kernel page table is exempt because its pm_active field is 1023 * immutable. The kernel page table is always active on every 1024 * processor. 1025 */ 1026void 1027pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1028{ 1029 cpuset_t *mask, other_cpus; 1030 u_int cpuid; 1031 1032 sched_pin(); 1033 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 1034 invlpg(va); 1035 mask = &all_cpus; 1036 } else { 1037 cpuid = PCPU_GET(cpuid); 1038 other_cpus = all_cpus; 1039 CPU_CLR(cpuid, &other_cpus); 1040 if (CPU_ISSET(cpuid, &pmap->pm_active)) 1041 invlpg(va); 1042 CPU_AND(&other_cpus, &pmap->pm_active); 1043 mask = &other_cpus; 1044 } 1045 smp_masked_invlpg(*mask, va); 1046 sched_unpin(); 1047} 1048 1049/* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ 1050#define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) 1051 1052void 1053pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1054{ 1055 cpuset_t *mask, other_cpus; 1056 vm_offset_t addr; 1057 u_int cpuid; 1058 1059 if (eva - sva >= PMAP_INVLPG_THRESHOLD) { 1060 pmap_invalidate_all(pmap); 1061 return; 1062 } 1063 1064 sched_pin(); 1065 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 1066 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1067 invlpg(addr); 1068 mask = &all_cpus; 1069 } else { 1070 cpuid = PCPU_GET(cpuid); 1071 other_cpus = all_cpus; 1072 CPU_CLR(cpuid, &other_cpus); 1073 if (CPU_ISSET(cpuid, &pmap->pm_active)) 1074 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1075 invlpg(addr); 1076 CPU_AND(&other_cpus, &pmap->pm_active); 1077 mask = &other_cpus; 1078 } 1079 smp_masked_invlpg_range(*mask, sva, eva); 1080 sched_unpin(); 1081} 1082 1083void 1084pmap_invalidate_all(pmap_t pmap) 1085{ 1086 cpuset_t *mask, other_cpus; 1087 u_int cpuid; 1088 1089 sched_pin(); 1090 if (pmap == kernel_pmap) { 1091 invltlb_glob(); 1092 mask = &all_cpus; 1093 } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { 1094 invltlb(); 1095 mask = &all_cpus; 1096 } else { 1097 cpuid = PCPU_GET(cpuid); 1098 other_cpus = all_cpus; 1099 CPU_CLR(cpuid, &other_cpus); 1100 if (CPU_ISSET(cpuid, &pmap->pm_active)) 1101 invltlb(); 1102 CPU_AND(&other_cpus, &pmap->pm_active); 1103 mask = &other_cpus; 1104 } 1105 smp_masked_invltlb(*mask, pmap); 1106 sched_unpin(); 1107} 1108 1109void 1110pmap_invalidate_cache(void) 1111{ 1112 1113 sched_pin(); 1114 wbinvd(); 1115 smp_cache_flush(); 1116 sched_unpin(); 1117} 1118 1119struct pde_action { 1120 cpuset_t invalidate; /* processors that invalidate their TLB */ 1121 vm_offset_t va; 1122 pd_entry_t *pde; 1123 pd_entry_t newpde; 1124 u_int store; /* processor that updates the PDE */ 1125}; 1126 1127static void 1128pmap_update_pde_kernel(void *arg) 1129{ 1130 struct pde_action *act = arg; 1131 pd_entry_t *pde; 1132 pmap_t pmap; 1133 1134 if (act->store == PCPU_GET(cpuid)) { 1135 1136 /* 1137 * Elsewhere, this operation requires allpmaps_lock for 1138 * synchronization. Here, it does not because it is being 1139 * performed in the context of an all_cpus rendezvous. 1140 */ 1141 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1142 pde = pmap_pde(pmap, act->va); 1143 pde_store(pde, act->newpde); 1144 } 1145 } 1146} 1147 1148static void 1149pmap_update_pde_user(void *arg) 1150{ 1151 struct pde_action *act = arg; 1152 1153 if (act->store == PCPU_GET(cpuid)) 1154 pde_store(act->pde, act->newpde); 1155} 1156 1157static void 1158pmap_update_pde_teardown(void *arg) 1159{ 1160 struct pde_action *act = arg; 1161 1162 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 1163 pmap_update_pde_invalidate(act->va, act->newpde); 1164} 1165 1166/* 1167 * Change the page size for the specified virtual address in a way that 1168 * prevents any possibility of the TLB ever having two entries that map the 1169 * same virtual address using different page sizes. This is the recommended 1170 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 1171 * machine check exception for a TLB state that is improperly diagnosed as a 1172 * hardware error. 1173 */ 1174static void 1175pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1176{ 1177 struct pde_action act; 1178 cpuset_t active, other_cpus; 1179 u_int cpuid; 1180 1181 sched_pin(); 1182 cpuid = PCPU_GET(cpuid); 1183 other_cpus = all_cpus; 1184 CPU_CLR(cpuid, &other_cpus); 1185 if (pmap == kernel_pmap) 1186 active = all_cpus; 1187 else 1188 active = pmap->pm_active; 1189 if (CPU_OVERLAP(&active, &other_cpus)) { 1190 act.store = cpuid; 1191 act.invalidate = active; 1192 act.va = va; 1193 act.pde = pde; 1194 act.newpde = newpde; 1195 CPU_SET(cpuid, &active); 1196 smp_rendezvous_cpus(active, 1197 smp_no_rendevous_barrier, pmap == kernel_pmap ? 1198 pmap_update_pde_kernel : pmap_update_pde_user, 1199 pmap_update_pde_teardown, &act); 1200 } else { 1201 if (pmap == kernel_pmap) 1202 pmap_kenter_pde(va, newpde); 1203 else 1204 pde_store(pde, newpde); 1205 if (CPU_ISSET(cpuid, &active)) 1206 pmap_update_pde_invalidate(va, newpde); 1207 } 1208 sched_unpin(); 1209} 1210#else /* !SMP */ 1211/* 1212 * Normal, non-SMP, 486+ invalidation functions. 1213 * We inline these within pmap.c for speed. 1214 */ 1215PMAP_INLINE void 1216pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1217{ 1218 1219 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1220 invlpg(va); 1221} 1222 1223PMAP_INLINE void 1224pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1225{ 1226 vm_offset_t addr; 1227 1228 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1229 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1230 invlpg(addr); 1231} 1232 1233PMAP_INLINE void 1234pmap_invalidate_all(pmap_t pmap) 1235{ 1236 1237 if (pmap == kernel_pmap) 1238 invltlb_glob(); 1239 else if (!CPU_EMPTY(&pmap->pm_active)) 1240 invltlb(); 1241} 1242 1243PMAP_INLINE void 1244pmap_invalidate_cache(void) 1245{ 1246 1247 wbinvd(); 1248} 1249 1250static void 1251pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1252{ 1253 1254 if (pmap == kernel_pmap) 1255 pmap_kenter_pde(va, newpde); 1256 else 1257 pde_store(pde, newpde); 1258 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1259 pmap_update_pde_invalidate(va, newpde); 1260} 1261#endif /* !SMP */ 1262 1263#define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1264 1265void 1266pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) 1267{ 1268 1269 if (force) { 1270 sva &= ~(vm_offset_t)cpu_clflush_line_size; 1271 } else { 1272 KASSERT((sva & PAGE_MASK) == 0, 1273 ("pmap_invalidate_cache_range: sva not page-aligned")); 1274 KASSERT((eva & PAGE_MASK) == 0, 1275 ("pmap_invalidate_cache_range: eva not page-aligned")); 1276 } 1277 1278 if ((cpu_feature & CPUID_SS) != 0 && !force) 1279 ; /* If "Self Snoop" is supported and allowed, do nothing. */ 1280 else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 && 1281 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 1282#ifdef DEV_APIC 1283 /* 1284 * XXX: Some CPUs fault, hang, or trash the local APIC 1285 * registers if we use CLFLUSH on the local APIC 1286 * range. The local APIC is always uncached, so we 1287 * don't need to flush for that range anyway. 1288 */ 1289 if (pmap_kextract(sva) == lapic_paddr) 1290 return; 1291#endif 1292 /* 1293 * Otherwise, do per-cache line flush. Use the sfence 1294 * instruction to insure that previous stores are 1295 * included in the write-back. The processor 1296 * propagates flush to other processors in the cache 1297 * coherence domain. 1298 */ 1299 sfence(); 1300 for (; sva < eva; sva += cpu_clflush_line_size) 1301 clflushopt(sva); 1302 sfence(); 1303 } else if ((cpu_feature & CPUID_CLFSH) != 0 && 1304 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 1305#ifdef DEV_APIC 1306 if (pmap_kextract(sva) == lapic_paddr) 1307 return; 1308#endif 1309 /* 1310 * Writes are ordered by CLFLUSH on Intel CPUs. 1311 */ 1312 if (cpu_vendor_id != CPU_VENDOR_INTEL) 1313 mfence(); 1314 for (; sva < eva; sva += cpu_clflush_line_size) 1315 clflush(sva); 1316 if (cpu_vendor_id != CPU_VENDOR_INTEL) 1317 mfence(); 1318 } else { 1319 1320 /* 1321 * No targeted cache flush methods are supported by CPU, 1322 * or the supplied range is bigger than 2MB. 1323 * Globally invalidate cache. 1324 */ 1325 pmap_invalidate_cache(); 1326 } 1327} 1328 1329void 1330pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1331{ 1332 int i; 1333 1334 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1335 (cpu_feature & CPUID_CLFSH) == 0) { 1336 pmap_invalidate_cache(); 1337 } else { 1338 for (i = 0; i < count; i++) 1339 pmap_flush_page(pages[i]); 1340 } 1341} 1342 1343/* 1344 * Are we current address space or kernel? 1345 */ 1346static __inline int 1347pmap_is_current(pmap_t pmap) 1348{ 1349 1350 return (pmap == kernel_pmap || pmap == 1351 vmspace_pmap(curthread->td_proc->p_vmspace)); 1352} 1353 1354/* 1355 * If the given pmap is not the current or kernel pmap, the returned pte must 1356 * be released by passing it to pmap_pte_release(). 1357 */ 1358pt_entry_t * 1359pmap_pte(pmap_t pmap, vm_offset_t va) 1360{ 1361 pd_entry_t newpf; 1362 pd_entry_t *pde; 1363 1364 pde = pmap_pde(pmap, va); 1365 if (*pde & PG_PS) 1366 return (pde); 1367 if (*pde != 0) { 1368 /* are we current address space or kernel? */ 1369 if (pmap_is_current(pmap)) 1370 return (vtopte(va)); 1371 mtx_lock(&PMAP2mutex); 1372 newpf = *pde & PG_FRAME; 1373 if ((*PMAP2 & PG_FRAME) != newpf) { 1374 *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; 1375 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); 1376 } 1377 return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); 1378 } 1379 return (NULL); 1380} 1381 1382/* 1383 * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte 1384 * being NULL. 1385 */ 1386static __inline void 1387pmap_pte_release(pt_entry_t *pte) 1388{ 1389 1390 if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) 1391 mtx_unlock(&PMAP2mutex); 1392} 1393 1394/* 1395 * NB: The sequence of updating a page table followed by accesses to the 1396 * corresponding pages is subject to the situation described in the "AMD64 1397 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23, 1398 * "7.3.1 Special Coherency Considerations". Therefore, issuing the INVLPG 1399 * right after modifying the PTE bits is crucial. 1400 */ 1401static __inline void 1402invlcaddr(void *caddr) 1403{ 1404 1405 invlpg((u_int)caddr); 1406} 1407 1408/* 1409 * Super fast pmap_pte routine best used when scanning 1410 * the pv lists. This eliminates many coarse-grained 1411 * invltlb calls. Note that many of the pv list 1412 * scans are across different pmaps. It is very wasteful 1413 * to do an entire invltlb for checking a single mapping. 1414 * 1415 * If the given pmap is not the current pmap, pvh_global_lock 1416 * must be held and curthread pinned to a CPU. 1417 */ 1418static pt_entry_t * 1419pmap_pte_quick(pmap_t pmap, vm_offset_t va) 1420{ 1421 pd_entry_t newpf; 1422 pd_entry_t *pde; 1423 1424 pde = pmap_pde(pmap, va); 1425 if (*pde & PG_PS) 1426 return (pde); 1427 if (*pde != 0) { 1428 /* are we current address space or kernel? */ 1429 if (pmap_is_current(pmap)) 1430 return (vtopte(va)); 1431 rw_assert(&pvh_global_lock, RA_WLOCKED); 1432 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 1433 newpf = *pde & PG_FRAME; 1434 if ((*PMAP1 & PG_FRAME) != newpf) { 1435 *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; 1436#ifdef SMP 1437 PMAP1cpu = PCPU_GET(cpuid); 1438#endif 1439 invlcaddr(PADDR1); 1440 PMAP1changed++; 1441 } else 1442#ifdef SMP 1443 if (PMAP1cpu != PCPU_GET(cpuid)) { 1444 PMAP1cpu = PCPU_GET(cpuid); 1445 invlcaddr(PADDR1); 1446 PMAP1changedcpu++; 1447 } else 1448#endif 1449 PMAP1unchanged++; 1450 return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); 1451 } 1452 return (0); 1453} 1454 1455/* 1456 * Routine: pmap_extract 1457 * Function: 1458 * Extract the physical page address associated 1459 * with the given map/virtual_address pair. 1460 */ 1461vm_paddr_t 1462pmap_extract(pmap_t pmap, vm_offset_t va) 1463{ 1464 vm_paddr_t rtval; 1465 pt_entry_t *pte; 1466 pd_entry_t pde; 1467 1468 rtval = 0; 1469 PMAP_LOCK(pmap); 1470 pde = pmap->pm_pdir[va >> PDRSHIFT]; 1471 if (pde != 0) { 1472 if ((pde & PG_PS) != 0) 1473 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); 1474 else { 1475 pte = pmap_pte(pmap, va); 1476 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); 1477 pmap_pte_release(pte); 1478 } 1479 } 1480 PMAP_UNLOCK(pmap); 1481 return (rtval); 1482} 1483 1484/* 1485 * Routine: pmap_extract_and_hold 1486 * Function: 1487 * Atomically extract and hold the physical page 1488 * with the given pmap and virtual address pair 1489 * if that mapping permits the given protection. 1490 */ 1491vm_page_t 1492pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1493{ 1494 pd_entry_t pde; 1495 pt_entry_t pte, *ptep; 1496 vm_page_t m; 1497 vm_paddr_t pa; 1498 1499 pa = 0; 1500 m = NULL; 1501 PMAP_LOCK(pmap); 1502retry: 1503 pde = *pmap_pde(pmap, va); 1504 if (pde != 0) { 1505 if (pde & PG_PS) { 1506 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 1507 if (vm_page_pa_tryrelock(pmap, (pde & 1508 PG_PS_FRAME) | (va & PDRMASK), &pa)) 1509 goto retry; 1510 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | 1511 (va & PDRMASK)); 1512 vm_page_hold(m); 1513 } 1514 } else { 1515 ptep = pmap_pte(pmap, va); 1516 pte = *ptep; 1517 pmap_pte_release(ptep); 1518 if (pte != 0 && 1519 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 1520 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, 1521 &pa)) 1522 goto retry; 1523 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 1524 vm_page_hold(m); 1525 } 1526 } 1527 } 1528 PA_UNLOCK_COND(pa); 1529 PMAP_UNLOCK(pmap); 1530 return (m); 1531} 1532 1533/*************************************************** 1534 * Low level mapping routines..... 1535 ***************************************************/ 1536 1537/* 1538 * Add a wired page to the kva. 1539 * Note: not SMP coherent. 1540 * 1541 * This function may be used before pmap_bootstrap() is called. 1542 */ 1543PMAP_INLINE void 1544pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1545{ 1546 pt_entry_t *pte; 1547 1548 pte = vtopte(va); 1549 pte_store(pte, pa | PG_RW | PG_V | pgeflag); 1550} 1551 1552static __inline void 1553pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 1554{ 1555 pt_entry_t *pte; 1556 1557 pte = vtopte(va); 1558 pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); 1559} 1560 1561/* 1562 * Remove a page from the kernel pagetables. 1563 * Note: not SMP coherent. 1564 * 1565 * This function may be used before pmap_bootstrap() is called. 1566 */ 1567PMAP_INLINE void 1568pmap_kremove(vm_offset_t va) 1569{ 1570 pt_entry_t *pte; 1571 1572 pte = vtopte(va); 1573 pte_clear(pte); 1574} 1575 1576/* 1577 * Used to map a range of physical addresses into kernel 1578 * virtual address space. 1579 * 1580 * The value passed in '*virt' is a suggested virtual address for 1581 * the mapping. Architectures which can support a direct-mapped 1582 * physical to virtual region can return the appropriate address 1583 * within that region, leaving '*virt' unchanged. Other 1584 * architectures should map the pages starting at '*virt' and 1585 * update '*virt' with the first usable address after the mapped 1586 * region. 1587 */ 1588vm_offset_t 1589pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1590{ 1591 vm_offset_t va, sva; 1592 vm_paddr_t superpage_offset; 1593 pd_entry_t newpde; 1594 1595 va = *virt; 1596 /* 1597 * Does the physical address range's size and alignment permit at 1598 * least one superpage mapping to be created? 1599 */ 1600 superpage_offset = start & PDRMASK; 1601 if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) { 1602 /* 1603 * Increase the starting virtual address so that its alignment 1604 * does not preclude the use of superpage mappings. 1605 */ 1606 if ((va & PDRMASK) < superpage_offset) 1607 va = (va & ~PDRMASK) + superpage_offset; 1608 else if ((va & PDRMASK) > superpage_offset) 1609 va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset; 1610 } 1611 sva = va; 1612 while (start < end) { 1613 if ((start & PDRMASK) == 0 && end - start >= NBPDR && 1614 pseflag) { 1615 KASSERT((va & PDRMASK) == 0, 1616 ("pmap_map: misaligned va %#x", va)); 1617 newpde = start | PG_PS | pgeflag | PG_RW | PG_V; 1618 pmap_kenter_pde(va, newpde); 1619 va += NBPDR; 1620 start += NBPDR; 1621 } else { 1622 pmap_kenter(va, start); 1623 va += PAGE_SIZE; 1624 start += PAGE_SIZE; 1625 } 1626 } 1627 pmap_invalidate_range(kernel_pmap, sva, va); 1628 *virt = va; 1629 return (sva); 1630} 1631 1632 1633/* 1634 * Add a list of wired pages to the kva 1635 * this routine is only used for temporary 1636 * kernel mappings that do not need to have 1637 * page modification or references recorded. 1638 * Note that old mappings are simply written 1639 * over. The page *must* be wired. 1640 * Note: SMP coherent. Uses a ranged shootdown IPI. 1641 */ 1642void 1643pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1644{ 1645 pt_entry_t *endpte, oldpte, pa, *pte; 1646 vm_page_t m; 1647 1648 oldpte = 0; 1649 pte = vtopte(sva); 1650 endpte = pte + count; 1651 while (pte < endpte) { 1652 m = *ma++; 1653 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); 1654 if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { 1655 oldpte |= *pte; 1656 pte_store(pte, pa | pgeflag | PG_RW | PG_V); 1657 } 1658 pte++; 1659 } 1660 if (__predict_false((oldpte & PG_V) != 0)) 1661 pmap_invalidate_range(kernel_pmap, sva, sva + count * 1662 PAGE_SIZE); 1663} 1664 1665/* 1666 * This routine tears out page mappings from the 1667 * kernel -- it is meant only for temporary mappings. 1668 * Note: SMP coherent. Uses a ranged shootdown IPI. 1669 */ 1670void 1671pmap_qremove(vm_offset_t sva, int count) 1672{ 1673 vm_offset_t va; 1674 1675 va = sva; 1676 while (count-- > 0) { 1677 pmap_kremove(va); 1678 va += PAGE_SIZE; 1679 } 1680 pmap_invalidate_range(kernel_pmap, sva, va); 1681} 1682 1683/*************************************************** 1684 * Page table page management routines..... 1685 ***************************************************/ 1686static __inline void 1687pmap_free_zero_pages(struct spglist *free) 1688{ 1689 vm_page_t m; 1690 1691 while ((m = SLIST_FIRST(free)) != NULL) { 1692 SLIST_REMOVE_HEAD(free, plinks.s.ss); 1693 /* Preserve the page's PG_ZERO setting. */ 1694 vm_page_free_toq(m); 1695 } 1696} 1697 1698/* 1699 * Schedule the specified unused page table page to be freed. Specifically, 1700 * add the page to the specified list of pages that will be released to the 1701 * physical memory manager after the TLB has been updated. 1702 */ 1703static __inline void 1704pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 1705 boolean_t set_PG_ZERO) 1706{ 1707 1708 if (set_PG_ZERO) 1709 m->flags |= PG_ZERO; 1710 else 1711 m->flags &= ~PG_ZERO; 1712 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1713} 1714 1715/* 1716 * Inserts the specified page table page into the specified pmap's collection 1717 * of idle page table pages. Each of a pmap's page table pages is responsible 1718 * for mapping a distinct range of virtual addresses. The pmap's collection is 1719 * ordered by this virtual address range. 1720 */ 1721static __inline int 1722pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 1723{ 1724 1725 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1726 return (vm_radix_insert(&pmap->pm_root, mpte)); 1727} 1728 1729/* 1730 * Looks for a page table page mapping the specified virtual address in the 1731 * specified pmap's collection of idle page table pages. Returns NULL if there 1732 * is no page table page corresponding to the specified virtual address. 1733 */ 1734static __inline vm_page_t 1735pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) 1736{ 1737 1738 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1739 return (vm_radix_lookup(&pmap->pm_root, va >> PDRSHIFT)); 1740} 1741 1742/* 1743 * Removes the specified page table page from the specified pmap's collection 1744 * of idle page table pages. The specified page table page must be a member of 1745 * the pmap's collection. 1746 */ 1747static __inline void 1748pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) 1749{ 1750 1751 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1752 vm_radix_remove(&pmap->pm_root, mpte->pindex); 1753} 1754 1755/* 1756 * Decrements a page table page's wire count, which is used to record the 1757 * number of valid page table entries within the page. If the wire count 1758 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1759 * page table page was unmapped and FALSE otherwise. 1760 */ 1761static inline boolean_t 1762pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) 1763{ 1764 1765 --m->wire_count; 1766 if (m->wire_count == 0) { 1767 _pmap_unwire_ptp(pmap, m, free); 1768 return (TRUE); 1769 } else 1770 return (FALSE); 1771} 1772 1773static void 1774_pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) 1775{ 1776 vm_offset_t pteva; 1777 1778 /* 1779 * unmap the page table page 1780 */ 1781 pmap->pm_pdir[m->pindex] = 0; 1782 --pmap->pm_stats.resident_count; 1783 1784 /* 1785 * This is a release store so that the ordinary store unmapping 1786 * the page table page is globally performed before TLB shoot- 1787 * down is begun. 1788 */ 1789 atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1); 1790 1791 /* 1792 * Do an invltlb to make the invalidated mapping 1793 * take effect immediately. 1794 */ 1795 pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); 1796 pmap_invalidate_page(pmap, pteva); 1797 1798 /* 1799 * Put page on a list so that it is released after 1800 * *ALL* TLB shootdown is done 1801 */ 1802 pmap_add_delayed_free_list(m, free, TRUE); 1803} 1804 1805/* 1806 * After removing a page table entry, this routine is used to 1807 * conditionally free the page, and manage the hold/wire counts. 1808 */ 1809static int 1810pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free) 1811{ 1812 pd_entry_t ptepde; 1813 vm_page_t mpte; 1814 1815 if (va >= VM_MAXUSER_ADDRESS) 1816 return (0); 1817 ptepde = *pmap_pde(pmap, va); 1818 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 1819 return (pmap_unwire_ptp(pmap, mpte, free)); 1820} 1821 1822/* 1823 * Initialize the pmap for the swapper process. 1824 */ 1825void 1826pmap_pinit0(pmap_t pmap) 1827{ 1828 1829 PMAP_LOCK_INIT(pmap); 1830 /* 1831 * Since the page table directory is shared with the kernel pmap, 1832 * which is already included in the list "allpmaps", this pmap does 1833 * not need to be inserted into that list. 1834 */ 1835 pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); 1836#if defined(PAE) || defined(PAE_TABLES) 1837 pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); 1838#endif 1839 pmap->pm_root.rt_root = 0; 1840 CPU_ZERO(&pmap->pm_active); 1841 PCPU_SET(curpmap, pmap); 1842 TAILQ_INIT(&pmap->pm_pvchunk); 1843 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1844} 1845 1846/* 1847 * Initialize a preallocated and zeroed pmap structure, 1848 * such as one in a vmspace structure. 1849 */ 1850int 1851pmap_pinit(pmap_t pmap) 1852{ 1853 vm_page_t m, ptdpg[NPGPTD]; 1854 vm_paddr_t pa; 1855 int i; 1856 1857 /* 1858 * No need to allocate page table space yet but we do need a valid 1859 * page directory table. 1860 */ 1861 if (pmap->pm_pdir == NULL) { 1862 pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD); 1863 if (pmap->pm_pdir == NULL) 1864 return (0); 1865#if defined(PAE) || defined(PAE_TABLES) 1866 pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); 1867 KASSERT(((vm_offset_t)pmap->pm_pdpt & 1868 ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, 1869 ("pmap_pinit: pdpt misaligned")); 1870 KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), 1871 ("pmap_pinit: pdpt above 4g")); 1872#endif 1873 pmap->pm_root.rt_root = 0; 1874 } 1875 KASSERT(vm_radix_is_empty(&pmap->pm_root), 1876 ("pmap_pinit: pmap has reserved page table page(s)")); 1877 1878 /* 1879 * allocate the page directory page(s) 1880 */ 1881 for (i = 0; i < NPGPTD;) { 1882 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1883 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1884 if (m == NULL) 1885 VM_WAIT; 1886 else { 1887 ptdpg[i++] = m; 1888 } 1889 } 1890 1891 pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); 1892 1893 for (i = 0; i < NPGPTD; i++) 1894 if ((ptdpg[i]->flags & PG_ZERO) == 0) 1895 pagezero(pmap->pm_pdir + (i * NPDEPG)); 1896 1897 mtx_lock_spin(&allpmaps_lock); 1898 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1899 /* Copy the kernel page table directory entries. */ 1900 bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); 1901 mtx_unlock_spin(&allpmaps_lock); 1902 1903 /* install self-referential address mapping entry(s) */ 1904 for (i = 0; i < NPGPTD; i++) { 1905 pa = VM_PAGE_TO_PHYS(ptdpg[i]); 1906 pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; 1907#if defined(PAE) || defined(PAE_TABLES) 1908 pmap->pm_pdpt[i] = pa | PG_V; 1909#endif 1910 } 1911 1912 CPU_ZERO(&pmap->pm_active); 1913 TAILQ_INIT(&pmap->pm_pvchunk); 1914 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1915 1916 return (1); 1917} 1918 1919/* 1920 * this routine is called if the page table page is not 1921 * mapped correctly. 1922 */ 1923static vm_page_t 1924_pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags) 1925{ 1926 vm_paddr_t ptepa; 1927 vm_page_t m; 1928 1929 /* 1930 * Allocate a page table page. 1931 */ 1932 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1933 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1934 if ((flags & PMAP_ENTER_NOSLEEP) == 0) { 1935 PMAP_UNLOCK(pmap); 1936 rw_wunlock(&pvh_global_lock); 1937 VM_WAIT; 1938 rw_wlock(&pvh_global_lock); 1939 PMAP_LOCK(pmap); 1940 } 1941 1942 /* 1943 * Indicate the need to retry. While waiting, the page table 1944 * page may have been allocated. 1945 */ 1946 return (NULL); 1947 } 1948 if ((m->flags & PG_ZERO) == 0) 1949 pmap_zero_page(m); 1950 1951 /* 1952 * Map the pagetable page into the process address space, if 1953 * it isn't already there. 1954 */ 1955 1956 pmap->pm_stats.resident_count++; 1957 1958 ptepa = VM_PAGE_TO_PHYS(m); 1959 pmap->pm_pdir[ptepindex] = 1960 (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); 1961 1962 return (m); 1963} 1964 1965static vm_page_t 1966pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) 1967{ 1968 u_int ptepindex; 1969 pd_entry_t ptepa; 1970 vm_page_t m; 1971 1972 /* 1973 * Calculate pagetable page index 1974 */ 1975 ptepindex = va >> PDRSHIFT; 1976retry: 1977 /* 1978 * Get the page directory entry 1979 */ 1980 ptepa = pmap->pm_pdir[ptepindex]; 1981 1982 /* 1983 * This supports switching from a 4MB page to a 1984 * normal 4K page. 1985 */ 1986 if (ptepa & PG_PS) { 1987 (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); 1988 ptepa = pmap->pm_pdir[ptepindex]; 1989 } 1990 1991 /* 1992 * If the page table page is mapped, we just increment the 1993 * hold count, and activate it. 1994 */ 1995 if (ptepa) { 1996 m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 1997 m->wire_count++; 1998 } else { 1999 /* 2000 * Here if the pte page isn't mapped, or if it has 2001 * been deallocated. 2002 */ 2003 m = _pmap_allocpte(pmap, ptepindex, flags); 2004 if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) 2005 goto retry; 2006 } 2007 return (m); 2008} 2009 2010 2011/*************************************************** 2012* Pmap allocation/deallocation routines. 2013 ***************************************************/ 2014 2015/* 2016 * Release any resources held by the given physical map. 2017 * Called when a pmap initialized by pmap_pinit is being released. 2018 * Should only be called if the map contains no valid mappings. 2019 */ 2020void 2021pmap_release(pmap_t pmap) 2022{ 2023 vm_page_t m, ptdpg[NPGPTD]; 2024 int i; 2025 2026 KASSERT(pmap->pm_stats.resident_count == 0, 2027 ("pmap_release: pmap resident count %ld != 0", 2028 pmap->pm_stats.resident_count)); 2029 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2030 ("pmap_release: pmap has reserved page table page(s)")); 2031 KASSERT(CPU_EMPTY(&pmap->pm_active), 2032 ("releasing active pmap %p", pmap)); 2033 2034 mtx_lock_spin(&allpmaps_lock); 2035 LIST_REMOVE(pmap, pm_list); 2036 mtx_unlock_spin(&allpmaps_lock); 2037 2038 for (i = 0; i < NPGPTD; i++) 2039 ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] & 2040 PG_FRAME); 2041 2042 bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * 2043 sizeof(*pmap->pm_pdir)); 2044 2045 pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); 2046 2047 for (i = 0; i < NPGPTD; i++) { 2048 m = ptdpg[i]; 2049#if defined(PAE) || defined(PAE_TABLES) 2050 KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), 2051 ("pmap_release: got wrong ptd page")); 2052#endif 2053 m->wire_count--; 2054 atomic_subtract_int(&vm_cnt.v_wire_count, 1); 2055 vm_page_free_zero(m); 2056 } 2057} 2058 2059static int 2060kvm_size(SYSCTL_HANDLER_ARGS) 2061{ 2062 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; 2063 2064 return (sysctl_handle_long(oidp, &ksize, 0, req)); 2065} 2066SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 2067 0, 0, kvm_size, "IU", "Size of KVM"); 2068 2069static int 2070kvm_free(SYSCTL_HANDLER_ARGS) 2071{ 2072 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 2073 2074 return (sysctl_handle_long(oidp, &kfree, 0, req)); 2075} 2076SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 2077 0, 0, kvm_free, "IU", "Amount of KVM free"); 2078 2079/* 2080 * grow the number of kernel page table entries, if needed 2081 */ 2082void 2083pmap_growkernel(vm_offset_t addr) 2084{ 2085 vm_paddr_t ptppaddr; 2086 vm_page_t nkpg; 2087 pd_entry_t newpdir; 2088 2089 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2090 addr = roundup2(addr, NBPDR); 2091 if (addr - 1 >= kernel_map->max_offset) 2092 addr = kernel_map->max_offset; 2093 while (kernel_vm_end < addr) { 2094 if (pdir_pde(PTD, kernel_vm_end)) { 2095 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2096 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2097 kernel_vm_end = kernel_map->max_offset; 2098 break; 2099 } 2100 continue; 2101 } 2102 2103 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, 2104 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2105 VM_ALLOC_ZERO); 2106 if (nkpg == NULL) 2107 panic("pmap_growkernel: no memory to grow kernel"); 2108 2109 nkpt++; 2110 2111 if ((nkpg->flags & PG_ZERO) == 0) 2112 pmap_zero_page(nkpg); 2113 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 2114 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); 2115 pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir; 2116 2117 pmap_kenter_pde(kernel_vm_end, newpdir); 2118 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2119 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2120 kernel_vm_end = kernel_map->max_offset; 2121 break; 2122 } 2123 } 2124} 2125 2126 2127/*************************************************** 2128 * page management routines. 2129 ***************************************************/ 2130 2131CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2132CTASSERT(_NPCM == 11); 2133CTASSERT(_NPCPV == 336); 2134 2135static __inline struct pv_chunk * 2136pv_to_chunk(pv_entry_t pv) 2137{ 2138 2139 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2140} 2141 2142#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2143 2144#define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ 2145#define PC_FREE10 0x0000fffful /* Free values for index 10 */ 2146 2147static const uint32_t pc_freemask[_NPCM] = { 2148 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2149 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2150 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2151 PC_FREE0_9, PC_FREE10 2152}; 2153 2154SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2155 "Current number of pv entries"); 2156 2157#ifdef PV_STATS 2158static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2159 2160SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2161 "Current number of pv entry chunks"); 2162SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2163 "Current number of pv entry chunks allocated"); 2164SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2165 "Current number of pv entry chunks frees"); 2166SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2167 "Number of times tried to get a chunk page but failed."); 2168 2169static long pv_entry_frees, pv_entry_allocs; 2170static int pv_entry_spare; 2171 2172SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2173 "Current number of pv entry frees"); 2174SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2175 "Current number of pv entry allocs"); 2176SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2177 "Current number of spare pv entries"); 2178#endif 2179 2180/* 2181 * We are in a serious low memory condition. Resort to 2182 * drastic measures to free some pages so we can allocate 2183 * another pv entry chunk. 2184 */ 2185static vm_page_t 2186pmap_pv_reclaim(pmap_t locked_pmap) 2187{ 2188 struct pch newtail; 2189 struct pv_chunk *pc; 2190 struct md_page *pvh; 2191 pd_entry_t *pde; 2192 pmap_t pmap; 2193 pt_entry_t *pte, tpte; 2194 pv_entry_t pv; 2195 vm_offset_t va; 2196 vm_page_t m, m_pc; 2197 struct spglist free; 2198 uint32_t inuse; 2199 int bit, field, freed; 2200 2201 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2202 pmap = NULL; 2203 m_pc = NULL; 2204 SLIST_INIT(&free); 2205 TAILQ_INIT(&newtail); 2206 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || 2207 SLIST_EMPTY(&free))) { 2208 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2209 if (pmap != pc->pc_pmap) { 2210 if (pmap != NULL) { 2211 pmap_invalidate_all(pmap); 2212 if (pmap != locked_pmap) 2213 PMAP_UNLOCK(pmap); 2214 } 2215 pmap = pc->pc_pmap; 2216 /* Avoid deadlock and lock recursion. */ 2217 if (pmap > locked_pmap) 2218 PMAP_LOCK(pmap); 2219 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { 2220 pmap = NULL; 2221 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2222 continue; 2223 } 2224 } 2225 2226 /* 2227 * Destroy every non-wired, 4 KB page mapping in the chunk. 2228 */ 2229 freed = 0; 2230 for (field = 0; field < _NPCM; field++) { 2231 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2232 inuse != 0; inuse &= ~(1UL << bit)) { 2233 bit = bsfl(inuse); 2234 pv = &pc->pc_pventry[field * 32 + bit]; 2235 va = pv->pv_va; 2236 pde = pmap_pde(pmap, va); 2237 if ((*pde & PG_PS) != 0) 2238 continue; 2239 pte = pmap_pte(pmap, va); 2240 tpte = *pte; 2241 if ((tpte & PG_W) == 0) 2242 tpte = pte_load_clear(pte); 2243 pmap_pte_release(pte); 2244 if ((tpte & PG_W) != 0) 2245 continue; 2246 KASSERT(tpte != 0, 2247 ("pmap_pv_reclaim: pmap %p va %x zero pte", 2248 pmap, va)); 2249 if ((tpte & PG_G) != 0) 2250 pmap_invalidate_page(pmap, va); 2251 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 2252 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2253 vm_page_dirty(m); 2254 if ((tpte & PG_A) != 0) 2255 vm_page_aflag_set(m, PGA_REFERENCED); 2256 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2257 if (TAILQ_EMPTY(&m->md.pv_list) && 2258 (m->flags & PG_FICTITIOUS) == 0) { 2259 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2260 if (TAILQ_EMPTY(&pvh->pv_list)) { 2261 vm_page_aflag_clear(m, 2262 PGA_WRITEABLE); 2263 } 2264 } 2265 pc->pc_map[field] |= 1UL << bit; 2266 pmap_unuse_pt(pmap, va, &free); 2267 freed++; 2268 } 2269 } 2270 if (freed == 0) { 2271 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2272 continue; 2273 } 2274 /* Every freed mapping is for a 4 KB page. */ 2275 pmap->pm_stats.resident_count -= freed; 2276 PV_STAT(pv_entry_frees += freed); 2277 PV_STAT(pv_entry_spare += freed); 2278 pv_entry_count -= freed; 2279 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2280 for (field = 0; field < _NPCM; field++) 2281 if (pc->pc_map[field] != pc_freemask[field]) { 2282 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2283 pc_list); 2284 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2285 2286 /* 2287 * One freed pv entry in locked_pmap is 2288 * sufficient. 2289 */ 2290 if (pmap == locked_pmap) 2291 goto out; 2292 break; 2293 } 2294 if (field == _NPCM) { 2295 PV_STAT(pv_entry_spare -= _NPCPV); 2296 PV_STAT(pc_chunk_count--); 2297 PV_STAT(pc_chunk_frees++); 2298 /* Entire chunk is free; return it. */ 2299 m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2300 pmap_qremove((vm_offset_t)pc, 1); 2301 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2302 break; 2303 } 2304 } 2305out: 2306 TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); 2307 if (pmap != NULL) { 2308 pmap_invalidate_all(pmap); 2309 if (pmap != locked_pmap) 2310 PMAP_UNLOCK(pmap); 2311 } 2312 if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { 2313 m_pc = SLIST_FIRST(&free); 2314 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2315 /* Recycle a freed page table page. */ 2316 m_pc->wire_count = 1; 2317 atomic_add_int(&vm_cnt.v_wire_count, 1); 2318 } 2319 pmap_free_zero_pages(&free); 2320 return (m_pc); 2321} 2322 2323/* 2324 * free the pv_entry back to the free list 2325 */ 2326static void 2327free_pv_entry(pmap_t pmap, pv_entry_t pv) 2328{ 2329 struct pv_chunk *pc; 2330 int idx, field, bit; 2331 2332 rw_assert(&pvh_global_lock, RA_WLOCKED); 2333 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2334 PV_STAT(pv_entry_frees++); 2335 PV_STAT(pv_entry_spare++); 2336 pv_entry_count--; 2337 pc = pv_to_chunk(pv); 2338 idx = pv - &pc->pc_pventry[0]; 2339 field = idx / 32; 2340 bit = idx % 32; 2341 pc->pc_map[field] |= 1ul << bit; 2342 for (idx = 0; idx < _NPCM; idx++) 2343 if (pc->pc_map[idx] != pc_freemask[idx]) { 2344 /* 2345 * 98% of the time, pc is already at the head of the 2346 * list. If it isn't already, move it to the head. 2347 */ 2348 if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != 2349 pc)) { 2350 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2351 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2352 pc_list); 2353 } 2354 return; 2355 } 2356 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2357 free_pv_chunk(pc); 2358} 2359 2360static void 2361free_pv_chunk(struct pv_chunk *pc) 2362{ 2363 vm_page_t m; 2364 2365 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2366 PV_STAT(pv_entry_spare -= _NPCPV); 2367 PV_STAT(pc_chunk_count--); 2368 PV_STAT(pc_chunk_frees++); 2369 /* entire chunk is free, return it */ 2370 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2371 pmap_qremove((vm_offset_t)pc, 1); 2372 vm_page_unwire(m, PQ_NONE); 2373 vm_page_free(m); 2374 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2375} 2376 2377/* 2378 * get a new pv_entry, allocating a block from the system 2379 * when needed. 2380 */ 2381static pv_entry_t 2382get_pv_entry(pmap_t pmap, boolean_t try) 2383{ 2384 static const struct timeval printinterval = { 60, 0 }; 2385 static struct timeval lastprint; 2386 int bit, field; 2387 pv_entry_t pv; 2388 struct pv_chunk *pc; 2389 vm_page_t m; 2390 2391 rw_assert(&pvh_global_lock, RA_WLOCKED); 2392 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2393 PV_STAT(pv_entry_allocs++); 2394 pv_entry_count++; 2395 if (pv_entry_count > pv_entry_high_water) 2396 if (ratecheck(&lastprint, &printinterval)) 2397 printf("Approaching the limit on PV entries, consider " 2398 "increasing either the vm.pmap.shpgperproc or the " 2399 "vm.pmap.pv_entry_max tunable.\n"); 2400retry: 2401 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2402 if (pc != NULL) { 2403 for (field = 0; field < _NPCM; field++) { 2404 if (pc->pc_map[field]) { 2405 bit = bsfl(pc->pc_map[field]); 2406 break; 2407 } 2408 } 2409 if (field < _NPCM) { 2410 pv = &pc->pc_pventry[field * 32 + bit]; 2411 pc->pc_map[field] &= ~(1ul << bit); 2412 /* If this was the last item, move it to tail */ 2413 for (field = 0; field < _NPCM; field++) 2414 if (pc->pc_map[field] != 0) { 2415 PV_STAT(pv_entry_spare--); 2416 return (pv); /* not full, return */ 2417 } 2418 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2419 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2420 PV_STAT(pv_entry_spare--); 2421 return (pv); 2422 } 2423 } 2424 /* 2425 * Access to the ptelist "pv_vafree" is synchronized by the pvh 2426 * global lock. If "pv_vafree" is currently non-empty, it will 2427 * remain non-empty until pmap_ptelist_alloc() completes. 2428 */ 2429 if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2430 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 2431 if (try) { 2432 pv_entry_count--; 2433 PV_STAT(pc_chunk_tryfail++); 2434 return (NULL); 2435 } 2436 m = pmap_pv_reclaim(pmap); 2437 if (m == NULL) 2438 goto retry; 2439 } 2440 PV_STAT(pc_chunk_count++); 2441 PV_STAT(pc_chunk_allocs++); 2442 pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); 2443 pmap_qenter((vm_offset_t)pc, &m, 1); 2444 pc->pc_pmap = pmap; 2445 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ 2446 for (field = 1; field < _NPCM; field++) 2447 pc->pc_map[field] = pc_freemask[field]; 2448 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2449 pv = &pc->pc_pventry[0]; 2450 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2451 PV_STAT(pv_entry_spare += _NPCPV - 1); 2452 return (pv); 2453} 2454 2455static __inline pv_entry_t 2456pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2457{ 2458 pv_entry_t pv; 2459 2460 rw_assert(&pvh_global_lock, RA_WLOCKED); 2461 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 2462 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 2463 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 2464 break; 2465 } 2466 } 2467 return (pv); 2468} 2469 2470static void 2471pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2472{ 2473 struct md_page *pvh; 2474 pv_entry_t pv; 2475 vm_offset_t va_last; 2476 vm_page_t m; 2477 2478 rw_assert(&pvh_global_lock, RA_WLOCKED); 2479 KASSERT((pa & PDRMASK) == 0, 2480 ("pmap_pv_demote_pde: pa is not 4mpage aligned")); 2481 2482 /* 2483 * Transfer the 4mpage's pv entry for this mapping to the first 2484 * page's pv list. 2485 */ 2486 pvh = pa_to_pvh(pa); 2487 va = trunc_4mpage(va); 2488 pv = pmap_pvh_remove(pvh, pmap, va); 2489 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 2490 m = PHYS_TO_VM_PAGE(pa); 2491 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2492 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 2493 va_last = va + NBPDR - PAGE_SIZE; 2494 do { 2495 m++; 2496 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2497 ("pmap_pv_demote_pde: page %p is not managed", m)); 2498 va += PAGE_SIZE; 2499 pmap_insert_entry(pmap, va, m); 2500 } while (va < va_last); 2501} 2502 2503static void 2504pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2505{ 2506 struct md_page *pvh; 2507 pv_entry_t pv; 2508 vm_offset_t va_last; 2509 vm_page_t m; 2510 2511 rw_assert(&pvh_global_lock, RA_WLOCKED); 2512 KASSERT((pa & PDRMASK) == 0, 2513 ("pmap_pv_promote_pde: pa is not 4mpage aligned")); 2514 2515 /* 2516 * Transfer the first page's pv entry for this mapping to the 2517 * 4mpage's pv list. Aside from avoiding the cost of a call 2518 * to get_pv_entry(), a transfer avoids the possibility that 2519 * get_pv_entry() calls pmap_collect() and that pmap_collect() 2520 * removes one of the mappings that is being promoted. 2521 */ 2522 m = PHYS_TO_VM_PAGE(pa); 2523 va = trunc_4mpage(va); 2524 pv = pmap_pvh_remove(&m->md, pmap, va); 2525 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 2526 pvh = pa_to_pvh(pa); 2527 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2528 /* Free the remaining NPTEPG - 1 pv entries. */ 2529 va_last = va + NBPDR - PAGE_SIZE; 2530 do { 2531 m++; 2532 va += PAGE_SIZE; 2533 pmap_pvh_free(&m->md, pmap, va); 2534 } while (va < va_last); 2535} 2536 2537static void 2538pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2539{ 2540 pv_entry_t pv; 2541 2542 pv = pmap_pvh_remove(pvh, pmap, va); 2543 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 2544 free_pv_entry(pmap, pv); 2545} 2546 2547static void 2548pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 2549{ 2550 struct md_page *pvh; 2551 2552 rw_assert(&pvh_global_lock, RA_WLOCKED); 2553 pmap_pvh_free(&m->md, pmap, va); 2554 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 2555 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2556 if (TAILQ_EMPTY(&pvh->pv_list)) 2557 vm_page_aflag_clear(m, PGA_WRITEABLE); 2558 } 2559} 2560 2561/* 2562 * Create a pv entry for page at pa for 2563 * (pmap, va). 2564 */ 2565static void 2566pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2567{ 2568 pv_entry_t pv; 2569 2570 rw_assert(&pvh_global_lock, RA_WLOCKED); 2571 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2572 pv = get_pv_entry(pmap, FALSE); 2573 pv->pv_va = va; 2574 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2575} 2576 2577/* 2578 * Conditionally create a pv entry. 2579 */ 2580static boolean_t 2581pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2582{ 2583 pv_entry_t pv; 2584 2585 rw_assert(&pvh_global_lock, RA_WLOCKED); 2586 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2587 if (pv_entry_count < pv_entry_high_water && 2588 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2589 pv->pv_va = va; 2590 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2591 return (TRUE); 2592 } else 2593 return (FALSE); 2594} 2595 2596/* 2597 * Create the pv entries for each of the pages within a superpage. 2598 */ 2599static boolean_t 2600pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2601{ 2602 struct md_page *pvh; 2603 pv_entry_t pv; 2604 2605 rw_assert(&pvh_global_lock, RA_WLOCKED); 2606 if (pv_entry_count < pv_entry_high_water && 2607 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2608 pv->pv_va = va; 2609 pvh = pa_to_pvh(pa); 2610 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2611 return (TRUE); 2612 } else 2613 return (FALSE); 2614} 2615 2616/* 2617 * Fills a page table page with mappings to consecutive physical pages. 2618 */ 2619static void 2620pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 2621{ 2622 pt_entry_t *pte; 2623 2624 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 2625 *pte = newpte; 2626 newpte += PAGE_SIZE; 2627 } 2628} 2629 2630/* 2631 * Tries to demote a 2- or 4MB page mapping. If demotion fails, the 2632 * 2- or 4MB page mapping is invalidated. 2633 */ 2634static boolean_t 2635pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2636{ 2637 pd_entry_t newpde, oldpde; 2638 pt_entry_t *firstpte, newpte; 2639 vm_paddr_t mptepa; 2640 vm_page_t mpte; 2641 struct spglist free; 2642 vm_offset_t sva; 2643 2644 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2645 oldpde = *pde; 2646 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 2647 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 2648 if ((oldpde & PG_A) != 0 && (mpte = pmap_lookup_pt_page(pmap, va)) != 2649 NULL) 2650 pmap_remove_pt_page(pmap, mpte); 2651 else { 2652 KASSERT((oldpde & PG_W) == 0, 2653 ("pmap_demote_pde: page table page for a wired mapping" 2654 " is missing")); 2655 2656 /* 2657 * Invalidate the 2- or 4MB page mapping and return 2658 * "failure" if the mapping was never accessed or the 2659 * allocation of the new page table page fails. 2660 */ 2661 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 2662 va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | 2663 VM_ALLOC_WIRED)) == NULL) { 2664 SLIST_INIT(&free); 2665 sva = trunc_4mpage(va); 2666 pmap_remove_pde(pmap, pde, sva, &free); 2667 pmap_invalidate_range(pmap, sva, sva + NBPDR - 1); 2668 pmap_free_zero_pages(&free); 2669 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" 2670 " in pmap %p", va, pmap); 2671 return (FALSE); 2672 } 2673 if (va < VM_MAXUSER_ADDRESS) 2674 pmap->pm_stats.resident_count++; 2675 } 2676 mptepa = VM_PAGE_TO_PHYS(mpte); 2677 2678 /* 2679 * If the page mapping is in the kernel's address space, then the 2680 * KPTmap can provide access to the page table page. Otherwise, 2681 * temporarily map the page table page (mpte) into the kernel's 2682 * address space at either PADDR1 or PADDR2. 2683 */ 2684 if (va >= KERNBASE) 2685 firstpte = &KPTmap[i386_btop(trunc_4mpage(va))]; 2686 else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { 2687 if ((*PMAP1 & PG_FRAME) != mptepa) { 2688 *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2689#ifdef SMP 2690 PMAP1cpu = PCPU_GET(cpuid); 2691#endif 2692 invlcaddr(PADDR1); 2693 PMAP1changed++; 2694 } else 2695#ifdef SMP 2696 if (PMAP1cpu != PCPU_GET(cpuid)) { 2697 PMAP1cpu = PCPU_GET(cpuid); 2698 invlcaddr(PADDR1); 2699 PMAP1changedcpu++; 2700 } else 2701#endif 2702 PMAP1unchanged++; 2703 firstpte = PADDR1; 2704 } else { 2705 mtx_lock(&PMAP2mutex); 2706 if ((*PMAP2 & PG_FRAME) != mptepa) { 2707 *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2708 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); 2709 } 2710 firstpte = PADDR2; 2711 } 2712 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 2713 KASSERT((oldpde & PG_A) != 0, 2714 ("pmap_demote_pde: oldpde is missing PG_A")); 2715 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 2716 ("pmap_demote_pde: oldpde is missing PG_M")); 2717 newpte = oldpde & ~PG_PS; 2718 if ((newpte & PG_PDE_PAT) != 0) 2719 newpte ^= PG_PDE_PAT | PG_PTE_PAT; 2720 2721 /* 2722 * If the page table page is new, initialize it. 2723 */ 2724 if (mpte->wire_count == 1) { 2725 mpte->wire_count = NPTEPG; 2726 pmap_fill_ptp(firstpte, newpte); 2727 } 2728 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 2729 ("pmap_demote_pde: firstpte and newpte map different physical" 2730 " addresses")); 2731 2732 /* 2733 * If the mapping has changed attributes, update the page table 2734 * entries. 2735 */ 2736 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 2737 pmap_fill_ptp(firstpte, newpte); 2738 2739 /* 2740 * Demote the mapping. This pmap is locked. The old PDE has 2741 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 2742 * set. Thus, there is no danger of a race with another 2743 * processor changing the setting of PG_A and/or PG_M between 2744 * the read above and the store below. 2745 */ 2746 if (workaround_erratum383) 2747 pmap_update_pde(pmap, va, pde, newpde); 2748 else if (pmap == kernel_pmap) 2749 pmap_kenter_pde(va, newpde); 2750 else 2751 pde_store(pde, newpde); 2752 if (firstpte == PADDR2) 2753 mtx_unlock(&PMAP2mutex); 2754 2755 /* 2756 * Invalidate the recursive mapping of the page table page. 2757 */ 2758 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 2759 2760 /* 2761 * Demote the pv entry. This depends on the earlier demotion 2762 * of the mapping. Specifically, the (re)creation of a per- 2763 * page pv entry might trigger the execution of pmap_collect(), 2764 * which might reclaim a newly (re)created per-page pv entry 2765 * and destroy the associated mapping. In order to destroy 2766 * the mapping, the PDE must have already changed from mapping 2767 * the 2mpage to referencing the page table page. 2768 */ 2769 if ((oldpde & PG_MANAGED) != 0) 2770 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); 2771 2772 pmap_pde_demotions++; 2773 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" 2774 " in pmap %p", va, pmap); 2775 return (TRUE); 2776} 2777 2778/* 2779 * Removes a 2- or 4MB page mapping from the kernel pmap. 2780 */ 2781static void 2782pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2783{ 2784 pd_entry_t newpde; 2785 vm_paddr_t mptepa; 2786 vm_page_t mpte; 2787 2788 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2789 mpte = pmap_lookup_pt_page(pmap, va); 2790 if (mpte == NULL) 2791 panic("pmap_remove_kernel_pde: Missing pt page."); 2792 2793 pmap_remove_pt_page(pmap, mpte); 2794 mptepa = VM_PAGE_TO_PHYS(mpte); 2795 newpde = mptepa | PG_M | PG_A | PG_RW | PG_V; 2796 2797 /* 2798 * Initialize the page table page. 2799 */ 2800 pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]); 2801 2802 /* 2803 * Remove the mapping. 2804 */ 2805 if (workaround_erratum383) 2806 pmap_update_pde(pmap, va, pde, newpde); 2807 else 2808 pmap_kenter_pde(va, newpde); 2809 2810 /* 2811 * Invalidate the recursive mapping of the page table page. 2812 */ 2813 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 2814} 2815 2816/* 2817 * pmap_remove_pde: do the things to unmap a superpage in a process 2818 */ 2819static void 2820pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 2821 struct spglist *free) 2822{ 2823 struct md_page *pvh; 2824 pd_entry_t oldpde; 2825 vm_offset_t eva, va; 2826 vm_page_t m, mpte; 2827 2828 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2829 KASSERT((sva & PDRMASK) == 0, 2830 ("pmap_remove_pde: sva is not 4mpage aligned")); 2831 oldpde = pte_load_clear(pdq); 2832 if (oldpde & PG_W) 2833 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 2834 2835 /* 2836 * Machines that don't support invlpg, also don't support 2837 * PG_G. 2838 * 2839 * When workaround_erratum383 is false, a promotion to a 2M/4M 2840 * page mapping does not invalidate the 512/1024 4K page mappings 2841 * from the TLB. Consequently, at this point, the TLB may 2842 * hold both 4K and 2M/4M page mappings. Therefore, the entire 2843 * range of addresses must be invalidated here. In contrast, 2844 * when workaround_erratum383 is true, a promotion does 2845 * invalidate the 512/1024 4K page mappings, and so a single INVLPG 2846 * suffices to invalidate the 2M/4M page mapping. 2847 */ 2848 if ((oldpde & PG_G) != 0) { 2849 if (workaround_erratum383) 2850 pmap_invalidate_page(kernel_pmap, sva); 2851 else 2852 pmap_invalidate_range(kernel_pmap, sva, 2853 sva + NBPDR - 1); 2854 } 2855 2856 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2857 if (oldpde & PG_MANAGED) { 2858 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 2859 pmap_pvh_free(pvh, pmap, sva); 2860 eva = sva + NBPDR; 2861 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 2862 va < eva; va += PAGE_SIZE, m++) { 2863 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2864 vm_page_dirty(m); 2865 if (oldpde & PG_A) 2866 vm_page_aflag_set(m, PGA_REFERENCED); 2867 if (TAILQ_EMPTY(&m->md.pv_list) && 2868 TAILQ_EMPTY(&pvh->pv_list)) 2869 vm_page_aflag_clear(m, PGA_WRITEABLE); 2870 } 2871 } 2872 if (pmap == kernel_pmap) { 2873 pmap_remove_kernel_pde(pmap, pdq, sva); 2874 } else { 2875 mpte = pmap_lookup_pt_page(pmap, sva); 2876 if (mpte != NULL) { 2877 pmap_remove_pt_page(pmap, mpte); 2878 pmap->pm_stats.resident_count--; 2879 KASSERT(mpte->wire_count == NPTEPG, 2880 ("pmap_remove_pde: pte page wire count error")); 2881 mpte->wire_count = 0; 2882 pmap_add_delayed_free_list(mpte, free, FALSE); 2883 atomic_subtract_int(&vm_cnt.v_wire_count, 1); 2884 } 2885 } 2886} 2887 2888/* 2889 * pmap_remove_pte: do the things to unmap a page in a process 2890 */ 2891static int 2892pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 2893 struct spglist *free) 2894{ 2895 pt_entry_t oldpte; 2896 vm_page_t m; 2897 2898 rw_assert(&pvh_global_lock, RA_WLOCKED); 2899 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2900 oldpte = pte_load_clear(ptq); 2901 KASSERT(oldpte != 0, 2902 ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); 2903 if (oldpte & PG_W) 2904 pmap->pm_stats.wired_count -= 1; 2905 /* 2906 * Machines that don't support invlpg, also don't support 2907 * PG_G. 2908 */ 2909 if (oldpte & PG_G) 2910 pmap_invalidate_page(kernel_pmap, va); 2911 pmap->pm_stats.resident_count -= 1; 2912 if (oldpte & PG_MANAGED) { 2913 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 2914 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2915 vm_page_dirty(m); 2916 if (oldpte & PG_A) 2917 vm_page_aflag_set(m, PGA_REFERENCED); 2918 pmap_remove_entry(pmap, m, va); 2919 } 2920 return (pmap_unuse_pt(pmap, va, free)); 2921} 2922 2923/* 2924 * Remove a single page from a process address space 2925 */ 2926static void 2927pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) 2928{ 2929 pt_entry_t *pte; 2930 2931 rw_assert(&pvh_global_lock, RA_WLOCKED); 2932 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 2933 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2934 if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) 2935 return; 2936 pmap_remove_pte(pmap, pte, va, free); 2937 pmap_invalidate_page(pmap, va); 2938} 2939 2940/* 2941 * Remove the given range of addresses from the specified map. 2942 * 2943 * It is assumed that the start and end are properly 2944 * rounded to the page size. 2945 */ 2946void 2947pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2948{ 2949 vm_offset_t pdnxt; 2950 pd_entry_t ptpaddr; 2951 pt_entry_t *pte; 2952 struct spglist free; 2953 int anyvalid; 2954 2955 /* 2956 * Perform an unsynchronized read. This is, however, safe. 2957 */ 2958 if (pmap->pm_stats.resident_count == 0) 2959 return; 2960 2961 anyvalid = 0; 2962 SLIST_INIT(&free); 2963 2964 rw_wlock(&pvh_global_lock); 2965 sched_pin(); 2966 PMAP_LOCK(pmap); 2967 2968 /* 2969 * special handling of removing one page. a very 2970 * common operation and easy to short circuit some 2971 * code. 2972 */ 2973 if ((sva + PAGE_SIZE == eva) && 2974 ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { 2975 pmap_remove_page(pmap, sva, &free); 2976 goto out; 2977 } 2978 2979 for (; sva < eva; sva = pdnxt) { 2980 u_int pdirindex; 2981 2982 /* 2983 * Calculate index for next page table. 2984 */ 2985 pdnxt = (sva + NBPDR) & ~PDRMASK; 2986 if (pdnxt < sva) 2987 pdnxt = eva; 2988 if (pmap->pm_stats.resident_count == 0) 2989 break; 2990 2991 pdirindex = sva >> PDRSHIFT; 2992 ptpaddr = pmap->pm_pdir[pdirindex]; 2993 2994 /* 2995 * Weed out invalid mappings. Note: we assume that the page 2996 * directory table is always allocated, and in kernel virtual. 2997 */ 2998 if (ptpaddr == 0) 2999 continue; 3000 3001 /* 3002 * Check for large page. 3003 */ 3004 if ((ptpaddr & PG_PS) != 0) { 3005 /* 3006 * Are we removing the entire large page? If not, 3007 * demote the mapping and fall through. 3008 */ 3009 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 3010 /* 3011 * The TLB entry for a PG_G mapping is 3012 * invalidated by pmap_remove_pde(). 3013 */ 3014 if ((ptpaddr & PG_G) == 0) 3015 anyvalid = 1; 3016 pmap_remove_pde(pmap, 3017 &pmap->pm_pdir[pdirindex], sva, &free); 3018 continue; 3019 } else if (!pmap_demote_pde(pmap, 3020 &pmap->pm_pdir[pdirindex], sva)) { 3021 /* The large page mapping was destroyed. */ 3022 continue; 3023 } 3024 } 3025 3026 /* 3027 * Limit our scan to either the end of the va represented 3028 * by the current page table page, or to the end of the 3029 * range being removed. 3030 */ 3031 if (pdnxt > eva) 3032 pdnxt = eva; 3033 3034 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 3035 sva += PAGE_SIZE) { 3036 if (*pte == 0) 3037 continue; 3038 3039 /* 3040 * The TLB entry for a PG_G mapping is invalidated 3041 * by pmap_remove_pte(). 3042 */ 3043 if ((*pte & PG_G) == 0) 3044 anyvalid = 1; 3045 if (pmap_remove_pte(pmap, pte, sva, &free)) 3046 break; 3047 } 3048 } 3049out: 3050 sched_unpin(); 3051 if (anyvalid) 3052 pmap_invalidate_all(pmap); 3053 rw_wunlock(&pvh_global_lock); 3054 PMAP_UNLOCK(pmap); 3055 pmap_free_zero_pages(&free); 3056} 3057 3058/* 3059 * Routine: pmap_remove_all 3060 * Function: 3061 * Removes this physical page from 3062 * all physical maps in which it resides. 3063 * Reflects back modify bits to the pager. 3064 * 3065 * Notes: 3066 * Original versions of this routine were very 3067 * inefficient because they iteratively called 3068 * pmap_remove (slow...) 3069 */ 3070 3071void 3072pmap_remove_all(vm_page_t m) 3073{ 3074 struct md_page *pvh; 3075 pv_entry_t pv; 3076 pmap_t pmap; 3077 pt_entry_t *pte, tpte; 3078 pd_entry_t *pde; 3079 vm_offset_t va; 3080 struct spglist free; 3081 3082 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3083 ("pmap_remove_all: page %p is not managed", m)); 3084 SLIST_INIT(&free); 3085 rw_wlock(&pvh_global_lock); 3086 sched_pin(); 3087 if ((m->flags & PG_FICTITIOUS) != 0) 3088 goto small_mappings; 3089 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3090 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 3091 va = pv->pv_va; 3092 pmap = PV_PMAP(pv); 3093 PMAP_LOCK(pmap); 3094 pde = pmap_pde(pmap, va); 3095 (void)pmap_demote_pde(pmap, pde, va); 3096 PMAP_UNLOCK(pmap); 3097 } 3098small_mappings: 3099 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3100 pmap = PV_PMAP(pv); 3101 PMAP_LOCK(pmap); 3102 pmap->pm_stats.resident_count--; 3103 pde = pmap_pde(pmap, pv->pv_va); 3104 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 3105 " a 4mpage in page %p's pv list", m)); 3106 pte = pmap_pte_quick(pmap, pv->pv_va); 3107 tpte = pte_load_clear(pte); 3108 KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", 3109 pmap, pv->pv_va)); 3110 if (tpte & PG_W) 3111 pmap->pm_stats.wired_count--; 3112 if (tpte & PG_A) 3113 vm_page_aflag_set(m, PGA_REFERENCED); 3114 3115 /* 3116 * Update the vm_page_t clean and reference bits. 3117 */ 3118 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3119 vm_page_dirty(m); 3120 pmap_unuse_pt(pmap, pv->pv_va, &free); 3121 pmap_invalidate_page(pmap, pv->pv_va); 3122 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3123 free_pv_entry(pmap, pv); 3124 PMAP_UNLOCK(pmap); 3125 } 3126 vm_page_aflag_clear(m, PGA_WRITEABLE); 3127 sched_unpin(); 3128 rw_wunlock(&pvh_global_lock); 3129 pmap_free_zero_pages(&free); 3130} 3131 3132/* 3133 * pmap_protect_pde: do the things to protect a 4mpage in a process 3134 */ 3135static boolean_t 3136pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 3137{ 3138 pd_entry_t newpde, oldpde; 3139 vm_offset_t eva, va; 3140 vm_page_t m; 3141 boolean_t anychanged; 3142 3143 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3144 KASSERT((sva & PDRMASK) == 0, 3145 ("pmap_protect_pde: sva is not 4mpage aligned")); 3146 anychanged = FALSE; 3147retry: 3148 oldpde = newpde = *pde; 3149 if (oldpde & PG_MANAGED) { 3150 eva = sva + NBPDR; 3151 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 3152 va < eva; va += PAGE_SIZE, m++) 3153 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3154 vm_page_dirty(m); 3155 } 3156 if ((prot & VM_PROT_WRITE) == 0) 3157 newpde &= ~(PG_RW | PG_M); 3158#if defined(PAE) || defined(PAE_TABLES) 3159 if ((prot & VM_PROT_EXECUTE) == 0) 3160 newpde |= pg_nx; 3161#endif 3162 if (newpde != oldpde) { 3163 if (!pde_cmpset(pde, oldpde, newpde)) 3164 goto retry; 3165 if (oldpde & PG_G) { 3166 /* See pmap_remove_pde() for explanation. */ 3167 if (workaround_erratum383) 3168 pmap_invalidate_page(kernel_pmap, sva); 3169 else 3170 pmap_invalidate_range(kernel_pmap, sva, 3171 sva + NBPDR - 1); 3172 } else 3173 anychanged = TRUE; 3174 } 3175 return (anychanged); 3176} 3177 3178/* 3179 * Set the physical protection on the 3180 * specified range of this map as requested. 3181 */ 3182void 3183pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 3184{ 3185 vm_offset_t pdnxt; 3186 pd_entry_t ptpaddr; 3187 pt_entry_t *pte; 3188 boolean_t anychanged, pv_lists_locked; 3189 3190 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 3191 if (prot == VM_PROT_NONE) { 3192 pmap_remove(pmap, sva, eva); 3193 return; 3194 } 3195 3196#if defined(PAE) || defined(PAE_TABLES) 3197 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 3198 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 3199 return; 3200#else 3201 if (prot & VM_PROT_WRITE) 3202 return; 3203#endif 3204 3205 if (pmap_is_current(pmap)) 3206 pv_lists_locked = FALSE; 3207 else { 3208 pv_lists_locked = TRUE; 3209resume: 3210 rw_wlock(&pvh_global_lock); 3211 sched_pin(); 3212 } 3213 anychanged = FALSE; 3214 3215 PMAP_LOCK(pmap); 3216 for (; sva < eva; sva = pdnxt) { 3217 pt_entry_t obits, pbits; 3218 u_int pdirindex; 3219 3220 pdnxt = (sva + NBPDR) & ~PDRMASK; 3221 if (pdnxt < sva) 3222 pdnxt = eva; 3223 3224 pdirindex = sva >> PDRSHIFT; 3225 ptpaddr = pmap->pm_pdir[pdirindex]; 3226 3227 /* 3228 * Weed out invalid mappings. Note: we assume that the page 3229 * directory table is always allocated, and in kernel virtual. 3230 */ 3231 if (ptpaddr == 0) 3232 continue; 3233 3234 /* 3235 * Check for large page. 3236 */ 3237 if ((ptpaddr & PG_PS) != 0) { 3238 /* 3239 * Are we protecting the entire large page? If not, 3240 * demote the mapping and fall through. 3241 */ 3242 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 3243 /* 3244 * The TLB entry for a PG_G mapping is 3245 * invalidated by pmap_protect_pde(). 3246 */ 3247 if (pmap_protect_pde(pmap, 3248 &pmap->pm_pdir[pdirindex], sva, prot)) 3249 anychanged = TRUE; 3250 continue; 3251 } else { 3252 if (!pv_lists_locked) { 3253 pv_lists_locked = TRUE; 3254 if (!rw_try_wlock(&pvh_global_lock)) { 3255 if (anychanged) 3256 pmap_invalidate_all( 3257 pmap); 3258 PMAP_UNLOCK(pmap); 3259 goto resume; 3260 } 3261 sched_pin(); 3262 } 3263 if (!pmap_demote_pde(pmap, 3264 &pmap->pm_pdir[pdirindex], sva)) { 3265 /* 3266 * The large page mapping was 3267 * destroyed. 3268 */ 3269 continue; 3270 } 3271 } 3272 } 3273 3274 if (pdnxt > eva) 3275 pdnxt = eva; 3276 3277 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 3278 sva += PAGE_SIZE) { 3279 vm_page_t m; 3280 3281retry: 3282 /* 3283 * Regardless of whether a pte is 32 or 64 bits in 3284 * size, PG_RW, PG_A, and PG_M are among the least 3285 * significant 32 bits. 3286 */ 3287 obits = pbits = *pte; 3288 if ((pbits & PG_V) == 0) 3289 continue; 3290 3291 if ((prot & VM_PROT_WRITE) == 0) { 3292 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 3293 (PG_MANAGED | PG_M | PG_RW)) { 3294 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 3295 vm_page_dirty(m); 3296 } 3297 pbits &= ~(PG_RW | PG_M); 3298 } 3299#if defined(PAE) || defined(PAE_TABLES) 3300 if ((prot & VM_PROT_EXECUTE) == 0) 3301 pbits |= pg_nx; 3302#endif 3303 3304 if (pbits != obits) { 3305#if defined(PAE) || defined(PAE_TABLES) 3306 if (!atomic_cmpset_64(pte, obits, pbits)) 3307 goto retry; 3308#else 3309 if (!atomic_cmpset_int((u_int *)pte, obits, 3310 pbits)) 3311 goto retry; 3312#endif 3313 if (obits & PG_G) 3314 pmap_invalidate_page(pmap, sva); 3315 else 3316 anychanged = TRUE; 3317 } 3318 } 3319 } 3320 if (anychanged) 3321 pmap_invalidate_all(pmap); 3322 if (pv_lists_locked) { 3323 sched_unpin(); 3324 rw_wunlock(&pvh_global_lock); 3325 } 3326 PMAP_UNLOCK(pmap); 3327} 3328 3329/* 3330 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are 3331 * within a single page table page (PTP) to a single 2- or 4MB page mapping. 3332 * For promotion to occur, two conditions must be met: (1) the 4KB page 3333 * mappings must map aligned, contiguous physical memory and (2) the 4KB page 3334 * mappings must have identical characteristics. 3335 * 3336 * Managed (PG_MANAGED) mappings within the kernel address space are not 3337 * promoted. The reason is that kernel PDEs are replicated in each pmap but 3338 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel 3339 * pmap. 3340 */ 3341static void 3342pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3343{ 3344 pd_entry_t newpde; 3345 pt_entry_t *firstpte, oldpte, pa, *pte; 3346 vm_offset_t oldpteva; 3347 vm_page_t mpte; 3348 3349 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3350 3351 /* 3352 * Examine the first PTE in the specified PTP. Abort if this PTE is 3353 * either invalid, unused, or does not map the first 4KB physical page 3354 * within a 2- or 4MB page. 3355 */ 3356 firstpte = pmap_pte_quick(pmap, trunc_4mpage(va)); 3357setpde: 3358 newpde = *firstpte; 3359 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 3360 pmap_pde_p_failures++; 3361 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3362 " in pmap %p", va, pmap); 3363 return; 3364 } 3365 if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { 3366 pmap_pde_p_failures++; 3367 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3368 " in pmap %p", va, pmap); 3369 return; 3370 } 3371 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 3372 /* 3373 * When PG_M is already clear, PG_RW can be cleared without 3374 * a TLB invalidation. 3375 */ 3376 if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde & 3377 ~PG_RW)) 3378 goto setpde; 3379 newpde &= ~PG_RW; 3380 } 3381 3382 /* 3383 * Examine each of the other PTEs in the specified PTP. Abort if this 3384 * PTE maps an unexpected 4KB physical page or does not have identical 3385 * characteristics to the first PTE. 3386 */ 3387 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 3388 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 3389setpte: 3390 oldpte = *pte; 3391 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 3392 pmap_pde_p_failures++; 3393 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3394 " in pmap %p", va, pmap); 3395 return; 3396 } 3397 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 3398 /* 3399 * When PG_M is already clear, PG_RW can be cleared 3400 * without a TLB invalidation. 3401 */ 3402 if (!atomic_cmpset_int((u_int *)pte, oldpte, 3403 oldpte & ~PG_RW)) 3404 goto setpte; 3405 oldpte &= ~PG_RW; 3406 oldpteva = (oldpte & PG_FRAME & PDRMASK) | 3407 (va & ~PDRMASK); 3408 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" 3409 " in pmap %p", oldpteva, pmap); 3410 } 3411 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 3412 pmap_pde_p_failures++; 3413 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3414 " in pmap %p", va, pmap); 3415 return; 3416 } 3417 pa -= PAGE_SIZE; 3418 } 3419 3420 /* 3421 * Save the page table page in its current state until the PDE 3422 * mapping the superpage is demoted by pmap_demote_pde() or 3423 * destroyed by pmap_remove_pde(). 3424 */ 3425 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 3426 KASSERT(mpte >= vm_page_array && 3427 mpte < &vm_page_array[vm_page_array_size], 3428 ("pmap_promote_pde: page table page is out of range")); 3429 KASSERT(mpte->pindex == va >> PDRSHIFT, 3430 ("pmap_promote_pde: page table page's pindex is wrong")); 3431 if (pmap_insert_pt_page(pmap, mpte)) { 3432 pmap_pde_p_failures++; 3433 CTR2(KTR_PMAP, 3434 "pmap_promote_pde: failure for va %#x in pmap %p", va, 3435 pmap); 3436 return; 3437 } 3438 3439 /* 3440 * Promote the pv entries. 3441 */ 3442 if ((newpde & PG_MANAGED) != 0) 3443 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); 3444 3445 /* 3446 * Propagate the PAT index to its proper position. 3447 */ 3448 if ((newpde & PG_PTE_PAT) != 0) 3449 newpde ^= PG_PDE_PAT | PG_PTE_PAT; 3450 3451 /* 3452 * Map the superpage. 3453 */ 3454 if (workaround_erratum383) 3455 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 3456 else if (pmap == kernel_pmap) 3457 pmap_kenter_pde(va, PG_PS | newpde); 3458 else 3459 pde_store(pde, PG_PS | newpde); 3460 3461 pmap_pde_promotions++; 3462 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" 3463 " in pmap %p", va, pmap); 3464} 3465 3466/* 3467 * Insert the given physical page (p) at 3468 * the specified virtual address (v) in the 3469 * target physical map with the protection requested. 3470 * 3471 * If specified, the page will be wired down, meaning 3472 * that the related pte can not be reclaimed. 3473 * 3474 * NB: This is the only routine which MAY NOT lazy-evaluate 3475 * or lose information. That is, this routine must actually 3476 * insert this page into the given map NOW. 3477 */ 3478int 3479pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3480 u_int flags, int8_t psind) 3481{ 3482 pd_entry_t *pde; 3483 pt_entry_t *pte; 3484 pt_entry_t newpte, origpte; 3485 pv_entry_t pv; 3486 vm_paddr_t opa, pa; 3487 vm_page_t mpte, om; 3488 boolean_t invlva, wired; 3489 3490 va = trunc_page(va); 3491 mpte = NULL; 3492 wired = (flags & PMAP_ENTER_WIRED) != 0; 3493 3494 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 3495 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 3496 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", 3497 va)); 3498 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) 3499 VM_OBJECT_ASSERT_LOCKED(m->object); 3500 3501 rw_wlock(&pvh_global_lock); 3502 PMAP_LOCK(pmap); 3503 sched_pin(); 3504 3505 pde = pmap_pde(pmap, va); 3506 if (va < VM_MAXUSER_ADDRESS) { 3507 /* 3508 * va is for UVA. 3509 * In the case that a page table page is not resident, 3510 * we are creating it here. pmap_allocpte() handles 3511 * demotion. 3512 */ 3513 mpte = pmap_allocpte(pmap, va, flags); 3514 if (mpte == NULL) { 3515 KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, 3516 ("pmap_allocpte failed with sleep allowed")); 3517 sched_unpin(); 3518 rw_wunlock(&pvh_global_lock); 3519 PMAP_UNLOCK(pmap); 3520 return (KERN_RESOURCE_SHORTAGE); 3521 } 3522 } else { 3523 /* 3524 * va is for KVA, so pmap_demote_pde() will never fail 3525 * to install a page table page. PG_V is also 3526 * asserted by pmap_demote_pde(). 3527 */ 3528 KASSERT(pde != NULL && (*pde & PG_V) != 0, 3529 ("KVA %#x invalid pde pdir %#jx", va, 3530 (uintmax_t)pmap->pm_pdir[PTDPTDI])); 3531 if ((*pde & PG_PS) != 0) 3532 pmap_demote_pde(pmap, pde, va); 3533 } 3534 pte = pmap_pte_quick(pmap, va); 3535 3536 /* 3537 * Page Directory table entry is not valid, which should not 3538 * happen. We should have either allocated the page table 3539 * page or demoted the existing mapping above. 3540 */ 3541 if (pte == NULL) { 3542 panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", 3543 (uintmax_t)pmap->pm_pdir[PTDPTDI], va); 3544 } 3545 3546 pa = VM_PAGE_TO_PHYS(m); 3547 om = NULL; 3548 origpte = *pte; 3549 opa = origpte & PG_FRAME; 3550 3551 /* 3552 * Mapping has not changed, must be protection or wiring change. 3553 */ 3554 if (origpte && (opa == pa)) { 3555 /* 3556 * Wiring change, just update stats. We don't worry about 3557 * wiring PT pages as they remain resident as long as there 3558 * are valid mappings in them. Hence, if a user page is wired, 3559 * the PT page will be also. 3560 */ 3561 if (wired && ((origpte & PG_W) == 0)) 3562 pmap->pm_stats.wired_count++; 3563 else if (!wired && (origpte & PG_W)) 3564 pmap->pm_stats.wired_count--; 3565 3566 /* 3567 * Remove extra pte reference 3568 */ 3569 if (mpte) 3570 mpte->wire_count--; 3571 3572 if (origpte & PG_MANAGED) { 3573 om = m; 3574 pa |= PG_MANAGED; 3575 } 3576 goto validate; 3577 } 3578 3579 pv = NULL; 3580 3581 /* 3582 * Mapping has changed, invalidate old range and fall through to 3583 * handle validating new mapping. 3584 */ 3585 if (opa) { 3586 if (origpte & PG_W) 3587 pmap->pm_stats.wired_count--; 3588 if (origpte & PG_MANAGED) { 3589 om = PHYS_TO_VM_PAGE(opa); 3590 pv = pmap_pvh_remove(&om->md, pmap, va); 3591 } 3592 if (mpte != NULL) { 3593 mpte->wire_count--; 3594 KASSERT(mpte->wire_count > 0, 3595 ("pmap_enter: missing reference to page table page," 3596 " va: 0x%x", va)); 3597 } 3598 } else 3599 pmap->pm_stats.resident_count++; 3600 3601 /* 3602 * Enter on the PV list if part of our managed memory. 3603 */ 3604 if ((m->oflags & VPO_UNMANAGED) == 0) { 3605 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, 3606 ("pmap_enter: managed mapping within the clean submap")); 3607 if (pv == NULL) 3608 pv = get_pv_entry(pmap, FALSE); 3609 pv->pv_va = va; 3610 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3611 pa |= PG_MANAGED; 3612 } else if (pv != NULL) 3613 free_pv_entry(pmap, pv); 3614 3615 /* 3616 * Increment counters 3617 */ 3618 if (wired) 3619 pmap->pm_stats.wired_count++; 3620 3621validate: 3622 /* 3623 * Now validate mapping with desired protection/wiring. 3624 */ 3625 newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V); 3626 if ((prot & VM_PROT_WRITE) != 0) { 3627 newpte |= PG_RW; 3628 if ((newpte & PG_MANAGED) != 0) 3629 vm_page_aflag_set(m, PGA_WRITEABLE); 3630 } 3631#if defined(PAE) || defined(PAE_TABLES) 3632 if ((prot & VM_PROT_EXECUTE) == 0) 3633 newpte |= pg_nx; 3634#endif 3635 if (wired) 3636 newpte |= PG_W; 3637 if (va < VM_MAXUSER_ADDRESS) 3638 newpte |= PG_U; 3639 if (pmap == kernel_pmap) 3640 newpte |= pgeflag; 3641 3642 /* 3643 * if the mapping or permission bits are different, we need 3644 * to update the pte. 3645 */ 3646 if ((origpte & ~(PG_M|PG_A)) != newpte) { 3647 newpte |= PG_A; 3648 if ((flags & VM_PROT_WRITE) != 0) 3649 newpte |= PG_M; 3650 if (origpte & PG_V) { 3651 invlva = FALSE; 3652 origpte = pte_load_store(pte, newpte); 3653 if (origpte & PG_A) { 3654 if (origpte & PG_MANAGED) 3655 vm_page_aflag_set(om, PGA_REFERENCED); 3656 if (opa != VM_PAGE_TO_PHYS(m)) 3657 invlva = TRUE; 3658#if defined(PAE) || defined(PAE_TABLES) 3659 if ((origpte & PG_NX) == 0 && 3660 (newpte & PG_NX) != 0) 3661 invlva = TRUE; 3662#endif 3663 } 3664 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 3665 if ((origpte & PG_MANAGED) != 0) 3666 vm_page_dirty(om); 3667 if ((prot & VM_PROT_WRITE) == 0) 3668 invlva = TRUE; 3669 } 3670 if ((origpte & PG_MANAGED) != 0 && 3671 TAILQ_EMPTY(&om->md.pv_list) && 3672 ((om->flags & PG_FICTITIOUS) != 0 || 3673 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 3674 vm_page_aflag_clear(om, PGA_WRITEABLE); 3675 if (invlva) 3676 pmap_invalidate_page(pmap, va); 3677 } else 3678 pte_store(pte, newpte); 3679 } 3680 3681 /* 3682 * If both the page table page and the reservation are fully 3683 * populated, then attempt promotion. 3684 */ 3685 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 3686 pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 && 3687 vm_reserv_level_iffullpop(m) == 0) 3688 pmap_promote_pde(pmap, pde, va); 3689 3690 sched_unpin(); 3691 rw_wunlock(&pvh_global_lock); 3692 PMAP_UNLOCK(pmap); 3693 return (KERN_SUCCESS); 3694} 3695 3696/* 3697 * Tries to create a 2- or 4MB page mapping. Returns TRUE if successful and 3698 * FALSE otherwise. Fails if (1) a page table page cannot be allocated without 3699 * blocking, (2) a mapping already exists at the specified virtual address, or 3700 * (3) a pv entry cannot be allocated without reclaiming another pv entry. 3701 */ 3702static boolean_t 3703pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3704{ 3705 pd_entry_t *pde, newpde; 3706 3707 rw_assert(&pvh_global_lock, RA_WLOCKED); 3708 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3709 pde = pmap_pde(pmap, va); 3710 if (*pde != 0) { 3711 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3712 " in pmap %p", va, pmap); 3713 return (FALSE); 3714 } 3715 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) | 3716 PG_PS | PG_V; 3717 if ((m->oflags & VPO_UNMANAGED) == 0) { 3718 newpde |= PG_MANAGED; 3719 3720 /* 3721 * Abort this mapping if its PV entry could not be created. 3722 */ 3723 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) { 3724 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3725 " in pmap %p", va, pmap); 3726 return (FALSE); 3727 } 3728 } 3729#if defined(PAE) || defined(PAE_TABLES) 3730 if ((prot & VM_PROT_EXECUTE) == 0) 3731 newpde |= pg_nx; 3732#endif 3733 if (va < VM_MAXUSER_ADDRESS) 3734 newpde |= PG_U; 3735 3736 /* 3737 * Increment counters. 3738 */ 3739 pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; 3740 3741 /* 3742 * Map the superpage. 3743 */ 3744 pde_store(pde, newpde); 3745 3746 pmap_pde_mappings++; 3747 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 3748 " in pmap %p", va, pmap); 3749 return (TRUE); 3750} 3751 3752/* 3753 * Maps a sequence of resident pages belonging to the same object. 3754 * The sequence begins with the given page m_start. This page is 3755 * mapped at the given virtual address start. Each subsequent page is 3756 * mapped at a virtual address that is offset from start by the same 3757 * amount as the page is offset from m_start within the object. The 3758 * last page in the sequence is the page with the largest offset from 3759 * m_start that can be mapped at a virtual address less than the given 3760 * virtual address end. Not every virtual page between start and end 3761 * is mapped; only those for which a resident page exists with the 3762 * corresponding offset from m_start are mapped. 3763 */ 3764void 3765pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3766 vm_page_t m_start, vm_prot_t prot) 3767{ 3768 vm_offset_t va; 3769 vm_page_t m, mpte; 3770 vm_pindex_t diff, psize; 3771 3772 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3773 3774 psize = atop(end - start); 3775 mpte = NULL; 3776 m = m_start; 3777 rw_wlock(&pvh_global_lock); 3778 PMAP_LOCK(pmap); 3779 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3780 va = start + ptoa(diff); 3781 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 3782 m->psind == 1 && pg_ps_enabled && 3783 pmap_enter_pde(pmap, va, m, prot)) 3784 m = &m[NBPDR / PAGE_SIZE - 1]; 3785 else 3786 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 3787 mpte); 3788 m = TAILQ_NEXT(m, listq); 3789 } 3790 rw_wunlock(&pvh_global_lock); 3791 PMAP_UNLOCK(pmap); 3792} 3793 3794/* 3795 * this code makes some *MAJOR* assumptions: 3796 * 1. Current pmap & pmap exists. 3797 * 2. Not wired. 3798 * 3. Read access. 3799 * 4. No page table pages. 3800 * but is *MUCH* faster than pmap_enter... 3801 */ 3802 3803void 3804pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3805{ 3806 3807 rw_wlock(&pvh_global_lock); 3808 PMAP_LOCK(pmap); 3809 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); 3810 rw_wunlock(&pvh_global_lock); 3811 PMAP_UNLOCK(pmap); 3812} 3813 3814static vm_page_t 3815pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3816 vm_prot_t prot, vm_page_t mpte) 3817{ 3818 pt_entry_t *pte; 3819 vm_paddr_t pa; 3820 struct spglist free; 3821 3822 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 3823 (m->oflags & VPO_UNMANAGED) != 0, 3824 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3825 rw_assert(&pvh_global_lock, RA_WLOCKED); 3826 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3827 3828 /* 3829 * In the case that a page table page is not 3830 * resident, we are creating it here. 3831 */ 3832 if (va < VM_MAXUSER_ADDRESS) { 3833 u_int ptepindex; 3834 pd_entry_t ptepa; 3835 3836 /* 3837 * Calculate pagetable page index 3838 */ 3839 ptepindex = va >> PDRSHIFT; 3840 if (mpte && (mpte->pindex == ptepindex)) { 3841 mpte->wire_count++; 3842 } else { 3843 /* 3844 * Get the page directory entry 3845 */ 3846 ptepa = pmap->pm_pdir[ptepindex]; 3847 3848 /* 3849 * If the page table page is mapped, we just increment 3850 * the hold count, and activate it. 3851 */ 3852 if (ptepa) { 3853 if (ptepa & PG_PS) 3854 return (NULL); 3855 mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 3856 mpte->wire_count++; 3857 } else { 3858 mpte = _pmap_allocpte(pmap, ptepindex, 3859 PMAP_ENTER_NOSLEEP); 3860 if (mpte == NULL) 3861 return (mpte); 3862 } 3863 } 3864 } else { 3865 mpte = NULL; 3866 } 3867 3868 /* 3869 * This call to vtopte makes the assumption that we are 3870 * entering the page into the current pmap. In order to support 3871 * quick entry into any pmap, one would likely use pmap_pte_quick. 3872 * But that isn't as quick as vtopte. 3873 */ 3874 pte = vtopte(va); 3875 if (*pte) { 3876 if (mpte != NULL) { 3877 mpte->wire_count--; 3878 mpte = NULL; 3879 } 3880 return (mpte); 3881 } 3882 3883 /* 3884 * Enter on the PV list if part of our managed memory. 3885 */ 3886 if ((m->oflags & VPO_UNMANAGED) == 0 && 3887 !pmap_try_insert_pv_entry(pmap, va, m)) { 3888 if (mpte != NULL) { 3889 SLIST_INIT(&free); 3890 if (pmap_unwire_ptp(pmap, mpte, &free)) { 3891 pmap_invalidate_page(pmap, va); 3892 pmap_free_zero_pages(&free); 3893 } 3894 3895 mpte = NULL; 3896 } 3897 return (mpte); 3898 } 3899 3900 /* 3901 * Increment counters 3902 */ 3903 pmap->pm_stats.resident_count++; 3904 3905 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); 3906#if defined(PAE) || defined(PAE_TABLES) 3907 if ((prot & VM_PROT_EXECUTE) == 0) 3908 pa |= pg_nx; 3909#endif 3910 3911 /* 3912 * Now validate mapping with RO protection 3913 */ 3914 if ((m->oflags & VPO_UNMANAGED) != 0) 3915 pte_store(pte, pa | PG_V | PG_U); 3916 else 3917 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 3918 return (mpte); 3919} 3920 3921/* 3922 * Make a temporary mapping for a physical address. This is only intended 3923 * to be used for panic dumps. 3924 */ 3925void * 3926pmap_kenter_temporary(vm_paddr_t pa, int i) 3927{ 3928 vm_offset_t va; 3929 3930 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 3931 pmap_kenter(va, pa); 3932 invlpg(va); 3933 return ((void *)crashdumpmap); 3934} 3935 3936/* 3937 * This code maps large physical mmap regions into the 3938 * processor address space. Note that some shortcuts 3939 * are taken, but the code works. 3940 */ 3941void 3942pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3943 vm_pindex_t pindex, vm_size_t size) 3944{ 3945 pd_entry_t *pde; 3946 vm_paddr_t pa, ptepa; 3947 vm_page_t p; 3948 int pat_mode; 3949 3950 VM_OBJECT_ASSERT_WLOCKED(object); 3951 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3952 ("pmap_object_init_pt: non-device object")); 3953 if (pseflag && 3954 (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 3955 if (!vm_object_populate(object, pindex, pindex + atop(size))) 3956 return; 3957 p = vm_page_lookup(object, pindex); 3958 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3959 ("pmap_object_init_pt: invalid page %p", p)); 3960 pat_mode = p->md.pat_mode; 3961 3962 /* 3963 * Abort the mapping if the first page is not physically 3964 * aligned to a 2/4MB page boundary. 3965 */ 3966 ptepa = VM_PAGE_TO_PHYS(p); 3967 if (ptepa & (NBPDR - 1)) 3968 return; 3969 3970 /* 3971 * Skip the first page. Abort the mapping if the rest of 3972 * the pages are not physically contiguous or have differing 3973 * memory attributes. 3974 */ 3975 p = TAILQ_NEXT(p, listq); 3976 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 3977 pa += PAGE_SIZE) { 3978 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3979 ("pmap_object_init_pt: invalid page %p", p)); 3980 if (pa != VM_PAGE_TO_PHYS(p) || 3981 pat_mode != p->md.pat_mode) 3982 return; 3983 p = TAILQ_NEXT(p, listq); 3984 } 3985 3986 /* 3987 * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and 3988 * "size" is a multiple of 2/4M, adding the PAT setting to 3989 * "pa" will not affect the termination of this loop. 3990 */ 3991 PMAP_LOCK(pmap); 3992 for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa + 3993 size; pa += NBPDR) { 3994 pde = pmap_pde(pmap, addr); 3995 if (*pde == 0) { 3996 pde_store(pde, pa | PG_PS | PG_M | PG_A | 3997 PG_U | PG_RW | PG_V); 3998 pmap->pm_stats.resident_count += NBPDR / 3999 PAGE_SIZE; 4000 pmap_pde_mappings++; 4001 } 4002 /* Else continue on if the PDE is already valid. */ 4003 addr += NBPDR; 4004 } 4005 PMAP_UNLOCK(pmap); 4006 } 4007} 4008 4009/* 4010 * Clear the wired attribute from the mappings for the specified range of 4011 * addresses in the given pmap. Every valid mapping within that range 4012 * must have the wired attribute set. In contrast, invalid mappings 4013 * cannot have the wired attribute set, so they are ignored. 4014 * 4015 * The wired attribute of the page table entry is not a hardware feature, 4016 * so there is no need to invalidate any TLB entries. 4017 */ 4018void 4019pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4020{ 4021 vm_offset_t pdnxt; 4022 pd_entry_t *pde; 4023 pt_entry_t *pte; 4024 boolean_t pv_lists_locked; 4025 4026 if (pmap_is_current(pmap)) 4027 pv_lists_locked = FALSE; 4028 else { 4029 pv_lists_locked = TRUE; 4030resume: 4031 rw_wlock(&pvh_global_lock); 4032 sched_pin(); 4033 } 4034 PMAP_LOCK(pmap); 4035 for (; sva < eva; sva = pdnxt) { 4036 pdnxt = (sva + NBPDR) & ~PDRMASK; 4037 if (pdnxt < sva) 4038 pdnxt = eva; 4039 pde = pmap_pde(pmap, sva); 4040 if ((*pde & PG_V) == 0) 4041 continue; 4042 if ((*pde & PG_PS) != 0) { 4043 if ((*pde & PG_W) == 0) 4044 panic("pmap_unwire: pde %#jx is missing PG_W", 4045 (uintmax_t)*pde); 4046 4047 /* 4048 * Are we unwiring the entire large page? If not, 4049 * demote the mapping and fall through. 4050 */ 4051 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 4052 /* 4053 * Regardless of whether a pde (or pte) is 32 4054 * or 64 bits in size, PG_W is among the least 4055 * significant 32 bits. 4056 */ 4057 atomic_clear_int((u_int *)pde, PG_W); 4058 pmap->pm_stats.wired_count -= NBPDR / 4059 PAGE_SIZE; 4060 continue; 4061 } else { 4062 if (!pv_lists_locked) { 4063 pv_lists_locked = TRUE; 4064 if (!rw_try_wlock(&pvh_global_lock)) { 4065 PMAP_UNLOCK(pmap); 4066 /* Repeat sva. */ 4067 goto resume; 4068 } 4069 sched_pin(); 4070 } 4071 if (!pmap_demote_pde(pmap, pde, sva)) 4072 panic("pmap_unwire: demotion failed"); 4073 } 4074 } 4075 if (pdnxt > eva) 4076 pdnxt = eva; 4077 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 4078 sva += PAGE_SIZE) { 4079 if ((*pte & PG_V) == 0) 4080 continue; 4081 if ((*pte & PG_W) == 0) 4082 panic("pmap_unwire: pte %#jx is missing PG_W", 4083 (uintmax_t)*pte); 4084 4085 /* 4086 * PG_W must be cleared atomically. Although the pmap 4087 * lock synchronizes access to PG_W, another processor 4088 * could be setting PG_M and/or PG_A concurrently. 4089 * 4090 * PG_W is among the least significant 32 bits. 4091 */ 4092 atomic_clear_int((u_int *)pte, PG_W); 4093 pmap->pm_stats.wired_count--; 4094 } 4095 } 4096 if (pv_lists_locked) { 4097 sched_unpin(); 4098 rw_wunlock(&pvh_global_lock); 4099 } 4100 PMAP_UNLOCK(pmap); 4101} 4102 4103 4104/* 4105 * Copy the range specified by src_addr/len 4106 * from the source map to the range dst_addr/len 4107 * in the destination map. 4108 * 4109 * This routine is only advisory and need not do anything. 4110 */ 4111 4112void 4113pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 4114 vm_offset_t src_addr) 4115{ 4116 struct spglist free; 4117 vm_offset_t addr; 4118 vm_offset_t end_addr = src_addr + len; 4119 vm_offset_t pdnxt; 4120 4121 if (dst_addr != src_addr) 4122 return; 4123 4124 if (!pmap_is_current(src_pmap)) 4125 return; 4126 4127 rw_wlock(&pvh_global_lock); 4128 if (dst_pmap < src_pmap) { 4129 PMAP_LOCK(dst_pmap); 4130 PMAP_LOCK(src_pmap); 4131 } else { 4132 PMAP_LOCK(src_pmap); 4133 PMAP_LOCK(dst_pmap); 4134 } 4135 sched_pin(); 4136 for (addr = src_addr; addr < end_addr; addr = pdnxt) { 4137 pt_entry_t *src_pte, *dst_pte; 4138 vm_page_t dstmpte, srcmpte; 4139 pd_entry_t srcptepaddr; 4140 u_int ptepindex; 4141 4142 KASSERT(addr < UPT_MIN_ADDRESS, 4143 ("pmap_copy: invalid to pmap_copy page tables")); 4144 4145 pdnxt = (addr + NBPDR) & ~PDRMASK; 4146 if (pdnxt < addr) 4147 pdnxt = end_addr; 4148 ptepindex = addr >> PDRSHIFT; 4149 4150 srcptepaddr = src_pmap->pm_pdir[ptepindex]; 4151 if (srcptepaddr == 0) 4152 continue; 4153 4154 if (srcptepaddr & PG_PS) { 4155 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 4156 continue; 4157 if (dst_pmap->pm_pdir[ptepindex] == 0 && 4158 ((srcptepaddr & PG_MANAGED) == 0 || 4159 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & 4160 PG_PS_FRAME))) { 4161 dst_pmap->pm_pdir[ptepindex] = srcptepaddr & 4162 ~PG_W; 4163 dst_pmap->pm_stats.resident_count += 4164 NBPDR / PAGE_SIZE; 4165 pmap_pde_mappings++; 4166 } 4167 continue; 4168 } 4169 4170 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); 4171 KASSERT(srcmpte->wire_count > 0, 4172 ("pmap_copy: source page table page is unused")); 4173 4174 if (pdnxt > end_addr) 4175 pdnxt = end_addr; 4176 4177 src_pte = vtopte(addr); 4178 while (addr < pdnxt) { 4179 pt_entry_t ptetemp; 4180 ptetemp = *src_pte; 4181 /* 4182 * we only virtual copy managed pages 4183 */ 4184 if ((ptetemp & PG_MANAGED) != 0) { 4185 dstmpte = pmap_allocpte(dst_pmap, addr, 4186 PMAP_ENTER_NOSLEEP); 4187 if (dstmpte == NULL) 4188 goto out; 4189 dst_pte = pmap_pte_quick(dst_pmap, addr); 4190 if (*dst_pte == 0 && 4191 pmap_try_insert_pv_entry(dst_pmap, addr, 4192 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { 4193 /* 4194 * Clear the wired, modified, and 4195 * accessed (referenced) bits 4196 * during the copy. 4197 */ 4198 *dst_pte = ptetemp & ~(PG_W | PG_M | 4199 PG_A); 4200 dst_pmap->pm_stats.resident_count++; 4201 } else { 4202 SLIST_INIT(&free); 4203 if (pmap_unwire_ptp(dst_pmap, dstmpte, 4204 &free)) { 4205 pmap_invalidate_page(dst_pmap, 4206 addr); 4207 pmap_free_zero_pages(&free); 4208 } 4209 goto out; 4210 } 4211 if (dstmpte->wire_count >= srcmpte->wire_count) 4212 break; 4213 } 4214 addr += PAGE_SIZE; 4215 src_pte++; 4216 } 4217 } 4218out: 4219 sched_unpin(); 4220 rw_wunlock(&pvh_global_lock); 4221 PMAP_UNLOCK(src_pmap); 4222 PMAP_UNLOCK(dst_pmap); 4223} 4224 4225static __inline void 4226pagezero(void *page) 4227{ 4228#if defined(I686_CPU) 4229 if (cpu_class == CPUCLASS_686) { 4230 if (cpu_feature & CPUID_SSE2) 4231 sse2_pagezero(page); 4232 else 4233 i686_pagezero(page); 4234 } else 4235#endif 4236 bzero(page, PAGE_SIZE); 4237} 4238 4239/* 4240 * pmap_zero_page zeros the specified hardware page by mapping 4241 * the page into KVM and using bzero to clear its contents. 4242 */ 4243void 4244pmap_zero_page(vm_page_t m) 4245{ 4246 pt_entry_t *cmap_pte2; 4247 struct pcpu *pc; 4248 4249 sched_pin(); 4250 pc = get_pcpu(); 4251 cmap_pte2 = pc->pc_cmap_pte2; 4252 mtx_lock(&pc->pc_cmap_lock); 4253 if (*cmap_pte2) 4254 panic("pmap_zero_page: CMAP2 busy"); 4255 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4256 pmap_cache_bits(m->md.pat_mode, 0); 4257 invlcaddr(pc->pc_cmap_addr2); 4258 pagezero(pc->pc_cmap_addr2); 4259 *cmap_pte2 = 0; 4260 4261 /* 4262 * Unpin the thread before releasing the lock. Otherwise the thread 4263 * could be rescheduled while still bound to the current CPU, only 4264 * to unpin itself immediately upon resuming execution. 4265 */ 4266 sched_unpin(); 4267 mtx_unlock(&pc->pc_cmap_lock); 4268} 4269 4270/* 4271 * pmap_zero_page_area zeros the specified hardware page by mapping 4272 * the page into KVM and using bzero to clear its contents. 4273 * 4274 * off and size may not cover an area beyond a single hardware page. 4275 */ 4276void 4277pmap_zero_page_area(vm_page_t m, int off, int size) 4278{ 4279 pt_entry_t *cmap_pte2; 4280 struct pcpu *pc; 4281 4282 sched_pin(); 4283 pc = get_pcpu(); 4284 cmap_pte2 = pc->pc_cmap_pte2; 4285 mtx_lock(&pc->pc_cmap_lock); 4286 if (*cmap_pte2) 4287 panic("pmap_zero_page_area: CMAP2 busy"); 4288 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4289 pmap_cache_bits(m->md.pat_mode, 0); 4290 invlcaddr(pc->pc_cmap_addr2); 4291 if (off == 0 && size == PAGE_SIZE) 4292 pagezero(pc->pc_cmap_addr2); 4293 else 4294 bzero(pc->pc_cmap_addr2 + off, size); 4295 *cmap_pte2 = 0; 4296 sched_unpin(); 4297 mtx_unlock(&pc->pc_cmap_lock); 4298} 4299 4300/* 4301 * pmap_zero_page_idle zeros the specified hardware page by mapping 4302 * the page into KVM and using bzero to clear its contents. This 4303 * is intended to be called from the vm_pagezero process only and 4304 * outside of Giant. 4305 */ 4306void 4307pmap_zero_page_idle(vm_page_t m) 4308{ 4309 4310 if (*CMAP3) 4311 panic("pmap_zero_page_idle: CMAP3 busy"); 4312 sched_pin(); 4313 *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4314 pmap_cache_bits(m->md.pat_mode, 0); 4315 invlcaddr(CADDR3); 4316 pagezero(CADDR3); 4317 *CMAP3 = 0; 4318 sched_unpin(); 4319} 4320 4321/* 4322 * pmap_copy_page copies the specified (machine independent) 4323 * page by mapping the page into virtual memory and using 4324 * bcopy to copy the page, one machine dependent page at a 4325 * time. 4326 */ 4327void 4328pmap_copy_page(vm_page_t src, vm_page_t dst) 4329{ 4330 pt_entry_t *cmap_pte1, *cmap_pte2; 4331 struct pcpu *pc; 4332 4333 sched_pin(); 4334 pc = get_pcpu(); 4335 cmap_pte1 = pc->pc_cmap_pte1; 4336 cmap_pte2 = pc->pc_cmap_pte2; 4337 mtx_lock(&pc->pc_cmap_lock); 4338 if (*cmap_pte1) 4339 panic("pmap_copy_page: CMAP1 busy"); 4340 if (*cmap_pte2) 4341 panic("pmap_copy_page: CMAP2 busy"); 4342 *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A | 4343 pmap_cache_bits(src->md.pat_mode, 0); 4344 invlcaddr(pc->pc_cmap_addr1); 4345 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M | 4346 pmap_cache_bits(dst->md.pat_mode, 0); 4347 invlcaddr(pc->pc_cmap_addr2); 4348 bcopy(pc->pc_cmap_addr1, pc->pc_cmap_addr2, PAGE_SIZE); 4349 *cmap_pte1 = 0; 4350 *cmap_pte2 = 0; 4351 sched_unpin(); 4352 mtx_unlock(&pc->pc_cmap_lock); 4353} 4354 4355int unmapped_buf_allowed = 1; 4356 4357void 4358pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 4359 vm_offset_t b_offset, int xfersize) 4360{ 4361 vm_page_t a_pg, b_pg; 4362 char *a_cp, *b_cp; 4363 vm_offset_t a_pg_offset, b_pg_offset; 4364 pt_entry_t *cmap_pte1, *cmap_pte2; 4365 struct pcpu *pc; 4366 int cnt; 4367 4368 sched_pin(); 4369 pc = get_pcpu(); 4370 cmap_pte1 = pc->pc_cmap_pte1; 4371 cmap_pte2 = pc->pc_cmap_pte2; 4372 mtx_lock(&pc->pc_cmap_lock); 4373 if (*cmap_pte1 != 0) 4374 panic("pmap_copy_pages: CMAP1 busy"); 4375 if (*cmap_pte2 != 0) 4376 panic("pmap_copy_pages: CMAP2 busy"); 4377 while (xfersize > 0) { 4378 a_pg = ma[a_offset >> PAGE_SHIFT]; 4379 a_pg_offset = a_offset & PAGE_MASK; 4380 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 4381 b_pg = mb[b_offset >> PAGE_SHIFT]; 4382 b_pg_offset = b_offset & PAGE_MASK; 4383 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 4384 *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A | 4385 pmap_cache_bits(a_pg->md.pat_mode, 0); 4386 invlcaddr(pc->pc_cmap_addr1); 4387 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A | 4388 PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0); 4389 invlcaddr(pc->pc_cmap_addr2); 4390 a_cp = pc->pc_cmap_addr1 + a_pg_offset; 4391 b_cp = pc->pc_cmap_addr2 + b_pg_offset; 4392 bcopy(a_cp, b_cp, cnt); 4393 a_offset += cnt; 4394 b_offset += cnt; 4395 xfersize -= cnt; 4396 } 4397 *cmap_pte1 = 0; 4398 *cmap_pte2 = 0; 4399 sched_unpin(); 4400 mtx_unlock(&pc->pc_cmap_lock); 4401} 4402 4403/* 4404 * Returns true if the pmap's pv is one of the first 4405 * 16 pvs linked to from this page. This count may 4406 * be changed upwards or downwards in the future; it 4407 * is only necessary that true be returned for a small 4408 * subset of pmaps for proper page aging. 4409 */ 4410boolean_t 4411pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 4412{ 4413 struct md_page *pvh; 4414 pv_entry_t pv; 4415 int loops = 0; 4416 boolean_t rv; 4417 4418 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4419 ("pmap_page_exists_quick: page %p is not managed", m)); 4420 rv = FALSE; 4421 rw_wlock(&pvh_global_lock); 4422 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4423 if (PV_PMAP(pv) == pmap) { 4424 rv = TRUE; 4425 break; 4426 } 4427 loops++; 4428 if (loops >= 16) 4429 break; 4430 } 4431 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 4432 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4433 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4434 if (PV_PMAP(pv) == pmap) { 4435 rv = TRUE; 4436 break; 4437 } 4438 loops++; 4439 if (loops >= 16) 4440 break; 4441 } 4442 } 4443 rw_wunlock(&pvh_global_lock); 4444 return (rv); 4445} 4446 4447/* 4448 * pmap_page_wired_mappings: 4449 * 4450 * Return the number of managed mappings to the given physical page 4451 * that are wired. 4452 */ 4453int 4454pmap_page_wired_mappings(vm_page_t m) 4455{ 4456 int count; 4457 4458 count = 0; 4459 if ((m->oflags & VPO_UNMANAGED) != 0) 4460 return (count); 4461 rw_wlock(&pvh_global_lock); 4462 count = pmap_pvh_wired_mappings(&m->md, count); 4463 if ((m->flags & PG_FICTITIOUS) == 0) { 4464 count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), 4465 count); 4466 } 4467 rw_wunlock(&pvh_global_lock); 4468 return (count); 4469} 4470 4471/* 4472 * pmap_pvh_wired_mappings: 4473 * 4474 * Return the updated number "count" of managed mappings that are wired. 4475 */ 4476static int 4477pmap_pvh_wired_mappings(struct md_page *pvh, int count) 4478{ 4479 pmap_t pmap; 4480 pt_entry_t *pte; 4481 pv_entry_t pv; 4482 4483 rw_assert(&pvh_global_lock, RA_WLOCKED); 4484 sched_pin(); 4485 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4486 pmap = PV_PMAP(pv); 4487 PMAP_LOCK(pmap); 4488 pte = pmap_pte_quick(pmap, pv->pv_va); 4489 if ((*pte & PG_W) != 0) 4490 count++; 4491 PMAP_UNLOCK(pmap); 4492 } 4493 sched_unpin(); 4494 return (count); 4495} 4496 4497/* 4498 * Returns TRUE if the given page is mapped individually or as part of 4499 * a 4mpage. Otherwise, returns FALSE. 4500 */ 4501boolean_t 4502pmap_page_is_mapped(vm_page_t m) 4503{ 4504 boolean_t rv; 4505 4506 if ((m->oflags & VPO_UNMANAGED) != 0) 4507 return (FALSE); 4508 rw_wlock(&pvh_global_lock); 4509 rv = !TAILQ_EMPTY(&m->md.pv_list) || 4510 ((m->flags & PG_FICTITIOUS) == 0 && 4511 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 4512 rw_wunlock(&pvh_global_lock); 4513 return (rv); 4514} 4515 4516/* 4517 * Remove all pages from specified address space 4518 * this aids process exit speeds. Also, this code 4519 * is special cased for current process only, but 4520 * can have the more generic (and slightly slower) 4521 * mode enabled. This is much faster than pmap_remove 4522 * in the case of running down an entire address space. 4523 */ 4524void 4525pmap_remove_pages(pmap_t pmap) 4526{ 4527 pt_entry_t *pte, tpte; 4528 vm_page_t m, mpte, mt; 4529 pv_entry_t pv; 4530 struct md_page *pvh; 4531 struct pv_chunk *pc, *npc; 4532 struct spglist free; 4533 int field, idx; 4534 int32_t bit; 4535 uint32_t inuse, bitmask; 4536 int allfree; 4537 4538 if (pmap != PCPU_GET(curpmap)) { 4539 printf("warning: pmap_remove_pages called with non-current pmap\n"); 4540 return; 4541 } 4542 SLIST_INIT(&free); 4543 rw_wlock(&pvh_global_lock); 4544 PMAP_LOCK(pmap); 4545 sched_pin(); 4546 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4547 KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, 4548 pc->pc_pmap)); 4549 allfree = 1; 4550 for (field = 0; field < _NPCM; field++) { 4551 inuse = ~pc->pc_map[field] & pc_freemask[field]; 4552 while (inuse != 0) { 4553 bit = bsfl(inuse); 4554 bitmask = 1UL << bit; 4555 idx = field * 32 + bit; 4556 pv = &pc->pc_pventry[idx]; 4557 inuse &= ~bitmask; 4558 4559 pte = pmap_pde(pmap, pv->pv_va); 4560 tpte = *pte; 4561 if ((tpte & PG_PS) == 0) { 4562 pte = vtopte(pv->pv_va); 4563 tpte = *pte & ~PG_PTE_PAT; 4564 } 4565 4566 if (tpte == 0) { 4567 printf( 4568 "TPTE at %p IS ZERO @ VA %08x\n", 4569 pte, pv->pv_va); 4570 panic("bad pte"); 4571 } 4572 4573/* 4574 * We cannot remove wired pages from a process' mapping at this time 4575 */ 4576 if (tpte & PG_W) { 4577 allfree = 0; 4578 continue; 4579 } 4580 4581 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 4582 KASSERT(m->phys_addr == (tpte & PG_FRAME), 4583 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 4584 m, (uintmax_t)m->phys_addr, 4585 (uintmax_t)tpte)); 4586 4587 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4588 m < &vm_page_array[vm_page_array_size], 4589 ("pmap_remove_pages: bad tpte %#jx", 4590 (uintmax_t)tpte)); 4591 4592 pte_clear(pte); 4593 4594 /* 4595 * Update the vm_page_t clean/reference bits. 4596 */ 4597 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4598 if ((tpte & PG_PS) != 0) { 4599 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4600 vm_page_dirty(mt); 4601 } else 4602 vm_page_dirty(m); 4603 } 4604 4605 /* Mark free */ 4606 PV_STAT(pv_entry_frees++); 4607 PV_STAT(pv_entry_spare++); 4608 pv_entry_count--; 4609 pc->pc_map[field] |= bitmask; 4610 if ((tpte & PG_PS) != 0) { 4611 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 4612 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 4613 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4614 if (TAILQ_EMPTY(&pvh->pv_list)) { 4615 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4616 if (TAILQ_EMPTY(&mt->md.pv_list)) 4617 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4618 } 4619 mpte = pmap_lookup_pt_page(pmap, pv->pv_va); 4620 if (mpte != NULL) { 4621 pmap_remove_pt_page(pmap, mpte); 4622 pmap->pm_stats.resident_count--; 4623 KASSERT(mpte->wire_count == NPTEPG, 4624 ("pmap_remove_pages: pte page wire count error")); 4625 mpte->wire_count = 0; 4626 pmap_add_delayed_free_list(mpte, &free, FALSE); 4627 atomic_subtract_int(&vm_cnt.v_wire_count, 1); 4628 } 4629 } else { 4630 pmap->pm_stats.resident_count--; 4631 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4632 if (TAILQ_EMPTY(&m->md.pv_list) && 4633 (m->flags & PG_FICTITIOUS) == 0) { 4634 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4635 if (TAILQ_EMPTY(&pvh->pv_list)) 4636 vm_page_aflag_clear(m, PGA_WRITEABLE); 4637 } 4638 pmap_unuse_pt(pmap, pv->pv_va, &free); 4639 } 4640 } 4641 } 4642 if (allfree) { 4643 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4644 free_pv_chunk(pc); 4645 } 4646 } 4647 sched_unpin(); 4648 pmap_invalidate_all(pmap); 4649 rw_wunlock(&pvh_global_lock); 4650 PMAP_UNLOCK(pmap); 4651 pmap_free_zero_pages(&free); 4652} 4653 4654/* 4655 * pmap_is_modified: 4656 * 4657 * Return whether or not the specified physical page was modified 4658 * in any physical maps. 4659 */ 4660boolean_t 4661pmap_is_modified(vm_page_t m) 4662{ 4663 boolean_t rv; 4664 4665 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4666 ("pmap_is_modified: page %p is not managed", m)); 4667 4668 /* 4669 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 4670 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 4671 * is clear, no PTEs can have PG_M set. 4672 */ 4673 VM_OBJECT_ASSERT_WLOCKED(m->object); 4674 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 4675 return (FALSE); 4676 rw_wlock(&pvh_global_lock); 4677 rv = pmap_is_modified_pvh(&m->md) || 4678 ((m->flags & PG_FICTITIOUS) == 0 && 4679 pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 4680 rw_wunlock(&pvh_global_lock); 4681 return (rv); 4682} 4683 4684/* 4685 * Returns TRUE if any of the given mappings were used to modify 4686 * physical memory. Otherwise, returns FALSE. Both page and 2mpage 4687 * mappings are supported. 4688 */ 4689static boolean_t 4690pmap_is_modified_pvh(struct md_page *pvh) 4691{ 4692 pv_entry_t pv; 4693 pt_entry_t *pte; 4694 pmap_t pmap; 4695 boolean_t rv; 4696 4697 rw_assert(&pvh_global_lock, RA_WLOCKED); 4698 rv = FALSE; 4699 sched_pin(); 4700 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4701 pmap = PV_PMAP(pv); 4702 PMAP_LOCK(pmap); 4703 pte = pmap_pte_quick(pmap, pv->pv_va); 4704 rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); 4705 PMAP_UNLOCK(pmap); 4706 if (rv) 4707 break; 4708 } 4709 sched_unpin(); 4710 return (rv); 4711} 4712 4713/* 4714 * pmap_is_prefaultable: 4715 * 4716 * Return whether or not the specified virtual address is elgible 4717 * for prefault. 4718 */ 4719boolean_t 4720pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 4721{ 4722 pd_entry_t *pde; 4723 pt_entry_t *pte; 4724 boolean_t rv; 4725 4726 rv = FALSE; 4727 PMAP_LOCK(pmap); 4728 pde = pmap_pde(pmap, addr); 4729 if (*pde != 0 && (*pde & PG_PS) == 0) { 4730 pte = vtopte(addr); 4731 rv = *pte == 0; 4732 } 4733 PMAP_UNLOCK(pmap); 4734 return (rv); 4735} 4736 4737/* 4738 * pmap_is_referenced: 4739 * 4740 * Return whether or not the specified physical page was referenced 4741 * in any physical maps. 4742 */ 4743boolean_t 4744pmap_is_referenced(vm_page_t m) 4745{ 4746 boolean_t rv; 4747 4748 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4749 ("pmap_is_referenced: page %p is not managed", m)); 4750 rw_wlock(&pvh_global_lock); 4751 rv = pmap_is_referenced_pvh(&m->md) || 4752 ((m->flags & PG_FICTITIOUS) == 0 && 4753 pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 4754 rw_wunlock(&pvh_global_lock); 4755 return (rv); 4756} 4757 4758/* 4759 * Returns TRUE if any of the given mappings were referenced and FALSE 4760 * otherwise. Both page and 4mpage mappings are supported. 4761 */ 4762static boolean_t 4763pmap_is_referenced_pvh(struct md_page *pvh) 4764{ 4765 pv_entry_t pv; 4766 pt_entry_t *pte; 4767 pmap_t pmap; 4768 boolean_t rv; 4769 4770 rw_assert(&pvh_global_lock, RA_WLOCKED); 4771 rv = FALSE; 4772 sched_pin(); 4773 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4774 pmap = PV_PMAP(pv); 4775 PMAP_LOCK(pmap); 4776 pte = pmap_pte_quick(pmap, pv->pv_va); 4777 rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); 4778 PMAP_UNLOCK(pmap); 4779 if (rv) 4780 break; 4781 } 4782 sched_unpin(); 4783 return (rv); 4784} 4785 4786/* 4787 * Clear the write and modified bits in each of the given page's mappings. 4788 */ 4789void 4790pmap_remove_write(vm_page_t m) 4791{ 4792 struct md_page *pvh; 4793 pv_entry_t next_pv, pv; 4794 pmap_t pmap; 4795 pd_entry_t *pde; 4796 pt_entry_t oldpte, *pte; 4797 vm_offset_t va; 4798 4799 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4800 ("pmap_remove_write: page %p is not managed", m)); 4801 4802 /* 4803 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 4804 * set by another thread while the object is locked. Thus, 4805 * if PGA_WRITEABLE is clear, no page table entries need updating. 4806 */ 4807 VM_OBJECT_ASSERT_WLOCKED(m->object); 4808 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 4809 return; 4810 rw_wlock(&pvh_global_lock); 4811 sched_pin(); 4812 if ((m->flags & PG_FICTITIOUS) != 0) 4813 goto small_mappings; 4814 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4815 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4816 va = pv->pv_va; 4817 pmap = PV_PMAP(pv); 4818 PMAP_LOCK(pmap); 4819 pde = pmap_pde(pmap, va); 4820 if ((*pde & PG_RW) != 0) 4821 (void)pmap_demote_pde(pmap, pde, va); 4822 PMAP_UNLOCK(pmap); 4823 } 4824small_mappings: 4825 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4826 pmap = PV_PMAP(pv); 4827 PMAP_LOCK(pmap); 4828 pde = pmap_pde(pmap, pv->pv_va); 4829 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" 4830 " a 4mpage in page %p's pv list", m)); 4831 pte = pmap_pte_quick(pmap, pv->pv_va); 4832retry: 4833 oldpte = *pte; 4834 if ((oldpte & PG_RW) != 0) { 4835 /* 4836 * Regardless of whether a pte is 32 or 64 bits 4837 * in size, PG_RW and PG_M are among the least 4838 * significant 32 bits. 4839 */ 4840 if (!atomic_cmpset_int((u_int *)pte, oldpte, 4841 oldpte & ~(PG_RW | PG_M))) 4842 goto retry; 4843 if ((oldpte & PG_M) != 0) 4844 vm_page_dirty(m); 4845 pmap_invalidate_page(pmap, pv->pv_va); 4846 } 4847 PMAP_UNLOCK(pmap); 4848 } 4849 vm_page_aflag_clear(m, PGA_WRITEABLE); 4850 sched_unpin(); 4851 rw_wunlock(&pvh_global_lock); 4852} 4853 4854#define PMAP_TS_REFERENCED_MAX 5 4855 4856/* 4857 * pmap_ts_referenced: 4858 * 4859 * Return a count of reference bits for a page, clearing those bits. 4860 * It is not necessary for every reference bit to be cleared, but it 4861 * is necessary that 0 only be returned when there are truly no 4862 * reference bits set. 4863 * 4864 * XXX: The exact number of bits to check and clear is a matter that 4865 * should be tested and standardized at some point in the future for 4866 * optimal aging of shared pages. 4867 * 4868 * As an optimization, update the page's dirty field if a modified bit is 4869 * found while counting reference bits. This opportunistic update can be 4870 * performed at low cost and can eliminate the need for some future calls 4871 * to pmap_is_modified(). However, since this function stops after 4872 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 4873 * dirty pages. Those dirty pages will only be detected by a future call 4874 * to pmap_is_modified(). 4875 */ 4876int 4877pmap_ts_referenced(vm_page_t m) 4878{ 4879 struct md_page *pvh; 4880 pv_entry_t pv, pvf; 4881 pmap_t pmap; 4882 pd_entry_t *pde; 4883 pt_entry_t *pte; 4884 vm_paddr_t pa; 4885 int rtval = 0; 4886 4887 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4888 ("pmap_ts_referenced: page %p is not managed", m)); 4889 pa = VM_PAGE_TO_PHYS(m); 4890 pvh = pa_to_pvh(pa); 4891 rw_wlock(&pvh_global_lock); 4892 sched_pin(); 4893 if ((m->flags & PG_FICTITIOUS) != 0 || 4894 (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 4895 goto small_mappings; 4896 pv = pvf; 4897 do { 4898 pmap = PV_PMAP(pv); 4899 PMAP_LOCK(pmap); 4900 pde = pmap_pde(pmap, pv->pv_va); 4901 if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4902 /* 4903 * Although "*pde" is mapping a 2/4MB page, because 4904 * this function is called at a 4KB page granularity, 4905 * we only update the 4KB page under test. 4906 */ 4907 vm_page_dirty(m); 4908 } 4909 if ((*pde & PG_A) != 0) { 4910 /* 4911 * Since this reference bit is shared by either 1024 4912 * or 512 4KB pages, it should not be cleared every 4913 * time it is tested. Apply a simple "hash" function 4914 * on the physical page number, the virtual superpage 4915 * number, and the pmap address to select one 4KB page 4916 * out of the 1024 or 512 on which testing the 4917 * reference bit will result in clearing that bit. 4918 * This function is designed to avoid the selection of 4919 * the same 4KB page for every 2- or 4MB page mapping. 4920 * 4921 * On demotion, a mapping that hasn't been referenced 4922 * is simply destroyed. To avoid the possibility of a 4923 * subsequent page fault on a demoted wired mapping, 4924 * always leave its reference bit set. Moreover, 4925 * since the superpage is wired, the current state of 4926 * its reference bit won't affect page replacement. 4927 */ 4928 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 4929 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 4930 (*pde & PG_W) == 0) { 4931 atomic_clear_int((u_int *)pde, PG_A); 4932 pmap_invalidate_page(pmap, pv->pv_va); 4933 } 4934 rtval++; 4935 } 4936 PMAP_UNLOCK(pmap); 4937 /* Rotate the PV list if it has more than one entry. */ 4938 if (TAILQ_NEXT(pv, pv_next) != NULL) { 4939 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4940 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4941 } 4942 if (rtval >= PMAP_TS_REFERENCED_MAX) 4943 goto out; 4944 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 4945small_mappings: 4946 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 4947 goto out; 4948 pv = pvf; 4949 do { 4950 pmap = PV_PMAP(pv); 4951 PMAP_LOCK(pmap); 4952 pde = pmap_pde(pmap, pv->pv_va); 4953 KASSERT((*pde & PG_PS) == 0, 4954 ("pmap_ts_referenced: found a 4mpage in page %p's pv list", 4955 m)); 4956 pte = pmap_pte_quick(pmap, pv->pv_va); 4957 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 4958 vm_page_dirty(m); 4959 if ((*pte & PG_A) != 0) { 4960 atomic_clear_int((u_int *)pte, PG_A); 4961 pmap_invalidate_page(pmap, pv->pv_va); 4962 rtval++; 4963 } 4964 PMAP_UNLOCK(pmap); 4965 /* Rotate the PV list if it has more than one entry. */ 4966 if (TAILQ_NEXT(pv, pv_next) != NULL) { 4967 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4968 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4969 } 4970 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < 4971 PMAP_TS_REFERENCED_MAX); 4972out: 4973 sched_unpin(); 4974 rw_wunlock(&pvh_global_lock); 4975 return (rtval); 4976} 4977 4978/* 4979 * Apply the given advice to the specified range of addresses within the 4980 * given pmap. Depending on the advice, clear the referenced and/or 4981 * modified flags in each mapping and set the mapped page's dirty field. 4982 */ 4983void 4984pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 4985{ 4986 pd_entry_t oldpde, *pde; 4987 pt_entry_t *pte; 4988 vm_offset_t va, pdnxt; 4989 vm_page_t m; 4990 boolean_t anychanged, pv_lists_locked; 4991 4992 if (advice != MADV_DONTNEED && advice != MADV_FREE) 4993 return; 4994 if (pmap_is_current(pmap)) 4995 pv_lists_locked = FALSE; 4996 else { 4997 pv_lists_locked = TRUE; 4998resume: 4999 rw_wlock(&pvh_global_lock); 5000 sched_pin(); 5001 } 5002 anychanged = FALSE; 5003 PMAP_LOCK(pmap); 5004 for (; sva < eva; sva = pdnxt) { 5005 pdnxt = (sva + NBPDR) & ~PDRMASK; 5006 if (pdnxt < sva) 5007 pdnxt = eva; 5008 pde = pmap_pde(pmap, sva); 5009 oldpde = *pde; 5010 if ((oldpde & PG_V) == 0) 5011 continue; 5012 else if ((oldpde & PG_PS) != 0) { 5013 if ((oldpde & PG_MANAGED) == 0) 5014 continue; 5015 if (!pv_lists_locked) { 5016 pv_lists_locked = TRUE; 5017 if (!rw_try_wlock(&pvh_global_lock)) { 5018 if (anychanged) 5019 pmap_invalidate_all(pmap); 5020 PMAP_UNLOCK(pmap); 5021 goto resume; 5022 } 5023 sched_pin(); 5024 } 5025 if (!pmap_demote_pde(pmap, pde, sva)) { 5026 /* 5027 * The large page mapping was destroyed. 5028 */ 5029 continue; 5030 } 5031 5032 /* 5033 * Unless the page mappings are wired, remove the 5034 * mapping to a single page so that a subsequent 5035 * access may repromote. Since the underlying page 5036 * table page is fully populated, this removal never 5037 * frees a page table page. 5038 */ 5039 if ((oldpde & PG_W) == 0) { 5040 pte = pmap_pte_quick(pmap, sva); 5041 KASSERT((*pte & PG_V) != 0, 5042 ("pmap_advise: invalid PTE")); 5043 pmap_remove_pte(pmap, pte, sva, NULL); 5044 anychanged = TRUE; 5045 } 5046 } 5047 if (pdnxt > eva) 5048 pdnxt = eva; 5049 va = pdnxt; 5050 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 5051 sva += PAGE_SIZE) { 5052 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) 5053 goto maybe_invlrng; 5054 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5055 if (advice == MADV_DONTNEED) { 5056 /* 5057 * Future calls to pmap_is_modified() 5058 * can be avoided by making the page 5059 * dirty now. 5060 */ 5061 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 5062 vm_page_dirty(m); 5063 } 5064 atomic_clear_int((u_int *)pte, PG_M | PG_A); 5065 } else if ((*pte & PG_A) != 0) 5066 atomic_clear_int((u_int *)pte, PG_A); 5067 else 5068 goto maybe_invlrng; 5069 if ((*pte & PG_G) != 0) { 5070 if (va == pdnxt) 5071 va = sva; 5072 } else 5073 anychanged = TRUE; 5074 continue; 5075maybe_invlrng: 5076 if (va != pdnxt) { 5077 pmap_invalidate_range(pmap, va, sva); 5078 va = pdnxt; 5079 } 5080 } 5081 if (va != pdnxt) 5082 pmap_invalidate_range(pmap, va, sva); 5083 } 5084 if (anychanged) 5085 pmap_invalidate_all(pmap); 5086 if (pv_lists_locked) { 5087 sched_unpin(); 5088 rw_wunlock(&pvh_global_lock); 5089 } 5090 PMAP_UNLOCK(pmap); 5091} 5092 5093/* 5094 * Clear the modify bits on the specified physical page. 5095 */ 5096void 5097pmap_clear_modify(vm_page_t m) 5098{ 5099 struct md_page *pvh; 5100 pv_entry_t next_pv, pv; 5101 pmap_t pmap; 5102 pd_entry_t oldpde, *pde; 5103 pt_entry_t oldpte, *pte; 5104 vm_offset_t va; 5105 5106 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5107 ("pmap_clear_modify: page %p is not managed", m)); 5108 VM_OBJECT_ASSERT_WLOCKED(m->object); 5109 KASSERT(!vm_page_xbusied(m), 5110 ("pmap_clear_modify: page %p is exclusive busied", m)); 5111 5112 /* 5113 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 5114 * If the object containing the page is locked and the page is not 5115 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 5116 */ 5117 if ((m->aflags & PGA_WRITEABLE) == 0) 5118 return; 5119 rw_wlock(&pvh_global_lock); 5120 sched_pin(); 5121 if ((m->flags & PG_FICTITIOUS) != 0) 5122 goto small_mappings; 5123 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5124 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5125 va = pv->pv_va; 5126 pmap = PV_PMAP(pv); 5127 PMAP_LOCK(pmap); 5128 pde = pmap_pde(pmap, va); 5129 oldpde = *pde; 5130 if ((oldpde & PG_RW) != 0) { 5131 if (pmap_demote_pde(pmap, pde, va)) { 5132 if ((oldpde & PG_W) == 0) { 5133 /* 5134 * Write protect the mapping to a 5135 * single page so that a subsequent 5136 * write access may repromote. 5137 */ 5138 va += VM_PAGE_TO_PHYS(m) - (oldpde & 5139 PG_PS_FRAME); 5140 pte = pmap_pte_quick(pmap, va); 5141 oldpte = *pte; 5142 if ((oldpte & PG_V) != 0) { 5143 /* 5144 * Regardless of whether a pte is 32 or 64 bits 5145 * in size, PG_RW and PG_M are among the least 5146 * significant 32 bits. 5147 */ 5148 while (!atomic_cmpset_int((u_int *)pte, 5149 oldpte, 5150 oldpte & ~(PG_M | PG_RW))) 5151 oldpte = *pte; 5152 vm_page_dirty(m); 5153 pmap_invalidate_page(pmap, va); 5154 } 5155 } 5156 } 5157 } 5158 PMAP_UNLOCK(pmap); 5159 } 5160small_mappings: 5161 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5162 pmap = PV_PMAP(pv); 5163 PMAP_LOCK(pmap); 5164 pde = pmap_pde(pmap, pv->pv_va); 5165 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 5166 " a 4mpage in page %p's pv list", m)); 5167 pte = pmap_pte_quick(pmap, pv->pv_va); 5168 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5169 /* 5170 * Regardless of whether a pte is 32 or 64 bits 5171 * in size, PG_M is among the least significant 5172 * 32 bits. 5173 */ 5174 atomic_clear_int((u_int *)pte, PG_M); 5175 pmap_invalidate_page(pmap, pv->pv_va); 5176 } 5177 PMAP_UNLOCK(pmap); 5178 } 5179 sched_unpin(); 5180 rw_wunlock(&pvh_global_lock); 5181} 5182 5183/* 5184 * Miscellaneous support routines follow 5185 */ 5186 5187/* Adjust the cache mode for a 4KB page mapped via a PTE. */ 5188static __inline void 5189pmap_pte_attr(pt_entry_t *pte, int cache_bits) 5190{ 5191 u_int opte, npte; 5192 5193 /* 5194 * The cache mode bits are all in the low 32-bits of the 5195 * PTE, so we can just spin on updating the low 32-bits. 5196 */ 5197 do { 5198 opte = *(u_int *)pte; 5199 npte = opte & ~PG_PTE_CACHE; 5200 npte |= cache_bits; 5201 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 5202} 5203 5204/* Adjust the cache mode for a 2/4MB page mapped via a PDE. */ 5205static __inline void 5206pmap_pde_attr(pd_entry_t *pde, int cache_bits) 5207{ 5208 u_int opde, npde; 5209 5210 /* 5211 * The cache mode bits are all in the low 32-bits of the 5212 * PDE, so we can just spin on updating the low 32-bits. 5213 */ 5214 do { 5215 opde = *(u_int *)pde; 5216 npde = opde & ~PG_PDE_CACHE; 5217 npde |= cache_bits; 5218 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 5219} 5220 5221/* 5222 * Map a set of physical memory pages into the kernel virtual 5223 * address space. Return a pointer to where it is mapped. This 5224 * routine is intended to be used for mapping device memory, 5225 * NOT real memory. 5226 */ 5227void * 5228pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 5229{ 5230 struct pmap_preinit_mapping *ppim; 5231 vm_offset_t va, offset; 5232 vm_size_t tmpsize; 5233 int i; 5234 5235 offset = pa & PAGE_MASK; 5236 size = round_page(offset + size); 5237 pa = pa & PG_FRAME; 5238 5239 if (pa < KERNLOAD && pa + size <= KERNLOAD) 5240 va = KERNBASE + pa; 5241 else if (!pmap_initialized) { 5242 va = 0; 5243 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5244 ppim = pmap_preinit_mapping + i; 5245 if (ppim->va == 0) { 5246 ppim->pa = pa; 5247 ppim->sz = size; 5248 ppim->mode = mode; 5249 ppim->va = virtual_avail; 5250 virtual_avail += size; 5251 va = ppim->va; 5252 break; 5253 } 5254 } 5255 if (va == 0) 5256 panic("%s: too many preinit mappings", __func__); 5257 } else { 5258 /* 5259 * If we have a preinit mapping, re-use it. 5260 */ 5261 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5262 ppim = pmap_preinit_mapping + i; 5263 if (ppim->pa == pa && ppim->sz == size && 5264 ppim->mode == mode) 5265 return ((void *)(ppim->va + offset)); 5266 } 5267 va = kva_alloc(size); 5268 if (va == 0) 5269 panic("%s: Couldn't allocate KVA", __func__); 5270 } 5271 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 5272 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 5273 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 5274 pmap_invalidate_cache_range(va, va + size, FALSE); 5275 return ((void *)(va + offset)); 5276} 5277 5278void * 5279pmap_mapdev(vm_paddr_t pa, vm_size_t size) 5280{ 5281 5282 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 5283} 5284 5285void * 5286pmap_mapbios(vm_paddr_t pa, vm_size_t size) 5287{ 5288 5289 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5290} 5291 5292void 5293pmap_unmapdev(vm_offset_t va, vm_size_t size) 5294{ 5295 struct pmap_preinit_mapping *ppim; 5296 vm_offset_t offset; 5297 int i; 5298 5299 if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) 5300 return; 5301 offset = va & PAGE_MASK; 5302 size = round_page(offset + size); 5303 va = trunc_page(va); 5304 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5305 ppim = pmap_preinit_mapping + i; 5306 if (ppim->va == va && ppim->sz == size) { 5307 if (pmap_initialized) 5308 return; 5309 ppim->pa = 0; 5310 ppim->va = 0; 5311 ppim->sz = 0; 5312 ppim->mode = 0; 5313 if (va + size == virtual_avail) 5314 virtual_avail = va; 5315 return; 5316 } 5317 } 5318 if (pmap_initialized) 5319 kva_free(va, size); 5320} 5321 5322/* 5323 * Sets the memory attribute for the specified page. 5324 */ 5325void 5326pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5327{ 5328 5329 m->md.pat_mode = ma; 5330 if ((m->flags & PG_FICTITIOUS) != 0) 5331 return; 5332 5333 /* 5334 * If "m" is a normal page, flush it from the cache. 5335 * See pmap_invalidate_cache_range(). 5336 * 5337 * First, try to find an existing mapping of the page by sf 5338 * buffer. sf_buf_invalidate_cache() modifies mapping and 5339 * flushes the cache. 5340 */ 5341 if (sf_buf_invalidate_cache(m)) 5342 return; 5343 5344 /* 5345 * If page is not mapped by sf buffer, but CPU does not 5346 * support self snoop, map the page transient and do 5347 * invalidation. In the worst case, whole cache is flushed by 5348 * pmap_invalidate_cache_range(). 5349 */ 5350 if ((cpu_feature & CPUID_SS) == 0) 5351 pmap_flush_page(m); 5352} 5353 5354static void 5355pmap_flush_page(vm_page_t m) 5356{ 5357 pt_entry_t *cmap_pte2; 5358 struct pcpu *pc; 5359 vm_offset_t sva, eva; 5360 bool useclflushopt; 5361 5362 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; 5363 if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) { 5364 sched_pin(); 5365 pc = get_pcpu(); 5366 cmap_pte2 = pc->pc_cmap_pte2; 5367 mtx_lock(&pc->pc_cmap_lock); 5368 if (*cmap_pte2) 5369 panic("pmap_flush_page: CMAP2 busy"); 5370 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | 5371 PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); 5372 invlcaddr(pc->pc_cmap_addr2); 5373 sva = (vm_offset_t)pc->pc_cmap_addr2; 5374 eva = sva + PAGE_SIZE; 5375 5376 /* 5377 * Use mfence or sfence despite the ordering implied by 5378 * mtx_{un,}lock() because clflush on non-Intel CPUs 5379 * and clflushopt are not guaranteed to be ordered by 5380 * any other instruction. 5381 */ 5382 if (useclflushopt) 5383 sfence(); 5384 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 5385 mfence(); 5386 for (; sva < eva; sva += cpu_clflush_line_size) { 5387 if (useclflushopt) 5388 clflushopt(sva); 5389 else 5390 clflush(sva); 5391 } 5392 if (useclflushopt) 5393 sfence(); 5394 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 5395 mfence(); 5396 *cmap_pte2 = 0; 5397 sched_unpin(); 5398 mtx_unlock(&pc->pc_cmap_lock); 5399 } else 5400 pmap_invalidate_cache(); 5401} 5402 5403/* 5404 * Changes the specified virtual address range's memory type to that given by 5405 * the parameter "mode". The specified virtual address range must be 5406 * completely contained within either the kernel map. 5407 * 5408 * Returns zero if the change completed successfully, and either EINVAL or 5409 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 5410 * of the virtual address range was not mapped, and ENOMEM is returned if 5411 * there was insufficient memory available to complete the change. 5412 */ 5413int 5414pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 5415{ 5416 vm_offset_t base, offset, tmpva; 5417 pd_entry_t *pde; 5418 pt_entry_t *pte; 5419 int cache_bits_pte, cache_bits_pde; 5420 boolean_t changed; 5421 5422 base = trunc_page(va); 5423 offset = va & PAGE_MASK; 5424 size = round_page(offset + size); 5425 5426 /* 5427 * Only supported on kernel virtual addresses above the recursive map. 5428 */ 5429 if (base < VM_MIN_KERNEL_ADDRESS) 5430 return (EINVAL); 5431 5432 cache_bits_pde = pmap_cache_bits(mode, 1); 5433 cache_bits_pte = pmap_cache_bits(mode, 0); 5434 changed = FALSE; 5435 5436 /* 5437 * Pages that aren't mapped aren't supported. Also break down 5438 * 2/4MB pages into 4KB pages if required. 5439 */ 5440 PMAP_LOCK(kernel_pmap); 5441 for (tmpva = base; tmpva < base + size; ) { 5442 pde = pmap_pde(kernel_pmap, tmpva); 5443 if (*pde == 0) { 5444 PMAP_UNLOCK(kernel_pmap); 5445 return (EINVAL); 5446 } 5447 if (*pde & PG_PS) { 5448 /* 5449 * If the current 2/4MB page already has 5450 * the required memory type, then we need not 5451 * demote this page. Just increment tmpva to 5452 * the next 2/4MB page frame. 5453 */ 5454 if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { 5455 tmpva = trunc_4mpage(tmpva) + NBPDR; 5456 continue; 5457 } 5458 5459 /* 5460 * If the current offset aligns with a 2/4MB 5461 * page frame and there is at least 2/4MB left 5462 * within the range, then we need not break 5463 * down this page into 4KB pages. 5464 */ 5465 if ((tmpva & PDRMASK) == 0 && 5466 tmpva + PDRMASK < base + size) { 5467 tmpva += NBPDR; 5468 continue; 5469 } 5470 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) { 5471 PMAP_UNLOCK(kernel_pmap); 5472 return (ENOMEM); 5473 } 5474 } 5475 pte = vtopte(tmpva); 5476 if (*pte == 0) { 5477 PMAP_UNLOCK(kernel_pmap); 5478 return (EINVAL); 5479 } 5480 tmpva += PAGE_SIZE; 5481 } 5482 PMAP_UNLOCK(kernel_pmap); 5483 5484 /* 5485 * Ok, all the pages exist, so run through them updating their 5486 * cache mode if required. 5487 */ 5488 for (tmpva = base; tmpva < base + size; ) { 5489 pde = pmap_pde(kernel_pmap, tmpva); 5490 if (*pde & PG_PS) { 5491 if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { 5492 pmap_pde_attr(pde, cache_bits_pde); 5493 changed = TRUE; 5494 } 5495 tmpva = trunc_4mpage(tmpva) + NBPDR; 5496 } else { 5497 pte = vtopte(tmpva); 5498 if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { 5499 pmap_pte_attr(pte, cache_bits_pte); 5500 changed = TRUE; 5501 } 5502 tmpva += PAGE_SIZE; 5503 } 5504 } 5505 5506 /* 5507 * Flush CPU caches to make sure any data isn't cached that 5508 * shouldn't be, etc. 5509 */ 5510 if (changed) { 5511 pmap_invalidate_range(kernel_pmap, base, tmpva); 5512 pmap_invalidate_cache_range(base, tmpva, FALSE); 5513 } 5514 return (0); 5515} 5516 5517/* 5518 * perform the pmap work for mincore 5519 */ 5520int 5521pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 5522{ 5523 pd_entry_t *pdep; 5524 pt_entry_t *ptep, pte; 5525 vm_paddr_t pa; 5526 int val; 5527 5528 PMAP_LOCK(pmap); 5529retry: 5530 pdep = pmap_pde(pmap, addr); 5531 if (*pdep != 0) { 5532 if (*pdep & PG_PS) { 5533 pte = *pdep; 5534 /* Compute the physical address of the 4KB page. */ 5535 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & 5536 PG_FRAME; 5537 val = MINCORE_SUPER; 5538 } else { 5539 ptep = pmap_pte(pmap, addr); 5540 pte = *ptep; 5541 pmap_pte_release(ptep); 5542 pa = pte & PG_FRAME; 5543 val = 0; 5544 } 5545 } else { 5546 pte = 0; 5547 pa = 0; 5548 val = 0; 5549 } 5550 if ((pte & PG_V) != 0) { 5551 val |= MINCORE_INCORE; 5552 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5553 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 5554 if ((pte & PG_A) != 0) 5555 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 5556 } 5557 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 5558 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 5559 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 5560 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 5561 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 5562 goto retry; 5563 } else 5564 PA_UNLOCK_COND(*locked_pa); 5565 PMAP_UNLOCK(pmap); 5566 return (val); 5567} 5568 5569void 5570pmap_activate(struct thread *td) 5571{ 5572 pmap_t pmap, oldpmap; 5573 u_int cpuid; 5574 u_int32_t cr3; 5575 5576 critical_enter(); 5577 pmap = vmspace_pmap(td->td_proc->p_vmspace); 5578 oldpmap = PCPU_GET(curpmap); 5579 cpuid = PCPU_GET(cpuid); 5580#if defined(SMP) 5581 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 5582 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 5583#else 5584 CPU_CLR(cpuid, &oldpmap->pm_active); 5585 CPU_SET(cpuid, &pmap->pm_active); 5586#endif 5587#if defined(PAE) || defined(PAE_TABLES) 5588 cr3 = vtophys(pmap->pm_pdpt); 5589#else 5590 cr3 = vtophys(pmap->pm_pdir); 5591#endif 5592 /* 5593 * pmap_activate is for the current thread on the current cpu 5594 */ 5595 td->td_pcb->pcb_cr3 = cr3; 5596 load_cr3(cr3); 5597 PCPU_SET(curpmap, pmap); 5598 critical_exit(); 5599} 5600 5601void 5602pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 5603{ 5604} 5605 5606/* 5607 * Increase the starting virtual address of the given mapping if a 5608 * different alignment might result in more superpage mappings. 5609 */ 5610void 5611pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 5612 vm_offset_t *addr, vm_size_t size) 5613{ 5614 vm_offset_t superpage_offset; 5615 5616 if (size < NBPDR) 5617 return; 5618 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 5619 offset += ptoa(object->pg_color); 5620 superpage_offset = offset & PDRMASK; 5621 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 5622 (*addr & PDRMASK) == superpage_offset) 5623 return; 5624 if ((*addr & PDRMASK) < superpage_offset) 5625 *addr = (*addr & ~PDRMASK) + superpage_offset; 5626 else 5627 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 5628} 5629 5630vm_offset_t 5631pmap_quick_enter_page(vm_page_t m) 5632{ 5633 vm_offset_t qaddr; 5634 pt_entry_t *pte; 5635 5636 critical_enter(); 5637 qaddr = PCPU_GET(qmap_addr); 5638 pte = vtopte(qaddr); 5639 5640 KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy")); 5641 *pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 5642 pmap_cache_bits(pmap_page_get_memattr(m), 0); 5643 invlpg(qaddr); 5644 5645 return (qaddr); 5646} 5647 5648void 5649pmap_quick_remove_page(vm_offset_t addr) 5650{ 5651 vm_offset_t qaddr; 5652 pt_entry_t *pte; 5653 5654 qaddr = PCPU_GET(qmap_addr); 5655 pte = vtopte(qaddr); 5656 5657 KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use")); 5658 KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address")); 5659 5660 *pte = 0; 5661 critical_exit(); 5662} 5663 5664#if defined(PMAP_DEBUG) 5665pmap_pid_dump(int pid) 5666{ 5667 pmap_t pmap; 5668 struct proc *p; 5669 int npte = 0; 5670 int index; 5671 5672 sx_slock(&allproc_lock); 5673 FOREACH_PROC_IN_SYSTEM(p) { 5674 if (p->p_pid != pid) 5675 continue; 5676 5677 if (p->p_vmspace) { 5678 int i,j; 5679 index = 0; 5680 pmap = vmspace_pmap(p->p_vmspace); 5681 for (i = 0; i < NPDEPTD; i++) { 5682 pd_entry_t *pde; 5683 pt_entry_t *pte; 5684 vm_offset_t base = i << PDRSHIFT; 5685 5686 pde = &pmap->pm_pdir[i]; 5687 if (pde && pmap_pde_v(pde)) { 5688 for (j = 0; j < NPTEPG; j++) { 5689 vm_offset_t va = base + (j << PAGE_SHIFT); 5690 if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { 5691 if (index) { 5692 index = 0; 5693 printf("\n"); 5694 } 5695 sx_sunlock(&allproc_lock); 5696 return (npte); 5697 } 5698 pte = pmap_pte(pmap, va); 5699 if (pte && pmap_pte_v(pte)) { 5700 pt_entry_t pa; 5701 vm_page_t m; 5702 pa = *pte; 5703 m = PHYS_TO_VM_PAGE(pa & PG_FRAME); 5704 printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", 5705 va, pa, m->hold_count, m->wire_count, m->flags); 5706 npte++; 5707 index++; 5708 if (index >= 2) { 5709 index = 0; 5710 printf("\n"); 5711 } else { 5712 printf(" "); 5713 } 5714 } 5715 } 5716 } 5717 } 5718 } 5719 } 5720 sx_sunlock(&allproc_lock); 5721 return (npte); 5722} 5723#endif 5724