1/*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 9 * All rights reserved. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * the Systems Programming Group of the University of Utah Computer 13 * Science Department and William Jolitz of UUNET Technologies Inc. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 3. All advertising materials mentioning features or use of this software 24 * must display the following acknowledgement: 25 * This product includes software developed by the University of 26 * California, Berkeley and its contributors. 27 * 4. Neither the name of the University nor the names of its contributors 28 * may be used to endorse or promote products derived from this software 29 * without specific prior written permission. 30 * 31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41 * SUCH DAMAGE. 42 * 43 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 44 */ 45/*- 46 * Copyright (c) 2003 Networks Associates Technology, Inc. 47 * All rights reserved. 48 * 49 * This software was developed for the FreeBSD Project by Jake Burkholder, 50 * Safeport Network Services, and Network Associates Laboratories, the 51 * Security Research Division of Network Associates, Inc. under 52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 53 * CHATS research program. 54 * 55 * Redistribution and use in source and binary forms, with or without 56 * modification, are permitted provided that the following conditions 57 * are met: 58 * 1. Redistributions of source code must retain the above copyright 59 * notice, this list of conditions and the following disclaimer. 60 * 2. Redistributions in binary form must reproduce the above copyright 61 * notice, this list of conditions and the following disclaimer in the 62 * documentation and/or other materials provided with the distribution. 63 * 64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 67 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 74 * SUCH DAMAGE. 75 */ 76 77#include <sys/cdefs.h> 78__FBSDID("$FreeBSD: stable/11/sys/i386/i386/pmap.c 351449 2019-08-24 00:35:59Z jhb $"); 79 80/* 81 * Manages physical address maps. 82 * 83 * Since the information managed by this module is 84 * also stored by the logical address mapping module, 85 * this module may throw away valid virtual-to-physical 86 * mappings at almost any time. However, invalidations 87 * of virtual-to-physical mappings must be done as 88 * requested. 89 * 90 * In order to cope with hardware architectures which 91 * make virtual-to-physical map invalidates expensive, 92 * this module may delay invalidate or reduced protection 93 * operations until such time as they are actually 94 * necessary. This module is given full information as 95 * to which processors are currently using which maps, 96 * and to when physical maps must be made correct. 97 */ 98 99#include "opt_apic.h" 100#include "opt_cpu.h" 101#include "opt_pmap.h" 102#include "opt_smp.h" 103#include "opt_vm.h" 104#include "opt_xbox.h" 105 106#include <sys/param.h> 107#include <sys/systm.h> 108#include <sys/kernel.h> 109#include <sys/ktr.h> 110#include <sys/lock.h> 111#include <sys/malloc.h> 112#include <sys/mman.h> 113#include <sys/msgbuf.h> 114#include <sys/mutex.h> 115#include <sys/proc.h> 116#include <sys/rwlock.h> 117#include <sys/sf_buf.h> 118#include <sys/sx.h> 119#include <sys/vmmeter.h> 120#include <sys/sched.h> 121#include <sys/sysctl.h> 122#include <sys/smp.h> 123 124#include <vm/vm.h> 125#include <vm/vm_param.h> 126#include <vm/vm_kern.h> 127#include <vm/vm_page.h> 128#include <vm/vm_map.h> 129#include <vm/vm_object.h> 130#include <vm/vm_extern.h> 131#include <vm/vm_pageout.h> 132#include <vm/vm_pager.h> 133#include <vm/vm_phys.h> 134#include <vm/vm_radix.h> 135#include <vm/vm_reserv.h> 136#include <vm/uma.h> 137 138#ifdef DEV_APIC 139#include <sys/bus.h> 140#include <machine/intr_machdep.h> 141#include <x86/apicvar.h> 142#endif 143#include <machine/cpu.h> 144#include <machine/cputypes.h> 145#include <machine/md_var.h> 146#include <machine/pcb.h> 147#include <machine/specialreg.h> 148#ifdef SMP 149#include <machine/smp.h> 150#endif 151 152#ifdef XBOX 153#include <machine/xbox.h> 154#endif 155 156#ifndef PMAP_SHPGPERPROC 157#define PMAP_SHPGPERPROC 200 158#endif 159 160#if !defined(DIAGNOSTIC) 161#ifdef __GNUC_GNU_INLINE__ 162#define PMAP_INLINE __attribute__((__gnu_inline__)) inline 163#else 164#define PMAP_INLINE extern inline 165#endif 166#else 167#define PMAP_INLINE 168#endif 169 170#ifdef PV_STATS 171#define PV_STAT(x) do { x ; } while (0) 172#else 173#define PV_STAT(x) do { } while (0) 174#endif 175 176#define pa_index(pa) ((pa) >> PDRSHIFT) 177#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 178 179/* 180 * Get PDEs and PTEs for user/kernel address space 181 */ 182#define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) 183#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 184 185#define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) 186#define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) 187#define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) 188#define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) 189#define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) 190 191#define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ 192 atomic_clear_int((u_int *)(pte), PG_W)) 193#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) 194 195struct pmap kernel_pmap_store; 196LIST_HEAD(pmaplist, pmap); 197static struct pmaplist allpmaps; 198static struct mtx allpmaps_lock; 199 200vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 201vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 202int pgeflag = 0; /* PG_G or-in */ 203int pseflag = 0; /* PG_PS or-in */ 204 205static int nkpt = NKPT; 206vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR; 207extern u_int32_t KERNend; 208extern u_int32_t KPTphys; 209 210#if defined(PAE) || defined(PAE_TABLES) 211pt_entry_t pg_nx; 212static uma_zone_t pdptzone; 213#endif 214 215static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 216 217static int pat_works = 1; 218SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 0, 219 "Is page attribute table fully functional?"); 220 221static int pg_ps_enabled = 1; 222SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, 223 &pg_ps_enabled, 0, "Are large page mappings enabled?"); 224 225#define PAT_INDEX_SIZE 8 226static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 227 228/* 229 * pmap_mapdev support pre initialization (i.e. console) 230 */ 231#define PMAP_PREINIT_MAPPING_COUNT 8 232static struct pmap_preinit_mapping { 233 vm_paddr_t pa; 234 vm_offset_t va; 235 vm_size_t sz; 236 int mode; 237} pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; 238static int pmap_initialized; 239 240static struct rwlock_padalign pvh_global_lock; 241 242/* 243 * Data for the pv entry allocation mechanism 244 */ 245static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 246static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 247static struct md_page *pv_table; 248static int shpgperproc = PMAP_SHPGPERPROC; 249 250struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ 251int pv_maxchunks; /* How many chunks we have KVA for */ 252vm_offset_t pv_vafree; /* freelist stored in the PTE */ 253 254/* 255 * All those kernel PT submaps that BSD is so fond of 256 */ 257pt_entry_t *CMAP3; 258static pd_entry_t *KPTD; 259caddr_t ptvmmap = 0; 260caddr_t CADDR3; 261struct msgbuf *msgbufp = NULL; 262 263/* 264 * Crashdump maps. 265 */ 266static caddr_t crashdumpmap; 267 268static pt_entry_t *PMAP1 = NULL, *PMAP2; 269static pt_entry_t *PADDR1 = NULL, *PADDR2; 270#ifdef SMP 271static int PMAP1cpu; 272static int PMAP1changedcpu; 273SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 274 &PMAP1changedcpu, 0, 275 "Number of times pmap_pte_quick changed CPU with same PMAP1"); 276#endif 277static int PMAP1changed; 278SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 279 &PMAP1changed, 0, 280 "Number of times pmap_pte_quick changed PMAP1"); 281static int PMAP1unchanged; 282SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 283 &PMAP1unchanged, 0, 284 "Number of times pmap_pte_quick didn't change PMAP1"); 285static struct mtx PMAP2mutex; 286 287int pti; 288 289/* 290 * Internal flags for pmap_mapdev_internal(). 291 */ 292#define MAPDEV_SETATTR 0x0000001 /* Modify existing attrs. */ 293 294static void free_pv_chunk(struct pv_chunk *pc); 295static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 296static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); 297static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 298static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 299#if VM_NRESERVLEVEL > 0 300static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 301#endif 302static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 303static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 304 vm_offset_t va); 305static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); 306 307static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 308static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, 309 vm_prot_t prot); 310static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 311 vm_page_t m, vm_prot_t prot, vm_page_t mpte); 312static void pmap_flush_page(vm_page_t m); 313static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 314static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, 315 pd_entry_t pde); 316static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 317static boolean_t pmap_is_modified_pvh(struct md_page *pvh); 318static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); 319static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 320static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde); 321static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); 322#if VM_NRESERVLEVEL > 0 323static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 324#endif 325static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 326 vm_prot_t prot); 327static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); 328static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 329 struct spglist *free); 330static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 331 struct spglist *free); 332static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); 333static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, 334 struct spglist *free); 335static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, 336 vm_offset_t va); 337static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); 338static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 339 vm_page_t m); 340static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 341 pd_entry_t newpde); 342static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); 343 344static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); 345 346static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags); 347static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free); 348static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); 349static void pmap_pte_release(pt_entry_t *pte); 350static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *); 351#if defined(PAE) || defined(PAE_TABLES) 352static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, 353 int wait); 354#endif 355static void pmap_set_pg(void); 356 357static __inline void pagezero(void *page); 358 359CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); 360CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); 361 362/* 363 * If you get an error here, then you set KVA_PAGES wrong! See the 364 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be 365 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. 366 */ 367CTASSERT(KERNBASE % (1 << 24) == 0); 368 369/* 370 * Bootstrap the system enough to run with virtual memory. 371 * 372 * On the i386 this is called after mapping has already been enabled 373 * and just syncs the pmap module with what has already been done. 374 * [We can't call it easily with mapping off since the kernel is not 375 * mapped with PA == VA, hence we would have to relocate every address 376 * from the linked base (virtual) address "KERNBASE" to the actual 377 * (physical) address starting relative to 0] 378 */ 379void 380pmap_bootstrap(vm_paddr_t firstaddr) 381{ 382 vm_offset_t va; 383 pt_entry_t *pte, *unused; 384 struct pcpu *pc; 385 u_long res; 386 int i; 387 388 res = atop(firstaddr - (vm_paddr_t)KERNLOAD); 389 390 /* 391 * Add a physical memory segment (vm_phys_seg) corresponding to the 392 * preallocated kernel page table pages so that vm_page structures 393 * representing these pages will be created. The vm_page structures 394 * are required for promotion of the corresponding kernel virtual 395 * addresses to superpage mappings. 396 */ 397 vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); 398 399 /* 400 * Initialize the first available kernel virtual address. However, 401 * using "firstaddr" may waste a few pages of the kernel virtual 402 * address space, because locore may not have mapped every physical 403 * page that it allocated. Preferably, locore would provide a first 404 * unused virtual address in addition to "firstaddr". 405 */ 406 virtual_avail = (vm_offset_t) KERNBASE + firstaddr; 407 virtual_end = VM_MAX_KERNEL_ADDRESS; 408 409 /* 410 * Initialize the kernel pmap (which is statically allocated). 411 * Count bootstrap data as being resident in case any of this data is 412 * later unmapped (using pmap_remove()) and freed. 413 */ 414 PMAP_LOCK_INIT(kernel_pmap); 415 kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); 416#if defined(PAE) || defined(PAE_TABLES) 417 kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); 418#endif 419 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 420 kernel_pmap->pm_stats.resident_count = res; 421 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 422 423 /* 424 * Initialize the global pv list lock. 425 */ 426 rw_init(&pvh_global_lock, "pmap pv global"); 427 428 LIST_INIT(&allpmaps); 429 430 /* 431 * Request a spin mutex so that changes to allpmaps cannot be 432 * preempted by smp_rendezvous_cpus(). Otherwise, 433 * pmap_update_pde_kernel() could access allpmaps while it is 434 * being changed. 435 */ 436 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); 437 mtx_lock_spin(&allpmaps_lock); 438 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); 439 mtx_unlock_spin(&allpmaps_lock); 440 441 /* 442 * Reserve some special page table entries/VA space for temporary 443 * mapping of pages. 444 */ 445#define SYSMAP(c, p, v, n) \ 446 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 447 448 va = virtual_avail; 449 pte = vtopte(va); 450 451 452 /* 453 * Initialize temporary map objects on the current CPU for use 454 * during early boot. 455 * CMAP1/CMAP2 are used for zeroing and copying pages. 456 * CMAP3 is used for the idle process page zeroing. 457 */ 458 pc = get_pcpu(); 459 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 460 SYSMAP(caddr_t, pc->pc_cmap_pte1, pc->pc_cmap_addr1, 1) 461 SYSMAP(caddr_t, pc->pc_cmap_pte2, pc->pc_cmap_addr2, 1) 462 SYSMAP(vm_offset_t, pte, pc->pc_qmap_addr, 1) 463 464 SYSMAP(caddr_t, CMAP3, CADDR3, 1) 465 466 /* 467 * Crashdump maps. 468 */ 469 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) 470 471 /* 472 * ptvmmap is used for reading arbitrary physical pages via /dev/mem. 473 */ 474 SYSMAP(caddr_t, unused, ptvmmap, 1) 475 476 /* 477 * msgbufp is used to map the system message buffer. 478 */ 479 SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) 480 481 /* 482 * KPTmap is used by pmap_kextract(). 483 * 484 * KPTmap is first initialized by locore. However, that initial 485 * KPTmap can only support NKPT page table pages. Here, a larger 486 * KPTmap is created that can support KVA_PAGES page table pages. 487 */ 488 SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES) 489 490 for (i = 0; i < NKPT; i++) 491 KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V; 492 493 /* 494 * Adjust the start of the KPTD and KPTmap so that the implementation 495 * of pmap_kextract() and pmap_growkernel() can be made simpler. 496 */ 497 KPTD -= KPTDI; 498 KPTmap -= i386_btop(KPTDI << PDRSHIFT); 499 500 /* 501 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), 502 * respectively. 503 */ 504 SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) 505 SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) 506 507 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); 508 509 virtual_avail = va; 510 511 /* 512 * Leave in place an identity mapping (virt == phys) for the low 1 MB 513 * physical memory region that is used by the ACPI wakeup code. This 514 * mapping must not have PG_G set. 515 */ 516#ifdef XBOX 517 /* FIXME: This is gross, but needed for the XBOX. Since we are in such 518 * an early stadium, we cannot yet neatly map video memory ... :-( 519 * Better fixes are very welcome! */ 520 if (!arch_i386_is_xbox) 521#endif 522 for (i = 1; i < NKPT; i++) 523 PTD[i] = 0; 524 525 /* 526 * Initialize the PAT MSR if present. 527 * pmap_init_pat() clears and sets CR4_PGE, which, as a 528 * side-effect, invalidates stale PG_G TLB entries that might 529 * have been created in our pre-boot environment. We assume 530 * that PAT support implies PGE and in reverse, PGE presence 531 * comes with PAT. Both features were added for Pentium Pro. 532 */ 533 pmap_init_pat(); 534 535 /* Turn on PG_G on kernel page(s) */ 536 pmap_set_pg(); 537} 538 539static void 540pmap_init_reserved_pages(void) 541{ 542 struct pcpu *pc; 543 vm_offset_t pages; 544 int i; 545 546 CPU_FOREACH(i) { 547 pc = pcpu_find(i); 548 /* 549 * Skip if the mapping has already been initialized, 550 * i.e. this is the BSP. 551 */ 552 if (pc->pc_cmap_addr1 != 0) 553 continue; 554 mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); 555 pages = kva_alloc(PAGE_SIZE * 3); 556 if (pages == 0) 557 panic("%s: unable to allocate KVA", __func__); 558 pc->pc_cmap_pte1 = vtopte(pages); 559 pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE); 560 pc->pc_cmap_addr1 = (caddr_t)pages; 561 pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE); 562 pc->pc_qmap_addr = pages + (PAGE_SIZE * 2); 563 } 564} 565 566SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); 567 568/* 569 * Setup the PAT MSR. 570 */ 571void 572pmap_init_pat(void) 573{ 574 int pat_table[PAT_INDEX_SIZE]; 575 uint64_t pat_msr; 576 u_long cr0, cr4; 577 int i; 578 579 /* Set default PAT index table. */ 580 for (i = 0; i < PAT_INDEX_SIZE; i++) 581 pat_table[i] = -1; 582 pat_table[PAT_WRITE_BACK] = 0; 583 pat_table[PAT_WRITE_THROUGH] = 1; 584 pat_table[PAT_UNCACHEABLE] = 3; 585 pat_table[PAT_WRITE_COMBINING] = 3; 586 pat_table[PAT_WRITE_PROTECTED] = 3; 587 pat_table[PAT_UNCACHED] = 3; 588 589 /* 590 * Bail if this CPU doesn't implement PAT. 591 * We assume that PAT support implies PGE. 592 */ 593 if ((cpu_feature & CPUID_PAT) == 0) { 594 for (i = 0; i < PAT_INDEX_SIZE; i++) 595 pat_index[i] = pat_table[i]; 596 pat_works = 0; 597 return; 598 } 599 600 /* 601 * Due to some Intel errata, we can only safely use the lower 4 602 * PAT entries. 603 * 604 * Intel Pentium III Processor Specification Update 605 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B 606 * or Mode C Paging) 607 * 608 * Intel Pentium IV Processor Specification Update 609 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) 610 */ 611 if (cpu_vendor_id == CPU_VENDOR_INTEL && 612 !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) 613 pat_works = 0; 614 615 /* Initialize default PAT entries. */ 616 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 617 PAT_VALUE(1, PAT_WRITE_THROUGH) | 618 PAT_VALUE(2, PAT_UNCACHED) | 619 PAT_VALUE(3, PAT_UNCACHEABLE) | 620 PAT_VALUE(4, PAT_WRITE_BACK) | 621 PAT_VALUE(5, PAT_WRITE_THROUGH) | 622 PAT_VALUE(6, PAT_UNCACHED) | 623 PAT_VALUE(7, PAT_UNCACHEABLE); 624 625 if (pat_works) { 626 /* 627 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 628 * Program 5 and 6 as WP and WC. 629 * Leave 4 and 7 as WB and UC. 630 */ 631 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); 632 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | 633 PAT_VALUE(6, PAT_WRITE_COMBINING); 634 pat_table[PAT_UNCACHED] = 2; 635 pat_table[PAT_WRITE_PROTECTED] = 5; 636 pat_table[PAT_WRITE_COMBINING] = 6; 637 } else { 638 /* 639 * Just replace PAT Index 2 with WC instead of UC-. 640 */ 641 pat_msr &= ~PAT_MASK(2); 642 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 643 pat_table[PAT_WRITE_COMBINING] = 2; 644 } 645 646 /* Disable PGE. */ 647 cr4 = rcr4(); 648 load_cr4(cr4 & ~CR4_PGE); 649 650 /* Disable caches (CD = 1, NW = 0). */ 651 cr0 = rcr0(); 652 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 653 654 /* Flushes caches and TLBs. */ 655 wbinvd(); 656 invltlb(); 657 658 /* Update PAT and index table. */ 659 wrmsr(MSR_PAT, pat_msr); 660 for (i = 0; i < PAT_INDEX_SIZE; i++) 661 pat_index[i] = pat_table[i]; 662 663 /* Flush caches and TLBs again. */ 664 wbinvd(); 665 invltlb(); 666 667 /* Restore caches and PGE. */ 668 load_cr0(cr0); 669 load_cr4(cr4); 670} 671 672/* 673 * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. 674 */ 675static void 676pmap_set_pg(void) 677{ 678 pt_entry_t *pte; 679 vm_offset_t va, endva; 680 681 if (pgeflag == 0) 682 return; 683 684 endva = KERNBASE + KERNend; 685 686 if (pseflag) { 687 va = KERNBASE + KERNLOAD; 688 while (va < endva) { 689 pdir_pde(PTD, va) |= pgeflag; 690 invltlb(); /* Flush non-PG_G entries. */ 691 va += NBPDR; 692 } 693 } else { 694 va = (vm_offset_t)btext; 695 while (va < endva) { 696 pte = vtopte(va); 697 if (*pte) 698 *pte |= pgeflag; 699 invltlb(); /* Flush non-PG_G entries. */ 700 va += PAGE_SIZE; 701 } 702 } 703} 704 705/* 706 * Initialize a vm_page's machine-dependent fields. 707 */ 708void 709pmap_page_init(vm_page_t m) 710{ 711 712 TAILQ_INIT(&m->md.pv_list); 713 m->md.pat_mode = PAT_WRITE_BACK; 714} 715 716#if defined(PAE) || defined(PAE_TABLES) 717static void * 718pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) 719{ 720 721 /* Inform UMA that this allocator uses kernel_map/object. */ 722 *flags = UMA_SLAB_KERNEL; 723 return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL, 724 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); 725} 726#endif 727 728/* 729 * Abuse the pte nodes for unmapped kva to thread a kva freelist through. 730 * Requirements: 731 * - Must deal with pages in order to ensure that none of the PG_* bits 732 * are ever set, PG_V in particular. 733 * - Assumes we can write to ptes without pte_store() atomic ops, even 734 * on PAE systems. This should be ok. 735 * - Assumes nothing will ever test these addresses for 0 to indicate 736 * no mapping instead of correctly checking PG_V. 737 * - Assumes a vm_offset_t will fit in a pte (true for i386). 738 * Because PG_V is never set, there can be no mappings to invalidate. 739 */ 740static vm_offset_t 741pmap_ptelist_alloc(vm_offset_t *head) 742{ 743 pt_entry_t *pte; 744 vm_offset_t va; 745 746 va = *head; 747 if (va == 0) 748 panic("pmap_ptelist_alloc: exhausted ptelist KVA"); 749 pte = vtopte(va); 750 *head = *pte; 751 if (*head & PG_V) 752 panic("pmap_ptelist_alloc: va with PG_V set!"); 753 *pte = 0; 754 return (va); 755} 756 757static void 758pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) 759{ 760 pt_entry_t *pte; 761 762 if (va & PG_V) 763 panic("pmap_ptelist_free: freeing va with PG_V set!"); 764 pte = vtopte(va); 765 *pte = *head; /* virtual! PG_V is 0 though */ 766 *head = va; 767} 768 769static void 770pmap_ptelist_init(vm_offset_t *head, void *base, int npages) 771{ 772 int i; 773 vm_offset_t va; 774 775 *head = 0; 776 for (i = npages - 1; i >= 0; i--) { 777 va = (vm_offset_t)base + i * PAGE_SIZE; 778 pmap_ptelist_free(head, va); 779 } 780} 781 782 783/* 784 * Initialize the pmap module. 785 * Called by vm_init, to initialize any structures that the pmap 786 * system needs to map virtual memory. 787 */ 788void 789pmap_init(void) 790{ 791 struct pmap_preinit_mapping *ppim; 792 vm_page_t mpte; 793 vm_size_t s; 794 int i, pv_npg; 795 796 /* 797 * Initialize the vm page array entries for the kernel pmap's 798 * page table pages. 799 */ 800 PMAP_LOCK(kernel_pmap); 801 for (i = 0; i < NKPT; i++) { 802 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 803 KASSERT(mpte >= vm_page_array && 804 mpte < &vm_page_array[vm_page_array_size], 805 ("pmap_init: page table page is out of range")); 806 mpte->pindex = i + KPTDI; 807 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 808 mpte->wire_count = 1; 809 if (pseflag != 0 && 810 KERNBASE <= i << PDRSHIFT && i << PDRSHIFT < KERNend && 811 pmap_insert_pt_page(kernel_pmap, mpte)) 812 panic("pmap_init: pmap_insert_pt_page failed"); 813 } 814 PMAP_UNLOCK(kernel_pmap); 815 atomic_add_int(&vm_cnt.v_wire_count, NKPT); 816 817 /* 818 * Initialize the address space (zone) for the pv entries. Set a 819 * high water mark so that the system can recover from excessive 820 * numbers of pv entries. 821 */ 822 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 823 pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; 824 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 825 pv_entry_max = roundup(pv_entry_max, _NPCPV); 826 pv_entry_high_water = 9 * (pv_entry_max / 10); 827 828 /* 829 * If the kernel is running on a virtual machine, then it must assume 830 * that MCA is enabled by the hypervisor. Moreover, the kernel must 831 * be prepared for the hypervisor changing the vendor and family that 832 * are reported by CPUID. Consequently, the workaround for AMD Family 833 * 10h Erratum 383 is enabled if the processor's feature set does not 834 * include at least one feature that is only supported by older Intel 835 * or newer AMD processors. 836 */ 837 if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && 838 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 839 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 840 AMDID2_FMA4)) == 0) 841 workaround_erratum383 = 1; 842 843 /* 844 * Are large page mappings supported and enabled? 845 */ 846 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 847 if (pseflag == 0) 848 pg_ps_enabled = 0; 849 else if (pg_ps_enabled) { 850 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 851 ("pmap_init: can't assign to pagesizes[1]")); 852 pagesizes[1] = NBPDR; 853 } 854 855 /* 856 * Calculate the size of the pv head table for superpages. 857 * Handle the possibility that "vm_phys_segs[...].end" is zero. 858 */ 859 pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end - 860 PAGE_SIZE) / NBPDR + 1; 861 862 /* 863 * Allocate memory for the pv head table for superpages. 864 */ 865 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 866 s = round_page(s); 867 pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, 868 M_WAITOK | M_ZERO); 869 for (i = 0; i < pv_npg; i++) 870 TAILQ_INIT(&pv_table[i].pv_list); 871 872 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); 873 pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); 874 if (pv_chunkbase == NULL) 875 panic("pmap_init: not enough kvm for pv chunks"); 876 pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); 877#if defined(PAE) || defined(PAE_TABLES) 878 pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, 879 NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, 880 UMA_ZONE_VM | UMA_ZONE_NOFREE); 881 uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); 882#endif 883 884 pmap_initialized = 1; 885 if (!bootverbose) 886 return; 887 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 888 ppim = pmap_preinit_mapping + i; 889 if (ppim->va == 0) 890 continue; 891 printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i, 892 (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode); 893 } 894} 895 896 897SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, 898 "Max number of PV entries"); 899SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, 900 "Page share factor per proc"); 901 902static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, 903 "2/4MB page mapping counters"); 904 905static u_long pmap_pde_demotions; 906SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, 907 &pmap_pde_demotions, 0, "2/4MB page demotions"); 908 909static u_long pmap_pde_mappings; 910SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 911 &pmap_pde_mappings, 0, "2/4MB page mappings"); 912 913static u_long pmap_pde_p_failures; 914SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 915 &pmap_pde_p_failures, 0, "2/4MB page promotion failures"); 916 917static u_long pmap_pde_promotions; 918SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 919 &pmap_pde_promotions, 0, "2/4MB page promotions"); 920 921/*************************************************** 922 * Low level helper routines..... 923 ***************************************************/ 924 925/* 926 * Determine the appropriate bits to set in a PTE or PDE for a specified 927 * caching mode. 928 */ 929int 930pmap_cache_bits(int mode, boolean_t is_pde) 931{ 932 int cache_bits, pat_flag, pat_idx; 933 934 if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) 935 panic("Unknown caching mode %d\n", mode); 936 937 /* The PAT bit is different for PTE's and PDE's. */ 938 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; 939 940 /* Map the caching mode to a PAT index. */ 941 pat_idx = pat_index[mode]; 942 943 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 944 cache_bits = 0; 945 if (pat_idx & 0x4) 946 cache_bits |= pat_flag; 947 if (pat_idx & 0x2) 948 cache_bits |= PG_NC_PCD; 949 if (pat_idx & 0x1) 950 cache_bits |= PG_NC_PWT; 951 return (cache_bits); 952} 953 954/* 955 * The caller is responsible for maintaining TLB consistency. 956 */ 957static void 958pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde) 959{ 960 pd_entry_t *pde; 961 pmap_t pmap; 962 boolean_t PTD_updated; 963 964 PTD_updated = FALSE; 965 mtx_lock_spin(&allpmaps_lock); 966 LIST_FOREACH(pmap, &allpmaps, pm_list) { 967 if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & 968 PG_FRAME)) 969 PTD_updated = TRUE; 970 pde = pmap_pde(pmap, va); 971 pde_store(pde, newpde); 972 } 973 mtx_unlock_spin(&allpmaps_lock); 974 KASSERT(PTD_updated, 975 ("pmap_kenter_pde: current page table is not in allpmaps")); 976} 977 978/* 979 * After changing the page size for the specified virtual address in the page 980 * table, flush the corresponding entries from the processor's TLB. Only the 981 * calling processor's TLB is affected. 982 * 983 * The calling thread must be pinned to a processor. 984 */ 985static void 986pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) 987{ 988 u_long cr4; 989 990 if ((newpde & PG_PS) == 0) 991 /* Demotion: flush a specific 2MB page mapping. */ 992 invlpg(va); 993 else if ((newpde & PG_G) == 0) 994 /* 995 * Promotion: flush every 4KB page mapping from the TLB 996 * because there are too many to flush individually. 997 */ 998 invltlb(); 999 else { 1000 /* 1001 * Promotion: flush every 4KB page mapping from the TLB, 1002 * including any global (PG_G) mappings. 1003 */ 1004 cr4 = rcr4(); 1005 load_cr4(cr4 & ~CR4_PGE); 1006 /* 1007 * Although preemption at this point could be detrimental to 1008 * performance, it would not lead to an error. PG_G is simply 1009 * ignored if CR4.PGE is clear. Moreover, in case this block 1010 * is re-entered, the load_cr4() either above or below will 1011 * modify CR4.PGE flushing the TLB. 1012 */ 1013 load_cr4(cr4 | CR4_PGE); 1014 } 1015} 1016 1017void 1018invltlb_glob(void) 1019{ 1020 uint64_t cr4; 1021 1022 if (pgeflag == 0) { 1023 invltlb(); 1024 } else { 1025 cr4 = rcr4(); 1026 load_cr4(cr4 & ~CR4_PGE); 1027 load_cr4(cr4 | CR4_PGE); 1028 } 1029} 1030 1031 1032#ifdef SMP 1033/* 1034 * For SMP, these functions have to use the IPI mechanism for coherence. 1035 * 1036 * N.B.: Before calling any of the following TLB invalidation functions, 1037 * the calling processor must ensure that all stores updating a non- 1038 * kernel page table are globally performed. Otherwise, another 1039 * processor could cache an old, pre-update entry without being 1040 * invalidated. This can happen one of two ways: (1) The pmap becomes 1041 * active on another processor after its pm_active field is checked by 1042 * one of the following functions but before a store updating the page 1043 * table is globally performed. (2) The pmap becomes active on another 1044 * processor before its pm_active field is checked but due to 1045 * speculative loads one of the following functions stills reads the 1046 * pmap as inactive on the other processor. 1047 * 1048 * The kernel page table is exempt because its pm_active field is 1049 * immutable. The kernel page table is always active on every 1050 * processor. 1051 */ 1052void 1053pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1054{ 1055 cpuset_t *mask, other_cpus; 1056 u_int cpuid; 1057 1058 sched_pin(); 1059 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 1060 invlpg(va); 1061 mask = &all_cpus; 1062 } else { 1063 cpuid = PCPU_GET(cpuid); 1064 other_cpus = all_cpus; 1065 CPU_CLR(cpuid, &other_cpus); 1066 if (CPU_ISSET(cpuid, &pmap->pm_active)) 1067 invlpg(va); 1068 CPU_AND(&other_cpus, &pmap->pm_active); 1069 mask = &other_cpus; 1070 } 1071 smp_masked_invlpg(*mask, va, pmap); 1072 sched_unpin(); 1073} 1074 1075/* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ 1076#define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) 1077 1078void 1079pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1080{ 1081 cpuset_t *mask, other_cpus; 1082 vm_offset_t addr; 1083 u_int cpuid; 1084 1085 if (eva - sva >= PMAP_INVLPG_THRESHOLD) { 1086 pmap_invalidate_all(pmap); 1087 return; 1088 } 1089 1090 sched_pin(); 1091 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 1092 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1093 invlpg(addr); 1094 mask = &all_cpus; 1095 } else { 1096 cpuid = PCPU_GET(cpuid); 1097 other_cpus = all_cpus; 1098 CPU_CLR(cpuid, &other_cpus); 1099 if (CPU_ISSET(cpuid, &pmap->pm_active)) 1100 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1101 invlpg(addr); 1102 CPU_AND(&other_cpus, &pmap->pm_active); 1103 mask = &other_cpus; 1104 } 1105 smp_masked_invlpg_range(*mask, sva, eva, pmap); 1106 sched_unpin(); 1107} 1108 1109void 1110pmap_invalidate_all(pmap_t pmap) 1111{ 1112 cpuset_t *mask, other_cpus; 1113 u_int cpuid; 1114 1115 sched_pin(); 1116 if (pmap == kernel_pmap) { 1117 invltlb_glob(); 1118 mask = &all_cpus; 1119 } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { 1120 invltlb(); 1121 mask = &all_cpus; 1122 } else { 1123 cpuid = PCPU_GET(cpuid); 1124 other_cpus = all_cpus; 1125 CPU_CLR(cpuid, &other_cpus); 1126 if (CPU_ISSET(cpuid, &pmap->pm_active)) 1127 invltlb(); 1128 CPU_AND(&other_cpus, &pmap->pm_active); 1129 mask = &other_cpus; 1130 } 1131 smp_masked_invltlb(*mask, pmap); 1132 sched_unpin(); 1133} 1134 1135void 1136pmap_invalidate_cache(void) 1137{ 1138 1139 sched_pin(); 1140 wbinvd(); 1141 smp_cache_flush(); 1142 sched_unpin(); 1143} 1144 1145struct pde_action { 1146 cpuset_t invalidate; /* processors that invalidate their TLB */ 1147 vm_offset_t va; 1148 pd_entry_t *pde; 1149 pd_entry_t newpde; 1150 u_int store; /* processor that updates the PDE */ 1151}; 1152 1153static void 1154pmap_update_pde_kernel(void *arg) 1155{ 1156 struct pde_action *act = arg; 1157 pd_entry_t *pde; 1158 pmap_t pmap; 1159 1160 if (act->store == PCPU_GET(cpuid)) { 1161 1162 /* 1163 * Elsewhere, this operation requires allpmaps_lock for 1164 * synchronization. Here, it does not because it is being 1165 * performed in the context of an all_cpus rendezvous. 1166 */ 1167 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1168 pde = pmap_pde(pmap, act->va); 1169 pde_store(pde, act->newpde); 1170 } 1171 } 1172} 1173 1174static void 1175pmap_update_pde_user(void *arg) 1176{ 1177 struct pde_action *act = arg; 1178 1179 if (act->store == PCPU_GET(cpuid)) 1180 pde_store(act->pde, act->newpde); 1181} 1182 1183static void 1184pmap_update_pde_teardown(void *arg) 1185{ 1186 struct pde_action *act = arg; 1187 1188 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 1189 pmap_update_pde_invalidate(act->va, act->newpde); 1190} 1191 1192/* 1193 * Change the page size for the specified virtual address in a way that 1194 * prevents any possibility of the TLB ever having two entries that map the 1195 * same virtual address using different page sizes. This is the recommended 1196 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 1197 * machine check exception for a TLB state that is improperly diagnosed as a 1198 * hardware error. 1199 */ 1200static void 1201pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1202{ 1203 struct pde_action act; 1204 cpuset_t active, other_cpus; 1205 u_int cpuid; 1206 1207 sched_pin(); 1208 cpuid = PCPU_GET(cpuid); 1209 other_cpus = all_cpus; 1210 CPU_CLR(cpuid, &other_cpus); 1211 if (pmap == kernel_pmap) 1212 active = all_cpus; 1213 else 1214 active = pmap->pm_active; 1215 if (CPU_OVERLAP(&active, &other_cpus)) { 1216 act.store = cpuid; 1217 act.invalidate = active; 1218 act.va = va; 1219 act.pde = pde; 1220 act.newpde = newpde; 1221 CPU_SET(cpuid, &active); 1222 smp_rendezvous_cpus(active, 1223 smp_no_rendezvous_barrier, pmap == kernel_pmap ? 1224 pmap_update_pde_kernel : pmap_update_pde_user, 1225 pmap_update_pde_teardown, &act); 1226 } else { 1227 if (pmap == kernel_pmap) 1228 pmap_kenter_pde(va, newpde); 1229 else 1230 pde_store(pde, newpde); 1231 if (CPU_ISSET(cpuid, &active)) 1232 pmap_update_pde_invalidate(va, newpde); 1233 } 1234 sched_unpin(); 1235} 1236#else /* !SMP */ 1237/* 1238 * Normal, non-SMP, 486+ invalidation functions. 1239 * We inline these within pmap.c for speed. 1240 */ 1241PMAP_INLINE void 1242pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1243{ 1244 1245 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1246 invlpg(va); 1247} 1248 1249PMAP_INLINE void 1250pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1251{ 1252 vm_offset_t addr; 1253 1254 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1255 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1256 invlpg(addr); 1257} 1258 1259PMAP_INLINE void 1260pmap_invalidate_all(pmap_t pmap) 1261{ 1262 1263 if (pmap == kernel_pmap) 1264 invltlb_glob(); 1265 else if (!CPU_EMPTY(&pmap->pm_active)) 1266 invltlb(); 1267} 1268 1269PMAP_INLINE void 1270pmap_invalidate_cache(void) 1271{ 1272 1273 wbinvd(); 1274} 1275 1276static void 1277pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1278{ 1279 1280 if (pmap == kernel_pmap) 1281 pmap_kenter_pde(va, newpde); 1282 else 1283 pde_store(pde, newpde); 1284 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1285 pmap_update_pde_invalidate(va, newpde); 1286} 1287#endif /* !SMP */ 1288 1289static void 1290pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) 1291{ 1292 1293 /* 1294 * When the PDE has PG_PROMOTED set, the 2- or 4MB page mapping was 1295 * created by a promotion that did not invalidate the 512 or 1024 4KB 1296 * page mappings that might exist in the TLB. Consequently, at this 1297 * point, the TLB may hold both 4KB and 2- or 4MB page mappings for 1298 * the address range [va, va + NBPDR). Therefore, the entire range 1299 * must be invalidated here. In contrast, when PG_PROMOTED is clear, 1300 * the TLB will not hold any 4KB page mappings for the address range 1301 * [va, va + NBPDR), and so a single INVLPG suffices to invalidate the 1302 * 2- or 4MB page mapping from the TLB. 1303 */ 1304 if ((pde & PG_PROMOTED) != 0) 1305 pmap_invalidate_range(pmap, va, va + NBPDR - 1); 1306 else 1307 pmap_invalidate_page(pmap, va); 1308} 1309 1310#define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1311 1312void 1313pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) 1314{ 1315 1316 if (force) { 1317 sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); 1318 } else { 1319 KASSERT((sva & PAGE_MASK) == 0, 1320 ("pmap_invalidate_cache_range: sva not page-aligned")); 1321 KASSERT((eva & PAGE_MASK) == 0, 1322 ("pmap_invalidate_cache_range: eva not page-aligned")); 1323 } 1324 1325 if ((cpu_feature & CPUID_SS) != 0 && !force) 1326 ; /* If "Self Snoop" is supported and allowed, do nothing. */ 1327 else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 && 1328 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 1329#ifdef DEV_APIC 1330 /* 1331 * XXX: Some CPUs fault, hang, or trash the local APIC 1332 * registers if we use CLFLUSH on the local APIC 1333 * range. The local APIC is always uncached, so we 1334 * don't need to flush for that range anyway. 1335 */ 1336 if (pmap_kextract(sva) == lapic_paddr) 1337 return; 1338#endif 1339 /* 1340 * Otherwise, do per-cache line flush. Use the sfence 1341 * instruction to insure that previous stores are 1342 * included in the write-back. The processor 1343 * propagates flush to other processors in the cache 1344 * coherence domain. 1345 */ 1346 sfence(); 1347 for (; sva < eva; sva += cpu_clflush_line_size) 1348 clflushopt(sva); 1349 sfence(); 1350 } else if ((cpu_feature & CPUID_CLFSH) != 0 && 1351 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 1352#ifdef DEV_APIC 1353 if (pmap_kextract(sva) == lapic_paddr) 1354 return; 1355#endif 1356 /* 1357 * Writes are ordered by CLFLUSH on Intel CPUs. 1358 */ 1359 if (cpu_vendor_id != CPU_VENDOR_INTEL) 1360 mfence(); 1361 for (; sva < eva; sva += cpu_clflush_line_size) 1362 clflush(sva); 1363 if (cpu_vendor_id != CPU_VENDOR_INTEL) 1364 mfence(); 1365 } else { 1366 1367 /* 1368 * No targeted cache flush methods are supported by CPU, 1369 * or the supplied range is bigger than 2MB. 1370 * Globally invalidate cache. 1371 */ 1372 pmap_invalidate_cache(); 1373 } 1374} 1375 1376void 1377pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1378{ 1379 int i; 1380 1381 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1382 (cpu_feature & CPUID_CLFSH) == 0) { 1383 pmap_invalidate_cache(); 1384 } else { 1385 for (i = 0; i < count; i++) 1386 pmap_flush_page(pages[i]); 1387 } 1388} 1389 1390/* 1391 * Are we current address space or kernel? 1392 */ 1393static __inline int 1394pmap_is_current(pmap_t pmap) 1395{ 1396 1397 return (pmap == kernel_pmap || pmap == 1398 vmspace_pmap(curthread->td_proc->p_vmspace)); 1399} 1400 1401/* 1402 * If the given pmap is not the current or kernel pmap, the returned pte must 1403 * be released by passing it to pmap_pte_release(). 1404 */ 1405pt_entry_t * 1406pmap_pte(pmap_t pmap, vm_offset_t va) 1407{ 1408 pd_entry_t newpf; 1409 pd_entry_t *pde; 1410 1411 pde = pmap_pde(pmap, va); 1412 if (*pde & PG_PS) 1413 return (pde); 1414 if (*pde != 0) { 1415 /* are we current address space or kernel? */ 1416 if (pmap_is_current(pmap)) 1417 return (vtopte(va)); 1418 mtx_lock(&PMAP2mutex); 1419 newpf = *pde & PG_FRAME; 1420 if ((*PMAP2 & PG_FRAME) != newpf) { 1421 *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; 1422 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); 1423 } 1424 return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); 1425 } 1426 return (NULL); 1427} 1428 1429/* 1430 * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte 1431 * being NULL. 1432 */ 1433static __inline void 1434pmap_pte_release(pt_entry_t *pte) 1435{ 1436 1437 if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) 1438 mtx_unlock(&PMAP2mutex); 1439} 1440 1441/* 1442 * NB: The sequence of updating a page table followed by accesses to the 1443 * corresponding pages is subject to the situation described in the "AMD64 1444 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23, 1445 * "7.3.1 Special Coherency Considerations". Therefore, issuing the INVLPG 1446 * right after modifying the PTE bits is crucial. 1447 */ 1448static __inline void 1449invlcaddr(void *caddr) 1450{ 1451 1452 invlpg((u_int)caddr); 1453} 1454 1455/* 1456 * Super fast pmap_pte routine best used when scanning 1457 * the pv lists. This eliminates many coarse-grained 1458 * invltlb calls. Note that many of the pv list 1459 * scans are across different pmaps. It is very wasteful 1460 * to do an entire invltlb for checking a single mapping. 1461 * 1462 * If the given pmap is not the current pmap, pvh_global_lock 1463 * must be held and curthread pinned to a CPU. 1464 */ 1465static pt_entry_t * 1466pmap_pte_quick(pmap_t pmap, vm_offset_t va) 1467{ 1468 pd_entry_t newpf; 1469 pd_entry_t *pde; 1470 1471 pde = pmap_pde(pmap, va); 1472 if (*pde & PG_PS) 1473 return (pde); 1474 if (*pde != 0) { 1475 /* are we current address space or kernel? */ 1476 if (pmap_is_current(pmap)) 1477 return (vtopte(va)); 1478 rw_assert(&pvh_global_lock, RA_WLOCKED); 1479 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 1480 newpf = *pde & PG_FRAME; 1481 if ((*PMAP1 & PG_FRAME) != newpf) { 1482 *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; 1483#ifdef SMP 1484 PMAP1cpu = PCPU_GET(cpuid); 1485#endif 1486 invlcaddr(PADDR1); 1487 PMAP1changed++; 1488 } else 1489#ifdef SMP 1490 if (PMAP1cpu != PCPU_GET(cpuid)) { 1491 PMAP1cpu = PCPU_GET(cpuid); 1492 invlcaddr(PADDR1); 1493 PMAP1changedcpu++; 1494 } else 1495#endif 1496 PMAP1unchanged++; 1497 return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); 1498 } 1499 return (0); 1500} 1501 1502/* 1503 * Routine: pmap_extract 1504 * Function: 1505 * Extract the physical page address associated 1506 * with the given map/virtual_address pair. 1507 */ 1508vm_paddr_t 1509pmap_extract(pmap_t pmap, vm_offset_t va) 1510{ 1511 vm_paddr_t rtval; 1512 pt_entry_t *pte; 1513 pd_entry_t pde; 1514 1515 rtval = 0; 1516 PMAP_LOCK(pmap); 1517 pde = pmap->pm_pdir[va >> PDRSHIFT]; 1518 if (pde != 0) { 1519 if ((pde & PG_PS) != 0) 1520 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); 1521 else { 1522 pte = pmap_pte(pmap, va); 1523 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); 1524 pmap_pte_release(pte); 1525 } 1526 } 1527 PMAP_UNLOCK(pmap); 1528 return (rtval); 1529} 1530 1531/* 1532 * Routine: pmap_extract_and_hold 1533 * Function: 1534 * Atomically extract and hold the physical page 1535 * with the given pmap and virtual address pair 1536 * if that mapping permits the given protection. 1537 */ 1538vm_page_t 1539pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1540{ 1541 pd_entry_t pde; 1542 pt_entry_t pte, *ptep; 1543 vm_page_t m; 1544 vm_paddr_t pa; 1545 1546 pa = 0; 1547 m = NULL; 1548 PMAP_LOCK(pmap); 1549retry: 1550 pde = *pmap_pde(pmap, va); 1551 if (pde != 0) { 1552 if (pde & PG_PS) { 1553 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 1554 if (vm_page_pa_tryrelock(pmap, (pde & 1555 PG_PS_FRAME) | (va & PDRMASK), &pa)) 1556 goto retry; 1557 m = PHYS_TO_VM_PAGE(pa); 1558 } 1559 } else { 1560 ptep = pmap_pte(pmap, va); 1561 pte = *ptep; 1562 pmap_pte_release(ptep); 1563 if (pte != 0 && 1564 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 1565 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, 1566 &pa)) 1567 goto retry; 1568 m = PHYS_TO_VM_PAGE(pa); 1569 } 1570 } 1571 if (m != NULL) 1572 vm_page_hold(m); 1573 } 1574 PA_UNLOCK_COND(pa); 1575 PMAP_UNLOCK(pmap); 1576 return (m); 1577} 1578 1579/*************************************************** 1580 * Low level mapping routines..... 1581 ***************************************************/ 1582 1583/* 1584 * Add a wired page to the kva. 1585 * Note: not SMP coherent. 1586 * 1587 * This function may be used before pmap_bootstrap() is called. 1588 */ 1589PMAP_INLINE void 1590pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1591{ 1592 pt_entry_t *pte; 1593 1594 pte = vtopte(va); 1595 pte_store(pte, pa | PG_RW | PG_V | pgeflag); 1596} 1597 1598static __inline void 1599pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 1600{ 1601 pt_entry_t *pte; 1602 1603 pte = vtopte(va); 1604 pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); 1605} 1606 1607/* 1608 * Remove a page from the kernel pagetables. 1609 * Note: not SMP coherent. 1610 * 1611 * This function may be used before pmap_bootstrap() is called. 1612 */ 1613PMAP_INLINE void 1614pmap_kremove(vm_offset_t va) 1615{ 1616 pt_entry_t *pte; 1617 1618 pte = vtopte(va); 1619 pte_clear(pte); 1620} 1621 1622/* 1623 * Used to map a range of physical addresses into kernel 1624 * virtual address space. 1625 * 1626 * The value passed in '*virt' is a suggested virtual address for 1627 * the mapping. Architectures which can support a direct-mapped 1628 * physical to virtual region can return the appropriate address 1629 * within that region, leaving '*virt' unchanged. Other 1630 * architectures should map the pages starting at '*virt' and 1631 * update '*virt' with the first usable address after the mapped 1632 * region. 1633 */ 1634vm_offset_t 1635pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1636{ 1637 vm_offset_t va, sva; 1638 vm_paddr_t superpage_offset; 1639 pd_entry_t newpde; 1640 1641 va = *virt; 1642 /* 1643 * Does the physical address range's size and alignment permit at 1644 * least one superpage mapping to be created? 1645 */ 1646 superpage_offset = start & PDRMASK; 1647 if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) { 1648 /* 1649 * Increase the starting virtual address so that its alignment 1650 * does not preclude the use of superpage mappings. 1651 */ 1652 if ((va & PDRMASK) < superpage_offset) 1653 va = (va & ~PDRMASK) + superpage_offset; 1654 else if ((va & PDRMASK) > superpage_offset) 1655 va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset; 1656 } 1657 sva = va; 1658 while (start < end) { 1659 if ((start & PDRMASK) == 0 && end - start >= NBPDR && 1660 pseflag) { 1661 KASSERT((va & PDRMASK) == 0, 1662 ("pmap_map: misaligned va %#x", va)); 1663 newpde = start | PG_PS | pgeflag | PG_RW | PG_V; 1664 pmap_kenter_pde(va, newpde); 1665 va += NBPDR; 1666 start += NBPDR; 1667 } else { 1668 pmap_kenter(va, start); 1669 va += PAGE_SIZE; 1670 start += PAGE_SIZE; 1671 } 1672 } 1673 pmap_invalidate_range(kernel_pmap, sva, va); 1674 *virt = va; 1675 return (sva); 1676} 1677 1678 1679/* 1680 * Add a list of wired pages to the kva 1681 * this routine is only used for temporary 1682 * kernel mappings that do not need to have 1683 * page modification or references recorded. 1684 * Note that old mappings are simply written 1685 * over. The page *must* be wired. 1686 * Note: SMP coherent. Uses a ranged shootdown IPI. 1687 */ 1688void 1689pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1690{ 1691 pt_entry_t *endpte, oldpte, pa, *pte; 1692 vm_page_t m; 1693 1694 oldpte = 0; 1695 pte = vtopte(sva); 1696 endpte = pte + count; 1697 while (pte < endpte) { 1698 m = *ma++; 1699 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); 1700 if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { 1701 oldpte |= *pte; 1702 pte_store(pte, pa | pgeflag | PG_RW | PG_V); 1703 } 1704 pte++; 1705 } 1706 if (__predict_false((oldpte & PG_V) != 0)) 1707 pmap_invalidate_range(kernel_pmap, sva, sva + count * 1708 PAGE_SIZE); 1709} 1710 1711/* 1712 * This routine tears out page mappings from the 1713 * kernel -- it is meant only for temporary mappings. 1714 * Note: SMP coherent. Uses a ranged shootdown IPI. 1715 */ 1716void 1717pmap_qremove(vm_offset_t sva, int count) 1718{ 1719 vm_offset_t va; 1720 1721 va = sva; 1722 while (count-- > 0) { 1723 pmap_kremove(va); 1724 va += PAGE_SIZE; 1725 } 1726 pmap_invalidate_range(kernel_pmap, sva, va); 1727} 1728 1729/*************************************************** 1730 * Page table page management routines..... 1731 ***************************************************/ 1732static __inline void 1733pmap_free_zero_pages(struct spglist *free) 1734{ 1735 vm_page_t m; 1736 int count; 1737 1738 for (count = 0; (m = SLIST_FIRST(free)) != NULL; count++) { 1739 SLIST_REMOVE_HEAD(free, plinks.s.ss); 1740 /* Preserve the page's PG_ZERO setting. */ 1741 vm_page_free_toq(m); 1742 } 1743 atomic_subtract_int(&vm_cnt.v_wire_count, count); 1744} 1745 1746/* 1747 * Schedule the specified unused page table page to be freed. Specifically, 1748 * add the page to the specified list of pages that will be released to the 1749 * physical memory manager after the TLB has been updated. 1750 */ 1751static __inline void 1752pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, 1753 boolean_t set_PG_ZERO) 1754{ 1755 1756 if (set_PG_ZERO) 1757 m->flags |= PG_ZERO; 1758 else 1759 m->flags &= ~PG_ZERO; 1760 SLIST_INSERT_HEAD(free, m, plinks.s.ss); 1761} 1762 1763/* 1764 * Inserts the specified page table page into the specified pmap's collection 1765 * of idle page table pages. Each of a pmap's page table pages is responsible 1766 * for mapping a distinct range of virtual addresses. The pmap's collection is 1767 * ordered by this virtual address range. 1768 */ 1769static __inline int 1770pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 1771{ 1772 1773 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1774 return (vm_radix_insert(&pmap->pm_root, mpte)); 1775} 1776 1777/* 1778 * Removes the page table page mapping the specified virtual address from the 1779 * specified pmap's collection of idle page table pages, and returns it. 1780 * Otherwise, returns NULL if there is no page table page corresponding to the 1781 * specified virtual address. 1782 */ 1783static __inline vm_page_t 1784pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) 1785{ 1786 1787 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1788 return (vm_radix_remove(&pmap->pm_root, va >> PDRSHIFT)); 1789} 1790 1791/* 1792 * Decrements a page table page's wire count, which is used to record the 1793 * number of valid page table entries within the page. If the wire count 1794 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1795 * page table page was unmapped and FALSE otherwise. 1796 */ 1797static inline boolean_t 1798pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) 1799{ 1800 1801 --m->wire_count; 1802 if (m->wire_count == 0) { 1803 _pmap_unwire_ptp(pmap, m, free); 1804 return (TRUE); 1805 } else 1806 return (FALSE); 1807} 1808 1809static void 1810_pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) 1811{ 1812 vm_offset_t pteva; 1813 1814 /* 1815 * unmap the page table page 1816 */ 1817 pmap->pm_pdir[m->pindex] = 0; 1818 --pmap->pm_stats.resident_count; 1819 1820 /* 1821 * Do an invltlb to make the invalidated mapping 1822 * take effect immediately. 1823 */ 1824 pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); 1825 pmap_invalidate_page(pmap, pteva); 1826 1827 /* 1828 * Put page on a list so that it is released after 1829 * *ALL* TLB shootdown is done 1830 */ 1831 pmap_add_delayed_free_list(m, free, TRUE); 1832} 1833 1834/* 1835 * After removing a page table entry, this routine is used to 1836 * conditionally free the page, and manage the hold/wire counts. 1837 */ 1838static int 1839pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free) 1840{ 1841 pd_entry_t ptepde; 1842 vm_page_t mpte; 1843 1844 if (va >= VM_MAXUSER_ADDRESS) 1845 return (0); 1846 ptepde = *pmap_pde(pmap, va); 1847 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 1848 return (pmap_unwire_ptp(pmap, mpte, free)); 1849} 1850 1851/* 1852 * Initialize the pmap for the swapper process. 1853 */ 1854void 1855pmap_pinit0(pmap_t pmap) 1856{ 1857 1858 PMAP_LOCK_INIT(pmap); 1859 /* 1860 * Since the page table directory is shared with the kernel pmap, 1861 * which is already included in the list "allpmaps", this pmap does 1862 * not need to be inserted into that list. 1863 */ 1864 pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); 1865#if defined(PAE) || defined(PAE_TABLES) 1866 pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); 1867#endif 1868 pmap->pm_root.rt_root = 0; 1869 CPU_ZERO(&pmap->pm_active); 1870 TAILQ_INIT(&pmap->pm_pvchunk); 1871 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1872 pmap_activate_boot(pmap); 1873} 1874 1875/* 1876 * Initialize a preallocated and zeroed pmap structure, 1877 * such as one in a vmspace structure. 1878 */ 1879int 1880pmap_pinit(pmap_t pmap) 1881{ 1882 vm_page_t m, ptdpg[NPGPTD]; 1883 vm_paddr_t pa; 1884 int i; 1885 1886 /* 1887 * No need to allocate page table space yet but we do need a valid 1888 * page directory table. 1889 */ 1890 if (pmap->pm_pdir == NULL) { 1891 pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD); 1892 if (pmap->pm_pdir == NULL) 1893 return (0); 1894#if defined(PAE) || defined(PAE_TABLES) 1895 pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); 1896 KASSERT(((vm_offset_t)pmap->pm_pdpt & 1897 ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, 1898 ("pmap_pinit: pdpt misaligned")); 1899 KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), 1900 ("pmap_pinit: pdpt above 4g")); 1901#endif 1902 pmap->pm_root.rt_root = 0; 1903 } 1904 KASSERT(vm_radix_is_empty(&pmap->pm_root), 1905 ("pmap_pinit: pmap has reserved page table page(s)")); 1906 1907 /* 1908 * allocate the page directory page(s) 1909 */ 1910 for (i = 0; i < NPGPTD;) { 1911 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1912 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1913 if (m == NULL) 1914 VM_WAIT; 1915 else { 1916 ptdpg[i++] = m; 1917 } 1918 } 1919 1920 pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); 1921 1922 for (i = 0; i < NPGPTD; i++) 1923 if ((ptdpg[i]->flags & PG_ZERO) == 0) 1924 pagezero(pmap->pm_pdir + (i * NPDEPG)); 1925 1926 mtx_lock_spin(&allpmaps_lock); 1927 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1928 /* Copy the kernel page table directory entries. */ 1929 bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); 1930 mtx_unlock_spin(&allpmaps_lock); 1931 1932 /* install self-referential address mapping entry(s) */ 1933 for (i = 0; i < NPGPTD; i++) { 1934 pa = VM_PAGE_TO_PHYS(ptdpg[i]); 1935 pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; 1936#if defined(PAE) || defined(PAE_TABLES) 1937 pmap->pm_pdpt[i] = pa | PG_V; 1938#endif 1939 } 1940 1941 CPU_ZERO(&pmap->pm_active); 1942 TAILQ_INIT(&pmap->pm_pvchunk); 1943 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1944 1945 return (1); 1946} 1947 1948/* 1949 * this routine is called if the page table page is not 1950 * mapped correctly. 1951 */ 1952static vm_page_t 1953_pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags) 1954{ 1955 vm_paddr_t ptepa; 1956 vm_page_t m; 1957 1958 /* 1959 * Allocate a page table page. 1960 */ 1961 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1962 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1963 if ((flags & PMAP_ENTER_NOSLEEP) == 0) { 1964 PMAP_UNLOCK(pmap); 1965 rw_wunlock(&pvh_global_lock); 1966 VM_WAIT; 1967 rw_wlock(&pvh_global_lock); 1968 PMAP_LOCK(pmap); 1969 } 1970 1971 /* 1972 * Indicate the need to retry. While waiting, the page table 1973 * page may have been allocated. 1974 */ 1975 return (NULL); 1976 } 1977 if ((m->flags & PG_ZERO) == 0) 1978 pmap_zero_page(m); 1979 1980 /* 1981 * Map the pagetable page into the process address space, if 1982 * it isn't already there. 1983 */ 1984 1985 pmap->pm_stats.resident_count++; 1986 1987 ptepa = VM_PAGE_TO_PHYS(m); 1988 pmap->pm_pdir[ptepindex] = 1989 (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); 1990 1991 return (m); 1992} 1993 1994static vm_page_t 1995pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) 1996{ 1997 u_int ptepindex; 1998 pd_entry_t ptepa; 1999 vm_page_t m; 2000 2001 /* 2002 * Calculate pagetable page index 2003 */ 2004 ptepindex = va >> PDRSHIFT; 2005retry: 2006 /* 2007 * Get the page directory entry 2008 */ 2009 ptepa = pmap->pm_pdir[ptepindex]; 2010 2011 /* 2012 * This supports switching from a 4MB page to a 2013 * normal 4K page. 2014 */ 2015 if (ptepa & PG_PS) { 2016 (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); 2017 ptepa = pmap->pm_pdir[ptepindex]; 2018 } 2019 2020 /* 2021 * If the page table page is mapped, we just increment the 2022 * hold count, and activate it. 2023 */ 2024 if (ptepa) { 2025 m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 2026 m->wire_count++; 2027 } else { 2028 /* 2029 * Here if the pte page isn't mapped, or if it has 2030 * been deallocated. 2031 */ 2032 m = _pmap_allocpte(pmap, ptepindex, flags); 2033 if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) 2034 goto retry; 2035 } 2036 return (m); 2037} 2038 2039 2040/*************************************************** 2041* Pmap allocation/deallocation routines. 2042 ***************************************************/ 2043 2044/* 2045 * Release any resources held by the given physical map. 2046 * Called when a pmap initialized by pmap_pinit is being released. 2047 * Should only be called if the map contains no valid mappings. 2048 */ 2049void 2050pmap_release(pmap_t pmap) 2051{ 2052 vm_page_t m, ptdpg[NPGPTD]; 2053 int i; 2054 2055 KASSERT(pmap->pm_stats.resident_count == 0, 2056 ("pmap_release: pmap resident count %ld != 0", 2057 pmap->pm_stats.resident_count)); 2058 KASSERT(vm_radix_is_empty(&pmap->pm_root), 2059 ("pmap_release: pmap has reserved page table page(s)")); 2060 KASSERT(CPU_EMPTY(&pmap->pm_active), 2061 ("releasing active pmap %p", pmap)); 2062 2063 mtx_lock_spin(&allpmaps_lock); 2064 LIST_REMOVE(pmap, pm_list); 2065 mtx_unlock_spin(&allpmaps_lock); 2066 2067 for (i = 0; i < NPGPTD; i++) 2068 ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] & 2069 PG_FRAME); 2070 2071 bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * 2072 sizeof(*pmap->pm_pdir)); 2073 2074 pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); 2075 2076 for (i = 0; i < NPGPTD; i++) { 2077 m = ptdpg[i]; 2078#if defined(PAE) || defined(PAE_TABLES) 2079 KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), 2080 ("pmap_release: got wrong ptd page")); 2081#endif 2082 m->wire_count--; 2083 vm_page_free_zero(m); 2084 } 2085 atomic_subtract_int(&vm_cnt.v_wire_count, NPGPTD); 2086} 2087 2088static int 2089kvm_size(SYSCTL_HANDLER_ARGS) 2090{ 2091 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; 2092 2093 return (sysctl_handle_long(oidp, &ksize, 0, req)); 2094} 2095SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 2096 0, 0, kvm_size, "IU", "Size of KVM"); 2097 2098static int 2099kvm_free(SYSCTL_HANDLER_ARGS) 2100{ 2101 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 2102 2103 return (sysctl_handle_long(oidp, &kfree, 0, req)); 2104} 2105SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 2106 0, 0, kvm_free, "IU", "Amount of KVM free"); 2107 2108/* 2109 * grow the number of kernel page table entries, if needed 2110 */ 2111void 2112pmap_growkernel(vm_offset_t addr) 2113{ 2114 vm_paddr_t ptppaddr; 2115 vm_page_t nkpg; 2116 pd_entry_t newpdir; 2117 2118 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2119 addr = roundup2(addr, NBPDR); 2120 if (addr - 1 >= vm_map_max(kernel_map)) 2121 addr = vm_map_max(kernel_map); 2122 while (kernel_vm_end < addr) { 2123 if (pdir_pde(PTD, kernel_vm_end)) { 2124 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2125 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2126 kernel_vm_end = vm_map_max(kernel_map); 2127 break; 2128 } 2129 continue; 2130 } 2131 2132 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, 2133 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2134 VM_ALLOC_ZERO); 2135 if (nkpg == NULL) 2136 panic("pmap_growkernel: no memory to grow kernel"); 2137 2138 nkpt++; 2139 2140 if ((nkpg->flags & PG_ZERO) == 0) 2141 pmap_zero_page(nkpg); 2142 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 2143 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); 2144 pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir; 2145 2146 pmap_kenter_pde(kernel_vm_end, newpdir); 2147 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2148 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { 2149 kernel_vm_end = vm_map_max(kernel_map); 2150 break; 2151 } 2152 } 2153} 2154 2155 2156/*************************************************** 2157 * page management routines. 2158 ***************************************************/ 2159 2160CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2161CTASSERT(_NPCM == 11); 2162CTASSERT(_NPCPV == 336); 2163 2164static __inline struct pv_chunk * 2165pv_to_chunk(pv_entry_t pv) 2166{ 2167 2168 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2169} 2170 2171#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2172 2173#define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ 2174#define PC_FREE10 0x0000fffful /* Free values for index 10 */ 2175 2176static const uint32_t pc_freemask[_NPCM] = { 2177 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2178 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2179 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2180 PC_FREE0_9, PC_FREE10 2181}; 2182 2183SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2184 "Current number of pv entries"); 2185 2186#ifdef PV_STATS 2187static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2188 2189SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2190 "Current number of pv entry chunks"); 2191SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2192 "Current number of pv entry chunks allocated"); 2193SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2194 "Current number of pv entry chunks frees"); 2195SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2196 "Number of times tried to get a chunk page but failed."); 2197 2198static long pv_entry_frees, pv_entry_allocs; 2199static int pv_entry_spare; 2200 2201SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2202 "Current number of pv entry frees"); 2203SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2204 "Current number of pv entry allocs"); 2205SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2206 "Current number of spare pv entries"); 2207#endif 2208 2209/* 2210 * We are in a serious low memory condition. Resort to 2211 * drastic measures to free some pages so we can allocate 2212 * another pv entry chunk. 2213 */ 2214static vm_page_t 2215pmap_pv_reclaim(pmap_t locked_pmap) 2216{ 2217 struct pch newtail; 2218 struct pv_chunk *pc; 2219 struct md_page *pvh; 2220 pd_entry_t *pde; 2221 pmap_t pmap; 2222 pt_entry_t *pte, tpte; 2223 pv_entry_t pv; 2224 vm_offset_t va; 2225 vm_page_t m, m_pc; 2226 struct spglist free; 2227 uint32_t inuse; 2228 int bit, field, freed; 2229 2230 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2231 pmap = NULL; 2232 m_pc = NULL; 2233 SLIST_INIT(&free); 2234 TAILQ_INIT(&newtail); 2235 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || 2236 SLIST_EMPTY(&free))) { 2237 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2238 if (pmap != pc->pc_pmap) { 2239 if (pmap != NULL) { 2240 pmap_invalidate_all(pmap); 2241 if (pmap != locked_pmap) 2242 PMAP_UNLOCK(pmap); 2243 } 2244 pmap = pc->pc_pmap; 2245 /* Avoid deadlock and lock recursion. */ 2246 if (pmap > locked_pmap) 2247 PMAP_LOCK(pmap); 2248 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { 2249 pmap = NULL; 2250 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2251 continue; 2252 } 2253 } 2254 2255 /* 2256 * Destroy every non-wired, 4 KB page mapping in the chunk. 2257 */ 2258 freed = 0; 2259 for (field = 0; field < _NPCM; field++) { 2260 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2261 inuse != 0; inuse &= ~(1UL << bit)) { 2262 bit = bsfl(inuse); 2263 pv = &pc->pc_pventry[field * 32 + bit]; 2264 va = pv->pv_va; 2265 pde = pmap_pde(pmap, va); 2266 if ((*pde & PG_PS) != 0) 2267 continue; 2268 pte = pmap_pte(pmap, va); 2269 tpte = *pte; 2270 if ((tpte & PG_W) == 0) 2271 tpte = pte_load_clear(pte); 2272 pmap_pte_release(pte); 2273 if ((tpte & PG_W) != 0) 2274 continue; 2275 KASSERT(tpte != 0, 2276 ("pmap_pv_reclaim: pmap %p va %x zero pte", 2277 pmap, va)); 2278 if ((tpte & PG_G) != 0) 2279 pmap_invalidate_page(pmap, va); 2280 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 2281 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2282 vm_page_dirty(m); 2283 if ((tpte & PG_A) != 0) 2284 vm_page_aflag_set(m, PGA_REFERENCED); 2285 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 2286 if (TAILQ_EMPTY(&m->md.pv_list) && 2287 (m->flags & PG_FICTITIOUS) == 0) { 2288 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2289 if (TAILQ_EMPTY(&pvh->pv_list)) { 2290 vm_page_aflag_clear(m, 2291 PGA_WRITEABLE); 2292 } 2293 } 2294 pc->pc_map[field] |= 1UL << bit; 2295 pmap_unuse_pt(pmap, va, &free); 2296 freed++; 2297 } 2298 } 2299 if (freed == 0) { 2300 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2301 continue; 2302 } 2303 /* Every freed mapping is for a 4 KB page. */ 2304 pmap->pm_stats.resident_count -= freed; 2305 PV_STAT(pv_entry_frees += freed); 2306 PV_STAT(pv_entry_spare += freed); 2307 pv_entry_count -= freed; 2308 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2309 for (field = 0; field < _NPCM; field++) 2310 if (pc->pc_map[field] != pc_freemask[field]) { 2311 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2312 pc_list); 2313 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2314 2315 /* 2316 * One freed pv entry in locked_pmap is 2317 * sufficient. 2318 */ 2319 if (pmap == locked_pmap) 2320 goto out; 2321 break; 2322 } 2323 if (field == _NPCM) { 2324 PV_STAT(pv_entry_spare -= _NPCPV); 2325 PV_STAT(pc_chunk_count--); 2326 PV_STAT(pc_chunk_frees++); 2327 /* Entire chunk is free; return it. */ 2328 m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2329 pmap_qremove((vm_offset_t)pc, 1); 2330 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2331 break; 2332 } 2333 } 2334out: 2335 TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); 2336 if (pmap != NULL) { 2337 pmap_invalidate_all(pmap); 2338 if (pmap != locked_pmap) 2339 PMAP_UNLOCK(pmap); 2340 } 2341 if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { 2342 m_pc = SLIST_FIRST(&free); 2343 SLIST_REMOVE_HEAD(&free, plinks.s.ss); 2344 /* Recycle a freed page table page. */ 2345 m_pc->wire_count = 1; 2346 } 2347 pmap_free_zero_pages(&free); 2348 return (m_pc); 2349} 2350 2351/* 2352 * free the pv_entry back to the free list 2353 */ 2354static void 2355free_pv_entry(pmap_t pmap, pv_entry_t pv) 2356{ 2357 struct pv_chunk *pc; 2358 int idx, field, bit; 2359 2360 rw_assert(&pvh_global_lock, RA_WLOCKED); 2361 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2362 PV_STAT(pv_entry_frees++); 2363 PV_STAT(pv_entry_spare++); 2364 pv_entry_count--; 2365 pc = pv_to_chunk(pv); 2366 idx = pv - &pc->pc_pventry[0]; 2367 field = idx / 32; 2368 bit = idx % 32; 2369 pc->pc_map[field] |= 1ul << bit; 2370 for (idx = 0; idx < _NPCM; idx++) 2371 if (pc->pc_map[idx] != pc_freemask[idx]) { 2372 /* 2373 * 98% of the time, pc is already at the head of the 2374 * list. If it isn't already, move it to the head. 2375 */ 2376 if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != 2377 pc)) { 2378 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2379 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2380 pc_list); 2381 } 2382 return; 2383 } 2384 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2385 free_pv_chunk(pc); 2386} 2387 2388static void 2389free_pv_chunk(struct pv_chunk *pc) 2390{ 2391 vm_page_t m; 2392 2393 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2394 PV_STAT(pv_entry_spare -= _NPCPV); 2395 PV_STAT(pc_chunk_count--); 2396 PV_STAT(pc_chunk_frees++); 2397 /* entire chunk is free, return it */ 2398 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2399 pmap_qremove((vm_offset_t)pc, 1); 2400 vm_page_unwire(m, PQ_NONE); 2401 vm_page_free(m); 2402 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2403} 2404 2405/* 2406 * get a new pv_entry, allocating a block from the system 2407 * when needed. 2408 */ 2409static pv_entry_t 2410get_pv_entry(pmap_t pmap, boolean_t try) 2411{ 2412 static const struct timeval printinterval = { 60, 0 }; 2413 static struct timeval lastprint; 2414 int bit, field; 2415 pv_entry_t pv; 2416 struct pv_chunk *pc; 2417 vm_page_t m; 2418 2419 rw_assert(&pvh_global_lock, RA_WLOCKED); 2420 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2421 PV_STAT(pv_entry_allocs++); 2422 pv_entry_count++; 2423 if (pv_entry_count > pv_entry_high_water) 2424 if (ratecheck(&lastprint, &printinterval)) 2425 printf("Approaching the limit on PV entries, consider " 2426 "increasing either the vm.pmap.shpgperproc or the " 2427 "vm.pmap.pv_entries tunable.\n"); 2428retry: 2429 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2430 if (pc != NULL) { 2431 for (field = 0; field < _NPCM; field++) { 2432 if (pc->pc_map[field]) { 2433 bit = bsfl(pc->pc_map[field]); 2434 break; 2435 } 2436 } 2437 if (field < _NPCM) { 2438 pv = &pc->pc_pventry[field * 32 + bit]; 2439 pc->pc_map[field] &= ~(1ul << bit); 2440 /* If this was the last item, move it to tail */ 2441 for (field = 0; field < _NPCM; field++) 2442 if (pc->pc_map[field] != 0) { 2443 PV_STAT(pv_entry_spare--); 2444 return (pv); /* not full, return */ 2445 } 2446 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2447 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2448 PV_STAT(pv_entry_spare--); 2449 return (pv); 2450 } 2451 } 2452 /* 2453 * Access to the ptelist "pv_vafree" is synchronized by the pvh 2454 * global lock. If "pv_vafree" is currently non-empty, it will 2455 * remain non-empty until pmap_ptelist_alloc() completes. 2456 */ 2457 if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2458 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 2459 if (try) { 2460 pv_entry_count--; 2461 PV_STAT(pc_chunk_tryfail++); 2462 return (NULL); 2463 } 2464 m = pmap_pv_reclaim(pmap); 2465 if (m == NULL) 2466 goto retry; 2467 } 2468 PV_STAT(pc_chunk_count++); 2469 PV_STAT(pc_chunk_allocs++); 2470 pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); 2471 pmap_qenter((vm_offset_t)pc, &m, 1); 2472 pc->pc_pmap = pmap; 2473 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ 2474 for (field = 1; field < _NPCM; field++) 2475 pc->pc_map[field] = pc_freemask[field]; 2476 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2477 pv = &pc->pc_pventry[0]; 2478 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2479 PV_STAT(pv_entry_spare += _NPCPV - 1); 2480 return (pv); 2481} 2482 2483static __inline pv_entry_t 2484pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2485{ 2486 pv_entry_t pv; 2487 2488 rw_assert(&pvh_global_lock, RA_WLOCKED); 2489 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 2490 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 2491 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 2492 break; 2493 } 2494 } 2495 return (pv); 2496} 2497 2498static void 2499pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2500{ 2501 struct md_page *pvh; 2502 pv_entry_t pv; 2503 vm_offset_t va_last; 2504 vm_page_t m; 2505 2506 rw_assert(&pvh_global_lock, RA_WLOCKED); 2507 KASSERT((pa & PDRMASK) == 0, 2508 ("pmap_pv_demote_pde: pa is not 4mpage aligned")); 2509 2510 /* 2511 * Transfer the 4mpage's pv entry for this mapping to the first 2512 * page's pv list. 2513 */ 2514 pvh = pa_to_pvh(pa); 2515 va = trunc_4mpage(va); 2516 pv = pmap_pvh_remove(pvh, pmap, va); 2517 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 2518 m = PHYS_TO_VM_PAGE(pa); 2519 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2520 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 2521 va_last = va + NBPDR - PAGE_SIZE; 2522 do { 2523 m++; 2524 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2525 ("pmap_pv_demote_pde: page %p is not managed", m)); 2526 va += PAGE_SIZE; 2527 pmap_insert_entry(pmap, va, m); 2528 } while (va < va_last); 2529} 2530 2531#if VM_NRESERVLEVEL > 0 2532static void 2533pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2534{ 2535 struct md_page *pvh; 2536 pv_entry_t pv; 2537 vm_offset_t va_last; 2538 vm_page_t m; 2539 2540 rw_assert(&pvh_global_lock, RA_WLOCKED); 2541 KASSERT((pa & PDRMASK) == 0, 2542 ("pmap_pv_promote_pde: pa is not 4mpage aligned")); 2543 2544 /* 2545 * Transfer the first page's pv entry for this mapping to the 2546 * 4mpage's pv list. Aside from avoiding the cost of a call 2547 * to get_pv_entry(), a transfer avoids the possibility that 2548 * get_pv_entry() calls pmap_collect() and that pmap_collect() 2549 * removes one of the mappings that is being promoted. 2550 */ 2551 m = PHYS_TO_VM_PAGE(pa); 2552 va = trunc_4mpage(va); 2553 pv = pmap_pvh_remove(&m->md, pmap, va); 2554 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 2555 pvh = pa_to_pvh(pa); 2556 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2557 /* Free the remaining NPTEPG - 1 pv entries. */ 2558 va_last = va + NBPDR - PAGE_SIZE; 2559 do { 2560 m++; 2561 va += PAGE_SIZE; 2562 pmap_pvh_free(&m->md, pmap, va); 2563 } while (va < va_last); 2564} 2565#endif /* VM_NRESERVLEVEL > 0 */ 2566 2567static void 2568pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2569{ 2570 pv_entry_t pv; 2571 2572 pv = pmap_pvh_remove(pvh, pmap, va); 2573 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 2574 free_pv_entry(pmap, pv); 2575} 2576 2577static void 2578pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 2579{ 2580 struct md_page *pvh; 2581 2582 rw_assert(&pvh_global_lock, RA_WLOCKED); 2583 pmap_pvh_free(&m->md, pmap, va); 2584 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 2585 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2586 if (TAILQ_EMPTY(&pvh->pv_list)) 2587 vm_page_aflag_clear(m, PGA_WRITEABLE); 2588 } 2589} 2590 2591/* 2592 * Create a pv entry for page at pa for 2593 * (pmap, va). 2594 */ 2595static void 2596pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2597{ 2598 pv_entry_t pv; 2599 2600 rw_assert(&pvh_global_lock, RA_WLOCKED); 2601 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2602 pv = get_pv_entry(pmap, FALSE); 2603 pv->pv_va = va; 2604 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2605} 2606 2607/* 2608 * Conditionally create a pv entry. 2609 */ 2610static boolean_t 2611pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2612{ 2613 pv_entry_t pv; 2614 2615 rw_assert(&pvh_global_lock, RA_WLOCKED); 2616 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2617 if (pv_entry_count < pv_entry_high_water && 2618 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2619 pv->pv_va = va; 2620 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 2621 return (TRUE); 2622 } else 2623 return (FALSE); 2624} 2625 2626/* 2627 * Create the pv entries for each of the pages within a superpage. 2628 */ 2629static boolean_t 2630pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2631{ 2632 struct md_page *pvh; 2633 pv_entry_t pv; 2634 2635 rw_assert(&pvh_global_lock, RA_WLOCKED); 2636 if (pv_entry_count < pv_entry_high_water && 2637 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2638 pv->pv_va = va; 2639 pvh = pa_to_pvh(pa); 2640 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 2641 return (TRUE); 2642 } else 2643 return (FALSE); 2644} 2645 2646/* 2647 * Fills a page table page with mappings to consecutive physical pages. 2648 */ 2649static void 2650pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 2651{ 2652 pt_entry_t *pte; 2653 2654 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 2655 *pte = newpte; 2656 newpte += PAGE_SIZE; 2657 } 2658} 2659 2660/* 2661 * Tries to demote a 2- or 4MB page mapping. If demotion fails, the 2662 * 2- or 4MB page mapping is invalidated. 2663 */ 2664static boolean_t 2665pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2666{ 2667 pd_entry_t newpde, oldpde; 2668 pt_entry_t *firstpte, newpte; 2669 vm_paddr_t mptepa; 2670 vm_page_t mpte; 2671 struct spglist free; 2672 vm_offset_t sva; 2673 2674 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2675 oldpde = *pde; 2676 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 2677 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 2678 if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == 2679 NULL) { 2680 KASSERT((oldpde & PG_W) == 0, 2681 ("pmap_demote_pde: page table page for a wired mapping" 2682 " is missing")); 2683 2684 /* 2685 * Invalidate the 2- or 4MB page mapping and return 2686 * "failure" if the mapping was never accessed or the 2687 * allocation of the new page table page fails. 2688 */ 2689 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 2690 va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | 2691 VM_ALLOC_WIRED)) == NULL) { 2692 SLIST_INIT(&free); 2693 sva = trunc_4mpage(va); 2694 pmap_remove_pde(pmap, pde, sva, &free); 2695 if ((oldpde & PG_G) == 0) 2696 pmap_invalidate_pde_page(pmap, sva, oldpde); 2697 pmap_free_zero_pages(&free); 2698 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" 2699 " in pmap %p", va, pmap); 2700 return (FALSE); 2701 } 2702 if (va < VM_MAXUSER_ADDRESS) 2703 pmap->pm_stats.resident_count++; 2704 } 2705 mptepa = VM_PAGE_TO_PHYS(mpte); 2706 2707 /* 2708 * If the page mapping is in the kernel's address space, then the 2709 * KPTmap can provide access to the page table page. Otherwise, 2710 * temporarily map the page table page (mpte) into the kernel's 2711 * address space at either PADDR1 or PADDR2. 2712 */ 2713 if (va >= KERNBASE) 2714 firstpte = &KPTmap[i386_btop(trunc_4mpage(va))]; 2715 else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { 2716 if ((*PMAP1 & PG_FRAME) != mptepa) { 2717 *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2718#ifdef SMP 2719 PMAP1cpu = PCPU_GET(cpuid); 2720#endif 2721 invlcaddr(PADDR1); 2722 PMAP1changed++; 2723 } else 2724#ifdef SMP 2725 if (PMAP1cpu != PCPU_GET(cpuid)) { 2726 PMAP1cpu = PCPU_GET(cpuid); 2727 invlcaddr(PADDR1); 2728 PMAP1changedcpu++; 2729 } else 2730#endif 2731 PMAP1unchanged++; 2732 firstpte = PADDR1; 2733 } else { 2734 mtx_lock(&PMAP2mutex); 2735 if ((*PMAP2 & PG_FRAME) != mptepa) { 2736 *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2737 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); 2738 } 2739 firstpte = PADDR2; 2740 } 2741 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 2742 KASSERT((oldpde & PG_A) != 0, 2743 ("pmap_demote_pde: oldpde is missing PG_A")); 2744 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 2745 ("pmap_demote_pde: oldpde is missing PG_M")); 2746 newpte = oldpde & ~PG_PS; 2747 if ((newpte & PG_PDE_PAT) != 0) 2748 newpte ^= PG_PDE_PAT | PG_PTE_PAT; 2749 2750 /* 2751 * If the page table page is new, initialize it. 2752 */ 2753 if (mpte->wire_count == 1) { 2754 mpte->wire_count = NPTEPG; 2755 pmap_fill_ptp(firstpte, newpte); 2756 } 2757 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 2758 ("pmap_demote_pde: firstpte and newpte map different physical" 2759 " addresses")); 2760 2761 /* 2762 * If the mapping has changed attributes, update the page table 2763 * entries. 2764 */ 2765 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 2766 pmap_fill_ptp(firstpte, newpte); 2767 2768 /* 2769 * Demote the mapping. This pmap is locked. The old PDE has 2770 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 2771 * set. Thus, there is no danger of a race with another 2772 * processor changing the setting of PG_A and/or PG_M between 2773 * the read above and the store below. 2774 */ 2775 if (workaround_erratum383) 2776 pmap_update_pde(pmap, va, pde, newpde); 2777 else if (pmap == kernel_pmap) 2778 pmap_kenter_pde(va, newpde); 2779 else 2780 pde_store(pde, newpde); 2781 if (firstpte == PADDR2) 2782 mtx_unlock(&PMAP2mutex); 2783 2784 /* 2785 * Invalidate the recursive mapping of the page table page. 2786 */ 2787 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 2788 2789 /* 2790 * Demote the pv entry. This depends on the earlier demotion 2791 * of the mapping. Specifically, the (re)creation of a per- 2792 * page pv entry might trigger the execution of pmap_collect(), 2793 * which might reclaim a newly (re)created per-page pv entry 2794 * and destroy the associated mapping. In order to destroy 2795 * the mapping, the PDE must have already changed from mapping 2796 * the 2mpage to referencing the page table page. 2797 */ 2798 if ((oldpde & PG_MANAGED) != 0) 2799 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); 2800 2801 pmap_pde_demotions++; 2802 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" 2803 " in pmap %p", va, pmap); 2804 return (TRUE); 2805} 2806 2807/* 2808 * Removes a 2- or 4MB page mapping from the kernel pmap. 2809 */ 2810static void 2811pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2812{ 2813 pd_entry_t newpde; 2814 vm_paddr_t mptepa; 2815 vm_page_t mpte; 2816 2817 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2818 mpte = pmap_remove_pt_page(pmap, va); 2819 if (mpte == NULL) 2820 panic("pmap_remove_kernel_pde: Missing pt page."); 2821 2822 mptepa = VM_PAGE_TO_PHYS(mpte); 2823 newpde = mptepa | PG_M | PG_A | PG_RW | PG_V; 2824 2825 /* 2826 * Initialize the page table page. 2827 */ 2828 pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]); 2829 2830 /* 2831 * Remove the mapping. 2832 */ 2833 if (workaround_erratum383) 2834 pmap_update_pde(pmap, va, pde, newpde); 2835 else 2836 pmap_kenter_pde(va, newpde); 2837 2838 /* 2839 * Invalidate the recursive mapping of the page table page. 2840 */ 2841 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 2842} 2843 2844/* 2845 * pmap_remove_pde: do the things to unmap a superpage in a process 2846 */ 2847static void 2848pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 2849 struct spglist *free) 2850{ 2851 struct md_page *pvh; 2852 pd_entry_t oldpde; 2853 vm_offset_t eva, va; 2854 vm_page_t m, mpte; 2855 2856 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2857 KASSERT((sva & PDRMASK) == 0, 2858 ("pmap_remove_pde: sva is not 4mpage aligned")); 2859 oldpde = pte_load_clear(pdq); 2860 if (oldpde & PG_W) 2861 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 2862 2863 /* 2864 * Machines that don't support invlpg, also don't support 2865 * PG_G. 2866 */ 2867 if ((oldpde & PG_G) != 0) 2868 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 2869 2870 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2871 if (oldpde & PG_MANAGED) { 2872 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 2873 pmap_pvh_free(pvh, pmap, sva); 2874 eva = sva + NBPDR; 2875 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 2876 va < eva; va += PAGE_SIZE, m++) { 2877 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2878 vm_page_dirty(m); 2879 if (oldpde & PG_A) 2880 vm_page_aflag_set(m, PGA_REFERENCED); 2881 if (TAILQ_EMPTY(&m->md.pv_list) && 2882 TAILQ_EMPTY(&pvh->pv_list)) 2883 vm_page_aflag_clear(m, PGA_WRITEABLE); 2884 } 2885 } 2886 if (pmap == kernel_pmap) { 2887 pmap_remove_kernel_pde(pmap, pdq, sva); 2888 } else { 2889 mpte = pmap_remove_pt_page(pmap, sva); 2890 if (mpte != NULL) { 2891 pmap->pm_stats.resident_count--; 2892 KASSERT(mpte->wire_count == NPTEPG, 2893 ("pmap_remove_pde: pte page wire count error")); 2894 mpte->wire_count = 0; 2895 pmap_add_delayed_free_list(mpte, free, FALSE); 2896 } 2897 } 2898} 2899 2900/* 2901 * pmap_remove_pte: do the things to unmap a page in a process 2902 */ 2903static int 2904pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 2905 struct spglist *free) 2906{ 2907 pt_entry_t oldpte; 2908 vm_page_t m; 2909 2910 rw_assert(&pvh_global_lock, RA_WLOCKED); 2911 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2912 oldpte = pte_load_clear(ptq); 2913 KASSERT(oldpte != 0, 2914 ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); 2915 if (oldpte & PG_W) 2916 pmap->pm_stats.wired_count -= 1; 2917 /* 2918 * Machines that don't support invlpg, also don't support 2919 * PG_G. 2920 */ 2921 if (oldpte & PG_G) 2922 pmap_invalidate_page(kernel_pmap, va); 2923 pmap->pm_stats.resident_count -= 1; 2924 if (oldpte & PG_MANAGED) { 2925 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 2926 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2927 vm_page_dirty(m); 2928 if (oldpte & PG_A) 2929 vm_page_aflag_set(m, PGA_REFERENCED); 2930 pmap_remove_entry(pmap, m, va); 2931 } 2932 return (pmap_unuse_pt(pmap, va, free)); 2933} 2934 2935/* 2936 * Remove a single page from a process address space 2937 */ 2938static void 2939pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) 2940{ 2941 pt_entry_t *pte; 2942 2943 rw_assert(&pvh_global_lock, RA_WLOCKED); 2944 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 2945 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2946 if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) 2947 return; 2948 pmap_remove_pte(pmap, pte, va, free); 2949 pmap_invalidate_page(pmap, va); 2950} 2951 2952/* 2953 * Remove the given range of addresses from the specified map. 2954 * 2955 * It is assumed that the start and end are properly 2956 * rounded to the page size. 2957 */ 2958void 2959pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2960{ 2961 vm_offset_t pdnxt; 2962 pd_entry_t ptpaddr; 2963 pt_entry_t *pte; 2964 struct spglist free; 2965 int anyvalid; 2966 2967 /* 2968 * Perform an unsynchronized read. This is, however, safe. 2969 */ 2970 if (pmap->pm_stats.resident_count == 0) 2971 return; 2972 2973 anyvalid = 0; 2974 SLIST_INIT(&free); 2975 2976 rw_wlock(&pvh_global_lock); 2977 sched_pin(); 2978 PMAP_LOCK(pmap); 2979 2980 /* 2981 * special handling of removing one page. a very 2982 * common operation and easy to short circuit some 2983 * code. 2984 */ 2985 if ((sva + PAGE_SIZE == eva) && 2986 ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { 2987 pmap_remove_page(pmap, sva, &free); 2988 goto out; 2989 } 2990 2991 for (; sva < eva; sva = pdnxt) { 2992 u_int pdirindex; 2993 2994 /* 2995 * Calculate index for next page table. 2996 */ 2997 pdnxt = (sva + NBPDR) & ~PDRMASK; 2998 if (pdnxt < sva) 2999 pdnxt = eva; 3000 if (pmap->pm_stats.resident_count == 0) 3001 break; 3002 3003 pdirindex = sva >> PDRSHIFT; 3004 ptpaddr = pmap->pm_pdir[pdirindex]; 3005 3006 /* 3007 * Weed out invalid mappings. Note: we assume that the page 3008 * directory table is always allocated, and in kernel virtual. 3009 */ 3010 if (ptpaddr == 0) 3011 continue; 3012 3013 /* 3014 * Check for large page. 3015 */ 3016 if ((ptpaddr & PG_PS) != 0) { 3017 /* 3018 * Are we removing the entire large page? If not, 3019 * demote the mapping and fall through. 3020 */ 3021 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 3022 /* 3023 * The TLB entry for a PG_G mapping is 3024 * invalidated by pmap_remove_pde(). 3025 */ 3026 if ((ptpaddr & PG_G) == 0) 3027 anyvalid = 1; 3028 pmap_remove_pde(pmap, 3029 &pmap->pm_pdir[pdirindex], sva, &free); 3030 continue; 3031 } else if (!pmap_demote_pde(pmap, 3032 &pmap->pm_pdir[pdirindex], sva)) { 3033 /* The large page mapping was destroyed. */ 3034 continue; 3035 } 3036 } 3037 3038 /* 3039 * Limit our scan to either the end of the va represented 3040 * by the current page table page, or to the end of the 3041 * range being removed. 3042 */ 3043 if (pdnxt > eva) 3044 pdnxt = eva; 3045 3046 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 3047 sva += PAGE_SIZE) { 3048 if (*pte == 0) 3049 continue; 3050 3051 /* 3052 * The TLB entry for a PG_G mapping is invalidated 3053 * by pmap_remove_pte(). 3054 */ 3055 if ((*pte & PG_G) == 0) 3056 anyvalid = 1; 3057 if (pmap_remove_pte(pmap, pte, sva, &free)) 3058 break; 3059 } 3060 } 3061out: 3062 sched_unpin(); 3063 if (anyvalid) 3064 pmap_invalidate_all(pmap); 3065 rw_wunlock(&pvh_global_lock); 3066 PMAP_UNLOCK(pmap); 3067 pmap_free_zero_pages(&free); 3068} 3069 3070/* 3071 * Routine: pmap_remove_all 3072 * Function: 3073 * Removes this physical page from 3074 * all physical maps in which it resides. 3075 * Reflects back modify bits to the pager. 3076 * 3077 * Notes: 3078 * Original versions of this routine were very 3079 * inefficient because they iteratively called 3080 * pmap_remove (slow...) 3081 */ 3082 3083void 3084pmap_remove_all(vm_page_t m) 3085{ 3086 struct md_page *pvh; 3087 pv_entry_t pv; 3088 pmap_t pmap; 3089 pt_entry_t *pte, tpte; 3090 pd_entry_t *pde; 3091 vm_offset_t va; 3092 struct spglist free; 3093 3094 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3095 ("pmap_remove_all: page %p is not managed", m)); 3096 SLIST_INIT(&free); 3097 rw_wlock(&pvh_global_lock); 3098 sched_pin(); 3099 if ((m->flags & PG_FICTITIOUS) != 0) 3100 goto small_mappings; 3101 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3102 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 3103 va = pv->pv_va; 3104 pmap = PV_PMAP(pv); 3105 PMAP_LOCK(pmap); 3106 pde = pmap_pde(pmap, va); 3107 (void)pmap_demote_pde(pmap, pde, va); 3108 PMAP_UNLOCK(pmap); 3109 } 3110small_mappings: 3111 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3112 pmap = PV_PMAP(pv); 3113 PMAP_LOCK(pmap); 3114 pmap->pm_stats.resident_count--; 3115 pde = pmap_pde(pmap, pv->pv_va); 3116 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 3117 " a 4mpage in page %p's pv list", m)); 3118 pte = pmap_pte_quick(pmap, pv->pv_va); 3119 tpte = pte_load_clear(pte); 3120 KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", 3121 pmap, pv->pv_va)); 3122 if (tpte & PG_W) 3123 pmap->pm_stats.wired_count--; 3124 if (tpte & PG_A) 3125 vm_page_aflag_set(m, PGA_REFERENCED); 3126 3127 /* 3128 * Update the vm_page_t clean and reference bits. 3129 */ 3130 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3131 vm_page_dirty(m); 3132 pmap_unuse_pt(pmap, pv->pv_va, &free); 3133 pmap_invalidate_page(pmap, pv->pv_va); 3134 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 3135 free_pv_entry(pmap, pv); 3136 PMAP_UNLOCK(pmap); 3137 } 3138 vm_page_aflag_clear(m, PGA_WRITEABLE); 3139 sched_unpin(); 3140 rw_wunlock(&pvh_global_lock); 3141 pmap_free_zero_pages(&free); 3142} 3143 3144/* 3145 * pmap_protect_pde: do the things to protect a 4mpage in a process 3146 */ 3147static boolean_t 3148pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 3149{ 3150 pd_entry_t newpde, oldpde; 3151 vm_offset_t eva, va; 3152 vm_page_t m; 3153 boolean_t anychanged; 3154 3155 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3156 KASSERT((sva & PDRMASK) == 0, 3157 ("pmap_protect_pde: sva is not 4mpage aligned")); 3158 anychanged = FALSE; 3159retry: 3160 oldpde = newpde = *pde; 3161 if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == 3162 (PG_MANAGED | PG_M | PG_RW)) { 3163 eva = sva + NBPDR; 3164 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 3165 va < eva; va += PAGE_SIZE, m++) 3166 vm_page_dirty(m); 3167 } 3168 if ((prot & VM_PROT_WRITE) == 0) 3169 newpde &= ~(PG_RW | PG_M); 3170#if defined(PAE) || defined(PAE_TABLES) 3171 if ((prot & VM_PROT_EXECUTE) == 0) 3172 newpde |= pg_nx; 3173#endif 3174 if (newpde != oldpde) { 3175 /* 3176 * As an optimization to future operations on this PDE, clear 3177 * PG_PROMOTED. The impending invalidation will remove any 3178 * lingering 4KB page mappings from the TLB. 3179 */ 3180 if (!pde_cmpset(pde, oldpde, newpde & ~PG_PROMOTED)) 3181 goto retry; 3182 if ((oldpde & PG_G) != 0) 3183 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); 3184 else 3185 anychanged = TRUE; 3186 } 3187 return (anychanged); 3188} 3189 3190/* 3191 * Set the physical protection on the 3192 * specified range of this map as requested. 3193 */ 3194void 3195pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 3196{ 3197 vm_offset_t pdnxt; 3198 pd_entry_t ptpaddr; 3199 pt_entry_t *pte; 3200 boolean_t anychanged, pv_lists_locked; 3201 3202 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); 3203 if (prot == VM_PROT_NONE) { 3204 pmap_remove(pmap, sva, eva); 3205 return; 3206 } 3207 3208#if defined(PAE) || defined(PAE_TABLES) 3209 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 3210 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 3211 return; 3212#else 3213 if (prot & VM_PROT_WRITE) 3214 return; 3215#endif 3216 3217 if (pmap_is_current(pmap)) 3218 pv_lists_locked = FALSE; 3219 else { 3220 pv_lists_locked = TRUE; 3221resume: 3222 rw_wlock(&pvh_global_lock); 3223 sched_pin(); 3224 } 3225 anychanged = FALSE; 3226 3227 PMAP_LOCK(pmap); 3228 for (; sva < eva; sva = pdnxt) { 3229 pt_entry_t obits, pbits; 3230 u_int pdirindex; 3231 3232 pdnxt = (sva + NBPDR) & ~PDRMASK; 3233 if (pdnxt < sva) 3234 pdnxt = eva; 3235 3236 pdirindex = sva >> PDRSHIFT; 3237 ptpaddr = pmap->pm_pdir[pdirindex]; 3238 3239 /* 3240 * Weed out invalid mappings. Note: we assume that the page 3241 * directory table is always allocated, and in kernel virtual. 3242 */ 3243 if (ptpaddr == 0) 3244 continue; 3245 3246 /* 3247 * Check for large page. 3248 */ 3249 if ((ptpaddr & PG_PS) != 0) { 3250 /* 3251 * Are we protecting the entire large page? If not, 3252 * demote the mapping and fall through. 3253 */ 3254 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 3255 /* 3256 * The TLB entry for a PG_G mapping is 3257 * invalidated by pmap_protect_pde(). 3258 */ 3259 if (pmap_protect_pde(pmap, 3260 &pmap->pm_pdir[pdirindex], sva, prot)) 3261 anychanged = TRUE; 3262 continue; 3263 } else { 3264 if (!pv_lists_locked) { 3265 pv_lists_locked = TRUE; 3266 if (!rw_try_wlock(&pvh_global_lock)) { 3267 if (anychanged) 3268 pmap_invalidate_all( 3269 pmap); 3270 PMAP_UNLOCK(pmap); 3271 goto resume; 3272 } 3273 sched_pin(); 3274 } 3275 if (!pmap_demote_pde(pmap, 3276 &pmap->pm_pdir[pdirindex], sva)) { 3277 /* 3278 * The large page mapping was 3279 * destroyed. 3280 */ 3281 continue; 3282 } 3283 } 3284 } 3285 3286 if (pdnxt > eva) 3287 pdnxt = eva; 3288 3289 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 3290 sva += PAGE_SIZE) { 3291 vm_page_t m; 3292 3293retry: 3294 /* 3295 * Regardless of whether a pte is 32 or 64 bits in 3296 * size, PG_RW, PG_A, and PG_M are among the least 3297 * significant 32 bits. 3298 */ 3299 obits = pbits = *pte; 3300 if ((pbits & PG_V) == 0) 3301 continue; 3302 3303 if ((prot & VM_PROT_WRITE) == 0) { 3304 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 3305 (PG_MANAGED | PG_M | PG_RW)) { 3306 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 3307 vm_page_dirty(m); 3308 } 3309 pbits &= ~(PG_RW | PG_M); 3310 } 3311#if defined(PAE) || defined(PAE_TABLES) 3312 if ((prot & VM_PROT_EXECUTE) == 0) 3313 pbits |= pg_nx; 3314#endif 3315 3316 if (pbits != obits) { 3317#if defined(PAE) || defined(PAE_TABLES) 3318 if (!atomic_cmpset_64(pte, obits, pbits)) 3319 goto retry; 3320#else 3321 if (!atomic_cmpset_int((u_int *)pte, obits, 3322 pbits)) 3323 goto retry; 3324#endif 3325 if (obits & PG_G) 3326 pmap_invalidate_page(pmap, sva); 3327 else 3328 anychanged = TRUE; 3329 } 3330 } 3331 } 3332 if (anychanged) 3333 pmap_invalidate_all(pmap); 3334 if (pv_lists_locked) { 3335 sched_unpin(); 3336 rw_wunlock(&pvh_global_lock); 3337 } 3338 PMAP_UNLOCK(pmap); 3339} 3340 3341#if VM_NRESERVLEVEL > 0 3342/* 3343 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are 3344 * within a single page table page (PTP) to a single 2- or 4MB page mapping. 3345 * For promotion to occur, two conditions must be met: (1) the 4KB page 3346 * mappings must map aligned, contiguous physical memory and (2) the 4KB page 3347 * mappings must have identical characteristics. 3348 * 3349 * Managed (PG_MANAGED) mappings within the kernel address space are not 3350 * promoted. The reason is that kernel PDEs are replicated in each pmap but 3351 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel 3352 * pmap. 3353 */ 3354static void 3355pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3356{ 3357 pd_entry_t newpde; 3358 pt_entry_t *firstpte, oldpte, pa, *pte; 3359 vm_offset_t oldpteva; 3360 vm_page_t mpte; 3361 3362 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3363 3364 /* 3365 * Examine the first PTE in the specified PTP. Abort if this PTE is 3366 * either invalid, unused, or does not map the first 4KB physical page 3367 * within a 2- or 4MB page. 3368 */ 3369 firstpte = pmap_pte_quick(pmap, trunc_4mpage(va)); 3370setpde: 3371 newpde = *firstpte; 3372 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 3373 pmap_pde_p_failures++; 3374 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3375 " in pmap %p", va, pmap); 3376 return; 3377 } 3378 if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { 3379 pmap_pde_p_failures++; 3380 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3381 " in pmap %p", va, pmap); 3382 return; 3383 } 3384 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 3385 /* 3386 * When PG_M is already clear, PG_RW can be cleared without 3387 * a TLB invalidation. 3388 */ 3389 if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde & 3390 ~PG_RW)) 3391 goto setpde; 3392 newpde &= ~PG_RW; 3393 } 3394 3395 /* 3396 * Examine each of the other PTEs in the specified PTP. Abort if this 3397 * PTE maps an unexpected 4KB physical page or does not have identical 3398 * characteristics to the first PTE. 3399 */ 3400 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 3401 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 3402setpte: 3403 oldpte = *pte; 3404 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 3405 pmap_pde_p_failures++; 3406 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3407 " in pmap %p", va, pmap); 3408 return; 3409 } 3410 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 3411 /* 3412 * When PG_M is already clear, PG_RW can be cleared 3413 * without a TLB invalidation. 3414 */ 3415 if (!atomic_cmpset_int((u_int *)pte, oldpte, 3416 oldpte & ~PG_RW)) 3417 goto setpte; 3418 oldpte &= ~PG_RW; 3419 oldpteva = (oldpte & PG_FRAME & PDRMASK) | 3420 (va & ~PDRMASK); 3421 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" 3422 " in pmap %p", oldpteva, pmap); 3423 } 3424 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 3425 pmap_pde_p_failures++; 3426 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3427 " in pmap %p", va, pmap); 3428 return; 3429 } 3430 pa -= PAGE_SIZE; 3431 } 3432 3433 /* 3434 * Save the page table page in its current state until the PDE 3435 * mapping the superpage is demoted by pmap_demote_pde() or 3436 * destroyed by pmap_remove_pde(). 3437 */ 3438 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 3439 KASSERT(mpte >= vm_page_array && 3440 mpte < &vm_page_array[vm_page_array_size], 3441 ("pmap_promote_pde: page table page is out of range")); 3442 KASSERT(mpte->pindex == va >> PDRSHIFT, 3443 ("pmap_promote_pde: page table page's pindex is wrong")); 3444 if (pmap_insert_pt_page(pmap, mpte)) { 3445 pmap_pde_p_failures++; 3446 CTR2(KTR_PMAP, 3447 "pmap_promote_pde: failure for va %#x in pmap %p", va, 3448 pmap); 3449 return; 3450 } 3451 3452 /* 3453 * Promote the pv entries. 3454 */ 3455 if ((newpde & PG_MANAGED) != 0) 3456 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); 3457 3458 /* 3459 * Propagate the PAT index to its proper position. 3460 */ 3461 if ((newpde & PG_PTE_PAT) != 0) 3462 newpde ^= PG_PDE_PAT | PG_PTE_PAT; 3463 3464 /* 3465 * Map the superpage. 3466 */ 3467 if (workaround_erratum383) 3468 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 3469 else if (pmap == kernel_pmap) 3470 pmap_kenter_pde(va, PG_PROMOTED | PG_PS | newpde); 3471 else 3472 pde_store(pde, PG_PROMOTED | PG_PS | newpde); 3473 3474 pmap_pde_promotions++; 3475 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" 3476 " in pmap %p", va, pmap); 3477} 3478#endif /* VM_NRESERVLEVEL > 0 */ 3479 3480/* 3481 * Insert the given physical page (p) at 3482 * the specified virtual address (v) in the 3483 * target physical map with the protection requested. 3484 * 3485 * If specified, the page will be wired down, meaning 3486 * that the related pte can not be reclaimed. 3487 * 3488 * NB: This is the only routine which MAY NOT lazy-evaluate 3489 * or lose information. That is, this routine must actually 3490 * insert this page into the given map NOW. 3491 */ 3492int 3493pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 3494 u_int flags, int8_t psind) 3495{ 3496 pd_entry_t *pde; 3497 pt_entry_t *pte; 3498 pt_entry_t newpte, origpte; 3499 pv_entry_t pv; 3500 vm_paddr_t opa, pa; 3501 vm_page_t mpte, om; 3502 boolean_t invlva, wired; 3503 3504 va = trunc_page(va); 3505 mpte = NULL; 3506 wired = (flags & PMAP_ENTER_WIRED) != 0; 3507 3508 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 3509 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 3510 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", 3511 va)); 3512 if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) 3513 VM_OBJECT_ASSERT_LOCKED(m->object); 3514 3515 rw_wlock(&pvh_global_lock); 3516 PMAP_LOCK(pmap); 3517 sched_pin(); 3518 3519 pde = pmap_pde(pmap, va); 3520 if (va < VM_MAXUSER_ADDRESS) { 3521 /* 3522 * va is for UVA. 3523 * In the case that a page table page is not resident, 3524 * we are creating it here. pmap_allocpte() handles 3525 * demotion. 3526 */ 3527 mpte = pmap_allocpte(pmap, va, flags); 3528 if (mpte == NULL) { 3529 KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, 3530 ("pmap_allocpte failed with sleep allowed")); 3531 sched_unpin(); 3532 rw_wunlock(&pvh_global_lock); 3533 PMAP_UNLOCK(pmap); 3534 return (KERN_RESOURCE_SHORTAGE); 3535 } 3536 } else { 3537 /* 3538 * va is for KVA, so pmap_demote_pde() will never fail 3539 * to install a page table page. PG_V is also 3540 * asserted by pmap_demote_pde(). 3541 */ 3542 KASSERT(pde != NULL && (*pde & PG_V) != 0, 3543 ("KVA %#x invalid pde pdir %#jx", va, 3544 (uintmax_t)pmap->pm_pdir[PTDPTDI])); 3545 if ((*pde & PG_PS) != 0) 3546 pmap_demote_pde(pmap, pde, va); 3547 } 3548 pte = pmap_pte_quick(pmap, va); 3549 3550 /* 3551 * Page Directory table entry is not valid, which should not 3552 * happen. We should have either allocated the page table 3553 * page or demoted the existing mapping above. 3554 */ 3555 if (pte == NULL) { 3556 panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", 3557 (uintmax_t)pmap->pm_pdir[PTDPTDI], va); 3558 } 3559 3560 pa = VM_PAGE_TO_PHYS(m); 3561 om = NULL; 3562 origpte = *pte; 3563 opa = origpte & PG_FRAME; 3564 3565 /* 3566 * Mapping has not changed, must be protection or wiring change. 3567 */ 3568 if (origpte && (opa == pa)) { 3569 /* 3570 * Wiring change, just update stats. We don't worry about 3571 * wiring PT pages as they remain resident as long as there 3572 * are valid mappings in them. Hence, if a user page is wired, 3573 * the PT page will be also. 3574 */ 3575 if (wired && ((origpte & PG_W) == 0)) 3576 pmap->pm_stats.wired_count++; 3577 else if (!wired && (origpte & PG_W)) 3578 pmap->pm_stats.wired_count--; 3579 3580 /* 3581 * Remove extra pte reference 3582 */ 3583 if (mpte) 3584 mpte->wire_count--; 3585 3586 if (origpte & PG_MANAGED) { 3587 om = m; 3588 pa |= PG_MANAGED; 3589 } 3590 goto validate; 3591 } 3592 3593 pv = NULL; 3594 3595 /* 3596 * Mapping has changed, invalidate old range and fall through to 3597 * handle validating new mapping. 3598 */ 3599 if (opa) { 3600 if (origpte & PG_W) 3601 pmap->pm_stats.wired_count--; 3602 if (origpte & PG_MANAGED) { 3603 om = PHYS_TO_VM_PAGE(opa); 3604 pv = pmap_pvh_remove(&om->md, pmap, va); 3605 } 3606 if (mpte != NULL) { 3607 mpte->wire_count--; 3608 KASSERT(mpte->wire_count > 0, 3609 ("pmap_enter: missing reference to page table page," 3610 " va: 0x%x", va)); 3611 } 3612 } else 3613 pmap->pm_stats.resident_count++; 3614 3615 /* 3616 * Enter on the PV list if part of our managed memory. 3617 */ 3618 if ((m->oflags & VPO_UNMANAGED) == 0) { 3619 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, 3620 ("pmap_enter: managed mapping within the clean submap")); 3621 if (pv == NULL) 3622 pv = get_pv_entry(pmap, FALSE); 3623 pv->pv_va = va; 3624 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 3625 pa |= PG_MANAGED; 3626 } else if (pv != NULL) 3627 free_pv_entry(pmap, pv); 3628 3629 /* 3630 * Increment counters 3631 */ 3632 if (wired) 3633 pmap->pm_stats.wired_count++; 3634 3635validate: 3636 /* 3637 * Now validate mapping with desired protection/wiring. 3638 */ 3639 newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V); 3640 if ((prot & VM_PROT_WRITE) != 0) { 3641 newpte |= PG_RW; 3642 if ((newpte & PG_MANAGED) != 0) 3643 vm_page_aflag_set(m, PGA_WRITEABLE); 3644 } 3645#if defined(PAE) || defined(PAE_TABLES) 3646 if ((prot & VM_PROT_EXECUTE) == 0) 3647 newpte |= pg_nx; 3648#endif 3649 if (wired) 3650 newpte |= PG_W; 3651 if (va < VM_MAXUSER_ADDRESS) 3652 newpte |= PG_U; 3653 if (pmap == kernel_pmap) 3654 newpte |= pgeflag; 3655 3656 /* 3657 * if the mapping or permission bits are different, we need 3658 * to update the pte. 3659 */ 3660 if ((origpte & ~(PG_M|PG_A)) != newpte) { 3661 newpte |= PG_A; 3662 if ((flags & VM_PROT_WRITE) != 0) 3663 newpte |= PG_M; 3664 if (origpte & PG_V) { 3665 invlva = FALSE; 3666 origpte = pte_load_store(pte, newpte); 3667 if (origpte & PG_A) { 3668 if (origpte & PG_MANAGED) 3669 vm_page_aflag_set(om, PGA_REFERENCED); 3670 if (opa != VM_PAGE_TO_PHYS(m)) 3671 invlva = TRUE; 3672#if defined(PAE) || defined(PAE_TABLES) 3673 if ((origpte & PG_NX) == 0 && 3674 (newpte & PG_NX) != 0) 3675 invlva = TRUE; 3676#endif 3677 } 3678 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 3679 if ((origpte & PG_MANAGED) != 0) 3680 vm_page_dirty(om); 3681 if ((prot & VM_PROT_WRITE) == 0) 3682 invlva = TRUE; 3683 } 3684 if ((origpte & PG_MANAGED) != 0 && 3685 TAILQ_EMPTY(&om->md.pv_list) && 3686 ((om->flags & PG_FICTITIOUS) != 0 || 3687 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 3688 vm_page_aflag_clear(om, PGA_WRITEABLE); 3689 if (invlva) 3690 pmap_invalidate_page(pmap, va); 3691 } else 3692 pte_store(pte, newpte); 3693 } 3694 3695#if VM_NRESERVLEVEL > 0 3696 /* 3697 * If both the page table page and the reservation are fully 3698 * populated, then attempt promotion. 3699 */ 3700 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 3701 pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 && 3702 vm_reserv_level_iffullpop(m) == 0) 3703 pmap_promote_pde(pmap, pde, va); 3704#endif 3705 3706 sched_unpin(); 3707 rw_wunlock(&pvh_global_lock); 3708 PMAP_UNLOCK(pmap); 3709 return (KERN_SUCCESS); 3710} 3711 3712/* 3713 * Tries to create a 2- or 4MB page mapping. Returns TRUE if successful and 3714 * FALSE otherwise. Fails if (1) a page table page cannot be allocated without 3715 * blocking, (2) a mapping already exists at the specified virtual address, or 3716 * (3) a pv entry cannot be allocated without reclaiming another pv entry. 3717 */ 3718static boolean_t 3719pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3720{ 3721 pd_entry_t *pde, newpde; 3722 3723 rw_assert(&pvh_global_lock, RA_WLOCKED); 3724 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3725 pde = pmap_pde(pmap, va); 3726 if (*pde != 0) { 3727 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3728 " in pmap %p", va, pmap); 3729 return (FALSE); 3730 } 3731 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) | 3732 PG_PS | PG_V; 3733 if ((m->oflags & VPO_UNMANAGED) == 0) { 3734 newpde |= PG_MANAGED; 3735 3736 /* 3737 * Abort this mapping if its PV entry could not be created. 3738 */ 3739 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) { 3740 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3741 " in pmap %p", va, pmap); 3742 return (FALSE); 3743 } 3744 } 3745#if defined(PAE) || defined(PAE_TABLES) 3746 if ((prot & VM_PROT_EXECUTE) == 0) 3747 newpde |= pg_nx; 3748#endif 3749 if (va < VM_MAXUSER_ADDRESS) 3750 newpde |= PG_U; 3751 3752 /* 3753 * Increment counters. 3754 */ 3755 pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; 3756 3757 /* 3758 * Map the superpage. (This is not a promoted mapping; there will not 3759 * be any lingering 4KB page mappings in the TLB.) 3760 */ 3761 pde_store(pde, newpde); 3762 3763 pmap_pde_mappings++; 3764 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 3765 " in pmap %p", va, pmap); 3766 return (TRUE); 3767} 3768 3769/* 3770 * Maps a sequence of resident pages belonging to the same object. 3771 * The sequence begins with the given page m_start. This page is 3772 * mapped at the given virtual address start. Each subsequent page is 3773 * mapped at a virtual address that is offset from start by the same 3774 * amount as the page is offset from m_start within the object. The 3775 * last page in the sequence is the page with the largest offset from 3776 * m_start that can be mapped at a virtual address less than the given 3777 * virtual address end. Not every virtual page between start and end 3778 * is mapped; only those for which a resident page exists with the 3779 * corresponding offset from m_start are mapped. 3780 */ 3781void 3782pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3783 vm_page_t m_start, vm_prot_t prot) 3784{ 3785 vm_offset_t va; 3786 vm_page_t m, mpte; 3787 vm_pindex_t diff, psize; 3788 3789 VM_OBJECT_ASSERT_LOCKED(m_start->object); 3790 3791 psize = atop(end - start); 3792 mpte = NULL; 3793 m = m_start; 3794 rw_wlock(&pvh_global_lock); 3795 PMAP_LOCK(pmap); 3796 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3797 va = start + ptoa(diff); 3798 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 3799 m->psind == 1 && pg_ps_enabled && 3800 pmap_enter_pde(pmap, va, m, prot)) 3801 m = &m[NBPDR / PAGE_SIZE - 1]; 3802 else 3803 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 3804 mpte); 3805 m = TAILQ_NEXT(m, listq); 3806 } 3807 rw_wunlock(&pvh_global_lock); 3808 PMAP_UNLOCK(pmap); 3809} 3810 3811/* 3812 * this code makes some *MAJOR* assumptions: 3813 * 1. Current pmap & pmap exists. 3814 * 2. Not wired. 3815 * 3. Read access. 3816 * 4. No page table pages. 3817 * but is *MUCH* faster than pmap_enter... 3818 */ 3819 3820void 3821pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3822{ 3823 3824 rw_wlock(&pvh_global_lock); 3825 PMAP_LOCK(pmap); 3826 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); 3827 rw_wunlock(&pvh_global_lock); 3828 PMAP_UNLOCK(pmap); 3829} 3830 3831static vm_page_t 3832pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3833 vm_prot_t prot, vm_page_t mpte) 3834{ 3835 pt_entry_t newpte, *pte; 3836 struct spglist free; 3837 3838 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 3839 (m->oflags & VPO_UNMANAGED) != 0, 3840 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3841 rw_assert(&pvh_global_lock, RA_WLOCKED); 3842 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3843 3844 /* 3845 * In the case that a page table page is not 3846 * resident, we are creating it here. 3847 */ 3848 if (va < VM_MAXUSER_ADDRESS) { 3849 u_int ptepindex; 3850 pd_entry_t ptepa; 3851 3852 /* 3853 * Calculate pagetable page index 3854 */ 3855 ptepindex = va >> PDRSHIFT; 3856 if (mpte && (mpte->pindex == ptepindex)) { 3857 mpte->wire_count++; 3858 } else { 3859 /* 3860 * Get the page directory entry 3861 */ 3862 ptepa = pmap->pm_pdir[ptepindex]; 3863 3864 /* 3865 * If the page table page is mapped, we just increment 3866 * the hold count, and activate it. 3867 */ 3868 if (ptepa) { 3869 if (ptepa & PG_PS) 3870 return (NULL); 3871 mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 3872 mpte->wire_count++; 3873 } else { 3874 mpte = _pmap_allocpte(pmap, ptepindex, 3875 PMAP_ENTER_NOSLEEP); 3876 if (mpte == NULL) 3877 return (mpte); 3878 } 3879 } 3880 } else { 3881 mpte = NULL; 3882 } 3883 3884 /* 3885 * This call to vtopte makes the assumption that we are 3886 * entering the page into the current pmap. In order to support 3887 * quick entry into any pmap, one would likely use pmap_pte_quick. 3888 * But that isn't as quick as vtopte. 3889 */ 3890 pte = vtopte(va); 3891 if (*pte) { 3892 if (mpte != NULL) { 3893 mpte->wire_count--; 3894 mpte = NULL; 3895 } 3896 return (mpte); 3897 } 3898 3899 /* 3900 * Enter on the PV list if part of our managed memory. 3901 */ 3902 if ((m->oflags & VPO_UNMANAGED) == 0 && 3903 !pmap_try_insert_pv_entry(pmap, va, m)) { 3904 if (mpte != NULL) { 3905 SLIST_INIT(&free); 3906 if (pmap_unwire_ptp(pmap, mpte, &free)) { 3907 pmap_invalidate_page(pmap, va); 3908 pmap_free_zero_pages(&free); 3909 } 3910 3911 mpte = NULL; 3912 } 3913 return (mpte); 3914 } 3915 3916 /* 3917 * Increment counters 3918 */ 3919 pmap->pm_stats.resident_count++; 3920 3921 newpte = VM_PAGE_TO_PHYS(m) | PG_V | 3922 pmap_cache_bits(m->md.pat_mode, 0); 3923 if ((m->oflags & VPO_UNMANAGED) == 0) 3924 newpte |= PG_MANAGED; 3925#if defined(PAE) || defined(PAE_TABLES) 3926 if ((prot & VM_PROT_EXECUTE) == 0) 3927 newpte |= pg_nx; 3928#endif 3929 if (pmap != kernel_pmap) 3930 newpte |= PG_U; 3931 pte_store(pte, newpte); 3932 return (mpte); 3933} 3934 3935/* 3936 * Make a temporary mapping for a physical address. This is only intended 3937 * to be used for panic dumps. 3938 */ 3939void * 3940pmap_kenter_temporary(vm_paddr_t pa, int i) 3941{ 3942 vm_offset_t va; 3943 3944 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 3945 pmap_kenter(va, pa); 3946 invlpg(va); 3947 return ((void *)crashdumpmap); 3948} 3949 3950/* 3951 * This code maps large physical mmap regions into the 3952 * processor address space. Note that some shortcuts 3953 * are taken, but the code works. 3954 */ 3955void 3956pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3957 vm_pindex_t pindex, vm_size_t size) 3958{ 3959 pd_entry_t *pde; 3960 vm_paddr_t pa, ptepa; 3961 vm_page_t p; 3962 int pat_mode; 3963 3964 VM_OBJECT_ASSERT_WLOCKED(object); 3965 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3966 ("pmap_object_init_pt: non-device object")); 3967 if (pseflag && 3968 (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 3969 if (!vm_object_populate(object, pindex, pindex + atop(size))) 3970 return; 3971 p = vm_page_lookup(object, pindex); 3972 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3973 ("pmap_object_init_pt: invalid page %p", p)); 3974 pat_mode = p->md.pat_mode; 3975 3976 /* 3977 * Abort the mapping if the first page is not physically 3978 * aligned to a 2/4MB page boundary. 3979 */ 3980 ptepa = VM_PAGE_TO_PHYS(p); 3981 if (ptepa & (NBPDR - 1)) 3982 return; 3983 3984 /* 3985 * Skip the first page. Abort the mapping if the rest of 3986 * the pages are not physically contiguous or have differing 3987 * memory attributes. 3988 */ 3989 p = TAILQ_NEXT(p, listq); 3990 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 3991 pa += PAGE_SIZE) { 3992 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3993 ("pmap_object_init_pt: invalid page %p", p)); 3994 if (pa != VM_PAGE_TO_PHYS(p) || 3995 pat_mode != p->md.pat_mode) 3996 return; 3997 p = TAILQ_NEXT(p, listq); 3998 } 3999 4000 /* 4001 * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and 4002 * "size" is a multiple of 2/4M, adding the PAT setting to 4003 * "pa" will not affect the termination of this loop. 4004 */ 4005 PMAP_LOCK(pmap); 4006 for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa + 4007 size; pa += NBPDR) { 4008 pde = pmap_pde(pmap, addr); 4009 if (*pde == 0) { 4010 pde_store(pde, pa | PG_PS | PG_M | PG_A | 4011 PG_U | PG_RW | PG_V); 4012 pmap->pm_stats.resident_count += NBPDR / 4013 PAGE_SIZE; 4014 pmap_pde_mappings++; 4015 } 4016 /* Else continue on if the PDE is already valid. */ 4017 addr += NBPDR; 4018 } 4019 PMAP_UNLOCK(pmap); 4020 } 4021} 4022 4023/* 4024 * Clear the wired attribute from the mappings for the specified range of 4025 * addresses in the given pmap. Every valid mapping within that range 4026 * must have the wired attribute set. In contrast, invalid mappings 4027 * cannot have the wired attribute set, so they are ignored. 4028 * 4029 * The wired attribute of the page table entry is not a hardware feature, 4030 * so there is no need to invalidate any TLB entries. 4031 */ 4032void 4033pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 4034{ 4035 vm_offset_t pdnxt; 4036 pd_entry_t *pde; 4037 pt_entry_t *pte; 4038 boolean_t pv_lists_locked; 4039 4040 if (pmap_is_current(pmap)) 4041 pv_lists_locked = FALSE; 4042 else { 4043 pv_lists_locked = TRUE; 4044resume: 4045 rw_wlock(&pvh_global_lock); 4046 sched_pin(); 4047 } 4048 PMAP_LOCK(pmap); 4049 for (; sva < eva; sva = pdnxt) { 4050 pdnxt = (sva + NBPDR) & ~PDRMASK; 4051 if (pdnxt < sva) 4052 pdnxt = eva; 4053 pde = pmap_pde(pmap, sva); 4054 if ((*pde & PG_V) == 0) 4055 continue; 4056 if ((*pde & PG_PS) != 0) { 4057 if ((*pde & PG_W) == 0) 4058 panic("pmap_unwire: pde %#jx is missing PG_W", 4059 (uintmax_t)*pde); 4060 4061 /* 4062 * Are we unwiring the entire large page? If not, 4063 * demote the mapping and fall through. 4064 */ 4065 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 4066 /* 4067 * Regardless of whether a pde (or pte) is 32 4068 * or 64 bits in size, PG_W is among the least 4069 * significant 32 bits. 4070 */ 4071 atomic_clear_int((u_int *)pde, PG_W); 4072 pmap->pm_stats.wired_count -= NBPDR / 4073 PAGE_SIZE; 4074 continue; 4075 } else { 4076 if (!pv_lists_locked) { 4077 pv_lists_locked = TRUE; 4078 if (!rw_try_wlock(&pvh_global_lock)) { 4079 PMAP_UNLOCK(pmap); 4080 /* Repeat sva. */ 4081 goto resume; 4082 } 4083 sched_pin(); 4084 } 4085 if (!pmap_demote_pde(pmap, pde, sva)) 4086 panic("pmap_unwire: demotion failed"); 4087 } 4088 } 4089 if (pdnxt > eva) 4090 pdnxt = eva; 4091 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 4092 sva += PAGE_SIZE) { 4093 if ((*pte & PG_V) == 0) 4094 continue; 4095 if ((*pte & PG_W) == 0) 4096 panic("pmap_unwire: pte %#jx is missing PG_W", 4097 (uintmax_t)*pte); 4098 4099 /* 4100 * PG_W must be cleared atomically. Although the pmap 4101 * lock synchronizes access to PG_W, another processor 4102 * could be setting PG_M and/or PG_A concurrently. 4103 * 4104 * PG_W is among the least significant 32 bits. 4105 */ 4106 atomic_clear_int((u_int *)pte, PG_W); 4107 pmap->pm_stats.wired_count--; 4108 } 4109 } 4110 if (pv_lists_locked) { 4111 sched_unpin(); 4112 rw_wunlock(&pvh_global_lock); 4113 } 4114 PMAP_UNLOCK(pmap); 4115} 4116 4117 4118/* 4119 * Copy the range specified by src_addr/len 4120 * from the source map to the range dst_addr/len 4121 * in the destination map. 4122 * 4123 * This routine is only advisory and need not do anything. 4124 */ 4125 4126void 4127pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 4128 vm_offset_t src_addr) 4129{ 4130 struct spglist free; 4131 vm_offset_t addr; 4132 vm_offset_t end_addr = src_addr + len; 4133 vm_offset_t pdnxt; 4134 4135 if (dst_addr != src_addr) 4136 return; 4137 4138 if (!pmap_is_current(src_pmap)) 4139 return; 4140 4141 rw_wlock(&pvh_global_lock); 4142 if (dst_pmap < src_pmap) { 4143 PMAP_LOCK(dst_pmap); 4144 PMAP_LOCK(src_pmap); 4145 } else { 4146 PMAP_LOCK(src_pmap); 4147 PMAP_LOCK(dst_pmap); 4148 } 4149 sched_pin(); 4150 for (addr = src_addr; addr < end_addr; addr = pdnxt) { 4151 pt_entry_t *src_pte, *dst_pte; 4152 vm_page_t dstmpte, srcmpte; 4153 pd_entry_t srcptepaddr; 4154 u_int ptepindex; 4155 4156 KASSERT(addr < UPT_MIN_ADDRESS, 4157 ("pmap_copy: invalid to pmap_copy page tables")); 4158 4159 pdnxt = (addr + NBPDR) & ~PDRMASK; 4160 if (pdnxt < addr) 4161 pdnxt = end_addr; 4162 ptepindex = addr >> PDRSHIFT; 4163 4164 srcptepaddr = src_pmap->pm_pdir[ptepindex]; 4165 if (srcptepaddr == 0) 4166 continue; 4167 4168 if (srcptepaddr & PG_PS) { 4169 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 4170 continue; 4171 if (dst_pmap->pm_pdir[ptepindex] == 0 && 4172 ((srcptepaddr & PG_MANAGED) == 0 || 4173 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & 4174 PG_PS_FRAME))) { 4175 dst_pmap->pm_pdir[ptepindex] = srcptepaddr & 4176 ~PG_W; 4177 dst_pmap->pm_stats.resident_count += 4178 NBPDR / PAGE_SIZE; 4179 pmap_pde_mappings++; 4180 } 4181 continue; 4182 } 4183 4184 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); 4185 KASSERT(srcmpte->wire_count > 0, 4186 ("pmap_copy: source page table page is unused")); 4187 4188 if (pdnxt > end_addr) 4189 pdnxt = end_addr; 4190 4191 src_pte = vtopte(addr); 4192 while (addr < pdnxt) { 4193 pt_entry_t ptetemp; 4194 ptetemp = *src_pte; 4195 /* 4196 * we only virtual copy managed pages 4197 */ 4198 if ((ptetemp & PG_MANAGED) != 0) { 4199 dstmpte = pmap_allocpte(dst_pmap, addr, 4200 PMAP_ENTER_NOSLEEP); 4201 if (dstmpte == NULL) 4202 goto out; 4203 dst_pte = pmap_pte_quick(dst_pmap, addr); 4204 if (*dst_pte == 0 && 4205 pmap_try_insert_pv_entry(dst_pmap, addr, 4206 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { 4207 /* 4208 * Clear the wired, modified, and 4209 * accessed (referenced) bits 4210 * during the copy. 4211 */ 4212 *dst_pte = ptetemp & ~(PG_W | PG_M | 4213 PG_A); 4214 dst_pmap->pm_stats.resident_count++; 4215 } else { 4216 SLIST_INIT(&free); 4217 if (pmap_unwire_ptp(dst_pmap, dstmpte, 4218 &free)) { 4219 pmap_invalidate_page(dst_pmap, 4220 addr); 4221 pmap_free_zero_pages(&free); 4222 } 4223 goto out; 4224 } 4225 if (dstmpte->wire_count >= srcmpte->wire_count) 4226 break; 4227 } 4228 addr += PAGE_SIZE; 4229 src_pte++; 4230 } 4231 } 4232out: 4233 sched_unpin(); 4234 rw_wunlock(&pvh_global_lock); 4235 PMAP_UNLOCK(src_pmap); 4236 PMAP_UNLOCK(dst_pmap); 4237} 4238 4239static __inline void 4240pagezero(void *page) 4241{ 4242#if defined(I686_CPU) 4243 if (cpu_class == CPUCLASS_686) { 4244 if (cpu_feature & CPUID_SSE2) 4245 sse2_pagezero(page); 4246 else 4247 i686_pagezero(page); 4248 } else 4249#endif 4250 bzero(page, PAGE_SIZE); 4251} 4252 4253/* 4254 * pmap_zero_page zeros the specified hardware page by mapping 4255 * the page into KVM and using bzero to clear its contents. 4256 */ 4257void 4258pmap_zero_page(vm_page_t m) 4259{ 4260 pt_entry_t *cmap_pte2; 4261 struct pcpu *pc; 4262 4263 sched_pin(); 4264 pc = get_pcpu(); 4265 cmap_pte2 = pc->pc_cmap_pte2; 4266 mtx_lock(&pc->pc_cmap_lock); 4267 if (*cmap_pte2) 4268 panic("pmap_zero_page: CMAP2 busy"); 4269 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4270 pmap_cache_bits(m->md.pat_mode, 0); 4271 invlcaddr(pc->pc_cmap_addr2); 4272 pagezero(pc->pc_cmap_addr2); 4273 *cmap_pte2 = 0; 4274 4275 /* 4276 * Unpin the thread before releasing the lock. Otherwise the thread 4277 * could be rescheduled while still bound to the current CPU, only 4278 * to unpin itself immediately upon resuming execution. 4279 */ 4280 sched_unpin(); 4281 mtx_unlock(&pc->pc_cmap_lock); 4282} 4283 4284/* 4285 * pmap_zero_page_area zeros the specified hardware page by mapping 4286 * the page into KVM and using bzero to clear its contents. 4287 * 4288 * off and size may not cover an area beyond a single hardware page. 4289 */ 4290void 4291pmap_zero_page_area(vm_page_t m, int off, int size) 4292{ 4293 pt_entry_t *cmap_pte2; 4294 struct pcpu *pc; 4295 4296 sched_pin(); 4297 pc = get_pcpu(); 4298 cmap_pte2 = pc->pc_cmap_pte2; 4299 mtx_lock(&pc->pc_cmap_lock); 4300 if (*cmap_pte2) 4301 panic("pmap_zero_page_area: CMAP2 busy"); 4302 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4303 pmap_cache_bits(m->md.pat_mode, 0); 4304 invlcaddr(pc->pc_cmap_addr2); 4305 if (off == 0 && size == PAGE_SIZE) 4306 pagezero(pc->pc_cmap_addr2); 4307 else 4308 bzero(pc->pc_cmap_addr2 + off, size); 4309 *cmap_pte2 = 0; 4310 sched_unpin(); 4311 mtx_unlock(&pc->pc_cmap_lock); 4312} 4313 4314/* 4315 * pmap_zero_page_idle zeros the specified hardware page by mapping 4316 * the page into KVM and using bzero to clear its contents. This 4317 * is intended to be called from the vm_pagezero process only and 4318 * outside of Giant. 4319 */ 4320void 4321pmap_zero_page_idle(vm_page_t m) 4322{ 4323 4324 if (*CMAP3) 4325 panic("pmap_zero_page_idle: CMAP3 busy"); 4326 sched_pin(); 4327 *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4328 pmap_cache_bits(m->md.pat_mode, 0); 4329 invlcaddr(CADDR3); 4330 pagezero(CADDR3); 4331 *CMAP3 = 0; 4332 sched_unpin(); 4333} 4334 4335/* 4336 * pmap_copy_page copies the specified (machine independent) 4337 * page by mapping the page into virtual memory and using 4338 * bcopy to copy the page, one machine dependent page at a 4339 * time. 4340 */ 4341void 4342pmap_copy_page(vm_page_t src, vm_page_t dst) 4343{ 4344 pt_entry_t *cmap_pte1, *cmap_pte2; 4345 struct pcpu *pc; 4346 4347 sched_pin(); 4348 pc = get_pcpu(); 4349 cmap_pte1 = pc->pc_cmap_pte1; 4350 cmap_pte2 = pc->pc_cmap_pte2; 4351 mtx_lock(&pc->pc_cmap_lock); 4352 if (*cmap_pte1) 4353 panic("pmap_copy_page: CMAP1 busy"); 4354 if (*cmap_pte2) 4355 panic("pmap_copy_page: CMAP2 busy"); 4356 *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A | 4357 pmap_cache_bits(src->md.pat_mode, 0); 4358 invlcaddr(pc->pc_cmap_addr1); 4359 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M | 4360 pmap_cache_bits(dst->md.pat_mode, 0); 4361 invlcaddr(pc->pc_cmap_addr2); 4362 bcopy(pc->pc_cmap_addr1, pc->pc_cmap_addr2, PAGE_SIZE); 4363 *cmap_pte1 = 0; 4364 *cmap_pte2 = 0; 4365 sched_unpin(); 4366 mtx_unlock(&pc->pc_cmap_lock); 4367} 4368 4369int unmapped_buf_allowed = 1; 4370 4371void 4372pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 4373 vm_offset_t b_offset, int xfersize) 4374{ 4375 vm_page_t a_pg, b_pg; 4376 char *a_cp, *b_cp; 4377 vm_offset_t a_pg_offset, b_pg_offset; 4378 pt_entry_t *cmap_pte1, *cmap_pte2; 4379 struct pcpu *pc; 4380 int cnt; 4381 4382 sched_pin(); 4383 pc = get_pcpu(); 4384 cmap_pte1 = pc->pc_cmap_pte1; 4385 cmap_pte2 = pc->pc_cmap_pte2; 4386 mtx_lock(&pc->pc_cmap_lock); 4387 if (*cmap_pte1 != 0) 4388 panic("pmap_copy_pages: CMAP1 busy"); 4389 if (*cmap_pte2 != 0) 4390 panic("pmap_copy_pages: CMAP2 busy"); 4391 while (xfersize > 0) { 4392 a_pg = ma[a_offset >> PAGE_SHIFT]; 4393 a_pg_offset = a_offset & PAGE_MASK; 4394 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 4395 b_pg = mb[b_offset >> PAGE_SHIFT]; 4396 b_pg_offset = b_offset & PAGE_MASK; 4397 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 4398 *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A | 4399 pmap_cache_bits(a_pg->md.pat_mode, 0); 4400 invlcaddr(pc->pc_cmap_addr1); 4401 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A | 4402 PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0); 4403 invlcaddr(pc->pc_cmap_addr2); 4404 a_cp = pc->pc_cmap_addr1 + a_pg_offset; 4405 b_cp = pc->pc_cmap_addr2 + b_pg_offset; 4406 bcopy(a_cp, b_cp, cnt); 4407 a_offset += cnt; 4408 b_offset += cnt; 4409 xfersize -= cnt; 4410 } 4411 *cmap_pte1 = 0; 4412 *cmap_pte2 = 0; 4413 sched_unpin(); 4414 mtx_unlock(&pc->pc_cmap_lock); 4415} 4416 4417/* 4418 * Returns true if the pmap's pv is one of the first 4419 * 16 pvs linked to from this page. This count may 4420 * be changed upwards or downwards in the future; it 4421 * is only necessary that true be returned for a small 4422 * subset of pmaps for proper page aging. 4423 */ 4424boolean_t 4425pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 4426{ 4427 struct md_page *pvh; 4428 pv_entry_t pv; 4429 int loops = 0; 4430 boolean_t rv; 4431 4432 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4433 ("pmap_page_exists_quick: page %p is not managed", m)); 4434 rv = FALSE; 4435 rw_wlock(&pvh_global_lock); 4436 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4437 if (PV_PMAP(pv) == pmap) { 4438 rv = TRUE; 4439 break; 4440 } 4441 loops++; 4442 if (loops >= 16) 4443 break; 4444 } 4445 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 4446 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4447 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4448 if (PV_PMAP(pv) == pmap) { 4449 rv = TRUE; 4450 break; 4451 } 4452 loops++; 4453 if (loops >= 16) 4454 break; 4455 } 4456 } 4457 rw_wunlock(&pvh_global_lock); 4458 return (rv); 4459} 4460 4461/* 4462 * pmap_page_wired_mappings: 4463 * 4464 * Return the number of managed mappings to the given physical page 4465 * that are wired. 4466 */ 4467int 4468pmap_page_wired_mappings(vm_page_t m) 4469{ 4470 int count; 4471 4472 count = 0; 4473 if ((m->oflags & VPO_UNMANAGED) != 0) 4474 return (count); 4475 rw_wlock(&pvh_global_lock); 4476 count = pmap_pvh_wired_mappings(&m->md, count); 4477 if ((m->flags & PG_FICTITIOUS) == 0) { 4478 count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), 4479 count); 4480 } 4481 rw_wunlock(&pvh_global_lock); 4482 return (count); 4483} 4484 4485/* 4486 * pmap_pvh_wired_mappings: 4487 * 4488 * Return the updated number "count" of managed mappings that are wired. 4489 */ 4490static int 4491pmap_pvh_wired_mappings(struct md_page *pvh, int count) 4492{ 4493 pmap_t pmap; 4494 pt_entry_t *pte; 4495 pv_entry_t pv; 4496 4497 rw_assert(&pvh_global_lock, RA_WLOCKED); 4498 sched_pin(); 4499 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4500 pmap = PV_PMAP(pv); 4501 PMAP_LOCK(pmap); 4502 pte = pmap_pte_quick(pmap, pv->pv_va); 4503 if ((*pte & PG_W) != 0) 4504 count++; 4505 PMAP_UNLOCK(pmap); 4506 } 4507 sched_unpin(); 4508 return (count); 4509} 4510 4511/* 4512 * Returns TRUE if the given page is mapped individually or as part of 4513 * a 4mpage. Otherwise, returns FALSE. 4514 */ 4515boolean_t 4516pmap_page_is_mapped(vm_page_t m) 4517{ 4518 boolean_t rv; 4519 4520 if ((m->oflags & VPO_UNMANAGED) != 0) 4521 return (FALSE); 4522 rw_wlock(&pvh_global_lock); 4523 rv = !TAILQ_EMPTY(&m->md.pv_list) || 4524 ((m->flags & PG_FICTITIOUS) == 0 && 4525 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 4526 rw_wunlock(&pvh_global_lock); 4527 return (rv); 4528} 4529 4530/* 4531 * Remove all pages from specified address space 4532 * this aids process exit speeds. Also, this code 4533 * is special cased for current process only, but 4534 * can have the more generic (and slightly slower) 4535 * mode enabled. This is much faster than pmap_remove 4536 * in the case of running down an entire address space. 4537 */ 4538void 4539pmap_remove_pages(pmap_t pmap) 4540{ 4541 pt_entry_t *pte, tpte; 4542 vm_page_t m, mpte, mt; 4543 pv_entry_t pv; 4544 struct md_page *pvh; 4545 struct pv_chunk *pc, *npc; 4546 struct spglist free; 4547 int field, idx; 4548 int32_t bit; 4549 uint32_t inuse, bitmask; 4550 int allfree; 4551 4552 if (pmap != PCPU_GET(curpmap)) { 4553 printf("warning: pmap_remove_pages called with non-current pmap\n"); 4554 return; 4555 } 4556 SLIST_INIT(&free); 4557 rw_wlock(&pvh_global_lock); 4558 PMAP_LOCK(pmap); 4559 sched_pin(); 4560 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4561 KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, 4562 pc->pc_pmap)); 4563 allfree = 1; 4564 for (field = 0; field < _NPCM; field++) { 4565 inuse = ~pc->pc_map[field] & pc_freemask[field]; 4566 while (inuse != 0) { 4567 bit = bsfl(inuse); 4568 bitmask = 1UL << bit; 4569 idx = field * 32 + bit; 4570 pv = &pc->pc_pventry[idx]; 4571 inuse &= ~bitmask; 4572 4573 pte = pmap_pde(pmap, pv->pv_va); 4574 tpte = *pte; 4575 if ((tpte & PG_PS) == 0) { 4576 pte = vtopte(pv->pv_va); 4577 tpte = *pte & ~PG_PTE_PAT; 4578 } 4579 4580 if (tpte == 0) { 4581 printf( 4582 "TPTE at %p IS ZERO @ VA %08x\n", 4583 pte, pv->pv_va); 4584 panic("bad pte"); 4585 } 4586 4587/* 4588 * We cannot remove wired pages from a process' mapping at this time 4589 */ 4590 if (tpte & PG_W) { 4591 allfree = 0; 4592 continue; 4593 } 4594 4595 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 4596 KASSERT(m->phys_addr == (tpte & PG_FRAME), 4597 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 4598 m, (uintmax_t)m->phys_addr, 4599 (uintmax_t)tpte)); 4600 4601 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4602 m < &vm_page_array[vm_page_array_size], 4603 ("pmap_remove_pages: bad tpte %#jx", 4604 (uintmax_t)tpte)); 4605 4606 pte_clear(pte); 4607 4608 /* 4609 * Update the vm_page_t clean/reference bits. 4610 */ 4611 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4612 if ((tpte & PG_PS) != 0) { 4613 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4614 vm_page_dirty(mt); 4615 } else 4616 vm_page_dirty(m); 4617 } 4618 4619 /* Mark free */ 4620 PV_STAT(pv_entry_frees++); 4621 PV_STAT(pv_entry_spare++); 4622 pv_entry_count--; 4623 pc->pc_map[field] |= bitmask; 4624 if ((tpte & PG_PS) != 0) { 4625 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 4626 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 4627 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4628 if (TAILQ_EMPTY(&pvh->pv_list)) { 4629 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4630 if (TAILQ_EMPTY(&mt->md.pv_list)) 4631 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4632 } 4633 mpte = pmap_remove_pt_page(pmap, pv->pv_va); 4634 if (mpte != NULL) { 4635 pmap->pm_stats.resident_count--; 4636 KASSERT(mpte->wire_count == NPTEPG, 4637 ("pmap_remove_pages: pte page wire count error")); 4638 mpte->wire_count = 0; 4639 pmap_add_delayed_free_list(mpte, &free, FALSE); 4640 } 4641 } else { 4642 pmap->pm_stats.resident_count--; 4643 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4644 if (TAILQ_EMPTY(&m->md.pv_list) && 4645 (m->flags & PG_FICTITIOUS) == 0) { 4646 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4647 if (TAILQ_EMPTY(&pvh->pv_list)) 4648 vm_page_aflag_clear(m, PGA_WRITEABLE); 4649 } 4650 pmap_unuse_pt(pmap, pv->pv_va, &free); 4651 } 4652 } 4653 } 4654 if (allfree) { 4655 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4656 free_pv_chunk(pc); 4657 } 4658 } 4659 sched_unpin(); 4660 pmap_invalidate_all(pmap); 4661 rw_wunlock(&pvh_global_lock); 4662 PMAP_UNLOCK(pmap); 4663 pmap_free_zero_pages(&free); 4664} 4665 4666/* 4667 * pmap_is_modified: 4668 * 4669 * Return whether or not the specified physical page was modified 4670 * in any physical maps. 4671 */ 4672boolean_t 4673pmap_is_modified(vm_page_t m) 4674{ 4675 boolean_t rv; 4676 4677 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4678 ("pmap_is_modified: page %p is not managed", m)); 4679 4680 /* 4681 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 4682 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 4683 * is clear, no PTEs can have PG_M set. 4684 */ 4685 VM_OBJECT_ASSERT_WLOCKED(m->object); 4686 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 4687 return (FALSE); 4688 rw_wlock(&pvh_global_lock); 4689 rv = pmap_is_modified_pvh(&m->md) || 4690 ((m->flags & PG_FICTITIOUS) == 0 && 4691 pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 4692 rw_wunlock(&pvh_global_lock); 4693 return (rv); 4694} 4695 4696/* 4697 * Returns TRUE if any of the given mappings were used to modify 4698 * physical memory. Otherwise, returns FALSE. Both page and 2mpage 4699 * mappings are supported. 4700 */ 4701static boolean_t 4702pmap_is_modified_pvh(struct md_page *pvh) 4703{ 4704 pv_entry_t pv; 4705 pt_entry_t *pte; 4706 pmap_t pmap; 4707 boolean_t rv; 4708 4709 rw_assert(&pvh_global_lock, RA_WLOCKED); 4710 rv = FALSE; 4711 sched_pin(); 4712 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4713 pmap = PV_PMAP(pv); 4714 PMAP_LOCK(pmap); 4715 pte = pmap_pte_quick(pmap, pv->pv_va); 4716 rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); 4717 PMAP_UNLOCK(pmap); 4718 if (rv) 4719 break; 4720 } 4721 sched_unpin(); 4722 return (rv); 4723} 4724 4725/* 4726 * pmap_is_prefaultable: 4727 * 4728 * Return whether or not the specified virtual address is elgible 4729 * for prefault. 4730 */ 4731boolean_t 4732pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 4733{ 4734 pd_entry_t *pde; 4735 pt_entry_t *pte; 4736 boolean_t rv; 4737 4738 rv = FALSE; 4739 PMAP_LOCK(pmap); 4740 pde = pmap_pde(pmap, addr); 4741 if (*pde != 0 && (*pde & PG_PS) == 0) { 4742 pte = vtopte(addr); 4743 rv = *pte == 0; 4744 } 4745 PMAP_UNLOCK(pmap); 4746 return (rv); 4747} 4748 4749/* 4750 * pmap_is_referenced: 4751 * 4752 * Return whether or not the specified physical page was referenced 4753 * in any physical maps. 4754 */ 4755boolean_t 4756pmap_is_referenced(vm_page_t m) 4757{ 4758 boolean_t rv; 4759 4760 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4761 ("pmap_is_referenced: page %p is not managed", m)); 4762 rw_wlock(&pvh_global_lock); 4763 rv = pmap_is_referenced_pvh(&m->md) || 4764 ((m->flags & PG_FICTITIOUS) == 0 && 4765 pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 4766 rw_wunlock(&pvh_global_lock); 4767 return (rv); 4768} 4769 4770/* 4771 * Returns TRUE if any of the given mappings were referenced and FALSE 4772 * otherwise. Both page and 4mpage mappings are supported. 4773 */ 4774static boolean_t 4775pmap_is_referenced_pvh(struct md_page *pvh) 4776{ 4777 pv_entry_t pv; 4778 pt_entry_t *pte; 4779 pmap_t pmap; 4780 boolean_t rv; 4781 4782 rw_assert(&pvh_global_lock, RA_WLOCKED); 4783 rv = FALSE; 4784 sched_pin(); 4785 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { 4786 pmap = PV_PMAP(pv); 4787 PMAP_LOCK(pmap); 4788 pte = pmap_pte_quick(pmap, pv->pv_va); 4789 rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); 4790 PMAP_UNLOCK(pmap); 4791 if (rv) 4792 break; 4793 } 4794 sched_unpin(); 4795 return (rv); 4796} 4797 4798/* 4799 * Clear the write and modified bits in each of the given page's mappings. 4800 */ 4801void 4802pmap_remove_write(vm_page_t m) 4803{ 4804 struct md_page *pvh; 4805 pv_entry_t next_pv, pv; 4806 pmap_t pmap; 4807 pd_entry_t *pde; 4808 pt_entry_t oldpte, *pte; 4809 vm_offset_t va; 4810 4811 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4812 ("pmap_remove_write: page %p is not managed", m)); 4813 4814 /* 4815 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be 4816 * set by another thread while the object is locked. Thus, 4817 * if PGA_WRITEABLE is clear, no page table entries need updating. 4818 */ 4819 VM_OBJECT_ASSERT_WLOCKED(m->object); 4820 if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) 4821 return; 4822 rw_wlock(&pvh_global_lock); 4823 sched_pin(); 4824 if ((m->flags & PG_FICTITIOUS) != 0) 4825 goto small_mappings; 4826 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4827 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 4828 va = pv->pv_va; 4829 pmap = PV_PMAP(pv); 4830 PMAP_LOCK(pmap); 4831 pde = pmap_pde(pmap, va); 4832 if ((*pde & PG_RW) != 0) 4833 (void)pmap_demote_pde(pmap, pde, va); 4834 PMAP_UNLOCK(pmap); 4835 } 4836small_mappings: 4837 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 4838 pmap = PV_PMAP(pv); 4839 PMAP_LOCK(pmap); 4840 pde = pmap_pde(pmap, pv->pv_va); 4841 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" 4842 " a 4mpage in page %p's pv list", m)); 4843 pte = pmap_pte_quick(pmap, pv->pv_va); 4844retry: 4845 oldpte = *pte; 4846 if ((oldpte & PG_RW) != 0) { 4847 /* 4848 * Regardless of whether a pte is 32 or 64 bits 4849 * in size, PG_RW and PG_M are among the least 4850 * significant 32 bits. 4851 */ 4852 if (!atomic_cmpset_int((u_int *)pte, oldpte, 4853 oldpte & ~(PG_RW | PG_M))) 4854 goto retry; 4855 if ((oldpte & PG_M) != 0) 4856 vm_page_dirty(m); 4857 pmap_invalidate_page(pmap, pv->pv_va); 4858 } 4859 PMAP_UNLOCK(pmap); 4860 } 4861 vm_page_aflag_clear(m, PGA_WRITEABLE); 4862 sched_unpin(); 4863 rw_wunlock(&pvh_global_lock); 4864} 4865 4866/* 4867 * pmap_ts_referenced: 4868 * 4869 * Return a count of reference bits for a page, clearing those bits. 4870 * It is not necessary for every reference bit to be cleared, but it 4871 * is necessary that 0 only be returned when there are truly no 4872 * reference bits set. 4873 * 4874 * As an optimization, update the page's dirty field if a modified bit is 4875 * found while counting reference bits. This opportunistic update can be 4876 * performed at low cost and can eliminate the need for some future calls 4877 * to pmap_is_modified(). However, since this function stops after 4878 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some 4879 * dirty pages. Those dirty pages will only be detected by a future call 4880 * to pmap_is_modified(). 4881 */ 4882int 4883pmap_ts_referenced(vm_page_t m) 4884{ 4885 struct md_page *pvh; 4886 pv_entry_t pv, pvf; 4887 pmap_t pmap; 4888 pd_entry_t *pde; 4889 pt_entry_t *pte; 4890 vm_paddr_t pa; 4891 int rtval = 0; 4892 4893 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4894 ("pmap_ts_referenced: page %p is not managed", m)); 4895 pa = VM_PAGE_TO_PHYS(m); 4896 pvh = pa_to_pvh(pa); 4897 rw_wlock(&pvh_global_lock); 4898 sched_pin(); 4899 if ((m->flags & PG_FICTITIOUS) != 0 || 4900 (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) 4901 goto small_mappings; 4902 pv = pvf; 4903 do { 4904 pmap = PV_PMAP(pv); 4905 PMAP_LOCK(pmap); 4906 pde = pmap_pde(pmap, pv->pv_va); 4907 if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4908 /* 4909 * Although "*pde" is mapping a 2/4MB page, because 4910 * this function is called at a 4KB page granularity, 4911 * we only update the 4KB page under test. 4912 */ 4913 vm_page_dirty(m); 4914 } 4915 if ((*pde & PG_A) != 0) { 4916 /* 4917 * Since this reference bit is shared by either 1024 4918 * or 512 4KB pages, it should not be cleared every 4919 * time it is tested. Apply a simple "hash" function 4920 * on the physical page number, the virtual superpage 4921 * number, and the pmap address to select one 4KB page 4922 * out of the 1024 or 512 on which testing the 4923 * reference bit will result in clearing that bit. 4924 * This function is designed to avoid the selection of 4925 * the same 4KB page for every 2- or 4MB page mapping. 4926 * 4927 * On demotion, a mapping that hasn't been referenced 4928 * is simply destroyed. To avoid the possibility of a 4929 * subsequent page fault on a demoted wired mapping, 4930 * always leave its reference bit set. Moreover, 4931 * since the superpage is wired, the current state of 4932 * its reference bit won't affect page replacement. 4933 */ 4934 if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ 4935 (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && 4936 (*pde & PG_W) == 0) { 4937 atomic_clear_int((u_int *)pde, PG_A); 4938 pmap_invalidate_page(pmap, pv->pv_va); 4939 } 4940 rtval++; 4941 } 4942 PMAP_UNLOCK(pmap); 4943 /* Rotate the PV list if it has more than one entry. */ 4944 if (TAILQ_NEXT(pv, pv_next) != NULL) { 4945 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); 4946 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); 4947 } 4948 if (rtval >= PMAP_TS_REFERENCED_MAX) 4949 goto out; 4950 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); 4951small_mappings: 4952 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) 4953 goto out; 4954 pv = pvf; 4955 do { 4956 pmap = PV_PMAP(pv); 4957 PMAP_LOCK(pmap); 4958 pde = pmap_pde(pmap, pv->pv_va); 4959 KASSERT((*pde & PG_PS) == 0, 4960 ("pmap_ts_referenced: found a 4mpage in page %p's pv list", 4961 m)); 4962 pte = pmap_pte_quick(pmap, pv->pv_va); 4963 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 4964 vm_page_dirty(m); 4965 if ((*pte & PG_A) != 0) { 4966 atomic_clear_int((u_int *)pte, PG_A); 4967 pmap_invalidate_page(pmap, pv->pv_va); 4968 rtval++; 4969 } 4970 PMAP_UNLOCK(pmap); 4971 /* Rotate the PV list if it has more than one entry. */ 4972 if (TAILQ_NEXT(pv, pv_next) != NULL) { 4973 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); 4974 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); 4975 } 4976 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < 4977 PMAP_TS_REFERENCED_MAX); 4978out: 4979 sched_unpin(); 4980 rw_wunlock(&pvh_global_lock); 4981 return (rtval); 4982} 4983 4984/* 4985 * Apply the given advice to the specified range of addresses within the 4986 * given pmap. Depending on the advice, clear the referenced and/or 4987 * modified flags in each mapping and set the mapped page's dirty field. 4988 */ 4989void 4990pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) 4991{ 4992 pd_entry_t oldpde, *pde; 4993 pt_entry_t *pte; 4994 vm_offset_t va, pdnxt; 4995 vm_page_t m; 4996 boolean_t anychanged, pv_lists_locked; 4997 4998 if (advice != MADV_DONTNEED && advice != MADV_FREE) 4999 return; 5000 if (pmap_is_current(pmap)) 5001 pv_lists_locked = FALSE; 5002 else { 5003 pv_lists_locked = TRUE; 5004resume: 5005 rw_wlock(&pvh_global_lock); 5006 sched_pin(); 5007 } 5008 anychanged = FALSE; 5009 PMAP_LOCK(pmap); 5010 for (; sva < eva; sva = pdnxt) { 5011 pdnxt = (sva + NBPDR) & ~PDRMASK; 5012 if (pdnxt < sva) 5013 pdnxt = eva; 5014 pde = pmap_pde(pmap, sva); 5015 oldpde = *pde; 5016 if ((oldpde & PG_V) == 0) 5017 continue; 5018 else if ((oldpde & PG_PS) != 0) { 5019 if ((oldpde & PG_MANAGED) == 0) 5020 continue; 5021 if (!pv_lists_locked) { 5022 pv_lists_locked = TRUE; 5023 if (!rw_try_wlock(&pvh_global_lock)) { 5024 if (anychanged) 5025 pmap_invalidate_all(pmap); 5026 PMAP_UNLOCK(pmap); 5027 goto resume; 5028 } 5029 sched_pin(); 5030 } 5031 if (!pmap_demote_pde(pmap, pde, sva)) { 5032 /* 5033 * The large page mapping was destroyed. 5034 */ 5035 continue; 5036 } 5037 5038 /* 5039 * Unless the page mappings are wired, remove the 5040 * mapping to a single page so that a subsequent 5041 * access may repromote. Since the underlying page 5042 * table page is fully populated, this removal never 5043 * frees a page table page. 5044 */ 5045 if ((oldpde & PG_W) == 0) { 5046 pte = pmap_pte_quick(pmap, sva); 5047 KASSERT((*pte & PG_V) != 0, 5048 ("pmap_advise: invalid PTE")); 5049 pmap_remove_pte(pmap, pte, sva, NULL); 5050 anychanged = TRUE; 5051 } 5052 } 5053 if (pdnxt > eva) 5054 pdnxt = eva; 5055 va = pdnxt; 5056 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 5057 sva += PAGE_SIZE) { 5058 if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) 5059 goto maybe_invlrng; 5060 else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5061 if (advice == MADV_DONTNEED) { 5062 /* 5063 * Future calls to pmap_is_modified() 5064 * can be avoided by making the page 5065 * dirty now. 5066 */ 5067 m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); 5068 vm_page_dirty(m); 5069 } 5070 atomic_clear_int((u_int *)pte, PG_M | PG_A); 5071 } else if ((*pte & PG_A) != 0) 5072 atomic_clear_int((u_int *)pte, PG_A); 5073 else 5074 goto maybe_invlrng; 5075 if ((*pte & PG_G) != 0) { 5076 if (va == pdnxt) 5077 va = sva; 5078 } else 5079 anychanged = TRUE; 5080 continue; 5081maybe_invlrng: 5082 if (va != pdnxt) { 5083 pmap_invalidate_range(pmap, va, sva); 5084 va = pdnxt; 5085 } 5086 } 5087 if (va != pdnxt) 5088 pmap_invalidate_range(pmap, va, sva); 5089 } 5090 if (anychanged) 5091 pmap_invalidate_all(pmap); 5092 if (pv_lists_locked) { 5093 sched_unpin(); 5094 rw_wunlock(&pvh_global_lock); 5095 } 5096 PMAP_UNLOCK(pmap); 5097} 5098 5099/* 5100 * Clear the modify bits on the specified physical page. 5101 */ 5102void 5103pmap_clear_modify(vm_page_t m) 5104{ 5105 struct md_page *pvh; 5106 pv_entry_t next_pv, pv; 5107 pmap_t pmap; 5108 pd_entry_t oldpde, *pde; 5109 pt_entry_t oldpte, *pte; 5110 vm_offset_t va; 5111 5112 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 5113 ("pmap_clear_modify: page %p is not managed", m)); 5114 VM_OBJECT_ASSERT_WLOCKED(m->object); 5115 KASSERT(!vm_page_xbusied(m), 5116 ("pmap_clear_modify: page %p is exclusive busied", m)); 5117 5118 /* 5119 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 5120 * If the object containing the page is locked and the page is not 5121 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. 5122 */ 5123 if ((m->aflags & PGA_WRITEABLE) == 0) 5124 return; 5125 rw_wlock(&pvh_global_lock); 5126 sched_pin(); 5127 if ((m->flags & PG_FICTITIOUS) != 0) 5128 goto small_mappings; 5129 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 5130 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { 5131 va = pv->pv_va; 5132 pmap = PV_PMAP(pv); 5133 PMAP_LOCK(pmap); 5134 pde = pmap_pde(pmap, va); 5135 oldpde = *pde; 5136 if ((oldpde & PG_RW) != 0) { 5137 if (pmap_demote_pde(pmap, pde, va)) { 5138 if ((oldpde & PG_W) == 0) { 5139 /* 5140 * Write protect the mapping to a 5141 * single page so that a subsequent 5142 * write access may repromote. 5143 */ 5144 va += VM_PAGE_TO_PHYS(m) - (oldpde & 5145 PG_PS_FRAME); 5146 pte = pmap_pte_quick(pmap, va); 5147 oldpte = *pte; 5148 if ((oldpte & PG_V) != 0) { 5149 /* 5150 * Regardless of whether a pte is 32 or 64 bits 5151 * in size, PG_RW and PG_M are among the least 5152 * significant 32 bits. 5153 */ 5154 while (!atomic_cmpset_int((u_int *)pte, 5155 oldpte, 5156 oldpte & ~(PG_M | PG_RW))) 5157 oldpte = *pte; 5158 vm_page_dirty(m); 5159 pmap_invalidate_page(pmap, va); 5160 } 5161 } 5162 } 5163 } 5164 PMAP_UNLOCK(pmap); 5165 } 5166small_mappings: 5167 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { 5168 pmap = PV_PMAP(pv); 5169 PMAP_LOCK(pmap); 5170 pde = pmap_pde(pmap, pv->pv_va); 5171 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 5172 " a 4mpage in page %p's pv list", m)); 5173 pte = pmap_pte_quick(pmap, pv->pv_va); 5174 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 5175 /* 5176 * Regardless of whether a pte is 32 or 64 bits 5177 * in size, PG_M is among the least significant 5178 * 32 bits. 5179 */ 5180 atomic_clear_int((u_int *)pte, PG_M); 5181 pmap_invalidate_page(pmap, pv->pv_va); 5182 } 5183 PMAP_UNLOCK(pmap); 5184 } 5185 sched_unpin(); 5186 rw_wunlock(&pvh_global_lock); 5187} 5188 5189/* 5190 * Miscellaneous support routines follow 5191 */ 5192 5193/* Adjust the cache mode for a 4KB page mapped via a PTE. */ 5194static __inline void 5195pmap_pte_attr(pt_entry_t *pte, int cache_bits) 5196{ 5197 u_int opte, npte; 5198 5199 /* 5200 * The cache mode bits are all in the low 32-bits of the 5201 * PTE, so we can just spin on updating the low 32-bits. 5202 */ 5203 do { 5204 opte = *(u_int *)pte; 5205 npte = opte & ~PG_PTE_CACHE; 5206 npte |= cache_bits; 5207 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 5208} 5209 5210/* Adjust the cache mode for a 2/4MB page mapped via a PDE. */ 5211static __inline void 5212pmap_pde_attr(pd_entry_t *pde, int cache_bits) 5213{ 5214 u_int opde, npde; 5215 5216 /* 5217 * The cache mode bits are all in the low 32-bits of the 5218 * PDE, so we can just spin on updating the low 32-bits. 5219 */ 5220 do { 5221 opde = *(u_int *)pde; 5222 npde = opde & ~PG_PDE_CACHE; 5223 npde |= cache_bits; 5224 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 5225} 5226 5227/* 5228 * Map a set of physical memory pages into the kernel virtual 5229 * address space. Return a pointer to where it is mapped. This 5230 * routine is intended to be used for mapping device memory, 5231 * NOT real memory. 5232 */ 5233static void * 5234pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, int flags) 5235{ 5236 struct pmap_preinit_mapping *ppim; 5237 vm_offset_t va, offset; 5238 vm_page_t m; 5239 vm_size_t tmpsize; 5240 int i; 5241 5242 offset = pa & PAGE_MASK; 5243 size = round_page(offset + size); 5244 pa = pa & PG_FRAME; 5245 5246 if (pa < KERNLOAD && pa + size <= KERNLOAD) { 5247 va = KERNBASE + pa; 5248 if ((flags & MAPDEV_SETATTR) == 0) 5249 return ((void *)(va + offset)); 5250 } else if (!pmap_initialized) { 5251 va = 0; 5252 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5253 ppim = pmap_preinit_mapping + i; 5254 if (ppim->va == 0) { 5255 ppim->pa = pa; 5256 ppim->sz = size; 5257 ppim->mode = mode; 5258 ppim->va = virtual_avail; 5259 virtual_avail += size; 5260 va = ppim->va; 5261 break; 5262 } 5263 } 5264 if (va == 0) 5265 panic("%s: too many preinit mappings", __func__); 5266 } else { 5267 /* 5268 * If we have a preinit mapping, re-use it. 5269 */ 5270 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5271 ppim = pmap_preinit_mapping + i; 5272 if (ppim->pa == pa && ppim->sz == size && 5273 (ppim->mode == mode || 5274 (flags & MAPDEV_SETATTR) == 0)) 5275 return ((void *)(ppim->va + offset)); 5276 } 5277 va = kva_alloc(size); 5278 if (va == 0) 5279 panic("%s: Couldn't allocate KVA", __func__); 5280 } 5281 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) { 5282 if ((flags & MAPDEV_SETATTR) == 0 && pmap_initialized) { 5283 m = PHYS_TO_VM_PAGE(pa); 5284 if (m != NULL && VM_PAGE_TO_PHYS(m) == pa) { 5285 pmap_kenter_attr(va + tmpsize, pa + tmpsize, 5286 m->md.pat_mode); 5287 continue; 5288 } 5289 } 5290 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 5291 } 5292 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 5293 pmap_invalidate_cache_range(va, va + size, FALSE); 5294 return ((void *)(va + offset)); 5295} 5296 5297void * 5298pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 5299{ 5300 5301 return (pmap_mapdev_internal(pa, size, mode, MAPDEV_SETATTR)); 5302} 5303 5304void * 5305pmap_mapdev(vm_paddr_t pa, vm_size_t size) 5306{ 5307 5308 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 5309} 5310 5311void * 5312pmap_mapbios(vm_paddr_t pa, vm_size_t size) 5313{ 5314 5315 return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK, 0)); 5316} 5317 5318void 5319pmap_unmapdev(vm_offset_t va, vm_size_t size) 5320{ 5321 struct pmap_preinit_mapping *ppim; 5322 vm_offset_t offset; 5323 int i; 5324 5325 if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) 5326 return; 5327 offset = va & PAGE_MASK; 5328 size = round_page(offset + size); 5329 va = trunc_page(va); 5330 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { 5331 ppim = pmap_preinit_mapping + i; 5332 if (ppim->va == va && ppim->sz == size) { 5333 if (pmap_initialized) 5334 return; 5335 ppim->pa = 0; 5336 ppim->va = 0; 5337 ppim->sz = 0; 5338 ppim->mode = 0; 5339 if (va + size == virtual_avail) 5340 virtual_avail = va; 5341 return; 5342 } 5343 } 5344 if (pmap_initialized) 5345 kva_free(va, size); 5346} 5347 5348/* 5349 * Sets the memory attribute for the specified page. 5350 */ 5351void 5352pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5353{ 5354 5355 m->md.pat_mode = ma; 5356 if ((m->flags & PG_FICTITIOUS) != 0) 5357 return; 5358 5359 /* 5360 * If "m" is a normal page, flush it from the cache. 5361 * See pmap_invalidate_cache_range(). 5362 * 5363 * First, try to find an existing mapping of the page by sf 5364 * buffer. sf_buf_invalidate_cache() modifies mapping and 5365 * flushes the cache. 5366 */ 5367 if (sf_buf_invalidate_cache(m)) 5368 return; 5369 5370 /* 5371 * If page is not mapped by sf buffer, but CPU does not 5372 * support self snoop, map the page transient and do 5373 * invalidation. In the worst case, whole cache is flushed by 5374 * pmap_invalidate_cache_range(). 5375 */ 5376 if ((cpu_feature & CPUID_SS) == 0) 5377 pmap_flush_page(m); 5378} 5379 5380static void 5381pmap_flush_page(vm_page_t m) 5382{ 5383 pt_entry_t *cmap_pte2; 5384 struct pcpu *pc; 5385 vm_offset_t sva, eva; 5386 bool useclflushopt; 5387 5388 useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; 5389 if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) { 5390 sched_pin(); 5391 pc = get_pcpu(); 5392 cmap_pte2 = pc->pc_cmap_pte2; 5393 mtx_lock(&pc->pc_cmap_lock); 5394 if (*cmap_pte2) 5395 panic("pmap_flush_page: CMAP2 busy"); 5396 *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | 5397 PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); 5398 invlcaddr(pc->pc_cmap_addr2); 5399 sva = (vm_offset_t)pc->pc_cmap_addr2; 5400 eva = sva + PAGE_SIZE; 5401 5402 /* 5403 * Use mfence or sfence despite the ordering implied by 5404 * mtx_{un,}lock() because clflush on non-Intel CPUs 5405 * and clflushopt are not guaranteed to be ordered by 5406 * any other instruction. 5407 */ 5408 if (useclflushopt) 5409 sfence(); 5410 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 5411 mfence(); 5412 for (; sva < eva; sva += cpu_clflush_line_size) { 5413 if (useclflushopt) 5414 clflushopt(sva); 5415 else 5416 clflush(sva); 5417 } 5418 if (useclflushopt) 5419 sfence(); 5420 else if (cpu_vendor_id != CPU_VENDOR_INTEL) 5421 mfence(); 5422 *cmap_pte2 = 0; 5423 sched_unpin(); 5424 mtx_unlock(&pc->pc_cmap_lock); 5425 } else 5426 pmap_invalidate_cache(); 5427} 5428 5429/* 5430 * Changes the specified virtual address range's memory type to that given by 5431 * the parameter "mode". The specified virtual address range must be 5432 * completely contained within either the kernel map. 5433 * 5434 * Returns zero if the change completed successfully, and either EINVAL or 5435 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 5436 * of the virtual address range was not mapped, and ENOMEM is returned if 5437 * there was insufficient memory available to complete the change. 5438 */ 5439int 5440pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 5441{ 5442 vm_offset_t base, offset, tmpva; 5443 pd_entry_t *pde; 5444 pt_entry_t *pte; 5445 int cache_bits_pte, cache_bits_pde; 5446 boolean_t changed; 5447 5448 base = trunc_page(va); 5449 offset = va & PAGE_MASK; 5450 size = round_page(offset + size); 5451 5452 /* 5453 * Only supported on kernel virtual addresses above the recursive map. 5454 */ 5455 if (base < VM_MIN_KERNEL_ADDRESS) 5456 return (EINVAL); 5457 5458 cache_bits_pde = pmap_cache_bits(mode, 1); 5459 cache_bits_pte = pmap_cache_bits(mode, 0); 5460 changed = FALSE; 5461 5462 /* 5463 * Pages that aren't mapped aren't supported. Also break down 5464 * 2/4MB pages into 4KB pages if required. 5465 */ 5466 PMAP_LOCK(kernel_pmap); 5467 for (tmpva = base; tmpva < base + size; ) { 5468 pde = pmap_pde(kernel_pmap, tmpva); 5469 if (*pde == 0) { 5470 PMAP_UNLOCK(kernel_pmap); 5471 return (EINVAL); 5472 } 5473 if (*pde & PG_PS) { 5474 /* 5475 * If the current 2/4MB page already has 5476 * the required memory type, then we need not 5477 * demote this page. Just increment tmpva to 5478 * the next 2/4MB page frame. 5479 */ 5480 if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { 5481 tmpva = trunc_4mpage(tmpva) + NBPDR; 5482 continue; 5483 } 5484 5485 /* 5486 * If the current offset aligns with a 2/4MB 5487 * page frame and there is at least 2/4MB left 5488 * within the range, then we need not break 5489 * down this page into 4KB pages. 5490 */ 5491 if ((tmpva & PDRMASK) == 0 && 5492 tmpva + PDRMASK < base + size) { 5493 tmpva += NBPDR; 5494 continue; 5495 } 5496 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) { 5497 PMAP_UNLOCK(kernel_pmap); 5498 return (ENOMEM); 5499 } 5500 } 5501 pte = vtopte(tmpva); 5502 if (*pte == 0) { 5503 PMAP_UNLOCK(kernel_pmap); 5504 return (EINVAL); 5505 } 5506 tmpva += PAGE_SIZE; 5507 } 5508 PMAP_UNLOCK(kernel_pmap); 5509 5510 /* 5511 * Ok, all the pages exist, so run through them updating their 5512 * cache mode if required. 5513 */ 5514 for (tmpva = base; tmpva < base + size; ) { 5515 pde = pmap_pde(kernel_pmap, tmpva); 5516 if (*pde & PG_PS) { 5517 if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { 5518 pmap_pde_attr(pde, cache_bits_pde); 5519 changed = TRUE; 5520 } 5521 tmpva = trunc_4mpage(tmpva) + NBPDR; 5522 } else { 5523 pte = vtopte(tmpva); 5524 if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { 5525 pmap_pte_attr(pte, cache_bits_pte); 5526 changed = TRUE; 5527 } 5528 tmpva += PAGE_SIZE; 5529 } 5530 } 5531 5532 /* 5533 * Flush CPU caches to make sure any data isn't cached that 5534 * shouldn't be, etc. 5535 */ 5536 if (changed) { 5537 pmap_invalidate_range(kernel_pmap, base, tmpva); 5538 pmap_invalidate_cache_range(base, tmpva, FALSE); 5539 } 5540 return (0); 5541} 5542 5543/* 5544 * perform the pmap work for mincore 5545 */ 5546int 5547pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 5548{ 5549 pd_entry_t *pdep; 5550 pt_entry_t *ptep, pte; 5551 vm_paddr_t pa; 5552 int val; 5553 5554 PMAP_LOCK(pmap); 5555retry: 5556 pdep = pmap_pde(pmap, addr); 5557 if (*pdep != 0) { 5558 if (*pdep & PG_PS) { 5559 pte = *pdep; 5560 /* Compute the physical address of the 4KB page. */ 5561 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & 5562 PG_FRAME; 5563 val = MINCORE_SUPER; 5564 } else { 5565 ptep = pmap_pte(pmap, addr); 5566 pte = *ptep; 5567 pmap_pte_release(ptep); 5568 pa = pte & PG_FRAME; 5569 val = 0; 5570 } 5571 } else { 5572 pte = 0; 5573 pa = 0; 5574 val = 0; 5575 } 5576 if ((pte & PG_V) != 0) { 5577 val |= MINCORE_INCORE; 5578 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5579 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 5580 if ((pte & PG_A) != 0) 5581 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 5582 } 5583 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 5584 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 5585 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 5586 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 5587 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 5588 goto retry; 5589 } else 5590 PA_UNLOCK_COND(*locked_pa); 5591 PMAP_UNLOCK(pmap); 5592 return (val); 5593} 5594 5595void 5596pmap_activate(struct thread *td) 5597{ 5598 pmap_t pmap, oldpmap; 5599 u_int cpuid; 5600 u_int32_t cr3; 5601 5602 critical_enter(); 5603 pmap = vmspace_pmap(td->td_proc->p_vmspace); 5604 oldpmap = PCPU_GET(curpmap); 5605 cpuid = PCPU_GET(cpuid); 5606#if defined(SMP) 5607 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 5608 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 5609#else 5610 CPU_CLR(cpuid, &oldpmap->pm_active); 5611 CPU_SET(cpuid, &pmap->pm_active); 5612#endif 5613#if defined(PAE) || defined(PAE_TABLES) 5614 cr3 = vtophys(pmap->pm_pdpt); 5615#else 5616 cr3 = vtophys(pmap->pm_pdir); 5617#endif 5618 /* 5619 * pmap_activate is for the current thread on the current cpu 5620 */ 5621 td->td_pcb->pcb_cr3 = cr3; 5622 load_cr3(cr3); 5623 PCPU_SET(curpmap, pmap); 5624 critical_exit(); 5625} 5626 5627void 5628pmap_activate_boot(pmap_t pmap) 5629{ 5630 u_int cpuid; 5631 5632 cpuid = PCPU_GET(cpuid); 5633#if defined(SMP) 5634 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 5635#else 5636 CPU_SET(cpuid, &pmap->pm_active); 5637#endif 5638 PCPU_SET(curpmap, pmap); 5639} 5640 5641void 5642pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 5643{ 5644} 5645 5646/* 5647 * Increase the starting virtual address of the given mapping if a 5648 * different alignment might result in more superpage mappings. 5649 */ 5650void 5651pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 5652 vm_offset_t *addr, vm_size_t size) 5653{ 5654 vm_offset_t superpage_offset; 5655 5656 if (size < NBPDR) 5657 return; 5658 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 5659 offset += ptoa(object->pg_color); 5660 superpage_offset = offset & PDRMASK; 5661 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 5662 (*addr & PDRMASK) == superpage_offset) 5663 return; 5664 if ((*addr & PDRMASK) < superpage_offset) 5665 *addr = (*addr & ~PDRMASK) + superpage_offset; 5666 else 5667 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 5668} 5669 5670vm_offset_t 5671pmap_quick_enter_page(vm_page_t m) 5672{ 5673 vm_offset_t qaddr; 5674 pt_entry_t *pte; 5675 5676 critical_enter(); 5677 qaddr = PCPU_GET(qmap_addr); 5678 pte = vtopte(qaddr); 5679 5680 KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy")); 5681 *pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 5682 pmap_cache_bits(pmap_page_get_memattr(m), 0); 5683 invlpg(qaddr); 5684 5685 return (qaddr); 5686} 5687 5688void 5689pmap_quick_remove_page(vm_offset_t addr) 5690{ 5691 vm_offset_t qaddr; 5692 pt_entry_t *pte; 5693 5694 qaddr = PCPU_GET(qmap_addr); 5695 pte = vtopte(qaddr); 5696 5697 KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use")); 5698 KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address")); 5699 5700 *pte = 0; 5701 critical_exit(); 5702} 5703 5704#if defined(PMAP_DEBUG) 5705pmap_pid_dump(int pid) 5706{ 5707 pmap_t pmap; 5708 struct proc *p; 5709 int npte = 0; 5710 int index; 5711 5712 sx_slock(&allproc_lock); 5713 FOREACH_PROC_IN_SYSTEM(p) { 5714 if (p->p_pid != pid) 5715 continue; 5716 5717 if (p->p_vmspace) { 5718 int i,j; 5719 index = 0; 5720 pmap = vmspace_pmap(p->p_vmspace); 5721 for (i = 0; i < NPDEPTD; i++) { 5722 pd_entry_t *pde; 5723 pt_entry_t *pte; 5724 vm_offset_t base = i << PDRSHIFT; 5725 5726 pde = &pmap->pm_pdir[i]; 5727 if (pde && pmap_pde_v(pde)) { 5728 for (j = 0; j < NPTEPG; j++) { 5729 vm_offset_t va = base + (j << PAGE_SHIFT); 5730 if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { 5731 if (index) { 5732 index = 0; 5733 printf("\n"); 5734 } 5735 sx_sunlock(&allproc_lock); 5736 return (npte); 5737 } 5738 pte = pmap_pte(pmap, va); 5739 if (pte && pmap_pte_v(pte)) { 5740 pt_entry_t pa; 5741 vm_page_t m; 5742 pa = *pte; 5743 m = PHYS_TO_VM_PAGE(pa & PG_FRAME); 5744 printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", 5745 va, pa, m->hold_count, m->wire_count, m->flags); 5746 npte++; 5747 index++; 5748 if (index >= 2) { 5749 index = 0; 5750 printf("\n"); 5751 } else { 5752 printf(" "); 5753 } 5754 } 5755 } 5756 } 5757 } 5758 } 5759 } 5760 sx_sunlock(&allproc_lock); 5761 return (npte); 5762} 5763#endif 5764