pmap.c revision 245577
1/*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 9 * All rights reserved. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * the Systems Programming Group of the University of Utah Computer 13 * Science Department and William Jolitz of UUNET Technologies Inc. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 3. All advertising materials mentioning features or use of this software 24 * must display the following acknowledgement: 25 * This product includes software developed by the University of 26 * California, Berkeley and its contributors. 27 * 4. Neither the name of the University nor the names of its contributors 28 * may be used to endorse or promote products derived from this software 29 * without specific prior written permission. 30 * 31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41 * SUCH DAMAGE. 42 * 43 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 44 */ 45/*- 46 * Copyright (c) 2003 Networks Associates Technology, Inc. 47 * All rights reserved. 48 * 49 * This software was developed for the FreeBSD Project by Jake Burkholder, 50 * Safeport Network Services, and Network Associates Laboratories, the 51 * Security Research Division of Network Associates, Inc. under 52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 53 * CHATS research program. 54 * 55 * Redistribution and use in source and binary forms, with or without 56 * modification, are permitted provided that the following conditions 57 * are met: 58 * 1. Redistributions of source code must retain the above copyright 59 * notice, this list of conditions and the following disclaimer. 60 * 2. Redistributions in binary form must reproduce the above copyright 61 * notice, this list of conditions and the following disclaimer in the 62 * documentation and/or other materials provided with the distribution. 63 * 64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 67 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 74 * SUCH DAMAGE. 75 */ 76 77#include <sys/cdefs.h> 78__FBSDID("$FreeBSD: head/sys/i386/i386/pmap.c 245577 2013-01-17 21:32:25Z jhb $"); 79 80/* 81 * Manages physical address maps. 82 * 83 * Since the information managed by this module is 84 * also stored by the logical address mapping module, 85 * this module may throw away valid virtual-to-physical 86 * mappings at almost any time. However, invalidations 87 * of virtual-to-physical mappings must be done as 88 * requested. 89 * 90 * In order to cope with hardware architectures which 91 * make virtual-to-physical map invalidates expensive, 92 * this module may delay invalidate or reduced protection 93 * operations until such time as they are actually 94 * necessary. This module is given full information as 95 * to which processors are currently using which maps, 96 * and to when physical maps must be made correct. 97 */ 98 99#include "opt_apic.h" 100#include "opt_cpu.h" 101#include "opt_pmap.h" 102#include "opt_smp.h" 103#include "opt_xbox.h" 104 105#include <sys/param.h> 106#include <sys/systm.h> 107#include <sys/kernel.h> 108#include <sys/ktr.h> 109#include <sys/lock.h> 110#include <sys/malloc.h> 111#include <sys/mman.h> 112#include <sys/msgbuf.h> 113#include <sys/mutex.h> 114#include <sys/proc.h> 115#include <sys/rwlock.h> 116#include <sys/sf_buf.h> 117#include <sys/sx.h> 118#include <sys/vmmeter.h> 119#include <sys/sched.h> 120#include <sys/sysctl.h> 121#ifdef SMP 122#include <sys/smp.h> 123#else 124#include <sys/cpuset.h> 125#endif 126 127#include <vm/vm.h> 128#include <vm/vm_param.h> 129#include <vm/vm_kern.h> 130#include <vm/vm_page.h> 131#include <vm/vm_map.h> 132#include <vm/vm_object.h> 133#include <vm/vm_extern.h> 134#include <vm/vm_pageout.h> 135#include <vm/vm_pager.h> 136#include <vm/vm_reserv.h> 137#include <vm/uma.h> 138 139#include <machine/cpu.h> 140#include <machine/cputypes.h> 141#include <machine/md_var.h> 142#include <machine/pcb.h> 143#include <machine/specialreg.h> 144#ifdef SMP 145#include <machine/smp.h> 146#endif 147 148#ifdef XBOX 149#include <machine/xbox.h> 150#endif 151 152#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) 153#define CPU_ENABLE_SSE 154#endif 155 156#ifndef PMAP_SHPGPERPROC 157#define PMAP_SHPGPERPROC 200 158#endif 159 160#if !defined(DIAGNOSTIC) 161#ifdef __GNUC_GNU_INLINE__ 162#define PMAP_INLINE __attribute__((__gnu_inline__)) inline 163#else 164#define PMAP_INLINE extern inline 165#endif 166#else 167#define PMAP_INLINE 168#endif 169 170#ifdef PV_STATS 171#define PV_STAT(x) do { x ; } while (0) 172#else 173#define PV_STAT(x) do { } while (0) 174#endif 175 176#define pa_index(pa) ((pa) >> PDRSHIFT) 177#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 178 179/* 180 * Get PDEs and PTEs for user/kernel address space 181 */ 182#define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) 183#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 184 185#define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) 186#define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) 187#define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) 188#define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) 189#define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) 190 191#define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ 192 atomic_clear_int((u_int *)(pte), PG_W)) 193#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) 194 195struct pmap kernel_pmap_store; 196LIST_HEAD(pmaplist, pmap); 197static struct pmaplist allpmaps; 198static struct mtx allpmaps_lock; 199 200vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 201vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 202int pgeflag = 0; /* PG_G or-in */ 203int pseflag = 0; /* PG_PS or-in */ 204 205static int nkpt = NKPT; 206vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR; 207extern u_int32_t KERNend; 208extern u_int32_t KPTphys; 209 210#ifdef PAE 211pt_entry_t pg_nx; 212static uma_zone_t pdptzone; 213#endif 214 215static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 216 217static int pat_works = 1; 218SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, 219 "Is page attribute table fully functional?"); 220 221static int pg_ps_enabled = 1; 222SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0, 223 "Are large page mappings enabled?"); 224 225#define PAT_INDEX_SIZE 8 226static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 227 228static struct rwlock_padalign pvh_global_lock; 229 230/* 231 * Data for the pv entry allocation mechanism 232 */ 233static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 234static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 235static struct md_page *pv_table; 236static int shpgperproc = PMAP_SHPGPERPROC; 237 238struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ 239int pv_maxchunks; /* How many chunks we have KVA for */ 240vm_offset_t pv_vafree; /* freelist stored in the PTE */ 241 242/* 243 * All those kernel PT submaps that BSD is so fond of 244 */ 245struct sysmaps { 246 struct mtx lock; 247 pt_entry_t *CMAP1; 248 pt_entry_t *CMAP2; 249 caddr_t CADDR1; 250 caddr_t CADDR2; 251}; 252static struct sysmaps sysmaps_pcpu[MAXCPU]; 253pt_entry_t *CMAP1 = 0; 254static pt_entry_t *CMAP3; 255static pd_entry_t *KPTD; 256caddr_t CADDR1 = 0, ptvmmap = 0; 257static caddr_t CADDR3; 258struct msgbuf *msgbufp = 0; 259 260/* 261 * Crashdump maps. 262 */ 263static caddr_t crashdumpmap; 264 265static pt_entry_t *PMAP1 = 0, *PMAP2; 266static pt_entry_t *PADDR1 = 0, *PADDR2; 267#ifdef SMP 268static int PMAP1cpu; 269static int PMAP1changedcpu; 270SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 271 &PMAP1changedcpu, 0, 272 "Number of times pmap_pte_quick changed CPU with same PMAP1"); 273#endif 274static int PMAP1changed; 275SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 276 &PMAP1changed, 0, 277 "Number of times pmap_pte_quick changed PMAP1"); 278static int PMAP1unchanged; 279SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 280 &PMAP1unchanged, 0, 281 "Number of times pmap_pte_quick didn't change PMAP1"); 282static struct mtx PMAP2mutex; 283 284static void free_pv_chunk(struct pv_chunk *pc); 285static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 286static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); 287static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 288static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 289static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 290static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 291static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 292 vm_offset_t va); 293static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); 294 295static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 296static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, 297 vm_prot_t prot); 298static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 299 vm_page_t m, vm_prot_t prot, vm_page_t mpte); 300static void pmap_flush_page(vm_page_t m); 301static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 302static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 303static boolean_t pmap_is_modified_pvh(struct md_page *pvh); 304static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); 305static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 306static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde); 307static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); 308static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); 309static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 310static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 311 vm_prot_t prot); 312static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); 313static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 314 vm_page_t *free); 315static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 316 vm_page_t *free); 317static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); 318static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, 319 vm_page_t *free); 320static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, 321 vm_offset_t va); 322static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); 323static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 324 vm_page_t m); 325static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 326 pd_entry_t newpde); 327static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); 328 329static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags); 330 331static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, int flags); 332static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free); 333static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); 334static void pmap_pte_release(pt_entry_t *pte); 335static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *); 336#ifdef PAE 337static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait); 338#endif 339static void pmap_set_pg(void); 340 341static __inline void pagezero(void *page); 342 343CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); 344CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); 345 346/* 347 * If you get an error here, then you set KVA_PAGES wrong! See the 348 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be 349 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. 350 */ 351CTASSERT(KERNBASE % (1 << 24) == 0); 352 353/* 354 * Bootstrap the system enough to run with virtual memory. 355 * 356 * On the i386 this is called after mapping has already been enabled 357 * and just syncs the pmap module with what has already been done. 358 * [We can't call it easily with mapping off since the kernel is not 359 * mapped with PA == VA, hence we would have to relocate every address 360 * from the linked base (virtual) address "KERNBASE" to the actual 361 * (physical) address starting relative to 0] 362 */ 363void 364pmap_bootstrap(vm_paddr_t firstaddr) 365{ 366 vm_offset_t va; 367 pt_entry_t *pte, *unused; 368 struct sysmaps *sysmaps; 369 int i; 370 371 /* 372 * Initialize the first available kernel virtual address. However, 373 * using "firstaddr" may waste a few pages of the kernel virtual 374 * address space, because locore may not have mapped every physical 375 * page that it allocated. Preferably, locore would provide a first 376 * unused virtual address in addition to "firstaddr". 377 */ 378 virtual_avail = (vm_offset_t) KERNBASE + firstaddr; 379 380 virtual_end = VM_MAX_KERNEL_ADDRESS; 381 382 /* 383 * Initialize the kernel pmap (which is statically allocated). 384 */ 385 PMAP_LOCK_INIT(kernel_pmap); 386 kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); 387#ifdef PAE 388 kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); 389#endif 390 kernel_pmap->pm_root = NULL; 391 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 392 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 393 394 /* 395 * Initialize the global pv list lock. 396 */ 397 rw_init(&pvh_global_lock, "pmap pv global"); 398 399 LIST_INIT(&allpmaps); 400 401 /* 402 * Request a spin mutex so that changes to allpmaps cannot be 403 * preempted by smp_rendezvous_cpus(). Otherwise, 404 * pmap_update_pde_kernel() could access allpmaps while it is 405 * being changed. 406 */ 407 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); 408 mtx_lock_spin(&allpmaps_lock); 409 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); 410 mtx_unlock_spin(&allpmaps_lock); 411 412 /* 413 * Reserve some special page table entries/VA space for temporary 414 * mapping of pages. 415 */ 416#define SYSMAP(c, p, v, n) \ 417 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 418 419 va = virtual_avail; 420 pte = vtopte(va); 421 422 /* 423 * CMAP1/CMAP2 are used for zeroing and copying pages. 424 * CMAP3 is used for the idle process page zeroing. 425 */ 426 for (i = 0; i < MAXCPU; i++) { 427 sysmaps = &sysmaps_pcpu[i]; 428 mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF); 429 SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1) 430 SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1) 431 } 432 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 433 SYSMAP(caddr_t, CMAP3, CADDR3, 1) 434 435 /* 436 * Crashdump maps. 437 */ 438 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) 439 440 /* 441 * ptvmmap is used for reading arbitrary physical pages via /dev/mem. 442 */ 443 SYSMAP(caddr_t, unused, ptvmmap, 1) 444 445 /* 446 * msgbufp is used to map the system message buffer. 447 */ 448 SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) 449 450 /* 451 * KPTmap is used by pmap_kextract(). 452 * 453 * KPTmap is first initialized by locore. However, that initial 454 * KPTmap can only support NKPT page table pages. Here, a larger 455 * KPTmap is created that can support KVA_PAGES page table pages. 456 */ 457 SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES) 458 459 for (i = 0; i < NKPT; i++) 460 KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V; 461 462 /* 463 * Adjust the start of the KPTD and KPTmap so that the implementation 464 * of pmap_kextract() and pmap_growkernel() can be made simpler. 465 */ 466 KPTD -= KPTDI; 467 KPTmap -= i386_btop(KPTDI << PDRSHIFT); 468 469 /* 470 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), 471 * respectively. 472 */ 473 SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) 474 SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) 475 476 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); 477 478 virtual_avail = va; 479 480 /* 481 * Leave in place an identity mapping (virt == phys) for the low 1 MB 482 * physical memory region that is used by the ACPI wakeup code. This 483 * mapping must not have PG_G set. 484 */ 485#ifdef XBOX 486 /* FIXME: This is gross, but needed for the XBOX. Since we are in such 487 * an early stadium, we cannot yet neatly map video memory ... :-( 488 * Better fixes are very welcome! */ 489 if (!arch_i386_is_xbox) 490#endif 491 for (i = 1; i < NKPT; i++) 492 PTD[i] = 0; 493 494 /* Initialize the PAT MSR if present. */ 495 pmap_init_pat(); 496 497 /* Turn on PG_G on kernel page(s) */ 498 pmap_set_pg(); 499} 500 501/* 502 * Setup the PAT MSR. 503 */ 504void 505pmap_init_pat(void) 506{ 507 int pat_table[PAT_INDEX_SIZE]; 508 uint64_t pat_msr; 509 u_long cr0, cr4; 510 int i; 511 512 /* Set default PAT index table. */ 513 for (i = 0; i < PAT_INDEX_SIZE; i++) 514 pat_table[i] = -1; 515 pat_table[PAT_WRITE_BACK] = 0; 516 pat_table[PAT_WRITE_THROUGH] = 1; 517 pat_table[PAT_UNCACHEABLE] = 3; 518 pat_table[PAT_WRITE_COMBINING] = 3; 519 pat_table[PAT_WRITE_PROTECTED] = 3; 520 pat_table[PAT_UNCACHED] = 3; 521 522 /* Bail if this CPU doesn't implement PAT. */ 523 if ((cpu_feature & CPUID_PAT) == 0) { 524 for (i = 0; i < PAT_INDEX_SIZE; i++) 525 pat_index[i] = pat_table[i]; 526 pat_works = 0; 527 return; 528 } 529 530 /* 531 * Due to some Intel errata, we can only safely use the lower 4 532 * PAT entries. 533 * 534 * Intel Pentium III Processor Specification Update 535 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B 536 * or Mode C Paging) 537 * 538 * Intel Pentium IV Processor Specification Update 539 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) 540 */ 541 if (cpu_vendor_id == CPU_VENDOR_INTEL && 542 !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) 543 pat_works = 0; 544 545 /* Initialize default PAT entries. */ 546 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 547 PAT_VALUE(1, PAT_WRITE_THROUGH) | 548 PAT_VALUE(2, PAT_UNCACHED) | 549 PAT_VALUE(3, PAT_UNCACHEABLE) | 550 PAT_VALUE(4, PAT_WRITE_BACK) | 551 PAT_VALUE(5, PAT_WRITE_THROUGH) | 552 PAT_VALUE(6, PAT_UNCACHED) | 553 PAT_VALUE(7, PAT_UNCACHEABLE); 554 555 if (pat_works) { 556 /* 557 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 558 * Program 5 and 6 as WP and WC. 559 * Leave 4 and 7 as WB and UC. 560 */ 561 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); 562 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | 563 PAT_VALUE(6, PAT_WRITE_COMBINING); 564 pat_table[PAT_UNCACHED] = 2; 565 pat_table[PAT_WRITE_PROTECTED] = 5; 566 pat_table[PAT_WRITE_COMBINING] = 6; 567 } else { 568 /* 569 * Just replace PAT Index 2 with WC instead of UC-. 570 */ 571 pat_msr &= ~PAT_MASK(2); 572 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 573 pat_table[PAT_WRITE_COMBINING] = 2; 574 } 575 576 /* Disable PGE. */ 577 cr4 = rcr4(); 578 load_cr4(cr4 & ~CR4_PGE); 579 580 /* Disable caches (CD = 1, NW = 0). */ 581 cr0 = rcr0(); 582 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 583 584 /* Flushes caches and TLBs. */ 585 wbinvd(); 586 invltlb(); 587 588 /* Update PAT and index table. */ 589 wrmsr(MSR_PAT, pat_msr); 590 for (i = 0; i < PAT_INDEX_SIZE; i++) 591 pat_index[i] = pat_table[i]; 592 593 /* Flush caches and TLBs again. */ 594 wbinvd(); 595 invltlb(); 596 597 /* Restore caches and PGE. */ 598 load_cr0(cr0); 599 load_cr4(cr4); 600} 601 602/* 603 * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. 604 */ 605static void 606pmap_set_pg(void) 607{ 608 pt_entry_t *pte; 609 vm_offset_t va, endva; 610 611 if (pgeflag == 0) 612 return; 613 614 endva = KERNBASE + KERNend; 615 616 if (pseflag) { 617 va = KERNBASE + KERNLOAD; 618 while (va < endva) { 619 pdir_pde(PTD, va) |= pgeflag; 620 invltlb(); /* Play it safe, invltlb() every time */ 621 va += NBPDR; 622 } 623 } else { 624 va = (vm_offset_t)btext; 625 while (va < endva) { 626 pte = vtopte(va); 627 if (*pte) 628 *pte |= pgeflag; 629 invltlb(); /* Play it safe, invltlb() every time */ 630 va += PAGE_SIZE; 631 } 632 } 633} 634 635/* 636 * Initialize a vm_page's machine-dependent fields. 637 */ 638void 639pmap_page_init(vm_page_t m) 640{ 641 642 TAILQ_INIT(&m->md.pv_list); 643 m->md.pat_mode = PAT_WRITE_BACK; 644} 645 646#ifdef PAE 647static void * 648pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) 649{ 650 651 /* Inform UMA that this allocator uses kernel_map/object. */ 652 *flags = UMA_SLAB_KERNEL; 653 return ((void *)kmem_alloc_contig(kernel_map, bytes, wait, 0x0ULL, 654 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); 655} 656#endif 657 658/* 659 * ABuse the pte nodes for unmapped kva to thread a kva freelist through. 660 * Requirements: 661 * - Must deal with pages in order to ensure that none of the PG_* bits 662 * are ever set, PG_V in particular. 663 * - Assumes we can write to ptes without pte_store() atomic ops, even 664 * on PAE systems. This should be ok. 665 * - Assumes nothing will ever test these addresses for 0 to indicate 666 * no mapping instead of correctly checking PG_V. 667 * - Assumes a vm_offset_t will fit in a pte (true for i386). 668 * Because PG_V is never set, there can be no mappings to invalidate. 669 */ 670static vm_offset_t 671pmap_ptelist_alloc(vm_offset_t *head) 672{ 673 pt_entry_t *pte; 674 vm_offset_t va; 675 676 va = *head; 677 if (va == 0) 678 return (va); /* Out of memory */ 679 pte = vtopte(va); 680 *head = *pte; 681 if (*head & PG_V) 682 panic("pmap_ptelist_alloc: va with PG_V set!"); 683 *pte = 0; 684 return (va); 685} 686 687static void 688pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) 689{ 690 pt_entry_t *pte; 691 692 if (va & PG_V) 693 panic("pmap_ptelist_free: freeing va with PG_V set!"); 694 pte = vtopte(va); 695 *pte = *head; /* virtual! PG_V is 0 though */ 696 *head = va; 697} 698 699static void 700pmap_ptelist_init(vm_offset_t *head, void *base, int npages) 701{ 702 int i; 703 vm_offset_t va; 704 705 *head = 0; 706 for (i = npages - 1; i >= 0; i--) { 707 va = (vm_offset_t)base + i * PAGE_SIZE; 708 pmap_ptelist_free(head, va); 709 } 710} 711 712 713/* 714 * Initialize the pmap module. 715 * Called by vm_init, to initialize any structures that the pmap 716 * system needs to map virtual memory. 717 */ 718void 719pmap_init(void) 720{ 721 vm_page_t mpte; 722 vm_size_t s; 723 int i, pv_npg; 724 725 /* 726 * Initialize the vm page array entries for the kernel pmap's 727 * page table pages. 728 */ 729 for (i = 0; i < NKPT; i++) { 730 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 731 KASSERT(mpte >= vm_page_array && 732 mpte < &vm_page_array[vm_page_array_size], 733 ("pmap_init: page table page is out of range")); 734 mpte->pindex = i + KPTDI; 735 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 736 } 737 738 /* 739 * Initialize the address space (zone) for the pv entries. Set a 740 * high water mark so that the system can recover from excessive 741 * numbers of pv entries. 742 */ 743 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 744 pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; 745 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 746 pv_entry_max = roundup(pv_entry_max, _NPCPV); 747 pv_entry_high_water = 9 * (pv_entry_max / 10); 748 749 /* 750 * If the kernel is running in a virtual machine on an AMD Family 10h 751 * processor, then it must assume that MCA is enabled by the virtual 752 * machine monitor. 753 */ 754 if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD && 755 CPUID_TO_FAMILY(cpu_id) == 0x10) 756 workaround_erratum383 = 1; 757 758 /* 759 * Are large page mappings supported and enabled? 760 */ 761 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 762 if (pseflag == 0) 763 pg_ps_enabled = 0; 764 else if (pg_ps_enabled) { 765 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 766 ("pmap_init: can't assign to pagesizes[1]")); 767 pagesizes[1] = NBPDR; 768 } 769 770 /* 771 * Calculate the size of the pv head table for superpages. 772 */ 773 for (i = 0; phys_avail[i + 1]; i += 2); 774 pv_npg = round_4mpage(phys_avail[(i - 2) + 1]) / NBPDR; 775 776 /* 777 * Allocate memory for the pv head table for superpages. 778 */ 779 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 780 s = round_page(s); 781 pv_table = (struct md_page *)kmem_alloc(kernel_map, s); 782 for (i = 0; i < pv_npg; i++) 783 TAILQ_INIT(&pv_table[i].pv_list); 784 785 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); 786 pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map, 787 PAGE_SIZE * pv_maxchunks); 788 if (pv_chunkbase == NULL) 789 panic("pmap_init: not enough kvm for pv chunks"); 790 pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); 791#ifdef PAE 792 pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, 793 NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, 794 UMA_ZONE_VM | UMA_ZONE_NOFREE); 795 uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); 796#endif 797} 798 799 800SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, 801 "Max number of PV entries"); 802SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, 803 "Page share factor per proc"); 804 805static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, 806 "2/4MB page mapping counters"); 807 808static u_long pmap_pde_demotions; 809SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, 810 &pmap_pde_demotions, 0, "2/4MB page demotions"); 811 812static u_long pmap_pde_mappings; 813SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 814 &pmap_pde_mappings, 0, "2/4MB page mappings"); 815 816static u_long pmap_pde_p_failures; 817SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 818 &pmap_pde_p_failures, 0, "2/4MB page promotion failures"); 819 820static u_long pmap_pde_promotions; 821SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 822 &pmap_pde_promotions, 0, "2/4MB page promotions"); 823 824/*************************************************** 825 * Low level helper routines..... 826 ***************************************************/ 827 828/* 829 * Determine the appropriate bits to set in a PTE or PDE for a specified 830 * caching mode. 831 */ 832int 833pmap_cache_bits(int mode, boolean_t is_pde) 834{ 835 int cache_bits, pat_flag, pat_idx; 836 837 if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) 838 panic("Unknown caching mode %d\n", mode); 839 840 /* The PAT bit is different for PTE's and PDE's. */ 841 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; 842 843 /* Map the caching mode to a PAT index. */ 844 pat_idx = pat_index[mode]; 845 846 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 847 cache_bits = 0; 848 if (pat_idx & 0x4) 849 cache_bits |= pat_flag; 850 if (pat_idx & 0x2) 851 cache_bits |= PG_NC_PCD; 852 if (pat_idx & 0x1) 853 cache_bits |= PG_NC_PWT; 854 return (cache_bits); 855} 856 857/* 858 * The caller is responsible for maintaining TLB consistency. 859 */ 860static void 861pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde) 862{ 863 pd_entry_t *pde; 864 pmap_t pmap; 865 boolean_t PTD_updated; 866 867 PTD_updated = FALSE; 868 mtx_lock_spin(&allpmaps_lock); 869 LIST_FOREACH(pmap, &allpmaps, pm_list) { 870 if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & 871 PG_FRAME)) 872 PTD_updated = TRUE; 873 pde = pmap_pde(pmap, va); 874 pde_store(pde, newpde); 875 } 876 mtx_unlock_spin(&allpmaps_lock); 877 KASSERT(PTD_updated, 878 ("pmap_kenter_pde: current page table is not in allpmaps")); 879} 880 881/* 882 * After changing the page size for the specified virtual address in the page 883 * table, flush the corresponding entries from the processor's TLB. Only the 884 * calling processor's TLB is affected. 885 * 886 * The calling thread must be pinned to a processor. 887 */ 888static void 889pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) 890{ 891 u_long cr4; 892 893 if ((newpde & PG_PS) == 0) 894 /* Demotion: flush a specific 2MB page mapping. */ 895 invlpg(va); 896 else if ((newpde & PG_G) == 0) 897 /* 898 * Promotion: flush every 4KB page mapping from the TLB 899 * because there are too many to flush individually. 900 */ 901 invltlb(); 902 else { 903 /* 904 * Promotion: flush every 4KB page mapping from the TLB, 905 * including any global (PG_G) mappings. 906 */ 907 cr4 = rcr4(); 908 load_cr4(cr4 & ~CR4_PGE); 909 /* 910 * Although preemption at this point could be detrimental to 911 * performance, it would not lead to an error. PG_G is simply 912 * ignored if CR4.PGE is clear. Moreover, in case this block 913 * is re-entered, the load_cr4() either above or below will 914 * modify CR4.PGE flushing the TLB. 915 */ 916 load_cr4(cr4 | CR4_PGE); 917 } 918} 919#ifdef SMP 920/* 921 * For SMP, these functions have to use the IPI mechanism for coherence. 922 * 923 * N.B.: Before calling any of the following TLB invalidation functions, 924 * the calling processor must ensure that all stores updating a non- 925 * kernel page table are globally performed. Otherwise, another 926 * processor could cache an old, pre-update entry without being 927 * invalidated. This can happen one of two ways: (1) The pmap becomes 928 * active on another processor after its pm_active field is checked by 929 * one of the following functions but before a store updating the page 930 * table is globally performed. (2) The pmap becomes active on another 931 * processor before its pm_active field is checked but due to 932 * speculative loads one of the following functions stills reads the 933 * pmap as inactive on the other processor. 934 * 935 * The kernel page table is exempt because its pm_active field is 936 * immutable. The kernel page table is always active on every 937 * processor. 938 */ 939void 940pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 941{ 942 cpuset_t other_cpus; 943 u_int cpuid; 944 945 sched_pin(); 946 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 947 invlpg(va); 948 smp_invlpg(va); 949 } else { 950 cpuid = PCPU_GET(cpuid); 951 other_cpus = all_cpus; 952 CPU_CLR(cpuid, &other_cpus); 953 if (CPU_ISSET(cpuid, &pmap->pm_active)) 954 invlpg(va); 955 CPU_AND(&other_cpus, &pmap->pm_active); 956 if (!CPU_EMPTY(&other_cpus)) 957 smp_masked_invlpg(other_cpus, va); 958 } 959 sched_unpin(); 960} 961 962void 963pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 964{ 965 cpuset_t other_cpus; 966 vm_offset_t addr; 967 u_int cpuid; 968 969 sched_pin(); 970 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 971 for (addr = sva; addr < eva; addr += PAGE_SIZE) 972 invlpg(addr); 973 smp_invlpg_range(sva, eva); 974 } else { 975 cpuid = PCPU_GET(cpuid); 976 other_cpus = all_cpus; 977 CPU_CLR(cpuid, &other_cpus); 978 if (CPU_ISSET(cpuid, &pmap->pm_active)) 979 for (addr = sva; addr < eva; addr += PAGE_SIZE) 980 invlpg(addr); 981 CPU_AND(&other_cpus, &pmap->pm_active); 982 if (!CPU_EMPTY(&other_cpus)) 983 smp_masked_invlpg_range(other_cpus, sva, eva); 984 } 985 sched_unpin(); 986} 987 988void 989pmap_invalidate_all(pmap_t pmap) 990{ 991 cpuset_t other_cpus; 992 u_int cpuid; 993 994 sched_pin(); 995 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 996 invltlb(); 997 smp_invltlb(); 998 } else { 999 cpuid = PCPU_GET(cpuid); 1000 other_cpus = all_cpus; 1001 CPU_CLR(cpuid, &other_cpus); 1002 if (CPU_ISSET(cpuid, &pmap->pm_active)) 1003 invltlb(); 1004 CPU_AND(&other_cpus, &pmap->pm_active); 1005 if (!CPU_EMPTY(&other_cpus)) 1006 smp_masked_invltlb(other_cpus); 1007 } 1008 sched_unpin(); 1009} 1010 1011void 1012pmap_invalidate_cache(void) 1013{ 1014 1015 sched_pin(); 1016 wbinvd(); 1017 smp_cache_flush(); 1018 sched_unpin(); 1019} 1020 1021struct pde_action { 1022 cpuset_t invalidate; /* processors that invalidate their TLB */ 1023 vm_offset_t va; 1024 pd_entry_t *pde; 1025 pd_entry_t newpde; 1026 u_int store; /* processor that updates the PDE */ 1027}; 1028 1029static void 1030pmap_update_pde_kernel(void *arg) 1031{ 1032 struct pde_action *act = arg; 1033 pd_entry_t *pde; 1034 pmap_t pmap; 1035 1036 if (act->store == PCPU_GET(cpuid)) { 1037 1038 /* 1039 * Elsewhere, this operation requires allpmaps_lock for 1040 * synchronization. Here, it does not because it is being 1041 * performed in the context of an all_cpus rendezvous. 1042 */ 1043 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1044 pde = pmap_pde(pmap, act->va); 1045 pde_store(pde, act->newpde); 1046 } 1047 } 1048} 1049 1050static void 1051pmap_update_pde_user(void *arg) 1052{ 1053 struct pde_action *act = arg; 1054 1055 if (act->store == PCPU_GET(cpuid)) 1056 pde_store(act->pde, act->newpde); 1057} 1058 1059static void 1060pmap_update_pde_teardown(void *arg) 1061{ 1062 struct pde_action *act = arg; 1063 1064 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 1065 pmap_update_pde_invalidate(act->va, act->newpde); 1066} 1067 1068/* 1069 * Change the page size for the specified virtual address in a way that 1070 * prevents any possibility of the TLB ever having two entries that map the 1071 * same virtual address using different page sizes. This is the recommended 1072 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 1073 * machine check exception for a TLB state that is improperly diagnosed as a 1074 * hardware error. 1075 */ 1076static void 1077pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1078{ 1079 struct pde_action act; 1080 cpuset_t active, other_cpus; 1081 u_int cpuid; 1082 1083 sched_pin(); 1084 cpuid = PCPU_GET(cpuid); 1085 other_cpus = all_cpus; 1086 CPU_CLR(cpuid, &other_cpus); 1087 if (pmap == kernel_pmap) 1088 active = all_cpus; 1089 else 1090 active = pmap->pm_active; 1091 if (CPU_OVERLAP(&active, &other_cpus)) { 1092 act.store = cpuid; 1093 act.invalidate = active; 1094 act.va = va; 1095 act.pde = pde; 1096 act.newpde = newpde; 1097 CPU_SET(cpuid, &active); 1098 smp_rendezvous_cpus(active, 1099 smp_no_rendevous_barrier, pmap == kernel_pmap ? 1100 pmap_update_pde_kernel : pmap_update_pde_user, 1101 pmap_update_pde_teardown, &act); 1102 } else { 1103 if (pmap == kernel_pmap) 1104 pmap_kenter_pde(va, newpde); 1105 else 1106 pde_store(pde, newpde); 1107 if (CPU_ISSET(cpuid, &active)) 1108 pmap_update_pde_invalidate(va, newpde); 1109 } 1110 sched_unpin(); 1111} 1112#else /* !SMP */ 1113/* 1114 * Normal, non-SMP, 486+ invalidation functions. 1115 * We inline these within pmap.c for speed. 1116 */ 1117PMAP_INLINE void 1118pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1119{ 1120 1121 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1122 invlpg(va); 1123} 1124 1125PMAP_INLINE void 1126pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1127{ 1128 vm_offset_t addr; 1129 1130 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1131 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1132 invlpg(addr); 1133} 1134 1135PMAP_INLINE void 1136pmap_invalidate_all(pmap_t pmap) 1137{ 1138 1139 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1140 invltlb(); 1141} 1142 1143PMAP_INLINE void 1144pmap_invalidate_cache(void) 1145{ 1146 1147 wbinvd(); 1148} 1149 1150static void 1151pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1152{ 1153 1154 if (pmap == kernel_pmap) 1155 pmap_kenter_pde(va, newpde); 1156 else 1157 pde_store(pde, newpde); 1158 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1159 pmap_update_pde_invalidate(va, newpde); 1160} 1161#endif /* !SMP */ 1162 1163#define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1164 1165void 1166pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 1167{ 1168 1169 KASSERT((sva & PAGE_MASK) == 0, 1170 ("pmap_invalidate_cache_range: sva not page-aligned")); 1171 KASSERT((eva & PAGE_MASK) == 0, 1172 ("pmap_invalidate_cache_range: eva not page-aligned")); 1173 1174 if (cpu_feature & CPUID_SS) 1175 ; /* If "Self Snoop" is supported, do nothing. */ 1176 else if ((cpu_feature & CPUID_CLFSH) != 0 && 1177 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 1178 1179#ifdef DEV_APIC 1180 /* 1181 * XXX: Some CPUs fault, hang, or trash the local APIC 1182 * registers if we use CLFLUSH on the local APIC 1183 * range. The local APIC is always uncached, so we 1184 * don't need to flush for that range anyway. 1185 */ 1186 if (pmap_kextract(sva) == lapic_paddr) 1187 return; 1188#endif 1189 /* 1190 * Otherwise, do per-cache line flush. Use the mfence 1191 * instruction to insure that previous stores are 1192 * included in the write-back. The processor 1193 * propagates flush to other processors in the cache 1194 * coherence domain. 1195 */ 1196 mfence(); 1197 for (; sva < eva; sva += cpu_clflush_line_size) 1198 clflush(sva); 1199 mfence(); 1200 } else { 1201 1202 /* 1203 * No targeted cache flush methods are supported by CPU, 1204 * or the supplied range is bigger than 2MB. 1205 * Globally invalidate cache. 1206 */ 1207 pmap_invalidate_cache(); 1208 } 1209} 1210 1211void 1212pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1213{ 1214 int i; 1215 1216 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1217 (cpu_feature & CPUID_CLFSH) == 0) { 1218 pmap_invalidate_cache(); 1219 } else { 1220 for (i = 0; i < count; i++) 1221 pmap_flush_page(pages[i]); 1222 } 1223} 1224 1225/* 1226 * Are we current address space or kernel? N.B. We return FALSE when 1227 * a pmap's page table is in use because a kernel thread is borrowing 1228 * it. The borrowed page table can change spontaneously, making any 1229 * dependence on its continued use subject to a race condition. 1230 */ 1231static __inline int 1232pmap_is_current(pmap_t pmap) 1233{ 1234 1235 return (pmap == kernel_pmap || 1236 (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) && 1237 (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME))); 1238} 1239 1240/* 1241 * If the given pmap is not the current or kernel pmap, the returned pte must 1242 * be released by passing it to pmap_pte_release(). 1243 */ 1244pt_entry_t * 1245pmap_pte(pmap_t pmap, vm_offset_t va) 1246{ 1247 pd_entry_t newpf; 1248 pd_entry_t *pde; 1249 1250 pde = pmap_pde(pmap, va); 1251 if (*pde & PG_PS) 1252 return (pde); 1253 if (*pde != 0) { 1254 /* are we current address space or kernel? */ 1255 if (pmap_is_current(pmap)) 1256 return (vtopte(va)); 1257 mtx_lock(&PMAP2mutex); 1258 newpf = *pde & PG_FRAME; 1259 if ((*PMAP2 & PG_FRAME) != newpf) { 1260 *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; 1261 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); 1262 } 1263 return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); 1264 } 1265 return (NULL); 1266} 1267 1268/* 1269 * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte 1270 * being NULL. 1271 */ 1272static __inline void 1273pmap_pte_release(pt_entry_t *pte) 1274{ 1275 1276 if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) 1277 mtx_unlock(&PMAP2mutex); 1278} 1279 1280static __inline void 1281invlcaddr(void *caddr) 1282{ 1283 1284 invlpg((u_int)caddr); 1285} 1286 1287/* 1288 * Super fast pmap_pte routine best used when scanning 1289 * the pv lists. This eliminates many coarse-grained 1290 * invltlb calls. Note that many of the pv list 1291 * scans are across different pmaps. It is very wasteful 1292 * to do an entire invltlb for checking a single mapping. 1293 * 1294 * If the given pmap is not the current pmap, pvh_global_lock 1295 * must be held and curthread pinned to a CPU. 1296 */ 1297static pt_entry_t * 1298pmap_pte_quick(pmap_t pmap, vm_offset_t va) 1299{ 1300 pd_entry_t newpf; 1301 pd_entry_t *pde; 1302 1303 pde = pmap_pde(pmap, va); 1304 if (*pde & PG_PS) 1305 return (pde); 1306 if (*pde != 0) { 1307 /* are we current address space or kernel? */ 1308 if (pmap_is_current(pmap)) 1309 return (vtopte(va)); 1310 rw_assert(&pvh_global_lock, RA_WLOCKED); 1311 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 1312 newpf = *pde & PG_FRAME; 1313 if ((*PMAP1 & PG_FRAME) != newpf) { 1314 *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; 1315#ifdef SMP 1316 PMAP1cpu = PCPU_GET(cpuid); 1317#endif 1318 invlcaddr(PADDR1); 1319 PMAP1changed++; 1320 } else 1321#ifdef SMP 1322 if (PMAP1cpu != PCPU_GET(cpuid)) { 1323 PMAP1cpu = PCPU_GET(cpuid); 1324 invlcaddr(PADDR1); 1325 PMAP1changedcpu++; 1326 } else 1327#endif 1328 PMAP1unchanged++; 1329 return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); 1330 } 1331 return (0); 1332} 1333 1334/* 1335 * Routine: pmap_extract 1336 * Function: 1337 * Extract the physical page address associated 1338 * with the given map/virtual_address pair. 1339 */ 1340vm_paddr_t 1341pmap_extract(pmap_t pmap, vm_offset_t va) 1342{ 1343 vm_paddr_t rtval; 1344 pt_entry_t *pte; 1345 pd_entry_t pde; 1346 1347 rtval = 0; 1348 PMAP_LOCK(pmap); 1349 pde = pmap->pm_pdir[va >> PDRSHIFT]; 1350 if (pde != 0) { 1351 if ((pde & PG_PS) != 0) 1352 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); 1353 else { 1354 pte = pmap_pte(pmap, va); 1355 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); 1356 pmap_pte_release(pte); 1357 } 1358 } 1359 PMAP_UNLOCK(pmap); 1360 return (rtval); 1361} 1362 1363/* 1364 * Routine: pmap_extract_and_hold 1365 * Function: 1366 * Atomically extract and hold the physical page 1367 * with the given pmap and virtual address pair 1368 * if that mapping permits the given protection. 1369 */ 1370vm_page_t 1371pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1372{ 1373 pd_entry_t pde; 1374 pt_entry_t pte, *ptep; 1375 vm_page_t m; 1376 vm_paddr_t pa; 1377 1378 pa = 0; 1379 m = NULL; 1380 PMAP_LOCK(pmap); 1381retry: 1382 pde = *pmap_pde(pmap, va); 1383 if (pde != 0) { 1384 if (pde & PG_PS) { 1385 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 1386 if (vm_page_pa_tryrelock(pmap, (pde & 1387 PG_PS_FRAME) | (va & PDRMASK), &pa)) 1388 goto retry; 1389 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | 1390 (va & PDRMASK)); 1391 vm_page_hold(m); 1392 } 1393 } else { 1394 ptep = pmap_pte(pmap, va); 1395 pte = *ptep; 1396 pmap_pte_release(ptep); 1397 if (pte != 0 && 1398 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 1399 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, 1400 &pa)) 1401 goto retry; 1402 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 1403 vm_page_hold(m); 1404 } 1405 } 1406 } 1407 PA_UNLOCK_COND(pa); 1408 PMAP_UNLOCK(pmap); 1409 return (m); 1410} 1411 1412/*************************************************** 1413 * Low level mapping routines..... 1414 ***************************************************/ 1415 1416/* 1417 * Add a wired page to the kva. 1418 * Note: not SMP coherent. 1419 * 1420 * This function may be used before pmap_bootstrap() is called. 1421 */ 1422PMAP_INLINE void 1423pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1424{ 1425 pt_entry_t *pte; 1426 1427 pte = vtopte(va); 1428 pte_store(pte, pa | PG_RW | PG_V | pgeflag); 1429} 1430 1431static __inline void 1432pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 1433{ 1434 pt_entry_t *pte; 1435 1436 pte = vtopte(va); 1437 pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); 1438} 1439 1440/* 1441 * Remove a page from the kernel pagetables. 1442 * Note: not SMP coherent. 1443 * 1444 * This function may be used before pmap_bootstrap() is called. 1445 */ 1446PMAP_INLINE void 1447pmap_kremove(vm_offset_t va) 1448{ 1449 pt_entry_t *pte; 1450 1451 pte = vtopte(va); 1452 pte_clear(pte); 1453} 1454 1455/* 1456 * Used to map a range of physical addresses into kernel 1457 * virtual address space. 1458 * 1459 * The value passed in '*virt' is a suggested virtual address for 1460 * the mapping. Architectures which can support a direct-mapped 1461 * physical to virtual region can return the appropriate address 1462 * within that region, leaving '*virt' unchanged. Other 1463 * architectures should map the pages starting at '*virt' and 1464 * update '*virt' with the first usable address after the mapped 1465 * region. 1466 */ 1467vm_offset_t 1468pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1469{ 1470 vm_offset_t va, sva; 1471 vm_paddr_t superpage_offset; 1472 pd_entry_t newpde; 1473 1474 va = *virt; 1475 /* 1476 * Does the physical address range's size and alignment permit at 1477 * least one superpage mapping to be created? 1478 */ 1479 superpage_offset = start & PDRMASK; 1480 if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) { 1481 /* 1482 * Increase the starting virtual address so that its alignment 1483 * does not preclude the use of superpage mappings. 1484 */ 1485 if ((va & PDRMASK) < superpage_offset) 1486 va = (va & ~PDRMASK) + superpage_offset; 1487 else if ((va & PDRMASK) > superpage_offset) 1488 va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset; 1489 } 1490 sva = va; 1491 while (start < end) { 1492 if ((start & PDRMASK) == 0 && end - start >= NBPDR && 1493 pseflag) { 1494 KASSERT((va & PDRMASK) == 0, 1495 ("pmap_map: misaligned va %#x", va)); 1496 newpde = start | PG_PS | pgeflag | PG_RW | PG_V; 1497 pmap_kenter_pde(va, newpde); 1498 va += NBPDR; 1499 start += NBPDR; 1500 } else { 1501 pmap_kenter(va, start); 1502 va += PAGE_SIZE; 1503 start += PAGE_SIZE; 1504 } 1505 } 1506 pmap_invalidate_range(kernel_pmap, sva, va); 1507 *virt = va; 1508 return (sva); 1509} 1510 1511 1512/* 1513 * Add a list of wired pages to the kva 1514 * this routine is only used for temporary 1515 * kernel mappings that do not need to have 1516 * page modification or references recorded. 1517 * Note that old mappings are simply written 1518 * over. The page *must* be wired. 1519 * Note: SMP coherent. Uses a ranged shootdown IPI. 1520 */ 1521void 1522pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1523{ 1524 pt_entry_t *endpte, oldpte, pa, *pte; 1525 vm_page_t m; 1526 1527 oldpte = 0; 1528 pte = vtopte(sva); 1529 endpte = pte + count; 1530 while (pte < endpte) { 1531 m = *ma++; 1532 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); 1533 if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { 1534 oldpte |= *pte; 1535 pte_store(pte, pa | pgeflag | PG_RW | PG_V); 1536 } 1537 pte++; 1538 } 1539 if (__predict_false((oldpte & PG_V) != 0)) 1540 pmap_invalidate_range(kernel_pmap, sva, sva + count * 1541 PAGE_SIZE); 1542} 1543 1544/* 1545 * This routine tears out page mappings from the 1546 * kernel -- it is meant only for temporary mappings. 1547 * Note: SMP coherent. Uses a ranged shootdown IPI. 1548 */ 1549void 1550pmap_qremove(vm_offset_t sva, int count) 1551{ 1552 vm_offset_t va; 1553 1554 va = sva; 1555 while (count-- > 0) { 1556 pmap_kremove(va); 1557 va += PAGE_SIZE; 1558 } 1559 pmap_invalidate_range(kernel_pmap, sva, va); 1560} 1561 1562/*************************************************** 1563 * Page table page management routines..... 1564 ***************************************************/ 1565static __inline void 1566pmap_free_zero_pages(vm_page_t free) 1567{ 1568 vm_page_t m; 1569 1570 while (free != NULL) { 1571 m = free; 1572 free = m->right; 1573 /* Preserve the page's PG_ZERO setting. */ 1574 vm_page_free_toq(m); 1575 } 1576} 1577 1578/* 1579 * Schedule the specified unused page table page to be freed. Specifically, 1580 * add the page to the specified list of pages that will be released to the 1581 * physical memory manager after the TLB has been updated. 1582 */ 1583static __inline void 1584pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO) 1585{ 1586 1587 if (set_PG_ZERO) 1588 m->flags |= PG_ZERO; 1589 else 1590 m->flags &= ~PG_ZERO; 1591 m->right = *free; 1592 *free = m; 1593} 1594 1595/* 1596 * Inserts the specified page table page into the specified pmap's collection 1597 * of idle page table pages. Each of a pmap's page table pages is responsible 1598 * for mapping a distinct range of virtual addresses. The pmap's collection is 1599 * ordered by this virtual address range. 1600 */ 1601static void 1602pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 1603{ 1604 vm_page_t root; 1605 1606 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1607 root = pmap->pm_root; 1608 if (root == NULL) { 1609 mpte->left = NULL; 1610 mpte->right = NULL; 1611 } else { 1612 root = vm_page_splay(mpte->pindex, root); 1613 if (mpte->pindex < root->pindex) { 1614 mpte->left = root->left; 1615 mpte->right = root; 1616 root->left = NULL; 1617 } else if (mpte->pindex == root->pindex) 1618 panic("pmap_insert_pt_page: pindex already inserted"); 1619 else { 1620 mpte->right = root->right; 1621 mpte->left = root; 1622 root->right = NULL; 1623 } 1624 } 1625 pmap->pm_root = mpte; 1626} 1627 1628/* 1629 * Looks for a page table page mapping the specified virtual address in the 1630 * specified pmap's collection of idle page table pages. Returns NULL if there 1631 * is no page table page corresponding to the specified virtual address. 1632 */ 1633static vm_page_t 1634pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) 1635{ 1636 vm_page_t mpte; 1637 vm_pindex_t pindex = va >> PDRSHIFT; 1638 1639 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1640 if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) { 1641 mpte = vm_page_splay(pindex, mpte); 1642 if ((pmap->pm_root = mpte)->pindex != pindex) 1643 mpte = NULL; 1644 } 1645 return (mpte); 1646} 1647 1648/* 1649 * Removes the specified page table page from the specified pmap's collection 1650 * of idle page table pages. The specified page table page must be a member of 1651 * the pmap's collection. 1652 */ 1653static void 1654pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) 1655{ 1656 vm_page_t root; 1657 1658 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1659 if (mpte != pmap->pm_root) 1660 vm_page_splay(mpte->pindex, pmap->pm_root); 1661 if (mpte->left == NULL) 1662 root = mpte->right; 1663 else { 1664 root = vm_page_splay(mpte->pindex, mpte->left); 1665 root->right = mpte->right; 1666 } 1667 pmap->pm_root = root; 1668} 1669 1670/* 1671 * Decrements a page table page's wire count, which is used to record the 1672 * number of valid page table entries within the page. If the wire count 1673 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1674 * page table page was unmapped and FALSE otherwise. 1675 */ 1676static inline boolean_t 1677pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free) 1678{ 1679 1680 --m->wire_count; 1681 if (m->wire_count == 0) { 1682 _pmap_unwire_ptp(pmap, m, free); 1683 return (TRUE); 1684 } else 1685 return (FALSE); 1686} 1687 1688static void 1689_pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free) 1690{ 1691 vm_offset_t pteva; 1692 1693 /* 1694 * unmap the page table page 1695 */ 1696 pmap->pm_pdir[m->pindex] = 0; 1697 --pmap->pm_stats.resident_count; 1698 1699 /* 1700 * This is a release store so that the ordinary store unmapping 1701 * the page table page is globally performed before TLB shoot- 1702 * down is begun. 1703 */ 1704 atomic_subtract_rel_int(&cnt.v_wire_count, 1); 1705 1706 /* 1707 * Do an invltlb to make the invalidated mapping 1708 * take effect immediately. 1709 */ 1710 pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); 1711 pmap_invalidate_page(pmap, pteva); 1712 1713 /* 1714 * Put page on a list so that it is released after 1715 * *ALL* TLB shootdown is done 1716 */ 1717 pmap_add_delayed_free_list(m, free, TRUE); 1718} 1719 1720/* 1721 * After removing a page table entry, this routine is used to 1722 * conditionally free the page, and manage the hold/wire counts. 1723 */ 1724static int 1725pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free) 1726{ 1727 pd_entry_t ptepde; 1728 vm_page_t mpte; 1729 1730 if (va >= VM_MAXUSER_ADDRESS) 1731 return (0); 1732 ptepde = *pmap_pde(pmap, va); 1733 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 1734 return (pmap_unwire_ptp(pmap, mpte, free)); 1735} 1736 1737/* 1738 * Initialize the pmap for the swapper process. 1739 */ 1740void 1741pmap_pinit0(pmap_t pmap) 1742{ 1743 1744 PMAP_LOCK_INIT(pmap); 1745 /* 1746 * Since the page table directory is shared with the kernel pmap, 1747 * which is already included in the list "allpmaps", this pmap does 1748 * not need to be inserted into that list. 1749 */ 1750 pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); 1751#ifdef PAE 1752 pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); 1753#endif 1754 pmap->pm_root = NULL; 1755 CPU_ZERO(&pmap->pm_active); 1756 PCPU_SET(curpmap, pmap); 1757 TAILQ_INIT(&pmap->pm_pvchunk); 1758 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1759} 1760 1761/* 1762 * Initialize a preallocated and zeroed pmap structure, 1763 * such as one in a vmspace structure. 1764 */ 1765int 1766pmap_pinit(pmap_t pmap) 1767{ 1768 vm_page_t m, ptdpg[NPGPTD]; 1769 vm_paddr_t pa; 1770 int i; 1771 1772 PMAP_LOCK_INIT(pmap); 1773 1774 /* 1775 * No need to allocate page table space yet but we do need a valid 1776 * page directory table. 1777 */ 1778 if (pmap->pm_pdir == NULL) { 1779 pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map, 1780 NBPTD); 1781 if (pmap->pm_pdir == NULL) { 1782 PMAP_LOCK_DESTROY(pmap); 1783 return (0); 1784 } 1785#ifdef PAE 1786 pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); 1787 KASSERT(((vm_offset_t)pmap->pm_pdpt & 1788 ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, 1789 ("pmap_pinit: pdpt misaligned")); 1790 KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), 1791 ("pmap_pinit: pdpt above 4g")); 1792#endif 1793 pmap->pm_root = NULL; 1794 } 1795 KASSERT(pmap->pm_root == NULL, 1796 ("pmap_pinit: pmap has reserved page table page(s)")); 1797 1798 /* 1799 * allocate the page directory page(s) 1800 */ 1801 for (i = 0; i < NPGPTD;) { 1802 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1803 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1804 if (m == NULL) 1805 VM_WAIT; 1806 else { 1807 ptdpg[i++] = m; 1808 } 1809 } 1810 1811 pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); 1812 1813 for (i = 0; i < NPGPTD; i++) 1814 if ((ptdpg[i]->flags & PG_ZERO) == 0) 1815 pagezero(pmap->pm_pdir + (i * NPDEPG)); 1816 1817 mtx_lock_spin(&allpmaps_lock); 1818 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1819 /* Copy the kernel page table directory entries. */ 1820 bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); 1821 mtx_unlock_spin(&allpmaps_lock); 1822 1823 /* install self-referential address mapping entry(s) */ 1824 for (i = 0; i < NPGPTD; i++) { 1825 pa = VM_PAGE_TO_PHYS(ptdpg[i]); 1826 pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; 1827#ifdef PAE 1828 pmap->pm_pdpt[i] = pa | PG_V; 1829#endif 1830 } 1831 1832 CPU_ZERO(&pmap->pm_active); 1833 TAILQ_INIT(&pmap->pm_pvchunk); 1834 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1835 1836 return (1); 1837} 1838 1839/* 1840 * this routine is called if the page table page is not 1841 * mapped correctly. 1842 */ 1843static vm_page_t 1844_pmap_allocpte(pmap_t pmap, u_int ptepindex, int flags) 1845{ 1846 vm_paddr_t ptepa; 1847 vm_page_t m; 1848 1849 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || 1850 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, 1851 ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); 1852 1853 /* 1854 * Allocate a page table page. 1855 */ 1856 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1857 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1858 if (flags & M_WAITOK) { 1859 PMAP_UNLOCK(pmap); 1860 rw_wunlock(&pvh_global_lock); 1861 VM_WAIT; 1862 rw_wlock(&pvh_global_lock); 1863 PMAP_LOCK(pmap); 1864 } 1865 1866 /* 1867 * Indicate the need to retry. While waiting, the page table 1868 * page may have been allocated. 1869 */ 1870 return (NULL); 1871 } 1872 if ((m->flags & PG_ZERO) == 0) 1873 pmap_zero_page(m); 1874 1875 /* 1876 * Map the pagetable page into the process address space, if 1877 * it isn't already there. 1878 */ 1879 1880 pmap->pm_stats.resident_count++; 1881 1882 ptepa = VM_PAGE_TO_PHYS(m); 1883 pmap->pm_pdir[ptepindex] = 1884 (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); 1885 1886 return (m); 1887} 1888 1889static vm_page_t 1890pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags) 1891{ 1892 u_int ptepindex; 1893 pd_entry_t ptepa; 1894 vm_page_t m; 1895 1896 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || 1897 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, 1898 ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); 1899 1900 /* 1901 * Calculate pagetable page index 1902 */ 1903 ptepindex = va >> PDRSHIFT; 1904retry: 1905 /* 1906 * Get the page directory entry 1907 */ 1908 ptepa = pmap->pm_pdir[ptepindex]; 1909 1910 /* 1911 * This supports switching from a 4MB page to a 1912 * normal 4K page. 1913 */ 1914 if (ptepa & PG_PS) { 1915 (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); 1916 ptepa = pmap->pm_pdir[ptepindex]; 1917 } 1918 1919 /* 1920 * If the page table page is mapped, we just increment the 1921 * hold count, and activate it. 1922 */ 1923 if (ptepa) { 1924 m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 1925 m->wire_count++; 1926 } else { 1927 /* 1928 * Here if the pte page isn't mapped, or if it has 1929 * been deallocated. 1930 */ 1931 m = _pmap_allocpte(pmap, ptepindex, flags); 1932 if (m == NULL && (flags & M_WAITOK)) 1933 goto retry; 1934 } 1935 return (m); 1936} 1937 1938 1939/*************************************************** 1940* Pmap allocation/deallocation routines. 1941 ***************************************************/ 1942 1943#ifdef SMP 1944/* 1945 * Deal with a SMP shootdown of other users of the pmap that we are 1946 * trying to dispose of. This can be a bit hairy. 1947 */ 1948static cpuset_t *lazymask; 1949static u_int lazyptd; 1950static volatile u_int lazywait; 1951 1952void pmap_lazyfix_action(void); 1953 1954void 1955pmap_lazyfix_action(void) 1956{ 1957 1958#ifdef COUNT_IPIS 1959 (*ipi_lazypmap_counts[PCPU_GET(cpuid)])++; 1960#endif 1961 if (rcr3() == lazyptd) 1962 load_cr3(curpcb->pcb_cr3); 1963 CPU_CLR_ATOMIC(PCPU_GET(cpuid), lazymask); 1964 atomic_store_rel_int(&lazywait, 1); 1965} 1966 1967static void 1968pmap_lazyfix_self(u_int cpuid) 1969{ 1970 1971 if (rcr3() == lazyptd) 1972 load_cr3(curpcb->pcb_cr3); 1973 CPU_CLR_ATOMIC(cpuid, lazymask); 1974} 1975 1976 1977static void 1978pmap_lazyfix(pmap_t pmap) 1979{ 1980 cpuset_t mymask, mask; 1981 u_int cpuid, spins; 1982 int lsb; 1983 1984 mask = pmap->pm_active; 1985 while (!CPU_EMPTY(&mask)) { 1986 spins = 50000000; 1987 1988 /* Find least significant set bit. */ 1989 lsb = cpusetobj_ffs(&mask); 1990 MPASS(lsb != 0); 1991 lsb--; 1992 CPU_SETOF(lsb, &mask); 1993 mtx_lock_spin(&smp_ipi_mtx); 1994#ifdef PAE 1995 lazyptd = vtophys(pmap->pm_pdpt); 1996#else 1997 lazyptd = vtophys(pmap->pm_pdir); 1998#endif 1999 cpuid = PCPU_GET(cpuid); 2000 2001 /* Use a cpuset just for having an easy check. */ 2002 CPU_SETOF(cpuid, &mymask); 2003 if (!CPU_CMP(&mask, &mymask)) { 2004 lazymask = &pmap->pm_active; 2005 pmap_lazyfix_self(cpuid); 2006 } else { 2007 atomic_store_rel_int((u_int *)&lazymask, 2008 (u_int)&pmap->pm_active); 2009 atomic_store_rel_int(&lazywait, 0); 2010 ipi_selected(mask, IPI_LAZYPMAP); 2011 while (lazywait == 0) { 2012 ia32_pause(); 2013 if (--spins == 0) 2014 break; 2015 } 2016 } 2017 mtx_unlock_spin(&smp_ipi_mtx); 2018 if (spins == 0) 2019 printf("pmap_lazyfix: spun for 50000000\n"); 2020 mask = pmap->pm_active; 2021 } 2022} 2023 2024#else /* SMP */ 2025 2026/* 2027 * Cleaning up on uniprocessor is easy. For various reasons, we're 2028 * unlikely to have to even execute this code, including the fact 2029 * that the cleanup is deferred until the parent does a wait(2), which 2030 * means that another userland process has run. 2031 */ 2032static void 2033pmap_lazyfix(pmap_t pmap) 2034{ 2035 u_int cr3; 2036 2037 cr3 = vtophys(pmap->pm_pdir); 2038 if (cr3 == rcr3()) { 2039 load_cr3(curpcb->pcb_cr3); 2040 CPU_CLR(PCPU_GET(cpuid), &pmap->pm_active); 2041 } 2042} 2043#endif /* SMP */ 2044 2045/* 2046 * Release any resources held by the given physical map. 2047 * Called when a pmap initialized by pmap_pinit is being released. 2048 * Should only be called if the map contains no valid mappings. 2049 */ 2050void 2051pmap_release(pmap_t pmap) 2052{ 2053 vm_page_t m, ptdpg[NPGPTD]; 2054 int i; 2055 2056 KASSERT(pmap->pm_stats.resident_count == 0, 2057 ("pmap_release: pmap resident count %ld != 0", 2058 pmap->pm_stats.resident_count)); 2059 KASSERT(pmap->pm_root == NULL, 2060 ("pmap_release: pmap has reserved page table page(s)")); 2061 2062 pmap_lazyfix(pmap); 2063 mtx_lock_spin(&allpmaps_lock); 2064 LIST_REMOVE(pmap, pm_list); 2065 mtx_unlock_spin(&allpmaps_lock); 2066 2067 for (i = 0; i < NPGPTD; i++) 2068 ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] & 2069 PG_FRAME); 2070 2071 bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * 2072 sizeof(*pmap->pm_pdir)); 2073 2074 pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); 2075 2076 for (i = 0; i < NPGPTD; i++) { 2077 m = ptdpg[i]; 2078#ifdef PAE 2079 KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), 2080 ("pmap_release: got wrong ptd page")); 2081#endif 2082 m->wire_count--; 2083 atomic_subtract_int(&cnt.v_wire_count, 1); 2084 vm_page_free_zero(m); 2085 } 2086 PMAP_LOCK_DESTROY(pmap); 2087} 2088 2089static int 2090kvm_size(SYSCTL_HANDLER_ARGS) 2091{ 2092 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; 2093 2094 return (sysctl_handle_long(oidp, &ksize, 0, req)); 2095} 2096SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 2097 0, 0, kvm_size, "IU", "Size of KVM"); 2098 2099static int 2100kvm_free(SYSCTL_HANDLER_ARGS) 2101{ 2102 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 2103 2104 return (sysctl_handle_long(oidp, &kfree, 0, req)); 2105} 2106SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 2107 0, 0, kvm_free, "IU", "Amount of KVM free"); 2108 2109/* 2110 * grow the number of kernel page table entries, if needed 2111 */ 2112void 2113pmap_growkernel(vm_offset_t addr) 2114{ 2115 vm_paddr_t ptppaddr; 2116 vm_page_t nkpg; 2117 pd_entry_t newpdir; 2118 2119 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2120 addr = roundup2(addr, NBPDR); 2121 if (addr - 1 >= kernel_map->max_offset) 2122 addr = kernel_map->max_offset; 2123 while (kernel_vm_end < addr) { 2124 if (pdir_pde(PTD, kernel_vm_end)) { 2125 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2126 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2127 kernel_vm_end = kernel_map->max_offset; 2128 break; 2129 } 2130 continue; 2131 } 2132 2133 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, 2134 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2135 VM_ALLOC_ZERO); 2136 if (nkpg == NULL) 2137 panic("pmap_growkernel: no memory to grow kernel"); 2138 2139 nkpt++; 2140 2141 if ((nkpg->flags & PG_ZERO) == 0) 2142 pmap_zero_page(nkpg); 2143 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 2144 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); 2145 pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir; 2146 2147 pmap_kenter_pde(kernel_vm_end, newpdir); 2148 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2149 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2150 kernel_vm_end = kernel_map->max_offset; 2151 break; 2152 } 2153 } 2154} 2155 2156 2157/*************************************************** 2158 * page management routines. 2159 ***************************************************/ 2160 2161CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2162CTASSERT(_NPCM == 11); 2163CTASSERT(_NPCPV == 336); 2164 2165static __inline struct pv_chunk * 2166pv_to_chunk(pv_entry_t pv) 2167{ 2168 2169 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2170} 2171 2172#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2173 2174#define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ 2175#define PC_FREE10 0x0000fffful /* Free values for index 10 */ 2176 2177static const uint32_t pc_freemask[_NPCM] = { 2178 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2179 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2180 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2181 PC_FREE0_9, PC_FREE10 2182}; 2183 2184SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2185 "Current number of pv entries"); 2186 2187#ifdef PV_STATS 2188static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2189 2190SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2191 "Current number of pv entry chunks"); 2192SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2193 "Current number of pv entry chunks allocated"); 2194SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2195 "Current number of pv entry chunks frees"); 2196SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2197 "Number of times tried to get a chunk page but failed."); 2198 2199static long pv_entry_frees, pv_entry_allocs; 2200static int pv_entry_spare; 2201 2202SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2203 "Current number of pv entry frees"); 2204SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2205 "Current number of pv entry allocs"); 2206SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2207 "Current number of spare pv entries"); 2208#endif 2209 2210/* 2211 * We are in a serious low memory condition. Resort to 2212 * drastic measures to free some pages so we can allocate 2213 * another pv entry chunk. 2214 */ 2215static vm_page_t 2216pmap_pv_reclaim(pmap_t locked_pmap) 2217{ 2218 struct pch newtail; 2219 struct pv_chunk *pc; 2220 struct md_page *pvh; 2221 pd_entry_t *pde; 2222 pmap_t pmap; 2223 pt_entry_t *pte, tpte; 2224 pv_entry_t pv; 2225 vm_offset_t va; 2226 vm_page_t free, m, m_pc; 2227 uint32_t inuse; 2228 int bit, field, freed; 2229 2230 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2231 pmap = NULL; 2232 free = m_pc = NULL; 2233 TAILQ_INIT(&newtail); 2234 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || 2235 free == NULL)) { 2236 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2237 if (pmap != pc->pc_pmap) { 2238 if (pmap != NULL) { 2239 pmap_invalidate_all(pmap); 2240 if (pmap != locked_pmap) 2241 PMAP_UNLOCK(pmap); 2242 } 2243 pmap = pc->pc_pmap; 2244 /* Avoid deadlock and lock recursion. */ 2245 if (pmap > locked_pmap) 2246 PMAP_LOCK(pmap); 2247 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { 2248 pmap = NULL; 2249 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2250 continue; 2251 } 2252 } 2253 2254 /* 2255 * Destroy every non-wired, 4 KB page mapping in the chunk. 2256 */ 2257 freed = 0; 2258 for (field = 0; field < _NPCM; field++) { 2259 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2260 inuse != 0; inuse &= ~(1UL << bit)) { 2261 bit = bsfl(inuse); 2262 pv = &pc->pc_pventry[field * 32 + bit]; 2263 va = pv->pv_va; 2264 pde = pmap_pde(pmap, va); 2265 if ((*pde & PG_PS) != 0) 2266 continue; 2267 pte = pmap_pte(pmap, va); 2268 tpte = *pte; 2269 if ((tpte & PG_W) == 0) 2270 tpte = pte_load_clear(pte); 2271 pmap_pte_release(pte); 2272 if ((tpte & PG_W) != 0) 2273 continue; 2274 KASSERT(tpte != 0, 2275 ("pmap_pv_reclaim: pmap %p va %x zero pte", 2276 pmap, va)); 2277 if ((tpte & PG_G) != 0) 2278 pmap_invalidate_page(pmap, va); 2279 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 2280 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2281 vm_page_dirty(m); 2282 if ((tpte & PG_A) != 0) 2283 vm_page_aflag_set(m, PGA_REFERENCED); 2284 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2285 if (TAILQ_EMPTY(&m->md.pv_list) && 2286 (m->flags & PG_FICTITIOUS) == 0) { 2287 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2288 if (TAILQ_EMPTY(&pvh->pv_list)) { 2289 vm_page_aflag_clear(m, 2290 PGA_WRITEABLE); 2291 } 2292 } 2293 pc->pc_map[field] |= 1UL << bit; 2294 pmap_unuse_pt(pmap, va, &free); 2295 freed++; 2296 } 2297 } 2298 if (freed == 0) { 2299 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2300 continue; 2301 } 2302 /* Every freed mapping is for a 4 KB page. */ 2303 pmap->pm_stats.resident_count -= freed; 2304 PV_STAT(pv_entry_frees += freed); 2305 PV_STAT(pv_entry_spare += freed); 2306 pv_entry_count -= freed; 2307 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2308 for (field = 0; field < _NPCM; field++) 2309 if (pc->pc_map[field] != pc_freemask[field]) { 2310 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2311 pc_list); 2312 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2313 2314 /* 2315 * One freed pv entry in locked_pmap is 2316 * sufficient. 2317 */ 2318 if (pmap == locked_pmap) 2319 goto out; 2320 break; 2321 } 2322 if (field == _NPCM) { 2323 PV_STAT(pv_entry_spare -= _NPCPV); 2324 PV_STAT(pc_chunk_count--); 2325 PV_STAT(pc_chunk_frees++); 2326 /* Entire chunk is free; return it. */ 2327 m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2328 pmap_qremove((vm_offset_t)pc, 1); 2329 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2330 break; 2331 } 2332 } 2333out: 2334 TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); 2335 if (pmap != NULL) { 2336 pmap_invalidate_all(pmap); 2337 if (pmap != locked_pmap) 2338 PMAP_UNLOCK(pmap); 2339 } 2340 if (m_pc == NULL && pv_vafree != 0 && free != NULL) { 2341 m_pc = free; 2342 free = m_pc->right; 2343 /* Recycle a freed page table page. */ 2344 m_pc->wire_count = 1; 2345 atomic_add_int(&cnt.v_wire_count, 1); 2346 } 2347 pmap_free_zero_pages(free); 2348 return (m_pc); 2349} 2350 2351/* 2352 * free the pv_entry back to the free list 2353 */ 2354static void 2355free_pv_entry(pmap_t pmap, pv_entry_t pv) 2356{ 2357 struct pv_chunk *pc; 2358 int idx, field, bit; 2359 2360 rw_assert(&pvh_global_lock, RA_WLOCKED); 2361 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2362 PV_STAT(pv_entry_frees++); 2363 PV_STAT(pv_entry_spare++); 2364 pv_entry_count--; 2365 pc = pv_to_chunk(pv); 2366 idx = pv - &pc->pc_pventry[0]; 2367 field = idx / 32; 2368 bit = idx % 32; 2369 pc->pc_map[field] |= 1ul << bit; 2370 for (idx = 0; idx < _NPCM; idx++) 2371 if (pc->pc_map[idx] != pc_freemask[idx]) { 2372 /* 2373 * 98% of the time, pc is already at the head of the 2374 * list. If it isn't already, move it to the head. 2375 */ 2376 if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != 2377 pc)) { 2378 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2379 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2380 pc_list); 2381 } 2382 return; 2383 } 2384 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2385 free_pv_chunk(pc); 2386} 2387 2388static void 2389free_pv_chunk(struct pv_chunk *pc) 2390{ 2391 vm_page_t m; 2392 2393 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2394 PV_STAT(pv_entry_spare -= _NPCPV); 2395 PV_STAT(pc_chunk_count--); 2396 PV_STAT(pc_chunk_frees++); 2397 /* entire chunk is free, return it */ 2398 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2399 pmap_qremove((vm_offset_t)pc, 1); 2400 vm_page_unwire(m, 0); 2401 vm_page_free(m); 2402 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2403} 2404 2405/* 2406 * get a new pv_entry, allocating a block from the system 2407 * when needed. 2408 */ 2409static pv_entry_t 2410get_pv_entry(pmap_t pmap, boolean_t try) 2411{ 2412 static const struct timeval printinterval = { 60, 0 }; 2413 static struct timeval lastprint; 2414 int bit, field; 2415 pv_entry_t pv; 2416 struct pv_chunk *pc; 2417 vm_page_t m; 2418 2419 rw_assert(&pvh_global_lock, RA_WLOCKED); 2420 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2421 PV_STAT(pv_entry_allocs++); 2422 pv_entry_count++; 2423 if (pv_entry_count > pv_entry_high_water) 2424 if (ratecheck(&lastprint, &printinterval)) 2425 printf("Approaching the limit on PV entries, consider " 2426 "increasing either the vm.pmap.shpgperproc or the " 2427 "vm.pmap.pv_entry_max tunable.\n"); 2428retry: 2429 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2430 if (pc != NULL) { 2431 for (field = 0; field < _NPCM; field++) { 2432 if (pc->pc_map[field]) { 2433 bit = bsfl(pc->pc_map[field]); 2434 break; 2435 } 2436 } 2437 if (field < _NPCM) { 2438 pv = &pc->pc_pventry[field * 32 + bit]; 2439 pc->pc_map[field] &= ~(1ul << bit); 2440 /* If this was the last item, move it to tail */ 2441 for (field = 0; field < _NPCM; field++) 2442 if (pc->pc_map[field] != 0) { 2443 PV_STAT(pv_entry_spare--); 2444 return (pv); /* not full, return */ 2445 } 2446 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2447 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2448 PV_STAT(pv_entry_spare--); 2449 return (pv); 2450 } 2451 } 2452 /* 2453 * Access to the ptelist "pv_vafree" is synchronized by the pvh 2454 * global lock. If "pv_vafree" is currently non-empty, it will 2455 * remain non-empty until pmap_ptelist_alloc() completes. 2456 */ 2457 if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2458 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 2459 if (try) { 2460 pv_entry_count--; 2461 PV_STAT(pc_chunk_tryfail++); 2462 return (NULL); 2463 } 2464 m = pmap_pv_reclaim(pmap); 2465 if (m == NULL) 2466 goto retry; 2467 } 2468 PV_STAT(pc_chunk_count++); 2469 PV_STAT(pc_chunk_allocs++); 2470 pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); 2471 pmap_qenter((vm_offset_t)pc, &m, 1); 2472 pc->pc_pmap = pmap; 2473 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ 2474 for (field = 1; field < _NPCM; field++) 2475 pc->pc_map[field] = pc_freemask[field]; 2476 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2477 pv = &pc->pc_pventry[0]; 2478 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2479 PV_STAT(pv_entry_spare += _NPCPV - 1); 2480 return (pv); 2481} 2482 2483static __inline pv_entry_t 2484pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2485{ 2486 pv_entry_t pv; 2487 2488 rw_assert(&pvh_global_lock, RA_WLOCKED); 2489 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 2490 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 2491 TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); 2492 break; 2493 } 2494 } 2495 return (pv); 2496} 2497 2498static void 2499pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2500{ 2501 struct md_page *pvh; 2502 pv_entry_t pv; 2503 vm_offset_t va_last; 2504 vm_page_t m; 2505 2506 rw_assert(&pvh_global_lock, RA_WLOCKED); 2507 KASSERT((pa & PDRMASK) == 0, 2508 ("pmap_pv_demote_pde: pa is not 4mpage aligned")); 2509 2510 /* 2511 * Transfer the 4mpage's pv entry for this mapping to the first 2512 * page's pv list. 2513 */ 2514 pvh = pa_to_pvh(pa); 2515 va = trunc_4mpage(va); 2516 pv = pmap_pvh_remove(pvh, pmap, va); 2517 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 2518 m = PHYS_TO_VM_PAGE(pa); 2519 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2520 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 2521 va_last = va + NBPDR - PAGE_SIZE; 2522 do { 2523 m++; 2524 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2525 ("pmap_pv_demote_pde: page %p is not managed", m)); 2526 va += PAGE_SIZE; 2527 pmap_insert_entry(pmap, va, m); 2528 } while (va < va_last); 2529} 2530 2531static void 2532pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2533{ 2534 struct md_page *pvh; 2535 pv_entry_t pv; 2536 vm_offset_t va_last; 2537 vm_page_t m; 2538 2539 rw_assert(&pvh_global_lock, RA_WLOCKED); 2540 KASSERT((pa & PDRMASK) == 0, 2541 ("pmap_pv_promote_pde: pa is not 4mpage aligned")); 2542 2543 /* 2544 * Transfer the first page's pv entry for this mapping to the 2545 * 4mpage's pv list. Aside from avoiding the cost of a call 2546 * to get_pv_entry(), a transfer avoids the possibility that 2547 * get_pv_entry() calls pmap_collect() and that pmap_collect() 2548 * removes one of the mappings that is being promoted. 2549 */ 2550 m = PHYS_TO_VM_PAGE(pa); 2551 va = trunc_4mpage(va); 2552 pv = pmap_pvh_remove(&m->md, pmap, va); 2553 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 2554 pvh = pa_to_pvh(pa); 2555 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); 2556 /* Free the remaining NPTEPG - 1 pv entries. */ 2557 va_last = va + NBPDR - PAGE_SIZE; 2558 do { 2559 m++; 2560 va += PAGE_SIZE; 2561 pmap_pvh_free(&m->md, pmap, va); 2562 } while (va < va_last); 2563} 2564 2565static void 2566pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2567{ 2568 pv_entry_t pv; 2569 2570 pv = pmap_pvh_remove(pvh, pmap, va); 2571 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 2572 free_pv_entry(pmap, pv); 2573} 2574 2575static void 2576pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 2577{ 2578 struct md_page *pvh; 2579 2580 rw_assert(&pvh_global_lock, RA_WLOCKED); 2581 pmap_pvh_free(&m->md, pmap, va); 2582 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 2583 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2584 if (TAILQ_EMPTY(&pvh->pv_list)) 2585 vm_page_aflag_clear(m, PGA_WRITEABLE); 2586 } 2587} 2588 2589/* 2590 * Create a pv entry for page at pa for 2591 * (pmap, va). 2592 */ 2593static void 2594pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2595{ 2596 pv_entry_t pv; 2597 2598 rw_assert(&pvh_global_lock, RA_WLOCKED); 2599 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2600 pv = get_pv_entry(pmap, FALSE); 2601 pv->pv_va = va; 2602 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2603} 2604 2605/* 2606 * Conditionally create a pv entry. 2607 */ 2608static boolean_t 2609pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2610{ 2611 pv_entry_t pv; 2612 2613 rw_assert(&pvh_global_lock, RA_WLOCKED); 2614 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2615 if (pv_entry_count < pv_entry_high_water && 2616 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2617 pv->pv_va = va; 2618 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2619 return (TRUE); 2620 } else 2621 return (FALSE); 2622} 2623 2624/* 2625 * Create the pv entries for each of the pages within a superpage. 2626 */ 2627static boolean_t 2628pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2629{ 2630 struct md_page *pvh; 2631 pv_entry_t pv; 2632 2633 rw_assert(&pvh_global_lock, RA_WLOCKED); 2634 if (pv_entry_count < pv_entry_high_water && 2635 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2636 pv->pv_va = va; 2637 pvh = pa_to_pvh(pa); 2638 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); 2639 return (TRUE); 2640 } else 2641 return (FALSE); 2642} 2643 2644/* 2645 * Fills a page table page with mappings to consecutive physical pages. 2646 */ 2647static void 2648pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 2649{ 2650 pt_entry_t *pte; 2651 2652 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 2653 *pte = newpte; 2654 newpte += PAGE_SIZE; 2655 } 2656} 2657 2658/* 2659 * Tries to demote a 2- or 4MB page mapping. If demotion fails, the 2660 * 2- or 4MB page mapping is invalidated. 2661 */ 2662static boolean_t 2663pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2664{ 2665 pd_entry_t newpde, oldpde; 2666 pt_entry_t *firstpte, newpte; 2667 vm_paddr_t mptepa; 2668 vm_page_t free, mpte; 2669 2670 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2671 oldpde = *pde; 2672 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 2673 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 2674 mpte = pmap_lookup_pt_page(pmap, va); 2675 if (mpte != NULL) 2676 pmap_remove_pt_page(pmap, mpte); 2677 else { 2678 KASSERT((oldpde & PG_W) == 0, 2679 ("pmap_demote_pde: page table page for a wired mapping" 2680 " is missing")); 2681 2682 /* 2683 * Invalidate the 2- or 4MB page mapping and return 2684 * "failure" if the mapping was never accessed or the 2685 * allocation of the new page table page fails. 2686 */ 2687 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 2688 va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | 2689 VM_ALLOC_WIRED)) == NULL) { 2690 free = NULL; 2691 pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free); 2692 pmap_invalidate_page(pmap, trunc_4mpage(va)); 2693 pmap_free_zero_pages(free); 2694 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" 2695 " in pmap %p", va, pmap); 2696 return (FALSE); 2697 } 2698 if (va < VM_MAXUSER_ADDRESS) 2699 pmap->pm_stats.resident_count++; 2700 } 2701 mptepa = VM_PAGE_TO_PHYS(mpte); 2702 2703 /* 2704 * If the page mapping is in the kernel's address space, then the 2705 * KPTmap can provide access to the page table page. Otherwise, 2706 * temporarily map the page table page (mpte) into the kernel's 2707 * address space at either PADDR1 or PADDR2. 2708 */ 2709 if (va >= KERNBASE) 2710 firstpte = &KPTmap[i386_btop(trunc_4mpage(va))]; 2711 else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { 2712 if ((*PMAP1 & PG_FRAME) != mptepa) { 2713 *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2714#ifdef SMP 2715 PMAP1cpu = PCPU_GET(cpuid); 2716#endif 2717 invlcaddr(PADDR1); 2718 PMAP1changed++; 2719 } else 2720#ifdef SMP 2721 if (PMAP1cpu != PCPU_GET(cpuid)) { 2722 PMAP1cpu = PCPU_GET(cpuid); 2723 invlcaddr(PADDR1); 2724 PMAP1changedcpu++; 2725 } else 2726#endif 2727 PMAP1unchanged++; 2728 firstpte = PADDR1; 2729 } else { 2730 mtx_lock(&PMAP2mutex); 2731 if ((*PMAP2 & PG_FRAME) != mptepa) { 2732 *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2733 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); 2734 } 2735 firstpte = PADDR2; 2736 } 2737 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 2738 KASSERT((oldpde & PG_A) != 0, 2739 ("pmap_demote_pde: oldpde is missing PG_A")); 2740 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 2741 ("pmap_demote_pde: oldpde is missing PG_M")); 2742 newpte = oldpde & ~PG_PS; 2743 if ((newpte & PG_PDE_PAT) != 0) 2744 newpte ^= PG_PDE_PAT | PG_PTE_PAT; 2745 2746 /* 2747 * If the page table page is new, initialize it. 2748 */ 2749 if (mpte->wire_count == 1) { 2750 mpte->wire_count = NPTEPG; 2751 pmap_fill_ptp(firstpte, newpte); 2752 } 2753 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 2754 ("pmap_demote_pde: firstpte and newpte map different physical" 2755 " addresses")); 2756 2757 /* 2758 * If the mapping has changed attributes, update the page table 2759 * entries. 2760 */ 2761 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 2762 pmap_fill_ptp(firstpte, newpte); 2763 2764 /* 2765 * Demote the mapping. This pmap is locked. The old PDE has 2766 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 2767 * set. Thus, there is no danger of a race with another 2768 * processor changing the setting of PG_A and/or PG_M between 2769 * the read above and the store below. 2770 */ 2771 if (workaround_erratum383) 2772 pmap_update_pde(pmap, va, pde, newpde); 2773 else if (pmap == kernel_pmap) 2774 pmap_kenter_pde(va, newpde); 2775 else 2776 pde_store(pde, newpde); 2777 if (firstpte == PADDR2) 2778 mtx_unlock(&PMAP2mutex); 2779 2780 /* 2781 * Invalidate the recursive mapping of the page table page. 2782 */ 2783 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 2784 2785 /* 2786 * Demote the pv entry. This depends on the earlier demotion 2787 * of the mapping. Specifically, the (re)creation of a per- 2788 * page pv entry might trigger the execution of pmap_collect(), 2789 * which might reclaim a newly (re)created per-page pv entry 2790 * and destroy the associated mapping. In order to destroy 2791 * the mapping, the PDE must have already changed from mapping 2792 * the 2mpage to referencing the page table page. 2793 */ 2794 if ((oldpde & PG_MANAGED) != 0) 2795 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); 2796 2797 pmap_pde_demotions++; 2798 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" 2799 " in pmap %p", va, pmap); 2800 return (TRUE); 2801} 2802 2803/* 2804 * pmap_remove_pde: do the things to unmap a superpage in a process 2805 */ 2806static void 2807pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 2808 vm_page_t *free) 2809{ 2810 struct md_page *pvh; 2811 pd_entry_t oldpde; 2812 vm_offset_t eva, va; 2813 vm_page_t m, mpte; 2814 2815 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2816 KASSERT((sva & PDRMASK) == 0, 2817 ("pmap_remove_pde: sva is not 4mpage aligned")); 2818 oldpde = pte_load_clear(pdq); 2819 if (oldpde & PG_W) 2820 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 2821 2822 /* 2823 * Machines that don't support invlpg, also don't support 2824 * PG_G. 2825 */ 2826 if (oldpde & PG_G) 2827 pmap_invalidate_page(kernel_pmap, sva); 2828 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2829 if (oldpde & PG_MANAGED) { 2830 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 2831 pmap_pvh_free(pvh, pmap, sva); 2832 eva = sva + NBPDR; 2833 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 2834 va < eva; va += PAGE_SIZE, m++) { 2835 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2836 vm_page_dirty(m); 2837 if (oldpde & PG_A) 2838 vm_page_aflag_set(m, PGA_REFERENCED); 2839 if (TAILQ_EMPTY(&m->md.pv_list) && 2840 TAILQ_EMPTY(&pvh->pv_list)) 2841 vm_page_aflag_clear(m, PGA_WRITEABLE); 2842 } 2843 } 2844 if (pmap == kernel_pmap) { 2845 if (!pmap_demote_pde(pmap, pdq, sva)) 2846 panic("pmap_remove_pde: failed demotion"); 2847 } else { 2848 mpte = pmap_lookup_pt_page(pmap, sva); 2849 if (mpte != NULL) { 2850 pmap_remove_pt_page(pmap, mpte); 2851 pmap->pm_stats.resident_count--; 2852 KASSERT(mpte->wire_count == NPTEPG, 2853 ("pmap_remove_pde: pte page wire count error")); 2854 mpte->wire_count = 0; 2855 pmap_add_delayed_free_list(mpte, free, FALSE); 2856 atomic_subtract_int(&cnt.v_wire_count, 1); 2857 } 2858 } 2859} 2860 2861/* 2862 * pmap_remove_pte: do the things to unmap a page in a process 2863 */ 2864static int 2865pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free) 2866{ 2867 pt_entry_t oldpte; 2868 vm_page_t m; 2869 2870 rw_assert(&pvh_global_lock, RA_WLOCKED); 2871 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2872 oldpte = pte_load_clear(ptq); 2873 KASSERT(oldpte != 0, 2874 ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); 2875 if (oldpte & PG_W) 2876 pmap->pm_stats.wired_count -= 1; 2877 /* 2878 * Machines that don't support invlpg, also don't support 2879 * PG_G. 2880 */ 2881 if (oldpte & PG_G) 2882 pmap_invalidate_page(kernel_pmap, va); 2883 pmap->pm_stats.resident_count -= 1; 2884 if (oldpte & PG_MANAGED) { 2885 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 2886 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2887 vm_page_dirty(m); 2888 if (oldpte & PG_A) 2889 vm_page_aflag_set(m, PGA_REFERENCED); 2890 pmap_remove_entry(pmap, m, va); 2891 } 2892 return (pmap_unuse_pt(pmap, va, free)); 2893} 2894 2895/* 2896 * Remove a single page from a process address space 2897 */ 2898static void 2899pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free) 2900{ 2901 pt_entry_t *pte; 2902 2903 rw_assert(&pvh_global_lock, RA_WLOCKED); 2904 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 2905 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2906 if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) 2907 return; 2908 pmap_remove_pte(pmap, pte, va, free); 2909 pmap_invalidate_page(pmap, va); 2910} 2911 2912/* 2913 * Remove the given range of addresses from the specified map. 2914 * 2915 * It is assumed that the start and end are properly 2916 * rounded to the page size. 2917 */ 2918void 2919pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2920{ 2921 vm_offset_t pdnxt; 2922 pd_entry_t ptpaddr; 2923 pt_entry_t *pte; 2924 vm_page_t free = NULL; 2925 int anyvalid; 2926 2927 /* 2928 * Perform an unsynchronized read. This is, however, safe. 2929 */ 2930 if (pmap->pm_stats.resident_count == 0) 2931 return; 2932 2933 anyvalid = 0; 2934 2935 rw_wlock(&pvh_global_lock); 2936 sched_pin(); 2937 PMAP_LOCK(pmap); 2938 2939 /* 2940 * special handling of removing one page. a very 2941 * common operation and easy to short circuit some 2942 * code. 2943 */ 2944 if ((sva + PAGE_SIZE == eva) && 2945 ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { 2946 pmap_remove_page(pmap, sva, &free); 2947 goto out; 2948 } 2949 2950 for (; sva < eva; sva = pdnxt) { 2951 u_int pdirindex; 2952 2953 /* 2954 * Calculate index for next page table. 2955 */ 2956 pdnxt = (sva + NBPDR) & ~PDRMASK; 2957 if (pdnxt < sva) 2958 pdnxt = eva; 2959 if (pmap->pm_stats.resident_count == 0) 2960 break; 2961 2962 pdirindex = sva >> PDRSHIFT; 2963 ptpaddr = pmap->pm_pdir[pdirindex]; 2964 2965 /* 2966 * Weed out invalid mappings. Note: we assume that the page 2967 * directory table is always allocated, and in kernel virtual. 2968 */ 2969 if (ptpaddr == 0) 2970 continue; 2971 2972 /* 2973 * Check for large page. 2974 */ 2975 if ((ptpaddr & PG_PS) != 0) { 2976 /* 2977 * Are we removing the entire large page? If not, 2978 * demote the mapping and fall through. 2979 */ 2980 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 2981 /* 2982 * The TLB entry for a PG_G mapping is 2983 * invalidated by pmap_remove_pde(). 2984 */ 2985 if ((ptpaddr & PG_G) == 0) 2986 anyvalid = 1; 2987 pmap_remove_pde(pmap, 2988 &pmap->pm_pdir[pdirindex], sva, &free); 2989 continue; 2990 } else if (!pmap_demote_pde(pmap, 2991 &pmap->pm_pdir[pdirindex], sva)) { 2992 /* The large page mapping was destroyed. */ 2993 continue; 2994 } 2995 } 2996 2997 /* 2998 * Limit our scan to either the end of the va represented 2999 * by the current page table page, or to the end of the 3000 * range being removed. 3001 */ 3002 if (pdnxt > eva) 3003 pdnxt = eva; 3004 3005 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 3006 sva += PAGE_SIZE) { 3007 if (*pte == 0) 3008 continue; 3009 3010 /* 3011 * The TLB entry for a PG_G mapping is invalidated 3012 * by pmap_remove_pte(). 3013 */ 3014 if ((*pte & PG_G) == 0) 3015 anyvalid = 1; 3016 if (pmap_remove_pte(pmap, pte, sva, &free)) 3017 break; 3018 } 3019 } 3020out: 3021 sched_unpin(); 3022 if (anyvalid) 3023 pmap_invalidate_all(pmap); 3024 rw_wunlock(&pvh_global_lock); 3025 PMAP_UNLOCK(pmap); 3026 pmap_free_zero_pages(free); 3027} 3028 3029/* 3030 * Routine: pmap_remove_all 3031 * Function: 3032 * Removes this physical page from 3033 * all physical maps in which it resides. 3034 * Reflects back modify bits to the pager. 3035 * 3036 * Notes: 3037 * Original versions of this routine were very 3038 * inefficient because they iteratively called 3039 * pmap_remove (slow...) 3040 */ 3041 3042void 3043pmap_remove_all(vm_page_t m) 3044{ 3045 struct md_page *pvh; 3046 pv_entry_t pv; 3047 pmap_t pmap; 3048 pt_entry_t *pte, tpte; 3049 pd_entry_t *pde; 3050 vm_offset_t va; 3051 vm_page_t free; 3052 3053 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3054 ("pmap_remove_all: page %p is not managed", m)); 3055 free = NULL; 3056 rw_wlock(&pvh_global_lock); 3057 sched_pin(); 3058 if ((m->flags & PG_FICTITIOUS) != 0) 3059 goto small_mappings; 3060 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3061 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 3062 va = pv->pv_va; 3063 pmap = PV_PMAP(pv); 3064 PMAP_LOCK(pmap); 3065 pde = pmap_pde(pmap, va); 3066 (void)pmap_demote_pde(pmap, pde, va); 3067 PMAP_UNLOCK(pmap); 3068 } 3069small_mappings: 3070 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3071 pmap = PV_PMAP(pv); 3072 PMAP_LOCK(pmap); 3073 pmap->pm_stats.resident_count--; 3074 pde = pmap_pde(pmap, pv->pv_va); 3075 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 3076 " a 4mpage in page %p's pv list", m)); 3077 pte = pmap_pte_quick(pmap, pv->pv_va); 3078 tpte = pte_load_clear(pte); 3079 KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", 3080 pmap, pv->pv_va)); 3081 if (tpte & PG_W) 3082 pmap->pm_stats.wired_count--; 3083 if (tpte & PG_A) 3084 vm_page_aflag_set(m, PGA_REFERENCED); 3085 3086 /* 3087 * Update the vm_page_t clean and reference bits. 3088 */ 3089 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3090 vm_page_dirty(m); 3091 pmap_unuse_pt(pmap, pv->pv_va, &free); 3092 pmap_invalidate_page(pmap, pv->pv_va); 3093 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3094 free_pv_entry(pmap, pv); 3095 PMAP_UNLOCK(pmap); 3096 } 3097 vm_page_aflag_clear(m, PGA_WRITEABLE); 3098 sched_unpin(); 3099 rw_wunlock(&pvh_global_lock); 3100 pmap_free_zero_pages(free); 3101} 3102 3103/* 3104 * pmap_protect_pde: do the things to protect a 4mpage in a process 3105 */ 3106static boolean_t 3107pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 3108{ 3109 pd_entry_t newpde, oldpde; 3110 vm_offset_t eva, va; 3111 vm_page_t m; 3112 boolean_t anychanged; 3113 3114 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3115 KASSERT((sva & PDRMASK) == 0, 3116 ("pmap_protect_pde: sva is not 4mpage aligned")); 3117 anychanged = FALSE; 3118retry: 3119 oldpde = newpde = *pde; 3120 if (oldpde & PG_MANAGED) { 3121 eva = sva + NBPDR; 3122 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 3123 va < eva; va += PAGE_SIZE, m++) 3124 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3125 vm_page_dirty(m); 3126 } 3127 if ((prot & VM_PROT_WRITE) == 0) 3128 newpde &= ~(PG_RW | PG_M); 3129#ifdef PAE 3130 if ((prot & VM_PROT_EXECUTE) == 0) 3131 newpde |= pg_nx; 3132#endif 3133 if (newpde != oldpde) { 3134 if (!pde_cmpset(pde, oldpde, newpde)) 3135 goto retry; 3136 if (oldpde & PG_G) 3137 pmap_invalidate_page(pmap, sva); 3138 else 3139 anychanged = TRUE; 3140 } 3141 return (anychanged); 3142} 3143 3144/* 3145 * Set the physical protection on the 3146 * specified range of this map as requested. 3147 */ 3148void 3149pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 3150{ 3151 vm_offset_t pdnxt; 3152 pd_entry_t ptpaddr; 3153 pt_entry_t *pte; 3154 boolean_t anychanged, pv_lists_locked; 3155 3156 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 3157 pmap_remove(pmap, sva, eva); 3158 return; 3159 } 3160 3161#ifdef PAE 3162 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 3163 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 3164 return; 3165#else 3166 if (prot & VM_PROT_WRITE) 3167 return; 3168#endif 3169 3170 if (pmap_is_current(pmap)) 3171 pv_lists_locked = FALSE; 3172 else { 3173 pv_lists_locked = TRUE; 3174resume: 3175 rw_wlock(&pvh_global_lock); 3176 sched_pin(); 3177 } 3178 anychanged = FALSE; 3179 3180 PMAP_LOCK(pmap); 3181 for (; sva < eva; sva = pdnxt) { 3182 pt_entry_t obits, pbits; 3183 u_int pdirindex; 3184 3185 pdnxt = (sva + NBPDR) & ~PDRMASK; 3186 if (pdnxt < sva) 3187 pdnxt = eva; 3188 3189 pdirindex = sva >> PDRSHIFT; 3190 ptpaddr = pmap->pm_pdir[pdirindex]; 3191 3192 /* 3193 * Weed out invalid mappings. Note: we assume that the page 3194 * directory table is always allocated, and in kernel virtual. 3195 */ 3196 if (ptpaddr == 0) 3197 continue; 3198 3199 /* 3200 * Check for large page. 3201 */ 3202 if ((ptpaddr & PG_PS) != 0) { 3203 /* 3204 * Are we protecting the entire large page? If not, 3205 * demote the mapping and fall through. 3206 */ 3207 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 3208 /* 3209 * The TLB entry for a PG_G mapping is 3210 * invalidated by pmap_protect_pde(). 3211 */ 3212 if (pmap_protect_pde(pmap, 3213 &pmap->pm_pdir[pdirindex], sva, prot)) 3214 anychanged = TRUE; 3215 continue; 3216 } else { 3217 if (!pv_lists_locked) { 3218 pv_lists_locked = TRUE; 3219 if (!rw_try_wlock(&pvh_global_lock)) { 3220 if (anychanged) 3221 pmap_invalidate_all( 3222 pmap); 3223 PMAP_UNLOCK(pmap); 3224 goto resume; 3225 } 3226 sched_pin(); 3227 } 3228 if (!pmap_demote_pde(pmap, 3229 &pmap->pm_pdir[pdirindex], sva)) { 3230 /* 3231 * The large page mapping was 3232 * destroyed. 3233 */ 3234 continue; 3235 } 3236 } 3237 } 3238 3239 if (pdnxt > eva) 3240 pdnxt = eva; 3241 3242 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 3243 sva += PAGE_SIZE) { 3244 vm_page_t m; 3245 3246retry: 3247 /* 3248 * Regardless of whether a pte is 32 or 64 bits in 3249 * size, PG_RW, PG_A, and PG_M are among the least 3250 * significant 32 bits. 3251 */ 3252 obits = pbits = *pte; 3253 if ((pbits & PG_V) == 0) 3254 continue; 3255 3256 if ((prot & VM_PROT_WRITE) == 0) { 3257 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 3258 (PG_MANAGED | PG_M | PG_RW)) { 3259 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 3260 vm_page_dirty(m); 3261 } 3262 pbits &= ~(PG_RW | PG_M); 3263 } 3264#ifdef PAE 3265 if ((prot & VM_PROT_EXECUTE) == 0) 3266 pbits |= pg_nx; 3267#endif 3268 3269 if (pbits != obits) { 3270#ifdef PAE 3271 if (!atomic_cmpset_64(pte, obits, pbits)) 3272 goto retry; 3273#else 3274 if (!atomic_cmpset_int((u_int *)pte, obits, 3275 pbits)) 3276 goto retry; 3277#endif 3278 if (obits & PG_G) 3279 pmap_invalidate_page(pmap, sva); 3280 else 3281 anychanged = TRUE; 3282 } 3283 } 3284 } 3285 if (anychanged) 3286 pmap_invalidate_all(pmap); 3287 if (pv_lists_locked) { 3288 sched_unpin(); 3289 rw_wunlock(&pvh_global_lock); 3290 } 3291 PMAP_UNLOCK(pmap); 3292} 3293 3294/* 3295 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are 3296 * within a single page table page (PTP) to a single 2- or 4MB page mapping. 3297 * For promotion to occur, two conditions must be met: (1) the 4KB page 3298 * mappings must map aligned, contiguous physical memory and (2) the 4KB page 3299 * mappings must have identical characteristics. 3300 * 3301 * Managed (PG_MANAGED) mappings within the kernel address space are not 3302 * promoted. The reason is that kernel PDEs are replicated in each pmap but 3303 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel 3304 * pmap. 3305 */ 3306static void 3307pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3308{ 3309 pd_entry_t newpde; 3310 pt_entry_t *firstpte, oldpte, pa, *pte; 3311 vm_offset_t oldpteva; 3312 vm_page_t mpte; 3313 3314 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3315 3316 /* 3317 * Examine the first PTE in the specified PTP. Abort if this PTE is 3318 * either invalid, unused, or does not map the first 4KB physical page 3319 * within a 2- or 4MB page. 3320 */ 3321 firstpte = pmap_pte_quick(pmap, trunc_4mpage(va)); 3322setpde: 3323 newpde = *firstpte; 3324 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 3325 pmap_pde_p_failures++; 3326 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3327 " in pmap %p", va, pmap); 3328 return; 3329 } 3330 if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { 3331 pmap_pde_p_failures++; 3332 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3333 " in pmap %p", va, pmap); 3334 return; 3335 } 3336 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 3337 /* 3338 * When PG_M is already clear, PG_RW can be cleared without 3339 * a TLB invalidation. 3340 */ 3341 if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde & 3342 ~PG_RW)) 3343 goto setpde; 3344 newpde &= ~PG_RW; 3345 } 3346 3347 /* 3348 * Examine each of the other PTEs in the specified PTP. Abort if this 3349 * PTE maps an unexpected 4KB physical page or does not have identical 3350 * characteristics to the first PTE. 3351 */ 3352 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 3353 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 3354setpte: 3355 oldpte = *pte; 3356 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 3357 pmap_pde_p_failures++; 3358 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3359 " in pmap %p", va, pmap); 3360 return; 3361 } 3362 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 3363 /* 3364 * When PG_M is already clear, PG_RW can be cleared 3365 * without a TLB invalidation. 3366 */ 3367 if (!atomic_cmpset_int((u_int *)pte, oldpte, 3368 oldpte & ~PG_RW)) 3369 goto setpte; 3370 oldpte &= ~PG_RW; 3371 oldpteva = (oldpte & PG_FRAME & PDRMASK) | 3372 (va & ~PDRMASK); 3373 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" 3374 " in pmap %p", oldpteva, pmap); 3375 } 3376 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 3377 pmap_pde_p_failures++; 3378 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3379 " in pmap %p", va, pmap); 3380 return; 3381 } 3382 pa -= PAGE_SIZE; 3383 } 3384 3385 /* 3386 * Save the page table page in its current state until the PDE 3387 * mapping the superpage is demoted by pmap_demote_pde() or 3388 * destroyed by pmap_remove_pde(). 3389 */ 3390 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 3391 KASSERT(mpte >= vm_page_array && 3392 mpte < &vm_page_array[vm_page_array_size], 3393 ("pmap_promote_pde: page table page is out of range")); 3394 KASSERT(mpte->pindex == va >> PDRSHIFT, 3395 ("pmap_promote_pde: page table page's pindex is wrong")); 3396 pmap_insert_pt_page(pmap, mpte); 3397 3398 /* 3399 * Promote the pv entries. 3400 */ 3401 if ((newpde & PG_MANAGED) != 0) 3402 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); 3403 3404 /* 3405 * Propagate the PAT index to its proper position. 3406 */ 3407 if ((newpde & PG_PTE_PAT) != 0) 3408 newpde ^= PG_PDE_PAT | PG_PTE_PAT; 3409 3410 /* 3411 * Map the superpage. 3412 */ 3413 if (workaround_erratum383) 3414 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 3415 else if (pmap == kernel_pmap) 3416 pmap_kenter_pde(va, PG_PS | newpde); 3417 else 3418 pde_store(pde, PG_PS | newpde); 3419 3420 pmap_pde_promotions++; 3421 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" 3422 " in pmap %p", va, pmap); 3423} 3424 3425/* 3426 * Insert the given physical page (p) at 3427 * the specified virtual address (v) in the 3428 * target physical map with the protection requested. 3429 * 3430 * If specified, the page will be wired down, meaning 3431 * that the related pte can not be reclaimed. 3432 * 3433 * NB: This is the only routine which MAY NOT lazy-evaluate 3434 * or lose information. That is, this routine must actually 3435 * insert this page into the given map NOW. 3436 */ 3437void 3438pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m, 3439 vm_prot_t prot, boolean_t wired) 3440{ 3441 pd_entry_t *pde; 3442 pt_entry_t *pte; 3443 pt_entry_t newpte, origpte; 3444 pv_entry_t pv; 3445 vm_paddr_t opa, pa; 3446 vm_page_t mpte, om; 3447 boolean_t invlva; 3448 3449 va = trunc_page(va); 3450 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 3451 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 3452 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", 3453 va)); 3454 KASSERT((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) != 0 || 3455 VM_OBJECT_LOCKED(m->object), 3456 ("pmap_enter: page %p is not busy", m)); 3457 3458 mpte = NULL; 3459 3460 rw_wlock(&pvh_global_lock); 3461 PMAP_LOCK(pmap); 3462 sched_pin(); 3463 3464 /* 3465 * In the case that a page table page is not 3466 * resident, we are creating it here. 3467 */ 3468 if (va < VM_MAXUSER_ADDRESS) { 3469 mpte = pmap_allocpte(pmap, va, M_WAITOK); 3470 } 3471 3472 pde = pmap_pde(pmap, va); 3473 if ((*pde & PG_PS) != 0) 3474 panic("pmap_enter: attempted pmap_enter on 4MB page"); 3475 pte = pmap_pte_quick(pmap, va); 3476 3477 /* 3478 * Page Directory table entry not valid, we need a new PT page 3479 */ 3480 if (pte == NULL) { 3481 panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", 3482 (uintmax_t)pmap->pm_pdir[PTDPTDI], va); 3483 } 3484 3485 pa = VM_PAGE_TO_PHYS(m); 3486 om = NULL; 3487 origpte = *pte; 3488 opa = origpte & PG_FRAME; 3489 3490 /* 3491 * Mapping has not changed, must be protection or wiring change. 3492 */ 3493 if (origpte && (opa == pa)) { 3494 /* 3495 * Wiring change, just update stats. We don't worry about 3496 * wiring PT pages as they remain resident as long as there 3497 * are valid mappings in them. Hence, if a user page is wired, 3498 * the PT page will be also. 3499 */ 3500 if (wired && ((origpte & PG_W) == 0)) 3501 pmap->pm_stats.wired_count++; 3502 else if (!wired && (origpte & PG_W)) 3503 pmap->pm_stats.wired_count--; 3504 3505 /* 3506 * Remove extra pte reference 3507 */ 3508 if (mpte) 3509 mpte->wire_count--; 3510 3511 if (origpte & PG_MANAGED) { 3512 om = m; 3513 pa |= PG_MANAGED; 3514 } 3515 goto validate; 3516 } 3517 3518 pv = NULL; 3519 3520 /* 3521 * Mapping has changed, invalidate old range and fall through to 3522 * handle validating new mapping. 3523 */ 3524 if (opa) { 3525 if (origpte & PG_W) 3526 pmap->pm_stats.wired_count--; 3527 if (origpte & PG_MANAGED) { 3528 om = PHYS_TO_VM_PAGE(opa); 3529 pv = pmap_pvh_remove(&om->md, pmap, va); 3530 } 3531 if (mpte != NULL) { 3532 mpte->wire_count--; 3533 KASSERT(mpte->wire_count > 0, 3534 ("pmap_enter: missing reference to page table page," 3535 " va: 0x%x", va)); 3536 } 3537 } else 3538 pmap->pm_stats.resident_count++; 3539 3540 /* 3541 * Enter on the PV list if part of our managed memory. 3542 */ 3543 if ((m->oflags & VPO_UNMANAGED) == 0) { 3544 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, 3545 ("pmap_enter: managed mapping within the clean submap")); 3546 if (pv == NULL) 3547 pv = get_pv_entry(pmap, FALSE); 3548 pv->pv_va = va; 3549 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 3550 pa |= PG_MANAGED; 3551 } else if (pv != NULL) 3552 free_pv_entry(pmap, pv); 3553 3554 /* 3555 * Increment counters 3556 */ 3557 if (wired) 3558 pmap->pm_stats.wired_count++; 3559 3560validate: 3561 /* 3562 * Now validate mapping with desired protection/wiring. 3563 */ 3564 newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V); 3565 if ((prot & VM_PROT_WRITE) != 0) { 3566 newpte |= PG_RW; 3567 if ((newpte & PG_MANAGED) != 0) 3568 vm_page_aflag_set(m, PGA_WRITEABLE); 3569 } 3570#ifdef PAE 3571 if ((prot & VM_PROT_EXECUTE) == 0) 3572 newpte |= pg_nx; 3573#endif 3574 if (wired) 3575 newpte |= PG_W; 3576 if (va < VM_MAXUSER_ADDRESS) 3577 newpte |= PG_U; 3578 if (pmap == kernel_pmap) 3579 newpte |= pgeflag; 3580 3581 /* 3582 * if the mapping or permission bits are different, we need 3583 * to update the pte. 3584 */ 3585 if ((origpte & ~(PG_M|PG_A)) != newpte) { 3586 newpte |= PG_A; 3587 if ((access & VM_PROT_WRITE) != 0) 3588 newpte |= PG_M; 3589 if (origpte & PG_V) { 3590 invlva = FALSE; 3591 origpte = pte_load_store(pte, newpte); 3592 if (origpte & PG_A) { 3593 if (origpte & PG_MANAGED) 3594 vm_page_aflag_set(om, PGA_REFERENCED); 3595 if (opa != VM_PAGE_TO_PHYS(m)) 3596 invlva = TRUE; 3597#ifdef PAE 3598 if ((origpte & PG_NX) == 0 && 3599 (newpte & PG_NX) != 0) 3600 invlva = TRUE; 3601#endif 3602 } 3603 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 3604 if ((origpte & PG_MANAGED) != 0) 3605 vm_page_dirty(om); 3606 if ((prot & VM_PROT_WRITE) == 0) 3607 invlva = TRUE; 3608 } 3609 if ((origpte & PG_MANAGED) != 0 && 3610 TAILQ_EMPTY(&om->md.pv_list) && 3611 ((om->flags & PG_FICTITIOUS) != 0 || 3612 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 3613 vm_page_aflag_clear(om, PGA_WRITEABLE); 3614 if (invlva) 3615 pmap_invalidate_page(pmap, va); 3616 } else 3617 pte_store(pte, newpte); 3618 } 3619 3620 /* 3621 * If both the page table page and the reservation are fully 3622 * populated, then attempt promotion. 3623 */ 3624 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 3625 pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 && 3626 vm_reserv_level_iffullpop(m) == 0) 3627 pmap_promote_pde(pmap, pde, va); 3628 3629 sched_unpin(); 3630 rw_wunlock(&pvh_global_lock); 3631 PMAP_UNLOCK(pmap); 3632} 3633 3634/* 3635 * Tries to create a 2- or 4MB page mapping. Returns TRUE if successful and 3636 * FALSE otherwise. Fails if (1) a page table page cannot be allocated without 3637 * blocking, (2) a mapping already exists at the specified virtual address, or 3638 * (3) a pv entry cannot be allocated without reclaiming another pv entry. 3639 */ 3640static boolean_t 3641pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3642{ 3643 pd_entry_t *pde, newpde; 3644 3645 rw_assert(&pvh_global_lock, RA_WLOCKED); 3646 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3647 pde = pmap_pde(pmap, va); 3648 if (*pde != 0) { 3649 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3650 " in pmap %p", va, pmap); 3651 return (FALSE); 3652 } 3653 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) | 3654 PG_PS | PG_V; 3655 if ((m->oflags & VPO_UNMANAGED) == 0) { 3656 newpde |= PG_MANAGED; 3657 3658 /* 3659 * Abort this mapping if its PV entry could not be created. 3660 */ 3661 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) { 3662 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3663 " in pmap %p", va, pmap); 3664 return (FALSE); 3665 } 3666 } 3667#ifdef PAE 3668 if ((prot & VM_PROT_EXECUTE) == 0) 3669 newpde |= pg_nx; 3670#endif 3671 if (va < VM_MAXUSER_ADDRESS) 3672 newpde |= PG_U; 3673 3674 /* 3675 * Increment counters. 3676 */ 3677 pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; 3678 3679 /* 3680 * Map the superpage. 3681 */ 3682 pde_store(pde, newpde); 3683 3684 pmap_pde_mappings++; 3685 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 3686 " in pmap %p", va, pmap); 3687 return (TRUE); 3688} 3689 3690/* 3691 * Maps a sequence of resident pages belonging to the same object. 3692 * The sequence begins with the given page m_start. This page is 3693 * mapped at the given virtual address start. Each subsequent page is 3694 * mapped at a virtual address that is offset from start by the same 3695 * amount as the page is offset from m_start within the object. The 3696 * last page in the sequence is the page with the largest offset from 3697 * m_start that can be mapped at a virtual address less than the given 3698 * virtual address end. Not every virtual page between start and end 3699 * is mapped; only those for which a resident page exists with the 3700 * corresponding offset from m_start are mapped. 3701 */ 3702void 3703pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3704 vm_page_t m_start, vm_prot_t prot) 3705{ 3706 vm_offset_t va; 3707 vm_page_t m, mpte; 3708 vm_pindex_t diff, psize; 3709 3710 VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED); 3711 psize = atop(end - start); 3712 mpte = NULL; 3713 m = m_start; 3714 rw_wlock(&pvh_global_lock); 3715 PMAP_LOCK(pmap); 3716 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3717 va = start + ptoa(diff); 3718 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 3719 (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 && 3720 pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 && 3721 pmap_enter_pde(pmap, va, m, prot)) 3722 m = &m[NBPDR / PAGE_SIZE - 1]; 3723 else 3724 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 3725 mpte); 3726 m = TAILQ_NEXT(m, listq); 3727 } 3728 rw_wunlock(&pvh_global_lock); 3729 PMAP_UNLOCK(pmap); 3730} 3731 3732/* 3733 * this code makes some *MAJOR* assumptions: 3734 * 1. Current pmap & pmap exists. 3735 * 2. Not wired. 3736 * 3. Read access. 3737 * 4. No page table pages. 3738 * but is *MUCH* faster than pmap_enter... 3739 */ 3740 3741void 3742pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3743{ 3744 3745 rw_wlock(&pvh_global_lock); 3746 PMAP_LOCK(pmap); 3747 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); 3748 rw_wunlock(&pvh_global_lock); 3749 PMAP_UNLOCK(pmap); 3750} 3751 3752static vm_page_t 3753pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3754 vm_prot_t prot, vm_page_t mpte) 3755{ 3756 pt_entry_t *pte; 3757 vm_paddr_t pa; 3758 vm_page_t free; 3759 3760 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 3761 (m->oflags & VPO_UNMANAGED) != 0, 3762 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3763 rw_assert(&pvh_global_lock, RA_WLOCKED); 3764 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3765 3766 /* 3767 * In the case that a page table page is not 3768 * resident, we are creating it here. 3769 */ 3770 if (va < VM_MAXUSER_ADDRESS) { 3771 u_int ptepindex; 3772 pd_entry_t ptepa; 3773 3774 /* 3775 * Calculate pagetable page index 3776 */ 3777 ptepindex = va >> PDRSHIFT; 3778 if (mpte && (mpte->pindex == ptepindex)) { 3779 mpte->wire_count++; 3780 } else { 3781 /* 3782 * Get the page directory entry 3783 */ 3784 ptepa = pmap->pm_pdir[ptepindex]; 3785 3786 /* 3787 * If the page table page is mapped, we just increment 3788 * the hold count, and activate it. 3789 */ 3790 if (ptepa) { 3791 if (ptepa & PG_PS) 3792 return (NULL); 3793 mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 3794 mpte->wire_count++; 3795 } else { 3796 mpte = _pmap_allocpte(pmap, ptepindex, 3797 M_NOWAIT); 3798 if (mpte == NULL) 3799 return (mpte); 3800 } 3801 } 3802 } else { 3803 mpte = NULL; 3804 } 3805 3806 /* 3807 * This call to vtopte makes the assumption that we are 3808 * entering the page into the current pmap. In order to support 3809 * quick entry into any pmap, one would likely use pmap_pte_quick. 3810 * But that isn't as quick as vtopte. 3811 */ 3812 pte = vtopte(va); 3813 if (*pte) { 3814 if (mpte != NULL) { 3815 mpte->wire_count--; 3816 mpte = NULL; 3817 } 3818 return (mpte); 3819 } 3820 3821 /* 3822 * Enter on the PV list if part of our managed memory. 3823 */ 3824 if ((m->oflags & VPO_UNMANAGED) == 0 && 3825 !pmap_try_insert_pv_entry(pmap, va, m)) { 3826 if (mpte != NULL) { 3827 free = NULL; 3828 if (pmap_unwire_ptp(pmap, mpte, &free)) { 3829 pmap_invalidate_page(pmap, va); 3830 pmap_free_zero_pages(free); 3831 } 3832 3833 mpte = NULL; 3834 } 3835 return (mpte); 3836 } 3837 3838 /* 3839 * Increment counters 3840 */ 3841 pmap->pm_stats.resident_count++; 3842 3843 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); 3844#ifdef PAE 3845 if ((prot & VM_PROT_EXECUTE) == 0) 3846 pa |= pg_nx; 3847#endif 3848 3849 /* 3850 * Now validate mapping with RO protection 3851 */ 3852 if ((m->oflags & VPO_UNMANAGED) != 0) 3853 pte_store(pte, pa | PG_V | PG_U); 3854 else 3855 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 3856 return (mpte); 3857} 3858 3859/* 3860 * Make a temporary mapping for a physical address. This is only intended 3861 * to be used for panic dumps. 3862 */ 3863void * 3864pmap_kenter_temporary(vm_paddr_t pa, int i) 3865{ 3866 vm_offset_t va; 3867 3868 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 3869 pmap_kenter(va, pa); 3870 invlpg(va); 3871 return ((void *)crashdumpmap); 3872} 3873 3874/* 3875 * This code maps large physical mmap regions into the 3876 * processor address space. Note that some shortcuts 3877 * are taken, but the code works. 3878 */ 3879void 3880pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3881 vm_pindex_t pindex, vm_size_t size) 3882{ 3883 pd_entry_t *pde; 3884 vm_paddr_t pa, ptepa; 3885 vm_page_t p; 3886 int pat_mode; 3887 3888 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 3889 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3890 ("pmap_object_init_pt: non-device object")); 3891 if (pseflag && 3892 (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 3893 if (!vm_object_populate(object, pindex, pindex + atop(size))) 3894 return; 3895 p = vm_page_lookup(object, pindex); 3896 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3897 ("pmap_object_init_pt: invalid page %p", p)); 3898 pat_mode = p->md.pat_mode; 3899 3900 /* 3901 * Abort the mapping if the first page is not physically 3902 * aligned to a 2/4MB page boundary. 3903 */ 3904 ptepa = VM_PAGE_TO_PHYS(p); 3905 if (ptepa & (NBPDR - 1)) 3906 return; 3907 3908 /* 3909 * Skip the first page. Abort the mapping if the rest of 3910 * the pages are not physically contiguous or have differing 3911 * memory attributes. 3912 */ 3913 p = TAILQ_NEXT(p, listq); 3914 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 3915 pa += PAGE_SIZE) { 3916 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3917 ("pmap_object_init_pt: invalid page %p", p)); 3918 if (pa != VM_PAGE_TO_PHYS(p) || 3919 pat_mode != p->md.pat_mode) 3920 return; 3921 p = TAILQ_NEXT(p, listq); 3922 } 3923 3924 /* 3925 * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and 3926 * "size" is a multiple of 2/4M, adding the PAT setting to 3927 * "pa" will not affect the termination of this loop. 3928 */ 3929 PMAP_LOCK(pmap); 3930 for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa + 3931 size; pa += NBPDR) { 3932 pde = pmap_pde(pmap, addr); 3933 if (*pde == 0) { 3934 pde_store(pde, pa | PG_PS | PG_M | PG_A | 3935 PG_U | PG_RW | PG_V); 3936 pmap->pm_stats.resident_count += NBPDR / 3937 PAGE_SIZE; 3938 pmap_pde_mappings++; 3939 } 3940 /* Else continue on if the PDE is already valid. */ 3941 addr += NBPDR; 3942 } 3943 PMAP_UNLOCK(pmap); 3944 } 3945} 3946 3947/* 3948 * Routine: pmap_change_wiring 3949 * Function: Change the wiring attribute for a map/virtual-address 3950 * pair. 3951 * In/out conditions: 3952 * The mapping must already exist in the pmap. 3953 */ 3954void 3955pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) 3956{ 3957 pd_entry_t *pde; 3958 pt_entry_t *pte; 3959 boolean_t are_queues_locked; 3960 3961 are_queues_locked = FALSE; 3962retry: 3963 PMAP_LOCK(pmap); 3964 pde = pmap_pde(pmap, va); 3965 if ((*pde & PG_PS) != 0) { 3966 if (!wired != ((*pde & PG_W) == 0)) { 3967 if (!are_queues_locked) { 3968 are_queues_locked = TRUE; 3969 if (!rw_try_wlock(&pvh_global_lock)) { 3970 PMAP_UNLOCK(pmap); 3971 rw_wlock(&pvh_global_lock); 3972 goto retry; 3973 } 3974 } 3975 if (!pmap_demote_pde(pmap, pde, va)) 3976 panic("pmap_change_wiring: demotion failed"); 3977 } else 3978 goto out; 3979 } 3980 pte = pmap_pte(pmap, va); 3981 3982 if (wired && !pmap_pte_w(pte)) 3983 pmap->pm_stats.wired_count++; 3984 else if (!wired && pmap_pte_w(pte)) 3985 pmap->pm_stats.wired_count--; 3986 3987 /* 3988 * Wiring is not a hardware characteristic so there is no need to 3989 * invalidate TLB. 3990 */ 3991 pmap_pte_set_w(pte, wired); 3992 pmap_pte_release(pte); 3993out: 3994 if (are_queues_locked) 3995 rw_wunlock(&pvh_global_lock); 3996 PMAP_UNLOCK(pmap); 3997} 3998 3999 4000 4001/* 4002 * Copy the range specified by src_addr/len 4003 * from the source map to the range dst_addr/len 4004 * in the destination map. 4005 * 4006 * This routine is only advisory and need not do anything. 4007 */ 4008 4009void 4010pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 4011 vm_offset_t src_addr) 4012{ 4013 vm_page_t free; 4014 vm_offset_t addr; 4015 vm_offset_t end_addr = src_addr + len; 4016 vm_offset_t pdnxt; 4017 4018 if (dst_addr != src_addr) 4019 return; 4020 4021 if (!pmap_is_current(src_pmap)) 4022 return; 4023 4024 rw_wlock(&pvh_global_lock); 4025 if (dst_pmap < src_pmap) { 4026 PMAP_LOCK(dst_pmap); 4027 PMAP_LOCK(src_pmap); 4028 } else { 4029 PMAP_LOCK(src_pmap); 4030 PMAP_LOCK(dst_pmap); 4031 } 4032 sched_pin(); 4033 for (addr = src_addr; addr < end_addr; addr = pdnxt) { 4034 pt_entry_t *src_pte, *dst_pte; 4035 vm_page_t dstmpte, srcmpte; 4036 pd_entry_t srcptepaddr; 4037 u_int ptepindex; 4038 4039 KASSERT(addr < UPT_MIN_ADDRESS, 4040 ("pmap_copy: invalid to pmap_copy page tables")); 4041 4042 pdnxt = (addr + NBPDR) & ~PDRMASK; 4043 if (pdnxt < addr) 4044 pdnxt = end_addr; 4045 ptepindex = addr >> PDRSHIFT; 4046 4047 srcptepaddr = src_pmap->pm_pdir[ptepindex]; 4048 if (srcptepaddr == 0) 4049 continue; 4050 4051 if (srcptepaddr & PG_PS) { 4052 if (dst_pmap->pm_pdir[ptepindex] == 0 && 4053 ((srcptepaddr & PG_MANAGED) == 0 || 4054 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & 4055 PG_PS_FRAME))) { 4056 dst_pmap->pm_pdir[ptepindex] = srcptepaddr & 4057 ~PG_W; 4058 dst_pmap->pm_stats.resident_count += 4059 NBPDR / PAGE_SIZE; 4060 } 4061 continue; 4062 } 4063 4064 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); 4065 KASSERT(srcmpte->wire_count > 0, 4066 ("pmap_copy: source page table page is unused")); 4067 4068 if (pdnxt > end_addr) 4069 pdnxt = end_addr; 4070 4071 src_pte = vtopte(addr); 4072 while (addr < pdnxt) { 4073 pt_entry_t ptetemp; 4074 ptetemp = *src_pte; 4075 /* 4076 * we only virtual copy managed pages 4077 */ 4078 if ((ptetemp & PG_MANAGED) != 0) { 4079 dstmpte = pmap_allocpte(dst_pmap, addr, 4080 M_NOWAIT); 4081 if (dstmpte == NULL) 4082 goto out; 4083 dst_pte = pmap_pte_quick(dst_pmap, addr); 4084 if (*dst_pte == 0 && 4085 pmap_try_insert_pv_entry(dst_pmap, addr, 4086 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { 4087 /* 4088 * Clear the wired, modified, and 4089 * accessed (referenced) bits 4090 * during the copy. 4091 */ 4092 *dst_pte = ptetemp & ~(PG_W | PG_M | 4093 PG_A); 4094 dst_pmap->pm_stats.resident_count++; 4095 } else { 4096 free = NULL; 4097 if (pmap_unwire_ptp(dst_pmap, dstmpte, 4098 &free)) { 4099 pmap_invalidate_page(dst_pmap, 4100 addr); 4101 pmap_free_zero_pages(free); 4102 } 4103 goto out; 4104 } 4105 if (dstmpte->wire_count >= srcmpte->wire_count) 4106 break; 4107 } 4108 addr += PAGE_SIZE; 4109 src_pte++; 4110 } 4111 } 4112out: 4113 sched_unpin(); 4114 rw_wunlock(&pvh_global_lock); 4115 PMAP_UNLOCK(src_pmap); 4116 PMAP_UNLOCK(dst_pmap); 4117} 4118 4119static __inline void 4120pagezero(void *page) 4121{ 4122#if defined(I686_CPU) 4123 if (cpu_class == CPUCLASS_686) { 4124#if defined(CPU_ENABLE_SSE) 4125 if (cpu_feature & CPUID_SSE2) 4126 sse2_pagezero(page); 4127 else 4128#endif 4129 i686_pagezero(page); 4130 } else 4131#endif 4132 bzero(page, PAGE_SIZE); 4133} 4134 4135/* 4136 * pmap_zero_page zeros the specified hardware page by mapping 4137 * the page into KVM and using bzero to clear its contents. 4138 */ 4139void 4140pmap_zero_page(vm_page_t m) 4141{ 4142 struct sysmaps *sysmaps; 4143 4144 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 4145 mtx_lock(&sysmaps->lock); 4146 if (*sysmaps->CMAP2) 4147 panic("pmap_zero_page: CMAP2 busy"); 4148 sched_pin(); 4149 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4150 pmap_cache_bits(m->md.pat_mode, 0); 4151 invlcaddr(sysmaps->CADDR2); 4152 pagezero(sysmaps->CADDR2); 4153 *sysmaps->CMAP2 = 0; 4154 sched_unpin(); 4155 mtx_unlock(&sysmaps->lock); 4156} 4157 4158/* 4159 * pmap_zero_page_area zeros the specified hardware page by mapping 4160 * the page into KVM and using bzero to clear its contents. 4161 * 4162 * off and size may not cover an area beyond a single hardware page. 4163 */ 4164void 4165pmap_zero_page_area(vm_page_t m, int off, int size) 4166{ 4167 struct sysmaps *sysmaps; 4168 4169 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 4170 mtx_lock(&sysmaps->lock); 4171 if (*sysmaps->CMAP2) 4172 panic("pmap_zero_page_area: CMAP2 busy"); 4173 sched_pin(); 4174 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4175 pmap_cache_bits(m->md.pat_mode, 0); 4176 invlcaddr(sysmaps->CADDR2); 4177 if (off == 0 && size == PAGE_SIZE) 4178 pagezero(sysmaps->CADDR2); 4179 else 4180 bzero((char *)sysmaps->CADDR2 + off, size); 4181 *sysmaps->CMAP2 = 0; 4182 sched_unpin(); 4183 mtx_unlock(&sysmaps->lock); 4184} 4185 4186/* 4187 * pmap_zero_page_idle zeros the specified hardware page by mapping 4188 * the page into KVM and using bzero to clear its contents. This 4189 * is intended to be called from the vm_pagezero process only and 4190 * outside of Giant. 4191 */ 4192void 4193pmap_zero_page_idle(vm_page_t m) 4194{ 4195 4196 if (*CMAP3) 4197 panic("pmap_zero_page_idle: CMAP3 busy"); 4198 sched_pin(); 4199 *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4200 pmap_cache_bits(m->md.pat_mode, 0); 4201 invlcaddr(CADDR3); 4202 pagezero(CADDR3); 4203 *CMAP3 = 0; 4204 sched_unpin(); 4205} 4206 4207/* 4208 * pmap_copy_page copies the specified (machine independent) 4209 * page by mapping the page into virtual memory and using 4210 * bcopy to copy the page, one machine dependent page at a 4211 * time. 4212 */ 4213void 4214pmap_copy_page(vm_page_t src, vm_page_t dst) 4215{ 4216 struct sysmaps *sysmaps; 4217 4218 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 4219 mtx_lock(&sysmaps->lock); 4220 if (*sysmaps->CMAP1) 4221 panic("pmap_copy_page: CMAP1 busy"); 4222 if (*sysmaps->CMAP2) 4223 panic("pmap_copy_page: CMAP2 busy"); 4224 sched_pin(); 4225 invlpg((u_int)sysmaps->CADDR1); 4226 invlpg((u_int)sysmaps->CADDR2); 4227 *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A | 4228 pmap_cache_bits(src->md.pat_mode, 0); 4229 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M | 4230 pmap_cache_bits(dst->md.pat_mode, 0); 4231 bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE); 4232 *sysmaps->CMAP1 = 0; 4233 *sysmaps->CMAP2 = 0; 4234 sched_unpin(); 4235 mtx_unlock(&sysmaps->lock); 4236} 4237 4238/* 4239 * Returns true if the pmap's pv is one of the first 4240 * 16 pvs linked to from this page. This count may 4241 * be changed upwards or downwards in the future; it 4242 * is only necessary that true be returned for a small 4243 * subset of pmaps for proper page aging. 4244 */ 4245boolean_t 4246pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 4247{ 4248 struct md_page *pvh; 4249 pv_entry_t pv; 4250 int loops = 0; 4251 boolean_t rv; 4252 4253 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4254 ("pmap_page_exists_quick: page %p is not managed", m)); 4255 rv = FALSE; 4256 rw_wlock(&pvh_global_lock); 4257 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4258 if (PV_PMAP(pv) == pmap) { 4259 rv = TRUE; 4260 break; 4261 } 4262 loops++; 4263 if (loops >= 16) 4264 break; 4265 } 4266 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 4267 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4268 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 4269 if (PV_PMAP(pv) == pmap) { 4270 rv = TRUE; 4271 break; 4272 } 4273 loops++; 4274 if (loops >= 16) 4275 break; 4276 } 4277 } 4278 rw_wunlock(&pvh_global_lock); 4279 return (rv); 4280} 4281 4282/* 4283 * pmap_page_wired_mappings: 4284 * 4285 * Return the number of managed mappings to the given physical page 4286 * that are wired. 4287 */ 4288int 4289pmap_page_wired_mappings(vm_page_t m) 4290{ 4291 int count; 4292 4293 count = 0; 4294 if ((m->oflags & VPO_UNMANAGED) != 0) 4295 return (count); 4296 rw_wlock(&pvh_global_lock); 4297 count = pmap_pvh_wired_mappings(&m->md, count); 4298 if ((m->flags & PG_FICTITIOUS) == 0) { 4299 count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), 4300 count); 4301 } 4302 rw_wunlock(&pvh_global_lock); 4303 return (count); 4304} 4305 4306/* 4307 * pmap_pvh_wired_mappings: 4308 * 4309 * Return the updated number "count" of managed mappings that are wired. 4310 */ 4311static int 4312pmap_pvh_wired_mappings(struct md_page *pvh, int count) 4313{ 4314 pmap_t pmap; 4315 pt_entry_t *pte; 4316 pv_entry_t pv; 4317 4318 rw_assert(&pvh_global_lock, RA_WLOCKED); 4319 sched_pin(); 4320 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 4321 pmap = PV_PMAP(pv); 4322 PMAP_LOCK(pmap); 4323 pte = pmap_pte_quick(pmap, pv->pv_va); 4324 if ((*pte & PG_W) != 0) 4325 count++; 4326 PMAP_UNLOCK(pmap); 4327 } 4328 sched_unpin(); 4329 return (count); 4330} 4331 4332/* 4333 * Returns TRUE if the given page is mapped individually or as part of 4334 * a 4mpage. Otherwise, returns FALSE. 4335 */ 4336boolean_t 4337pmap_page_is_mapped(vm_page_t m) 4338{ 4339 boolean_t rv; 4340 4341 if ((m->oflags & VPO_UNMANAGED) != 0) 4342 return (FALSE); 4343 rw_wlock(&pvh_global_lock); 4344 rv = !TAILQ_EMPTY(&m->md.pv_list) || 4345 ((m->flags & PG_FICTITIOUS) == 0 && 4346 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 4347 rw_wunlock(&pvh_global_lock); 4348 return (rv); 4349} 4350 4351/* 4352 * Remove all pages from specified address space 4353 * this aids process exit speeds. Also, this code 4354 * is special cased for current process only, but 4355 * can have the more generic (and slightly slower) 4356 * mode enabled. This is much faster than pmap_remove 4357 * in the case of running down an entire address space. 4358 */ 4359void 4360pmap_remove_pages(pmap_t pmap) 4361{ 4362 pt_entry_t *pte, tpte; 4363 vm_page_t free = NULL; 4364 vm_page_t m, mpte, mt; 4365 pv_entry_t pv; 4366 struct md_page *pvh; 4367 struct pv_chunk *pc, *npc; 4368 int field, idx; 4369 int32_t bit; 4370 uint32_t inuse, bitmask; 4371 int allfree; 4372 4373 if (pmap != PCPU_GET(curpmap)) { 4374 printf("warning: pmap_remove_pages called with non-current pmap\n"); 4375 return; 4376 } 4377 rw_wlock(&pvh_global_lock); 4378 PMAP_LOCK(pmap); 4379 sched_pin(); 4380 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4381 KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, 4382 pc->pc_pmap)); 4383 allfree = 1; 4384 for (field = 0; field < _NPCM; field++) { 4385 inuse = ~pc->pc_map[field] & pc_freemask[field]; 4386 while (inuse != 0) { 4387 bit = bsfl(inuse); 4388 bitmask = 1UL << bit; 4389 idx = field * 32 + bit; 4390 pv = &pc->pc_pventry[idx]; 4391 inuse &= ~bitmask; 4392 4393 pte = pmap_pde(pmap, pv->pv_va); 4394 tpte = *pte; 4395 if ((tpte & PG_PS) == 0) { 4396 pte = vtopte(pv->pv_va); 4397 tpte = *pte & ~PG_PTE_PAT; 4398 } 4399 4400 if (tpte == 0) { 4401 printf( 4402 "TPTE at %p IS ZERO @ VA %08x\n", 4403 pte, pv->pv_va); 4404 panic("bad pte"); 4405 } 4406 4407/* 4408 * We cannot remove wired pages from a process' mapping at this time 4409 */ 4410 if (tpte & PG_W) { 4411 allfree = 0; 4412 continue; 4413 } 4414 4415 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 4416 KASSERT(m->phys_addr == (tpte & PG_FRAME), 4417 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 4418 m, (uintmax_t)m->phys_addr, 4419 (uintmax_t)tpte)); 4420 4421 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4422 m < &vm_page_array[vm_page_array_size], 4423 ("pmap_remove_pages: bad tpte %#jx", 4424 (uintmax_t)tpte)); 4425 4426 pte_clear(pte); 4427 4428 /* 4429 * Update the vm_page_t clean/reference bits. 4430 */ 4431 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4432 if ((tpte & PG_PS) != 0) { 4433 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4434 vm_page_dirty(mt); 4435 } else 4436 vm_page_dirty(m); 4437 } 4438 4439 /* Mark free */ 4440 PV_STAT(pv_entry_frees++); 4441 PV_STAT(pv_entry_spare++); 4442 pv_entry_count--; 4443 pc->pc_map[field] |= bitmask; 4444 if ((tpte & PG_PS) != 0) { 4445 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 4446 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 4447 TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); 4448 if (TAILQ_EMPTY(&pvh->pv_list)) { 4449 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4450 if (TAILQ_EMPTY(&mt->md.pv_list)) 4451 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4452 } 4453 mpte = pmap_lookup_pt_page(pmap, pv->pv_va); 4454 if (mpte != NULL) { 4455 pmap_remove_pt_page(pmap, mpte); 4456 pmap->pm_stats.resident_count--; 4457 KASSERT(mpte->wire_count == NPTEPG, 4458 ("pmap_remove_pages: pte page wire count error")); 4459 mpte->wire_count = 0; 4460 pmap_add_delayed_free_list(mpte, &free, FALSE); 4461 atomic_subtract_int(&cnt.v_wire_count, 1); 4462 } 4463 } else { 4464 pmap->pm_stats.resident_count--; 4465 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 4466 if (TAILQ_EMPTY(&m->md.pv_list) && 4467 (m->flags & PG_FICTITIOUS) == 0) { 4468 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4469 if (TAILQ_EMPTY(&pvh->pv_list)) 4470 vm_page_aflag_clear(m, PGA_WRITEABLE); 4471 } 4472 pmap_unuse_pt(pmap, pv->pv_va, &free); 4473 } 4474 } 4475 } 4476 if (allfree) { 4477 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4478 free_pv_chunk(pc); 4479 } 4480 } 4481 sched_unpin(); 4482 pmap_invalidate_all(pmap); 4483 rw_wunlock(&pvh_global_lock); 4484 PMAP_UNLOCK(pmap); 4485 pmap_free_zero_pages(free); 4486} 4487 4488/* 4489 * pmap_is_modified: 4490 * 4491 * Return whether or not the specified physical page was modified 4492 * in any physical maps. 4493 */ 4494boolean_t 4495pmap_is_modified(vm_page_t m) 4496{ 4497 boolean_t rv; 4498 4499 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4500 ("pmap_is_modified: page %p is not managed", m)); 4501 4502 /* 4503 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be 4504 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 4505 * is clear, no PTEs can have PG_M set. 4506 */ 4507 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 4508 if ((m->oflags & VPO_BUSY) == 0 && 4509 (m->aflags & PGA_WRITEABLE) == 0) 4510 return (FALSE); 4511 rw_wlock(&pvh_global_lock); 4512 rv = pmap_is_modified_pvh(&m->md) || 4513 ((m->flags & PG_FICTITIOUS) == 0 && 4514 pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 4515 rw_wunlock(&pvh_global_lock); 4516 return (rv); 4517} 4518 4519/* 4520 * Returns TRUE if any of the given mappings were used to modify 4521 * physical memory. Otherwise, returns FALSE. Both page and 2mpage 4522 * mappings are supported. 4523 */ 4524static boolean_t 4525pmap_is_modified_pvh(struct md_page *pvh) 4526{ 4527 pv_entry_t pv; 4528 pt_entry_t *pte; 4529 pmap_t pmap; 4530 boolean_t rv; 4531 4532 rw_assert(&pvh_global_lock, RA_WLOCKED); 4533 rv = FALSE; 4534 sched_pin(); 4535 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 4536 pmap = PV_PMAP(pv); 4537 PMAP_LOCK(pmap); 4538 pte = pmap_pte_quick(pmap, pv->pv_va); 4539 rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); 4540 PMAP_UNLOCK(pmap); 4541 if (rv) 4542 break; 4543 } 4544 sched_unpin(); 4545 return (rv); 4546} 4547 4548/* 4549 * pmap_is_prefaultable: 4550 * 4551 * Return whether or not the specified virtual address is elgible 4552 * for prefault. 4553 */ 4554boolean_t 4555pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 4556{ 4557 pd_entry_t *pde; 4558 pt_entry_t *pte; 4559 boolean_t rv; 4560 4561 rv = FALSE; 4562 PMAP_LOCK(pmap); 4563 pde = pmap_pde(pmap, addr); 4564 if (*pde != 0 && (*pde & PG_PS) == 0) { 4565 pte = vtopte(addr); 4566 rv = *pte == 0; 4567 } 4568 PMAP_UNLOCK(pmap); 4569 return (rv); 4570} 4571 4572/* 4573 * pmap_is_referenced: 4574 * 4575 * Return whether or not the specified physical page was referenced 4576 * in any physical maps. 4577 */ 4578boolean_t 4579pmap_is_referenced(vm_page_t m) 4580{ 4581 boolean_t rv; 4582 4583 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4584 ("pmap_is_referenced: page %p is not managed", m)); 4585 rw_wlock(&pvh_global_lock); 4586 rv = pmap_is_referenced_pvh(&m->md) || 4587 ((m->flags & PG_FICTITIOUS) == 0 && 4588 pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 4589 rw_wunlock(&pvh_global_lock); 4590 return (rv); 4591} 4592 4593/* 4594 * Returns TRUE if any of the given mappings were referenced and FALSE 4595 * otherwise. Both page and 4mpage mappings are supported. 4596 */ 4597static boolean_t 4598pmap_is_referenced_pvh(struct md_page *pvh) 4599{ 4600 pv_entry_t pv; 4601 pt_entry_t *pte; 4602 pmap_t pmap; 4603 boolean_t rv; 4604 4605 rw_assert(&pvh_global_lock, RA_WLOCKED); 4606 rv = FALSE; 4607 sched_pin(); 4608 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 4609 pmap = PV_PMAP(pv); 4610 PMAP_LOCK(pmap); 4611 pte = pmap_pte_quick(pmap, pv->pv_va); 4612 rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); 4613 PMAP_UNLOCK(pmap); 4614 if (rv) 4615 break; 4616 } 4617 sched_unpin(); 4618 return (rv); 4619} 4620 4621/* 4622 * Clear the write and modified bits in each of the given page's mappings. 4623 */ 4624void 4625pmap_remove_write(vm_page_t m) 4626{ 4627 struct md_page *pvh; 4628 pv_entry_t next_pv, pv; 4629 pmap_t pmap; 4630 pd_entry_t *pde; 4631 pt_entry_t oldpte, *pte; 4632 vm_offset_t va; 4633 4634 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4635 ("pmap_remove_write: page %p is not managed", m)); 4636 4637 /* 4638 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be set by 4639 * another thread while the object is locked. Thus, if PGA_WRITEABLE 4640 * is clear, no page table entries need updating. 4641 */ 4642 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 4643 if ((m->oflags & VPO_BUSY) == 0 && 4644 (m->aflags & PGA_WRITEABLE) == 0) 4645 return; 4646 rw_wlock(&pvh_global_lock); 4647 sched_pin(); 4648 if ((m->flags & PG_FICTITIOUS) != 0) 4649 goto small_mappings; 4650 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4651 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { 4652 va = pv->pv_va; 4653 pmap = PV_PMAP(pv); 4654 PMAP_LOCK(pmap); 4655 pde = pmap_pde(pmap, va); 4656 if ((*pde & PG_RW) != 0) 4657 (void)pmap_demote_pde(pmap, pde, va); 4658 PMAP_UNLOCK(pmap); 4659 } 4660small_mappings: 4661 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4662 pmap = PV_PMAP(pv); 4663 PMAP_LOCK(pmap); 4664 pde = pmap_pde(pmap, pv->pv_va); 4665 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" 4666 " a 4mpage in page %p's pv list", m)); 4667 pte = pmap_pte_quick(pmap, pv->pv_va); 4668retry: 4669 oldpte = *pte; 4670 if ((oldpte & PG_RW) != 0) { 4671 /* 4672 * Regardless of whether a pte is 32 or 64 bits 4673 * in size, PG_RW and PG_M are among the least 4674 * significant 32 bits. 4675 */ 4676 if (!atomic_cmpset_int((u_int *)pte, oldpte, 4677 oldpte & ~(PG_RW | PG_M))) 4678 goto retry; 4679 if ((oldpte & PG_M) != 0) 4680 vm_page_dirty(m); 4681 pmap_invalidate_page(pmap, pv->pv_va); 4682 } 4683 PMAP_UNLOCK(pmap); 4684 } 4685 vm_page_aflag_clear(m, PGA_WRITEABLE); 4686 sched_unpin(); 4687 rw_wunlock(&pvh_global_lock); 4688} 4689 4690/* 4691 * pmap_ts_referenced: 4692 * 4693 * Return a count of reference bits for a page, clearing those bits. 4694 * It is not necessary for every reference bit to be cleared, but it 4695 * is necessary that 0 only be returned when there are truly no 4696 * reference bits set. 4697 * 4698 * XXX: The exact number of bits to check and clear is a matter that 4699 * should be tested and standardized at some point in the future for 4700 * optimal aging of shared pages. 4701 */ 4702int 4703pmap_ts_referenced(vm_page_t m) 4704{ 4705 struct md_page *pvh; 4706 pv_entry_t pv, pvf, pvn; 4707 pmap_t pmap; 4708 pd_entry_t oldpde, *pde; 4709 pt_entry_t *pte; 4710 vm_offset_t va; 4711 int rtval = 0; 4712 4713 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4714 ("pmap_ts_referenced: page %p is not managed", m)); 4715 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4716 rw_wlock(&pvh_global_lock); 4717 sched_pin(); 4718 if ((m->flags & PG_FICTITIOUS) != 0) 4719 goto small_mappings; 4720 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) { 4721 va = pv->pv_va; 4722 pmap = PV_PMAP(pv); 4723 PMAP_LOCK(pmap); 4724 pde = pmap_pde(pmap, va); 4725 oldpde = *pde; 4726 if ((oldpde & PG_A) != 0) { 4727 if (pmap_demote_pde(pmap, pde, va)) { 4728 if ((oldpde & PG_W) == 0) { 4729 /* 4730 * Remove the mapping to a single page 4731 * so that a subsequent access may 4732 * repromote. Since the underlying 4733 * page table page is fully populated, 4734 * this removal never frees a page 4735 * table page. 4736 */ 4737 va += VM_PAGE_TO_PHYS(m) - (oldpde & 4738 PG_PS_FRAME); 4739 pmap_remove_page(pmap, va, NULL); 4740 rtval++; 4741 if (rtval > 4) { 4742 PMAP_UNLOCK(pmap); 4743 goto out; 4744 } 4745 } 4746 } 4747 } 4748 PMAP_UNLOCK(pmap); 4749 } 4750small_mappings: 4751 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4752 pvf = pv; 4753 do { 4754 pvn = TAILQ_NEXT(pv, pv_list); 4755 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 4756 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 4757 pmap = PV_PMAP(pv); 4758 PMAP_LOCK(pmap); 4759 pde = pmap_pde(pmap, pv->pv_va); 4760 KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:" 4761 " found a 4mpage in page %p's pv list", m)); 4762 pte = pmap_pte_quick(pmap, pv->pv_va); 4763 if ((*pte & PG_A) != 0) { 4764 atomic_clear_int((u_int *)pte, PG_A); 4765 pmap_invalidate_page(pmap, pv->pv_va); 4766 rtval++; 4767 if (rtval > 4) 4768 pvn = NULL; 4769 } 4770 PMAP_UNLOCK(pmap); 4771 } while ((pv = pvn) != NULL && pv != pvf); 4772 } 4773out: 4774 sched_unpin(); 4775 rw_wunlock(&pvh_global_lock); 4776 return (rtval); 4777} 4778 4779/* 4780 * Clear the modify bits on the specified physical page. 4781 */ 4782void 4783pmap_clear_modify(vm_page_t m) 4784{ 4785 struct md_page *pvh; 4786 pv_entry_t next_pv, pv; 4787 pmap_t pmap; 4788 pd_entry_t oldpde, *pde; 4789 pt_entry_t oldpte, *pte; 4790 vm_offset_t va; 4791 4792 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4793 ("pmap_clear_modify: page %p is not managed", m)); 4794 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 4795 KASSERT((m->oflags & VPO_BUSY) == 0, 4796 ("pmap_clear_modify: page %p is busy", m)); 4797 4798 /* 4799 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 4800 * If the object containing the page is locked and the page is not 4801 * VPO_BUSY, then PGA_WRITEABLE cannot be concurrently set. 4802 */ 4803 if ((m->aflags & PGA_WRITEABLE) == 0) 4804 return; 4805 rw_wlock(&pvh_global_lock); 4806 sched_pin(); 4807 if ((m->flags & PG_FICTITIOUS) != 0) 4808 goto small_mappings; 4809 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4810 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { 4811 va = pv->pv_va; 4812 pmap = PV_PMAP(pv); 4813 PMAP_LOCK(pmap); 4814 pde = pmap_pde(pmap, va); 4815 oldpde = *pde; 4816 if ((oldpde & PG_RW) != 0) { 4817 if (pmap_demote_pde(pmap, pde, va)) { 4818 if ((oldpde & PG_W) == 0) { 4819 /* 4820 * Write protect the mapping to a 4821 * single page so that a subsequent 4822 * write access may repromote. 4823 */ 4824 va += VM_PAGE_TO_PHYS(m) - (oldpde & 4825 PG_PS_FRAME); 4826 pte = pmap_pte_quick(pmap, va); 4827 oldpte = *pte; 4828 if ((oldpte & PG_V) != 0) { 4829 /* 4830 * Regardless of whether a pte is 32 or 64 bits 4831 * in size, PG_RW and PG_M are among the least 4832 * significant 32 bits. 4833 */ 4834 while (!atomic_cmpset_int((u_int *)pte, 4835 oldpte, 4836 oldpte & ~(PG_M | PG_RW))) 4837 oldpte = *pte; 4838 vm_page_dirty(m); 4839 pmap_invalidate_page(pmap, va); 4840 } 4841 } 4842 } 4843 } 4844 PMAP_UNLOCK(pmap); 4845 } 4846small_mappings: 4847 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4848 pmap = PV_PMAP(pv); 4849 PMAP_LOCK(pmap); 4850 pde = pmap_pde(pmap, pv->pv_va); 4851 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 4852 " a 4mpage in page %p's pv list", m)); 4853 pte = pmap_pte_quick(pmap, pv->pv_va); 4854 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4855 /* 4856 * Regardless of whether a pte is 32 or 64 bits 4857 * in size, PG_M is among the least significant 4858 * 32 bits. 4859 */ 4860 atomic_clear_int((u_int *)pte, PG_M); 4861 pmap_invalidate_page(pmap, pv->pv_va); 4862 } 4863 PMAP_UNLOCK(pmap); 4864 } 4865 sched_unpin(); 4866 rw_wunlock(&pvh_global_lock); 4867} 4868 4869/* 4870 * pmap_clear_reference: 4871 * 4872 * Clear the reference bit on the specified physical page. 4873 */ 4874void 4875pmap_clear_reference(vm_page_t m) 4876{ 4877 struct md_page *pvh; 4878 pv_entry_t next_pv, pv; 4879 pmap_t pmap; 4880 pd_entry_t oldpde, *pde; 4881 pt_entry_t *pte; 4882 vm_offset_t va; 4883 4884 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4885 ("pmap_clear_reference: page %p is not managed", m)); 4886 rw_wlock(&pvh_global_lock); 4887 sched_pin(); 4888 if ((m->flags & PG_FICTITIOUS) != 0) 4889 goto small_mappings; 4890 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4891 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { 4892 va = pv->pv_va; 4893 pmap = PV_PMAP(pv); 4894 PMAP_LOCK(pmap); 4895 pde = pmap_pde(pmap, va); 4896 oldpde = *pde; 4897 if ((oldpde & PG_A) != 0) { 4898 if (pmap_demote_pde(pmap, pde, va)) { 4899 /* 4900 * Remove the mapping to a single page so 4901 * that a subsequent access may repromote. 4902 * Since the underlying page table page is 4903 * fully populated, this removal never frees 4904 * a page table page. 4905 */ 4906 va += VM_PAGE_TO_PHYS(m) - (oldpde & 4907 PG_PS_FRAME); 4908 pmap_remove_page(pmap, va, NULL); 4909 } 4910 } 4911 PMAP_UNLOCK(pmap); 4912 } 4913small_mappings: 4914 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4915 pmap = PV_PMAP(pv); 4916 PMAP_LOCK(pmap); 4917 pde = pmap_pde(pmap, pv->pv_va); 4918 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found" 4919 " a 4mpage in page %p's pv list", m)); 4920 pte = pmap_pte_quick(pmap, pv->pv_va); 4921 if ((*pte & PG_A) != 0) { 4922 /* 4923 * Regardless of whether a pte is 32 or 64 bits 4924 * in size, PG_A is among the least significant 4925 * 32 bits. 4926 */ 4927 atomic_clear_int((u_int *)pte, PG_A); 4928 pmap_invalidate_page(pmap, pv->pv_va); 4929 } 4930 PMAP_UNLOCK(pmap); 4931 } 4932 sched_unpin(); 4933 rw_wunlock(&pvh_global_lock); 4934} 4935 4936/* 4937 * Miscellaneous support routines follow 4938 */ 4939 4940/* Adjust the cache mode for a 4KB page mapped via a PTE. */ 4941static __inline void 4942pmap_pte_attr(pt_entry_t *pte, int cache_bits) 4943{ 4944 u_int opte, npte; 4945 4946 /* 4947 * The cache mode bits are all in the low 32-bits of the 4948 * PTE, so we can just spin on updating the low 32-bits. 4949 */ 4950 do { 4951 opte = *(u_int *)pte; 4952 npte = opte & ~PG_PTE_CACHE; 4953 npte |= cache_bits; 4954 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 4955} 4956 4957/* Adjust the cache mode for a 2/4MB page mapped via a PDE. */ 4958static __inline void 4959pmap_pde_attr(pd_entry_t *pde, int cache_bits) 4960{ 4961 u_int opde, npde; 4962 4963 /* 4964 * The cache mode bits are all in the low 32-bits of the 4965 * PDE, so we can just spin on updating the low 32-bits. 4966 */ 4967 do { 4968 opde = *(u_int *)pde; 4969 npde = opde & ~PG_PDE_CACHE; 4970 npde |= cache_bits; 4971 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 4972} 4973 4974/* 4975 * Map a set of physical memory pages into the kernel virtual 4976 * address space. Return a pointer to where it is mapped. This 4977 * routine is intended to be used for mapping device memory, 4978 * NOT real memory. 4979 */ 4980void * 4981pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 4982{ 4983 vm_offset_t va, offset; 4984 vm_size_t tmpsize; 4985 4986 offset = pa & PAGE_MASK; 4987 size = roundup(offset + size, PAGE_SIZE); 4988 pa = pa & PG_FRAME; 4989 4990 if (pa < KERNLOAD && pa + size <= KERNLOAD) 4991 va = KERNBASE + pa; 4992 else 4993 va = kmem_alloc_nofault(kernel_map, size); 4994 if (!va) 4995 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 4996 4997 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 4998 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 4999 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 5000 pmap_invalidate_cache_range(va, va + size); 5001 return ((void *)(va + offset)); 5002} 5003 5004void * 5005pmap_mapdev(vm_paddr_t pa, vm_size_t size) 5006{ 5007 5008 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 5009} 5010 5011void * 5012pmap_mapbios(vm_paddr_t pa, vm_size_t size) 5013{ 5014 5015 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5016} 5017 5018void 5019pmap_unmapdev(vm_offset_t va, vm_size_t size) 5020{ 5021 vm_offset_t base, offset; 5022 5023 if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) 5024 return; 5025 base = trunc_page(va); 5026 offset = va & PAGE_MASK; 5027 size = roundup(offset + size, PAGE_SIZE); 5028 kmem_free(kernel_map, base, size); 5029} 5030 5031/* 5032 * Sets the memory attribute for the specified page. 5033 */ 5034void 5035pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5036{ 5037 5038 m->md.pat_mode = ma; 5039 if ((m->flags & PG_FICTITIOUS) != 0) 5040 return; 5041 5042 /* 5043 * If "m" is a normal page, flush it from the cache. 5044 * See pmap_invalidate_cache_range(). 5045 * 5046 * First, try to find an existing mapping of the page by sf 5047 * buffer. sf_buf_invalidate_cache() modifies mapping and 5048 * flushes the cache. 5049 */ 5050 if (sf_buf_invalidate_cache(m)) 5051 return; 5052 5053 /* 5054 * If page is not mapped by sf buffer, but CPU does not 5055 * support self snoop, map the page transient and do 5056 * invalidation. In the worst case, whole cache is flushed by 5057 * pmap_invalidate_cache_range(). 5058 */ 5059 if ((cpu_feature & CPUID_SS) == 0) 5060 pmap_flush_page(m); 5061} 5062 5063static void 5064pmap_flush_page(vm_page_t m) 5065{ 5066 struct sysmaps *sysmaps; 5067 vm_offset_t sva, eva; 5068 5069 if ((cpu_feature & CPUID_CLFSH) != 0) { 5070 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 5071 mtx_lock(&sysmaps->lock); 5072 if (*sysmaps->CMAP2) 5073 panic("pmap_flush_page: CMAP2 busy"); 5074 sched_pin(); 5075 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | 5076 PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); 5077 invlcaddr(sysmaps->CADDR2); 5078 sva = (vm_offset_t)sysmaps->CADDR2; 5079 eva = sva + PAGE_SIZE; 5080 5081 /* 5082 * Use mfence despite the ordering implied by 5083 * mtx_{un,}lock() because clflush is not guaranteed 5084 * to be ordered by any other instruction. 5085 */ 5086 mfence(); 5087 for (; sva < eva; sva += cpu_clflush_line_size) 5088 clflush(sva); 5089 mfence(); 5090 *sysmaps->CMAP2 = 0; 5091 sched_unpin(); 5092 mtx_unlock(&sysmaps->lock); 5093 } else 5094 pmap_invalidate_cache(); 5095} 5096 5097/* 5098 * Changes the specified virtual address range's memory type to that given by 5099 * the parameter "mode". The specified virtual address range must be 5100 * completely contained within either the kernel map. 5101 * 5102 * Returns zero if the change completed successfully, and either EINVAL or 5103 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 5104 * of the virtual address range was not mapped, and ENOMEM is returned if 5105 * there was insufficient memory available to complete the change. 5106 */ 5107int 5108pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 5109{ 5110 vm_offset_t base, offset, tmpva; 5111 pd_entry_t *pde; 5112 pt_entry_t *pte; 5113 int cache_bits_pte, cache_bits_pde; 5114 boolean_t changed; 5115 5116 base = trunc_page(va); 5117 offset = va & PAGE_MASK; 5118 size = roundup(offset + size, PAGE_SIZE); 5119 5120 /* 5121 * Only supported on kernel virtual addresses above the recursive map. 5122 */ 5123 if (base < VM_MIN_KERNEL_ADDRESS) 5124 return (EINVAL); 5125 5126 cache_bits_pde = pmap_cache_bits(mode, 1); 5127 cache_bits_pte = pmap_cache_bits(mode, 0); 5128 changed = FALSE; 5129 5130 /* 5131 * Pages that aren't mapped aren't supported. Also break down 5132 * 2/4MB pages into 4KB pages if required. 5133 */ 5134 PMAP_LOCK(kernel_pmap); 5135 for (tmpva = base; tmpva < base + size; ) { 5136 pde = pmap_pde(kernel_pmap, tmpva); 5137 if (*pde == 0) { 5138 PMAP_UNLOCK(kernel_pmap); 5139 return (EINVAL); 5140 } 5141 if (*pde & PG_PS) { 5142 /* 5143 * If the current 2/4MB page already has 5144 * the required memory type, then we need not 5145 * demote this page. Just increment tmpva to 5146 * the next 2/4MB page frame. 5147 */ 5148 if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { 5149 tmpva = trunc_4mpage(tmpva) + NBPDR; 5150 continue; 5151 } 5152 5153 /* 5154 * If the current offset aligns with a 2/4MB 5155 * page frame and there is at least 2/4MB left 5156 * within the range, then we need not break 5157 * down this page into 4KB pages. 5158 */ 5159 if ((tmpva & PDRMASK) == 0 && 5160 tmpva + PDRMASK < base + size) { 5161 tmpva += NBPDR; 5162 continue; 5163 } 5164 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) { 5165 PMAP_UNLOCK(kernel_pmap); 5166 return (ENOMEM); 5167 } 5168 } 5169 pte = vtopte(tmpva); 5170 if (*pte == 0) { 5171 PMAP_UNLOCK(kernel_pmap); 5172 return (EINVAL); 5173 } 5174 tmpva += PAGE_SIZE; 5175 } 5176 PMAP_UNLOCK(kernel_pmap); 5177 5178 /* 5179 * Ok, all the pages exist, so run through them updating their 5180 * cache mode if required. 5181 */ 5182 for (tmpva = base; tmpva < base + size; ) { 5183 pde = pmap_pde(kernel_pmap, tmpva); 5184 if (*pde & PG_PS) { 5185 if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { 5186 pmap_pde_attr(pde, cache_bits_pde); 5187 changed = TRUE; 5188 } 5189 tmpva = trunc_4mpage(tmpva) + NBPDR; 5190 } else { 5191 pte = vtopte(tmpva); 5192 if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { 5193 pmap_pte_attr(pte, cache_bits_pte); 5194 changed = TRUE; 5195 } 5196 tmpva += PAGE_SIZE; 5197 } 5198 } 5199 5200 /* 5201 * Flush CPU caches to make sure any data isn't cached that 5202 * shouldn't be, etc. 5203 */ 5204 if (changed) { 5205 pmap_invalidate_range(kernel_pmap, base, tmpva); 5206 pmap_invalidate_cache_range(base, tmpva); 5207 } 5208 return (0); 5209} 5210 5211/* 5212 * perform the pmap work for mincore 5213 */ 5214int 5215pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 5216{ 5217 pd_entry_t *pdep; 5218 pt_entry_t *ptep, pte; 5219 vm_paddr_t pa; 5220 int val; 5221 5222 PMAP_LOCK(pmap); 5223retry: 5224 pdep = pmap_pde(pmap, addr); 5225 if (*pdep != 0) { 5226 if (*pdep & PG_PS) { 5227 pte = *pdep; 5228 /* Compute the physical address of the 4KB page. */ 5229 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & 5230 PG_FRAME; 5231 val = MINCORE_SUPER; 5232 } else { 5233 ptep = pmap_pte(pmap, addr); 5234 pte = *ptep; 5235 pmap_pte_release(ptep); 5236 pa = pte & PG_FRAME; 5237 val = 0; 5238 } 5239 } else { 5240 pte = 0; 5241 pa = 0; 5242 val = 0; 5243 } 5244 if ((pte & PG_V) != 0) { 5245 val |= MINCORE_INCORE; 5246 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5247 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 5248 if ((pte & PG_A) != 0) 5249 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 5250 } 5251 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 5252 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 5253 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 5254 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 5255 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 5256 goto retry; 5257 } else 5258 PA_UNLOCK_COND(*locked_pa); 5259 PMAP_UNLOCK(pmap); 5260 return (val); 5261} 5262 5263void 5264pmap_activate(struct thread *td) 5265{ 5266 pmap_t pmap, oldpmap; 5267 u_int cpuid; 5268 u_int32_t cr3; 5269 5270 critical_enter(); 5271 pmap = vmspace_pmap(td->td_proc->p_vmspace); 5272 oldpmap = PCPU_GET(curpmap); 5273 cpuid = PCPU_GET(cpuid); 5274#if defined(SMP) 5275 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 5276 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 5277#else 5278 CPU_CLR(cpuid, &oldpmap->pm_active); 5279 CPU_SET(cpuid, &pmap->pm_active); 5280#endif 5281#ifdef PAE 5282 cr3 = vtophys(pmap->pm_pdpt); 5283#else 5284 cr3 = vtophys(pmap->pm_pdir); 5285#endif 5286 /* 5287 * pmap_activate is for the current thread on the current cpu 5288 */ 5289 td->td_pcb->pcb_cr3 = cr3; 5290 load_cr3(cr3); 5291 PCPU_SET(curpmap, pmap); 5292 critical_exit(); 5293} 5294 5295void 5296pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 5297{ 5298} 5299 5300/* 5301 * Increase the starting virtual address of the given mapping if a 5302 * different alignment might result in more superpage mappings. 5303 */ 5304void 5305pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 5306 vm_offset_t *addr, vm_size_t size) 5307{ 5308 vm_offset_t superpage_offset; 5309 5310 if (size < NBPDR) 5311 return; 5312 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 5313 offset += ptoa(object->pg_color); 5314 superpage_offset = offset & PDRMASK; 5315 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 5316 (*addr & PDRMASK) == superpage_offset) 5317 return; 5318 if ((*addr & PDRMASK) < superpage_offset) 5319 *addr = (*addr & ~PDRMASK) + superpage_offset; 5320 else 5321 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 5322} 5323 5324 5325#if defined(PMAP_DEBUG) 5326pmap_pid_dump(int pid) 5327{ 5328 pmap_t pmap; 5329 struct proc *p; 5330 int npte = 0; 5331 int index; 5332 5333 sx_slock(&allproc_lock); 5334 FOREACH_PROC_IN_SYSTEM(p) { 5335 if (p->p_pid != pid) 5336 continue; 5337 5338 if (p->p_vmspace) { 5339 int i,j; 5340 index = 0; 5341 pmap = vmspace_pmap(p->p_vmspace); 5342 for (i = 0; i < NPDEPTD; i++) { 5343 pd_entry_t *pde; 5344 pt_entry_t *pte; 5345 vm_offset_t base = i << PDRSHIFT; 5346 5347 pde = &pmap->pm_pdir[i]; 5348 if (pde && pmap_pde_v(pde)) { 5349 for (j = 0; j < NPTEPG; j++) { 5350 vm_offset_t va = base + (j << PAGE_SHIFT); 5351 if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { 5352 if (index) { 5353 index = 0; 5354 printf("\n"); 5355 } 5356 sx_sunlock(&allproc_lock); 5357 return (npte); 5358 } 5359 pte = pmap_pte(pmap, va); 5360 if (pte && pmap_pte_v(pte)) { 5361 pt_entry_t pa; 5362 vm_page_t m; 5363 pa = *pte; 5364 m = PHYS_TO_VM_PAGE(pa & PG_FRAME); 5365 printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", 5366 va, pa, m->hold_count, m->wire_count, m->flags); 5367 npte++; 5368 index++; 5369 if (index >= 2) { 5370 index = 0; 5371 printf("\n"); 5372 } else { 5373 printf(" "); 5374 } 5375 } 5376 } 5377 } 5378 } 5379 } 5380 } 5381 sx_sunlock(&allproc_lock); 5382 return (npte); 5383} 5384#endif 5385 5386#if defined(DEBUG) 5387 5388static void pads(pmap_t pm); 5389void pmap_pvdump(vm_paddr_t pa); 5390 5391/* print address space of pmap*/ 5392static void 5393pads(pmap_t pm) 5394{ 5395 int i, j; 5396 vm_paddr_t va; 5397 pt_entry_t *ptep; 5398 5399 if (pm == kernel_pmap) 5400 return; 5401 for (i = 0; i < NPDEPTD; i++) 5402 if (pm->pm_pdir[i]) 5403 for (j = 0; j < NPTEPG; j++) { 5404 va = (i << PDRSHIFT) + (j << PAGE_SHIFT); 5405 if (pm == kernel_pmap && va < KERNBASE) 5406 continue; 5407 if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) 5408 continue; 5409 ptep = pmap_pte(pm, va); 5410 if (pmap_pte_v(ptep)) 5411 printf("%x:%x ", va, *ptep); 5412 }; 5413 5414} 5415 5416void 5417pmap_pvdump(vm_paddr_t pa) 5418{ 5419 pv_entry_t pv; 5420 pmap_t pmap; 5421 vm_page_t m; 5422 5423 printf("pa %x", pa); 5424 m = PHYS_TO_VM_PAGE(pa); 5425 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 5426 pmap = PV_PMAP(pv); 5427 printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va); 5428 pads(pmap); 5429 } 5430 printf(" "); 5431} 5432#endif 5433