pmap.c revision 216516
1/*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 9 * All rights reserved. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * the Systems Programming Group of the University of Utah Computer 13 * Science Department and William Jolitz of UUNET Technologies Inc. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 3. All advertising materials mentioning features or use of this software 24 * must display the following acknowledgement: 25 * This product includes software developed by the University of 26 * California, Berkeley and its contributors. 27 * 4. Neither the name of the University nor the names of its contributors 28 * may be used to endorse or promote products derived from this software 29 * without specific prior written permission. 30 * 31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41 * SUCH DAMAGE. 42 * 43 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 44 */ 45/*- 46 * Copyright (c) 2003 Networks Associates Technology, Inc. 47 * All rights reserved. 48 * 49 * This software was developed for the FreeBSD Project by Jake Burkholder, 50 * Safeport Network Services, and Network Associates Laboratories, the 51 * Security Research Division of Network Associates, Inc. under 52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 53 * CHATS research program. 54 * 55 * Redistribution and use in source and binary forms, with or without 56 * modification, are permitted provided that the following conditions 57 * are met: 58 * 1. Redistributions of source code must retain the above copyright 59 * notice, this list of conditions and the following disclaimer. 60 * 2. Redistributions in binary form must reproduce the above copyright 61 * notice, this list of conditions and the following disclaimer in the 62 * documentation and/or other materials provided with the distribution. 63 * 64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 67 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 74 * SUCH DAMAGE. 75 */ 76 77#include <sys/cdefs.h> 78__FBSDID("$FreeBSD: head/sys/i386/i386/pmap.c 216516 2010-12-18 11:31:32Z kib $"); 79 80/* 81 * Manages physical address maps. 82 * 83 * In addition to hardware address maps, this 84 * module is called upon to provide software-use-only 85 * maps which may or may not be stored in the same 86 * form as hardware maps. These pseudo-maps are 87 * used to store intermediate results from copy 88 * operations to and from address spaces. 89 * 90 * Since the information managed by this module is 91 * also stored by the logical address mapping module, 92 * this module may throw away valid virtual-to-physical 93 * mappings at almost any time. However, invalidations 94 * of virtual-to-physical mappings must be done as 95 * requested. 96 * 97 * In order to cope with hardware architectures which 98 * make virtual-to-physical map invalidates expensive, 99 * this module may delay invalidate or reduced protection 100 * operations until such time as they are actually 101 * necessary. This module is given full information as 102 * to which processors are currently using which maps, 103 * and to when physical maps must be made correct. 104 */ 105 106#include "opt_cpu.h" 107#include "opt_pmap.h" 108#include "opt_msgbuf.h" 109#include "opt_smp.h" 110#include "opt_xbox.h" 111 112#include <sys/param.h> 113#include <sys/systm.h> 114#include <sys/kernel.h> 115#include <sys/ktr.h> 116#include <sys/lock.h> 117#include <sys/malloc.h> 118#include <sys/mman.h> 119#include <sys/msgbuf.h> 120#include <sys/mutex.h> 121#include <sys/proc.h> 122#include <sys/sf_buf.h> 123#include <sys/sx.h> 124#include <sys/vmmeter.h> 125#include <sys/sched.h> 126#include <sys/sysctl.h> 127#ifdef SMP 128#include <sys/smp.h> 129#endif 130 131#include <vm/vm.h> 132#include <vm/vm_param.h> 133#include <vm/vm_kern.h> 134#include <vm/vm_page.h> 135#include <vm/vm_map.h> 136#include <vm/vm_object.h> 137#include <vm/vm_extern.h> 138#include <vm/vm_pageout.h> 139#include <vm/vm_pager.h> 140#include <vm/vm_reserv.h> 141#include <vm/uma.h> 142 143#include <machine/cpu.h> 144#include <machine/cputypes.h> 145#include <machine/md_var.h> 146#include <machine/pcb.h> 147#include <machine/specialreg.h> 148#ifdef SMP 149#include <machine/smp.h> 150#endif 151 152#ifdef XBOX 153#include <machine/xbox.h> 154#endif 155 156#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) 157#define CPU_ENABLE_SSE 158#endif 159 160#ifndef PMAP_SHPGPERPROC 161#define PMAP_SHPGPERPROC 200 162#endif 163 164#if !defined(DIAGNOSTIC) 165#ifdef __GNUC_GNU_INLINE__ 166#define PMAP_INLINE __attribute__((__gnu_inline__)) inline 167#else 168#define PMAP_INLINE extern inline 169#endif 170#else 171#define PMAP_INLINE 172#endif 173 174#define PV_STATS 175#ifdef PV_STATS 176#define PV_STAT(x) do { x ; } while (0) 177#else 178#define PV_STAT(x) do { } while (0) 179#endif 180 181#define pa_index(pa) ((pa) >> PDRSHIFT) 182#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 183 184/* 185 * Get PDEs and PTEs for user/kernel address space 186 */ 187#define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) 188#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 189 190#define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) 191#define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) 192#define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) 193#define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) 194#define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) 195 196#define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ 197 atomic_clear_int((u_int *)(pte), PG_W)) 198#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) 199 200struct pmap kernel_pmap_store; 201LIST_HEAD(pmaplist, pmap); 202static struct pmaplist allpmaps; 203static struct mtx allpmaps_lock; 204 205vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 206vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 207int pgeflag = 0; /* PG_G or-in */ 208int pseflag = 0; /* PG_PS or-in */ 209 210static int nkpt = NKPT; 211vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR; 212extern u_int32_t KERNend; 213extern u_int32_t KPTphys; 214 215#ifdef PAE 216pt_entry_t pg_nx; 217static uma_zone_t pdptzone; 218#endif 219 220SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 221 222static int pat_works = 1; 223SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, 224 "Is page attribute table fully functional?"); 225 226static int pg_ps_enabled = 1; 227SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0, 228 "Are large page mappings enabled?"); 229 230#define PAT_INDEX_SIZE 8 231static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 232 233/* 234 * Data for the pv entry allocation mechanism 235 */ 236static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 237static struct md_page *pv_table; 238static int shpgperproc = PMAP_SHPGPERPROC; 239 240struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ 241int pv_maxchunks; /* How many chunks we have KVA for */ 242vm_offset_t pv_vafree; /* freelist stored in the PTE */ 243 244/* 245 * All those kernel PT submaps that BSD is so fond of 246 */ 247struct sysmaps { 248 struct mtx lock; 249 pt_entry_t *CMAP1; 250 pt_entry_t *CMAP2; 251 caddr_t CADDR1; 252 caddr_t CADDR2; 253}; 254static struct sysmaps sysmaps_pcpu[MAXCPU]; 255pt_entry_t *CMAP1 = 0; 256static pt_entry_t *CMAP3; 257static pd_entry_t *KPTD; 258caddr_t CADDR1 = 0, ptvmmap = 0; 259static caddr_t CADDR3; 260struct msgbuf *msgbufp = 0; 261 262/* 263 * Crashdump maps. 264 */ 265static caddr_t crashdumpmap; 266 267static pt_entry_t *PMAP1 = 0, *PMAP2; 268static pt_entry_t *PADDR1 = 0, *PADDR2; 269#ifdef SMP 270static int PMAP1cpu; 271static int PMAP1changedcpu; 272SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 273 &PMAP1changedcpu, 0, 274 "Number of times pmap_pte_quick changed CPU with same PMAP1"); 275#endif 276static int PMAP1changed; 277SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 278 &PMAP1changed, 0, 279 "Number of times pmap_pte_quick changed PMAP1"); 280static int PMAP1unchanged; 281SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 282 &PMAP1unchanged, 0, 283 "Number of times pmap_pte_quick didn't change PMAP1"); 284static struct mtx PMAP2mutex; 285 286static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 287static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try); 288static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 289static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 290static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 291static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 292static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 293 vm_offset_t va); 294static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); 295 296static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 297static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, 298 vm_prot_t prot); 299static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 300 vm_page_t m, vm_prot_t prot, vm_page_t mpte); 301static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 302static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 303static boolean_t pmap_is_modified_pvh(struct md_page *pvh); 304static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); 305static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 306static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde); 307static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); 308static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); 309static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 310static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 311 vm_prot_t prot); 312static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); 313static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 314 vm_page_t *free); 315static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 316 vm_page_t *free); 317static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); 318static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, 319 vm_page_t *free); 320static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, 321 vm_offset_t va); 322static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); 323static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 324 vm_page_t m); 325static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 326 pd_entry_t newpde); 327static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); 328 329static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags); 330 331static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags); 332static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free); 333static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); 334static void pmap_pte_release(pt_entry_t *pte); 335static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *); 336#ifdef PAE 337static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait); 338#endif 339static void pmap_set_pg(void); 340 341CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); 342CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); 343 344/* 345 * If you get an error here, then you set KVA_PAGES wrong! See the 346 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be 347 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. 348 */ 349CTASSERT(KERNBASE % (1 << 24) == 0); 350 351/* 352 * Bootstrap the system enough to run with virtual memory. 353 * 354 * On the i386 this is called after mapping has already been enabled 355 * and just syncs the pmap module with what has already been done. 356 * [We can't call it easily with mapping off since the kernel is not 357 * mapped with PA == VA, hence we would have to relocate every address 358 * from the linked base (virtual) address "KERNBASE" to the actual 359 * (physical) address starting relative to 0] 360 */ 361void 362pmap_bootstrap(vm_paddr_t firstaddr) 363{ 364 vm_offset_t va; 365 pt_entry_t *pte, *unused; 366 struct sysmaps *sysmaps; 367 int i; 368 369 /* 370 * Initialize the first available kernel virtual address. However, 371 * using "firstaddr" may waste a few pages of the kernel virtual 372 * address space, because locore may not have mapped every physical 373 * page that it allocated. Preferably, locore would provide a first 374 * unused virtual address in addition to "firstaddr". 375 */ 376 virtual_avail = (vm_offset_t) KERNBASE + firstaddr; 377 378 virtual_end = VM_MAX_KERNEL_ADDRESS; 379 380 /* 381 * Initialize the kernel pmap (which is statically allocated). 382 */ 383 PMAP_LOCK_INIT(kernel_pmap); 384 kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); 385#ifdef PAE 386 kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); 387#endif 388 kernel_pmap->pm_root = NULL; 389 kernel_pmap->pm_active = -1; /* don't allow deactivation */ 390 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 391 LIST_INIT(&allpmaps); 392 393 /* 394 * Request a spin mutex so that changes to allpmaps cannot be 395 * preempted by smp_rendezvous_cpus(). Otherwise, 396 * pmap_update_pde_kernel() could access allpmaps while it is 397 * being changed. 398 */ 399 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); 400 mtx_lock_spin(&allpmaps_lock); 401 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); 402 mtx_unlock_spin(&allpmaps_lock); 403 404 /* 405 * Reserve some special page table entries/VA space for temporary 406 * mapping of pages. 407 */ 408#define SYSMAP(c, p, v, n) \ 409 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 410 411 va = virtual_avail; 412 pte = vtopte(va); 413 414 /* 415 * CMAP1/CMAP2 are used for zeroing and copying pages. 416 * CMAP3 is used for the idle process page zeroing. 417 */ 418 for (i = 0; i < MAXCPU; i++) { 419 sysmaps = &sysmaps_pcpu[i]; 420 mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF); 421 SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1) 422 SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1) 423 } 424 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 425 SYSMAP(caddr_t, CMAP3, CADDR3, 1) 426 427 /* 428 * Crashdump maps. 429 */ 430 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) 431 432 /* 433 * ptvmmap is used for reading arbitrary physical pages via /dev/mem. 434 */ 435 SYSMAP(caddr_t, unused, ptvmmap, 1) 436 437 /* 438 * msgbufp is used to map the system message buffer. 439 */ 440 SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE))) 441 442 /* 443 * KPTmap is used by pmap_kextract(). 444 * 445 * KPTmap is first initialized by locore. However, that initial 446 * KPTmap can only support NKPT page table pages. Here, a larger 447 * KPTmap is created that can support KVA_PAGES page table pages. 448 */ 449 SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES) 450 451 for (i = 0; i < NKPT; i++) 452 KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V; 453 454 /* 455 * Adjust the start of the KPTD and KPTmap so that the implementation 456 * of pmap_kextract() and pmap_growkernel() can be made simpler. 457 */ 458 KPTD -= KPTDI; 459 KPTmap -= i386_btop(KPTDI << PDRSHIFT); 460 461 /* 462 * ptemap is used for pmap_pte_quick 463 */ 464 SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) 465 SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) 466 467 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); 468 469 virtual_avail = va; 470 471 /* 472 * Leave in place an identity mapping (virt == phys) for the low 1 MB 473 * physical memory region that is used by the ACPI wakeup code. This 474 * mapping must not have PG_G set. 475 */ 476#ifdef XBOX 477 /* FIXME: This is gross, but needed for the XBOX. Since we are in such 478 * an early stadium, we cannot yet neatly map video memory ... :-( 479 * Better fixes are very welcome! */ 480 if (!arch_i386_is_xbox) 481#endif 482 for (i = 1; i < NKPT; i++) 483 PTD[i] = 0; 484 485 /* Initialize the PAT MSR if present. */ 486 pmap_init_pat(); 487 488 /* Turn on PG_G on kernel page(s) */ 489 pmap_set_pg(); 490} 491 492/* 493 * Setup the PAT MSR. 494 */ 495void 496pmap_init_pat(void) 497{ 498 int pat_table[PAT_INDEX_SIZE]; 499 uint64_t pat_msr; 500 u_long cr0, cr4; 501 int i; 502 503 /* Set default PAT index table. */ 504 for (i = 0; i < PAT_INDEX_SIZE; i++) 505 pat_table[i] = -1; 506 pat_table[PAT_WRITE_BACK] = 0; 507 pat_table[PAT_WRITE_THROUGH] = 1; 508 pat_table[PAT_UNCACHEABLE] = 3; 509 pat_table[PAT_WRITE_COMBINING] = 3; 510 pat_table[PAT_WRITE_PROTECTED] = 3; 511 pat_table[PAT_UNCACHED] = 3; 512 513 /* Bail if this CPU doesn't implement PAT. */ 514 if ((cpu_feature & CPUID_PAT) == 0) { 515 for (i = 0; i < PAT_INDEX_SIZE; i++) 516 pat_index[i] = pat_table[i]; 517 pat_works = 0; 518 return; 519 } 520 521 /* 522 * Due to some Intel errata, we can only safely use the lower 4 523 * PAT entries. 524 * 525 * Intel Pentium III Processor Specification Update 526 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B 527 * or Mode C Paging) 528 * 529 * Intel Pentium IV Processor Specification Update 530 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) 531 */ 532 if (cpu_vendor_id == CPU_VENDOR_INTEL && 533 !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) 534 pat_works = 0; 535 536 /* Initialize default PAT entries. */ 537 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 538 PAT_VALUE(1, PAT_WRITE_THROUGH) | 539 PAT_VALUE(2, PAT_UNCACHED) | 540 PAT_VALUE(3, PAT_UNCACHEABLE) | 541 PAT_VALUE(4, PAT_WRITE_BACK) | 542 PAT_VALUE(5, PAT_WRITE_THROUGH) | 543 PAT_VALUE(6, PAT_UNCACHED) | 544 PAT_VALUE(7, PAT_UNCACHEABLE); 545 546 if (pat_works) { 547 /* 548 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 549 * Program 5 and 6 as WP and WC. 550 * Leave 4 and 7 as WB and UC. 551 */ 552 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); 553 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | 554 PAT_VALUE(6, PAT_WRITE_COMBINING); 555 pat_table[PAT_UNCACHED] = 2; 556 pat_table[PAT_WRITE_PROTECTED] = 5; 557 pat_table[PAT_WRITE_COMBINING] = 6; 558 } else { 559 /* 560 * Just replace PAT Index 2 with WC instead of UC-. 561 */ 562 pat_msr &= ~PAT_MASK(2); 563 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 564 pat_table[PAT_WRITE_COMBINING] = 2; 565 } 566 567 /* Disable PGE. */ 568 cr4 = rcr4(); 569 load_cr4(cr4 & ~CR4_PGE); 570 571 /* Disable caches (CD = 1, NW = 0). */ 572 cr0 = rcr0(); 573 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 574 575 /* Flushes caches and TLBs. */ 576 wbinvd(); 577 invltlb(); 578 579 /* Update PAT and index table. */ 580 wrmsr(MSR_PAT, pat_msr); 581 for (i = 0; i < PAT_INDEX_SIZE; i++) 582 pat_index[i] = pat_table[i]; 583 584 /* Flush caches and TLBs again. */ 585 wbinvd(); 586 invltlb(); 587 588 /* Restore caches and PGE. */ 589 load_cr0(cr0); 590 load_cr4(cr4); 591} 592 593/* 594 * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. 595 */ 596static void 597pmap_set_pg(void) 598{ 599 pt_entry_t *pte; 600 vm_offset_t va, endva; 601 602 if (pgeflag == 0) 603 return; 604 605 endva = KERNBASE + KERNend; 606 607 if (pseflag) { 608 va = KERNBASE + KERNLOAD; 609 while (va < endva) { 610 pdir_pde(PTD, va) |= pgeflag; 611 invltlb(); /* Play it safe, invltlb() every time */ 612 va += NBPDR; 613 } 614 } else { 615 va = (vm_offset_t)btext; 616 while (va < endva) { 617 pte = vtopte(va); 618 if (*pte) 619 *pte |= pgeflag; 620 invltlb(); /* Play it safe, invltlb() every time */ 621 va += PAGE_SIZE; 622 } 623 } 624} 625 626/* 627 * Initialize a vm_page's machine-dependent fields. 628 */ 629void 630pmap_page_init(vm_page_t m) 631{ 632 633 TAILQ_INIT(&m->md.pv_list); 634 m->md.pat_mode = PAT_WRITE_BACK; 635} 636 637#ifdef PAE 638static void * 639pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) 640{ 641 642 /* Inform UMA that this allocator uses kernel_map/object. */ 643 *flags = UMA_SLAB_KERNEL; 644 return ((void *)kmem_alloc_contig(kernel_map, bytes, wait, 0x0ULL, 645 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); 646} 647#endif 648 649/* 650 * ABuse the pte nodes for unmapped kva to thread a kva freelist through. 651 * Requirements: 652 * - Must deal with pages in order to ensure that none of the PG_* bits 653 * are ever set, PG_V in particular. 654 * - Assumes we can write to ptes without pte_store() atomic ops, even 655 * on PAE systems. This should be ok. 656 * - Assumes nothing will ever test these addresses for 0 to indicate 657 * no mapping instead of correctly checking PG_V. 658 * - Assumes a vm_offset_t will fit in a pte (true for i386). 659 * Because PG_V is never set, there can be no mappings to invalidate. 660 */ 661static vm_offset_t 662pmap_ptelist_alloc(vm_offset_t *head) 663{ 664 pt_entry_t *pte; 665 vm_offset_t va; 666 667 va = *head; 668 if (va == 0) 669 return (va); /* Out of memory */ 670 pte = vtopte(va); 671 *head = *pte; 672 if (*head & PG_V) 673 panic("pmap_ptelist_alloc: va with PG_V set!"); 674 *pte = 0; 675 return (va); 676} 677 678static void 679pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) 680{ 681 pt_entry_t *pte; 682 683 if (va & PG_V) 684 panic("pmap_ptelist_free: freeing va with PG_V set!"); 685 pte = vtopte(va); 686 *pte = *head; /* virtual! PG_V is 0 though */ 687 *head = va; 688} 689 690static void 691pmap_ptelist_init(vm_offset_t *head, void *base, int npages) 692{ 693 int i; 694 vm_offset_t va; 695 696 *head = 0; 697 for (i = npages - 1; i >= 0; i--) { 698 va = (vm_offset_t)base + i * PAGE_SIZE; 699 pmap_ptelist_free(head, va); 700 } 701} 702 703 704/* 705 * Initialize the pmap module. 706 * Called by vm_init, to initialize any structures that the pmap 707 * system needs to map virtual memory. 708 */ 709void 710pmap_init(void) 711{ 712 vm_page_t mpte; 713 vm_size_t s; 714 int i, pv_npg; 715 716 /* 717 * Initialize the vm page array entries for the kernel pmap's 718 * page table pages. 719 */ 720 for (i = 0; i < NKPT; i++) { 721 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 722 KASSERT(mpte >= vm_page_array && 723 mpte < &vm_page_array[vm_page_array_size], 724 ("pmap_init: page table page is out of range")); 725 mpte->pindex = i + KPTDI; 726 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 727 } 728 729 /* 730 * Initialize the address space (zone) for the pv entries. Set a 731 * high water mark so that the system can recover from excessive 732 * numbers of pv entries. 733 */ 734 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 735 pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; 736 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 737 pv_entry_max = roundup(pv_entry_max, _NPCPV); 738 pv_entry_high_water = 9 * (pv_entry_max / 10); 739 740 /* 741 * If the kernel is running in a virtual machine on an AMD Family 10h 742 * processor, then it must assume that MCA is enabled by the virtual 743 * machine monitor. 744 */ 745 if (vm_guest == VM_GUEST_VM && cpu_vendor_id == CPU_VENDOR_AMD && 746 CPUID_TO_FAMILY(cpu_id) == 0x10) 747 workaround_erratum383 = 1; 748 749 /* 750 * Are large page mappings supported and enabled? 751 */ 752 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 753 if (pseflag == 0) 754 pg_ps_enabled = 0; 755 else if (pg_ps_enabled) { 756 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 757 ("pmap_init: can't assign to pagesizes[1]")); 758 pagesizes[1] = NBPDR; 759 } 760 761 /* 762 * Calculate the size of the pv head table for superpages. 763 */ 764 for (i = 0; phys_avail[i + 1]; i += 2); 765 pv_npg = round_4mpage(phys_avail[(i - 2) + 1]) / NBPDR; 766 767 /* 768 * Allocate memory for the pv head table for superpages. 769 */ 770 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 771 s = round_page(s); 772 pv_table = (struct md_page *)kmem_alloc(kernel_map, s); 773 for (i = 0; i < pv_npg; i++) 774 TAILQ_INIT(&pv_table[i].pv_list); 775 776 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); 777 pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map, 778 PAGE_SIZE * pv_maxchunks); 779 if (pv_chunkbase == NULL) 780 panic("pmap_init: not enough kvm for pv chunks"); 781 pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); 782#ifdef PAE 783 pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, 784 NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, 785 UMA_ZONE_VM | UMA_ZONE_NOFREE); 786 uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); 787#endif 788} 789 790 791SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, 792 "Max number of PV entries"); 793SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, 794 "Page share factor per proc"); 795 796SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, 797 "2/4MB page mapping counters"); 798 799static u_long pmap_pde_demotions; 800SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, 801 &pmap_pde_demotions, 0, "2/4MB page demotions"); 802 803static u_long pmap_pde_mappings; 804SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 805 &pmap_pde_mappings, 0, "2/4MB page mappings"); 806 807static u_long pmap_pde_p_failures; 808SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 809 &pmap_pde_p_failures, 0, "2/4MB page promotion failures"); 810 811static u_long pmap_pde_promotions; 812SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 813 &pmap_pde_promotions, 0, "2/4MB page promotions"); 814 815/*************************************************** 816 * Low level helper routines..... 817 ***************************************************/ 818 819/* 820 * Determine the appropriate bits to set in a PTE or PDE for a specified 821 * caching mode. 822 */ 823int 824pmap_cache_bits(int mode, boolean_t is_pde) 825{ 826 int cache_bits, pat_flag, pat_idx; 827 828 if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) 829 panic("Unknown caching mode %d\n", mode); 830 831 /* The PAT bit is different for PTE's and PDE's. */ 832 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; 833 834 /* Map the caching mode to a PAT index. */ 835 pat_idx = pat_index[mode]; 836 837 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 838 cache_bits = 0; 839 if (pat_idx & 0x4) 840 cache_bits |= pat_flag; 841 if (pat_idx & 0x2) 842 cache_bits |= PG_NC_PCD; 843 if (pat_idx & 0x1) 844 cache_bits |= PG_NC_PWT; 845 return (cache_bits); 846} 847 848/* 849 * The caller is responsible for maintaining TLB consistency. 850 */ 851static void 852pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde) 853{ 854 pd_entry_t *pde; 855 pmap_t pmap; 856 boolean_t PTD_updated; 857 858 PTD_updated = FALSE; 859 mtx_lock_spin(&allpmaps_lock); 860 LIST_FOREACH(pmap, &allpmaps, pm_list) { 861 if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & 862 PG_FRAME)) 863 PTD_updated = TRUE; 864 pde = pmap_pde(pmap, va); 865 pde_store(pde, newpde); 866 } 867 mtx_unlock_spin(&allpmaps_lock); 868 KASSERT(PTD_updated, 869 ("pmap_kenter_pde: current page table is not in allpmaps")); 870} 871 872/* 873 * After changing the page size for the specified virtual address in the page 874 * table, flush the corresponding entries from the processor's TLB. Only the 875 * calling processor's TLB is affected. 876 * 877 * The calling thread must be pinned to a processor. 878 */ 879static void 880pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) 881{ 882 u_long cr4; 883 884 if ((newpde & PG_PS) == 0) 885 /* Demotion: flush a specific 2MB page mapping. */ 886 invlpg(va); 887 else if ((newpde & PG_G) == 0) 888 /* 889 * Promotion: flush every 4KB page mapping from the TLB 890 * because there are too many to flush individually. 891 */ 892 invltlb(); 893 else { 894 /* 895 * Promotion: flush every 4KB page mapping from the TLB, 896 * including any global (PG_G) mappings. 897 */ 898 cr4 = rcr4(); 899 load_cr4(cr4 & ~CR4_PGE); 900 /* 901 * Although preemption at this point could be detrimental to 902 * performance, it would not lead to an error. PG_G is simply 903 * ignored if CR4.PGE is clear. Moreover, in case this block 904 * is re-entered, the load_cr4() either above or below will 905 * modify CR4.PGE flushing the TLB. 906 */ 907 load_cr4(cr4 | CR4_PGE); 908 } 909} 910#ifdef SMP 911/* 912 * For SMP, these functions have to use the IPI mechanism for coherence. 913 * 914 * N.B.: Before calling any of the following TLB invalidation functions, 915 * the calling processor must ensure that all stores updating a non- 916 * kernel page table are globally performed. Otherwise, another 917 * processor could cache an old, pre-update entry without being 918 * invalidated. This can happen one of two ways: (1) The pmap becomes 919 * active on another processor after its pm_active field is checked by 920 * one of the following functions but before a store updating the page 921 * table is globally performed. (2) The pmap becomes active on another 922 * processor before its pm_active field is checked but due to 923 * speculative loads one of the following functions stills reads the 924 * pmap as inactive on the other processor. 925 * 926 * The kernel page table is exempt because its pm_active field is 927 * immutable. The kernel page table is always active on every 928 * processor. 929 */ 930void 931pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 932{ 933 cpumask_t cpumask, other_cpus; 934 935 sched_pin(); 936 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 937 invlpg(va); 938 smp_invlpg(va); 939 } else { 940 cpumask = PCPU_GET(cpumask); 941 other_cpus = PCPU_GET(other_cpus); 942 if (pmap->pm_active & cpumask) 943 invlpg(va); 944 if (pmap->pm_active & other_cpus) 945 smp_masked_invlpg(pmap->pm_active & other_cpus, va); 946 } 947 sched_unpin(); 948} 949 950void 951pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 952{ 953 cpumask_t cpumask, other_cpus; 954 vm_offset_t addr; 955 956 sched_pin(); 957 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 958 for (addr = sva; addr < eva; addr += PAGE_SIZE) 959 invlpg(addr); 960 smp_invlpg_range(sva, eva); 961 } else { 962 cpumask = PCPU_GET(cpumask); 963 other_cpus = PCPU_GET(other_cpus); 964 if (pmap->pm_active & cpumask) 965 for (addr = sva; addr < eva; addr += PAGE_SIZE) 966 invlpg(addr); 967 if (pmap->pm_active & other_cpus) 968 smp_masked_invlpg_range(pmap->pm_active & other_cpus, 969 sva, eva); 970 } 971 sched_unpin(); 972} 973 974void 975pmap_invalidate_all(pmap_t pmap) 976{ 977 cpumask_t cpumask, other_cpus; 978 979 sched_pin(); 980 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 981 invltlb(); 982 smp_invltlb(); 983 } else { 984 cpumask = PCPU_GET(cpumask); 985 other_cpus = PCPU_GET(other_cpus); 986 if (pmap->pm_active & cpumask) 987 invltlb(); 988 if (pmap->pm_active & other_cpus) 989 smp_masked_invltlb(pmap->pm_active & other_cpus); 990 } 991 sched_unpin(); 992} 993 994void 995pmap_invalidate_cache(void) 996{ 997 998 sched_pin(); 999 wbinvd(); 1000 smp_cache_flush(); 1001 sched_unpin(); 1002} 1003 1004struct pde_action { 1005 cpumask_t store; /* processor that updates the PDE */ 1006 cpumask_t invalidate; /* processors that invalidate their TLB */ 1007 vm_offset_t va; 1008 pd_entry_t *pde; 1009 pd_entry_t newpde; 1010}; 1011 1012static void 1013pmap_update_pde_kernel(void *arg) 1014{ 1015 struct pde_action *act = arg; 1016 pd_entry_t *pde; 1017 pmap_t pmap; 1018 1019 if (act->store == PCPU_GET(cpumask)) 1020 /* 1021 * Elsewhere, this operation requires allpmaps_lock for 1022 * synchronization. Here, it does not because it is being 1023 * performed in the context of an all_cpus rendezvous. 1024 */ 1025 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1026 pde = pmap_pde(pmap, act->va); 1027 pde_store(pde, act->newpde); 1028 } 1029} 1030 1031static void 1032pmap_update_pde_user(void *arg) 1033{ 1034 struct pde_action *act = arg; 1035 1036 if (act->store == PCPU_GET(cpumask)) 1037 pde_store(act->pde, act->newpde); 1038} 1039 1040static void 1041pmap_update_pde_teardown(void *arg) 1042{ 1043 struct pde_action *act = arg; 1044 1045 if ((act->invalidate & PCPU_GET(cpumask)) != 0) 1046 pmap_update_pde_invalidate(act->va, act->newpde); 1047} 1048 1049/* 1050 * Change the page size for the specified virtual address in a way that 1051 * prevents any possibility of the TLB ever having two entries that map the 1052 * same virtual address using different page sizes. This is the recommended 1053 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 1054 * machine check exception for a TLB state that is improperly diagnosed as a 1055 * hardware error. 1056 */ 1057static void 1058pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1059{ 1060 struct pde_action act; 1061 cpumask_t active, cpumask; 1062 1063 sched_pin(); 1064 cpumask = PCPU_GET(cpumask); 1065 if (pmap == kernel_pmap) 1066 active = all_cpus; 1067 else 1068 active = pmap->pm_active; 1069 if ((active & PCPU_GET(other_cpus)) != 0) { 1070 act.store = cpumask; 1071 act.invalidate = active; 1072 act.va = va; 1073 act.pde = pde; 1074 act.newpde = newpde; 1075 smp_rendezvous_cpus(cpumask | active, 1076 smp_no_rendevous_barrier, pmap == kernel_pmap ? 1077 pmap_update_pde_kernel : pmap_update_pde_user, 1078 pmap_update_pde_teardown, &act); 1079 } else { 1080 if (pmap == kernel_pmap) 1081 pmap_kenter_pde(va, newpde); 1082 else 1083 pde_store(pde, newpde); 1084 if ((active & cpumask) != 0) 1085 pmap_update_pde_invalidate(va, newpde); 1086 } 1087 sched_unpin(); 1088} 1089#else /* !SMP */ 1090/* 1091 * Normal, non-SMP, 486+ invalidation functions. 1092 * We inline these within pmap.c for speed. 1093 */ 1094PMAP_INLINE void 1095pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1096{ 1097 1098 if (pmap == kernel_pmap || pmap->pm_active) 1099 invlpg(va); 1100} 1101 1102PMAP_INLINE void 1103pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1104{ 1105 vm_offset_t addr; 1106 1107 if (pmap == kernel_pmap || pmap->pm_active) 1108 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1109 invlpg(addr); 1110} 1111 1112PMAP_INLINE void 1113pmap_invalidate_all(pmap_t pmap) 1114{ 1115 1116 if (pmap == kernel_pmap || pmap->pm_active) 1117 invltlb(); 1118} 1119 1120PMAP_INLINE void 1121pmap_invalidate_cache(void) 1122{ 1123 1124 wbinvd(); 1125} 1126 1127static void 1128pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1129{ 1130 1131 if (pmap == kernel_pmap) 1132 pmap_kenter_pde(va, newpde); 1133 else 1134 pde_store(pde, newpde); 1135 if (pmap == kernel_pmap || pmap->pm_active) 1136 pmap_update_pde_invalidate(va, newpde); 1137} 1138#endif /* !SMP */ 1139 1140void 1141pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 1142{ 1143 1144 KASSERT((sva & PAGE_MASK) == 0, 1145 ("pmap_invalidate_cache_range: sva not page-aligned")); 1146 KASSERT((eva & PAGE_MASK) == 0, 1147 ("pmap_invalidate_cache_range: eva not page-aligned")); 1148 1149 if (cpu_feature & CPUID_SS) 1150 ; /* If "Self Snoop" is supported, do nothing. */ 1151 else if ((cpu_feature & CPUID_CLFSH) != 0 && 1152 eva - sva < 2 * 1024 * 1024) { 1153 1154 /* 1155 * Otherwise, do per-cache line flush. Use the mfence 1156 * instruction to insure that previous stores are 1157 * included in the write-back. The processor 1158 * propagates flush to other processors in the cache 1159 * coherence domain. 1160 */ 1161 mfence(); 1162 for (; sva < eva; sva += cpu_clflush_line_size) 1163 clflush(sva); 1164 mfence(); 1165 } else { 1166 1167 /* 1168 * No targeted cache flush methods are supported by CPU, 1169 * or the supplied range is bigger than 2MB. 1170 * Globally invalidate cache. 1171 */ 1172 pmap_invalidate_cache(); 1173 } 1174} 1175 1176/* 1177 * Are we current address space or kernel? N.B. We return FALSE when 1178 * a pmap's page table is in use because a kernel thread is borrowing 1179 * it. The borrowed page table can change spontaneously, making any 1180 * dependence on its continued use subject to a race condition. 1181 */ 1182static __inline int 1183pmap_is_current(pmap_t pmap) 1184{ 1185 1186 return (pmap == kernel_pmap || 1187 (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) && 1188 (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME))); 1189} 1190 1191/* 1192 * If the given pmap is not the current or kernel pmap, the returned pte must 1193 * be released by passing it to pmap_pte_release(). 1194 */ 1195pt_entry_t * 1196pmap_pte(pmap_t pmap, vm_offset_t va) 1197{ 1198 pd_entry_t newpf; 1199 pd_entry_t *pde; 1200 1201 pde = pmap_pde(pmap, va); 1202 if (*pde & PG_PS) 1203 return (pde); 1204 if (*pde != 0) { 1205 /* are we current address space or kernel? */ 1206 if (pmap_is_current(pmap)) 1207 return (vtopte(va)); 1208 mtx_lock(&PMAP2mutex); 1209 newpf = *pde & PG_FRAME; 1210 if ((*PMAP2 & PG_FRAME) != newpf) { 1211 *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; 1212 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); 1213 } 1214 return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); 1215 } 1216 return (NULL); 1217} 1218 1219/* 1220 * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte 1221 * being NULL. 1222 */ 1223static __inline void 1224pmap_pte_release(pt_entry_t *pte) 1225{ 1226 1227 if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) 1228 mtx_unlock(&PMAP2mutex); 1229} 1230 1231static __inline void 1232invlcaddr(void *caddr) 1233{ 1234 1235 invlpg((u_int)caddr); 1236} 1237 1238/* 1239 * Super fast pmap_pte routine best used when scanning 1240 * the pv lists. This eliminates many coarse-grained 1241 * invltlb calls. Note that many of the pv list 1242 * scans are across different pmaps. It is very wasteful 1243 * to do an entire invltlb for checking a single mapping. 1244 * 1245 * If the given pmap is not the current pmap, vm_page_queue_mtx 1246 * must be held and curthread pinned to a CPU. 1247 */ 1248static pt_entry_t * 1249pmap_pte_quick(pmap_t pmap, vm_offset_t va) 1250{ 1251 pd_entry_t newpf; 1252 pd_entry_t *pde; 1253 1254 pde = pmap_pde(pmap, va); 1255 if (*pde & PG_PS) 1256 return (pde); 1257 if (*pde != 0) { 1258 /* are we current address space or kernel? */ 1259 if (pmap_is_current(pmap)) 1260 return (vtopte(va)); 1261 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1262 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 1263 newpf = *pde & PG_FRAME; 1264 if ((*PMAP1 & PG_FRAME) != newpf) { 1265 *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; 1266#ifdef SMP 1267 PMAP1cpu = PCPU_GET(cpuid); 1268#endif 1269 invlcaddr(PADDR1); 1270 PMAP1changed++; 1271 } else 1272#ifdef SMP 1273 if (PMAP1cpu != PCPU_GET(cpuid)) { 1274 PMAP1cpu = PCPU_GET(cpuid); 1275 invlcaddr(PADDR1); 1276 PMAP1changedcpu++; 1277 } else 1278#endif 1279 PMAP1unchanged++; 1280 return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); 1281 } 1282 return (0); 1283} 1284 1285/* 1286 * Routine: pmap_extract 1287 * Function: 1288 * Extract the physical page address associated 1289 * with the given map/virtual_address pair. 1290 */ 1291vm_paddr_t 1292pmap_extract(pmap_t pmap, vm_offset_t va) 1293{ 1294 pt_entry_t pte, *ptep; 1295 vm_paddr_t rtval; 1296 1297 rtval = 0; 1298 PMAP_LOCK(pmap); 1299 ptep = pmap_pte(pmap, va); 1300 pte = (ptep != NULL) ? *ptep : 0; 1301 pmap_pte_release(ptep); 1302 PMAP_UNLOCK(pmap); 1303 if ((pte & PG_V) != 0) { 1304 if ((pte & PG_PS) != 0) 1305 rtval = (pte & PG_PS_FRAME) | (va & PDRMASK); 1306 else 1307 rtval = (pte & PG_FRAME) | (va & PAGE_MASK); 1308 } 1309 return (rtval); 1310} 1311 1312/* 1313 * Routine: pmap_extract_and_hold 1314 * Function: 1315 * Atomically extract and hold the physical page 1316 * with the given pmap and virtual address pair 1317 * if that mapping permits the given protection. 1318 */ 1319vm_page_t 1320pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1321{ 1322 pt_entry_t pte, *ptep; 1323 vm_paddr_t locked_pa, pa; 1324 vm_page_t m; 1325 1326 locked_pa = 0; 1327 m = NULL; 1328 PMAP_LOCK(pmap); 1329retry: 1330 ptep = pmap_pte(pmap, va); 1331 pte = (ptep != NULL) ? *ptep : 0; 1332 pmap_pte_release(ptep); 1333 if ((pte & PG_V) != 0 && 1334 ((pte & PG_RW) != 0 || (prot & VM_PROT_WRITE) == 0)) { 1335 if ((pte & PG_PS) != 0) { 1336 /* Compute the physical address of the 4KB page. */ 1337 pa = (pte & PG_PS_FRAME) | (va & PG_FRAME & PDRMASK); 1338 } else 1339 pa = pte & PG_FRAME; 1340 if (vm_page_pa_tryrelock(pmap, pa, &locked_pa)) 1341 goto retry; 1342 m = PHYS_TO_VM_PAGE(pa); 1343 vm_page_hold(m); 1344 PA_UNLOCK(locked_pa); 1345 } 1346 PMAP_UNLOCK(pmap); 1347 return (m); 1348} 1349 1350/*************************************************** 1351 * Low level mapping routines..... 1352 ***************************************************/ 1353 1354/* 1355 * Add a wired page to the kva. 1356 * Note: not SMP coherent. 1357 * 1358 * This function may be used before pmap_bootstrap() is called. 1359 */ 1360PMAP_INLINE void 1361pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1362{ 1363 pt_entry_t *pte; 1364 1365 pte = vtopte(va); 1366 pte_store(pte, pa | PG_RW | PG_V | pgeflag); 1367} 1368 1369static __inline void 1370pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 1371{ 1372 pt_entry_t *pte; 1373 1374 pte = vtopte(va); 1375 pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); 1376} 1377 1378/* 1379 * Remove a page from the kernel pagetables. 1380 * Note: not SMP coherent. 1381 * 1382 * This function may be used before pmap_bootstrap() is called. 1383 */ 1384PMAP_INLINE void 1385pmap_kremove(vm_offset_t va) 1386{ 1387 pt_entry_t *pte; 1388 1389 pte = vtopte(va); 1390 pte_clear(pte); 1391} 1392 1393/* 1394 * Used to map a range of physical addresses into kernel 1395 * virtual address space. 1396 * 1397 * The value passed in '*virt' is a suggested virtual address for 1398 * the mapping. Architectures which can support a direct-mapped 1399 * physical to virtual region can return the appropriate address 1400 * within that region, leaving '*virt' unchanged. Other 1401 * architectures should map the pages starting at '*virt' and 1402 * update '*virt' with the first usable address after the mapped 1403 * region. 1404 */ 1405vm_offset_t 1406pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1407{ 1408 vm_offset_t va, sva; 1409 1410 va = sva = *virt; 1411 while (start < end) { 1412 pmap_kenter(va, start); 1413 va += PAGE_SIZE; 1414 start += PAGE_SIZE; 1415 } 1416 pmap_invalidate_range(kernel_pmap, sva, va); 1417 *virt = va; 1418 return (sva); 1419} 1420 1421 1422/* 1423 * Add a list of wired pages to the kva 1424 * this routine is only used for temporary 1425 * kernel mappings that do not need to have 1426 * page modification or references recorded. 1427 * Note that old mappings are simply written 1428 * over. The page *must* be wired. 1429 * Note: SMP coherent. Uses a ranged shootdown IPI. 1430 */ 1431void 1432pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1433{ 1434 pt_entry_t *endpte, oldpte, pa, *pte; 1435 vm_page_t m; 1436 1437 oldpte = 0; 1438 pte = vtopte(sva); 1439 endpte = pte + count; 1440 while (pte < endpte) { 1441 m = *ma++; 1442 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); 1443 if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { 1444 oldpte |= *pte; 1445 pte_store(pte, pa | pgeflag | PG_RW | PG_V); 1446 } 1447 pte++; 1448 } 1449 if (__predict_false((oldpte & PG_V) != 0)) 1450 pmap_invalidate_range(kernel_pmap, sva, sva + count * 1451 PAGE_SIZE); 1452} 1453 1454/* 1455 * This routine tears out page mappings from the 1456 * kernel -- it is meant only for temporary mappings. 1457 * Note: SMP coherent. Uses a ranged shootdown IPI. 1458 */ 1459void 1460pmap_qremove(vm_offset_t sva, int count) 1461{ 1462 vm_offset_t va; 1463 1464 va = sva; 1465 while (count-- > 0) { 1466 pmap_kremove(va); 1467 va += PAGE_SIZE; 1468 } 1469 pmap_invalidate_range(kernel_pmap, sva, va); 1470} 1471 1472/*************************************************** 1473 * Page table page management routines..... 1474 ***************************************************/ 1475static __inline void 1476pmap_free_zero_pages(vm_page_t free) 1477{ 1478 vm_page_t m; 1479 1480 while (free != NULL) { 1481 m = free; 1482 free = m->right; 1483 /* Preserve the page's PG_ZERO setting. */ 1484 vm_page_free_toq(m); 1485 } 1486} 1487 1488/* 1489 * Schedule the specified unused page table page to be freed. Specifically, 1490 * add the page to the specified list of pages that will be released to the 1491 * physical memory manager after the TLB has been updated. 1492 */ 1493static __inline void 1494pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO) 1495{ 1496 1497 if (set_PG_ZERO) 1498 m->flags |= PG_ZERO; 1499 else 1500 m->flags &= ~PG_ZERO; 1501 m->right = *free; 1502 *free = m; 1503} 1504 1505/* 1506 * Inserts the specified page table page into the specified pmap's collection 1507 * of idle page table pages. Each of a pmap's page table pages is responsible 1508 * for mapping a distinct range of virtual addresses. The pmap's collection is 1509 * ordered by this virtual address range. 1510 */ 1511static void 1512pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 1513{ 1514 vm_page_t root; 1515 1516 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1517 root = pmap->pm_root; 1518 if (root == NULL) { 1519 mpte->left = NULL; 1520 mpte->right = NULL; 1521 } else { 1522 root = vm_page_splay(mpte->pindex, root); 1523 if (mpte->pindex < root->pindex) { 1524 mpte->left = root->left; 1525 mpte->right = root; 1526 root->left = NULL; 1527 } else if (mpte->pindex == root->pindex) 1528 panic("pmap_insert_pt_page: pindex already inserted"); 1529 else { 1530 mpte->right = root->right; 1531 mpte->left = root; 1532 root->right = NULL; 1533 } 1534 } 1535 pmap->pm_root = mpte; 1536} 1537 1538/* 1539 * Looks for a page table page mapping the specified virtual address in the 1540 * specified pmap's collection of idle page table pages. Returns NULL if there 1541 * is no page table page corresponding to the specified virtual address. 1542 */ 1543static vm_page_t 1544pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) 1545{ 1546 vm_page_t mpte; 1547 vm_pindex_t pindex = va >> PDRSHIFT; 1548 1549 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1550 if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) { 1551 mpte = vm_page_splay(pindex, mpte); 1552 if ((pmap->pm_root = mpte)->pindex != pindex) 1553 mpte = NULL; 1554 } 1555 return (mpte); 1556} 1557 1558/* 1559 * Removes the specified page table page from the specified pmap's collection 1560 * of idle page table pages. The specified page table page must be a member of 1561 * the pmap's collection. 1562 */ 1563static void 1564pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) 1565{ 1566 vm_page_t root; 1567 1568 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1569 if (mpte != pmap->pm_root) 1570 vm_page_splay(mpte->pindex, pmap->pm_root); 1571 if (mpte->left == NULL) 1572 root = mpte->right; 1573 else { 1574 root = vm_page_splay(mpte->pindex, mpte->left); 1575 root->right = mpte->right; 1576 } 1577 pmap->pm_root = root; 1578} 1579 1580/* 1581 * This routine unholds page table pages, and if the hold count 1582 * drops to zero, then it decrements the wire count. 1583 */ 1584static __inline int 1585pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free) 1586{ 1587 1588 --m->wire_count; 1589 if (m->wire_count == 0) 1590 return (_pmap_unwire_pte_hold(pmap, m, free)); 1591 else 1592 return (0); 1593} 1594 1595static int 1596_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free) 1597{ 1598 vm_offset_t pteva; 1599 1600 /* 1601 * unmap the page table page 1602 */ 1603 pmap->pm_pdir[m->pindex] = 0; 1604 --pmap->pm_stats.resident_count; 1605 1606 /* 1607 * This is a release store so that the ordinary store unmapping 1608 * the page table page is globally performed before TLB shoot- 1609 * down is begun. 1610 */ 1611 atomic_subtract_rel_int(&cnt.v_wire_count, 1); 1612 1613 /* 1614 * Do an invltlb to make the invalidated mapping 1615 * take effect immediately. 1616 */ 1617 pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); 1618 pmap_invalidate_page(pmap, pteva); 1619 1620 /* 1621 * Put page on a list so that it is released after 1622 * *ALL* TLB shootdown is done 1623 */ 1624 pmap_add_delayed_free_list(m, free, TRUE); 1625 1626 return (1); 1627} 1628 1629/* 1630 * After removing a page table entry, this routine is used to 1631 * conditionally free the page, and manage the hold/wire counts. 1632 */ 1633static int 1634pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free) 1635{ 1636 pd_entry_t ptepde; 1637 vm_page_t mpte; 1638 1639 if (va >= VM_MAXUSER_ADDRESS) 1640 return (0); 1641 ptepde = *pmap_pde(pmap, va); 1642 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 1643 return (pmap_unwire_pte_hold(pmap, mpte, free)); 1644} 1645 1646/* 1647 * Initialize the pmap for the swapper process. 1648 */ 1649void 1650pmap_pinit0(pmap_t pmap) 1651{ 1652 1653 PMAP_LOCK_INIT(pmap); 1654 /* 1655 * Since the page table directory is shared with the kernel pmap, 1656 * which is already included in the list "allpmaps", this pmap does 1657 * not need to be inserted into that list. 1658 */ 1659 pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); 1660#ifdef PAE 1661 pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); 1662#endif 1663 pmap->pm_root = NULL; 1664 pmap->pm_active = 0; 1665 PCPU_SET(curpmap, pmap); 1666 TAILQ_INIT(&pmap->pm_pvchunk); 1667 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1668} 1669 1670/* 1671 * Initialize a preallocated and zeroed pmap structure, 1672 * such as one in a vmspace structure. 1673 */ 1674int 1675pmap_pinit(pmap_t pmap) 1676{ 1677 vm_page_t m, ptdpg[NPGPTD]; 1678 vm_paddr_t pa; 1679 static int color; 1680 int i; 1681 1682 PMAP_LOCK_INIT(pmap); 1683 1684 /* 1685 * No need to allocate page table space yet but we do need a valid 1686 * page directory table. 1687 */ 1688 if (pmap->pm_pdir == NULL) { 1689 pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map, 1690 NBPTD); 1691 1692 if (pmap->pm_pdir == NULL) { 1693 PMAP_LOCK_DESTROY(pmap); 1694 return (0); 1695 } 1696#ifdef PAE 1697 pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); 1698 KASSERT(((vm_offset_t)pmap->pm_pdpt & 1699 ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, 1700 ("pmap_pinit: pdpt misaligned")); 1701 KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), 1702 ("pmap_pinit: pdpt above 4g")); 1703#endif 1704 pmap->pm_root = NULL; 1705 } 1706 KASSERT(pmap->pm_root == NULL, 1707 ("pmap_pinit: pmap has reserved page table page(s)")); 1708 1709 /* 1710 * allocate the page directory page(s) 1711 */ 1712 for (i = 0; i < NPGPTD;) { 1713 m = vm_page_alloc(NULL, color++, 1714 VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 1715 VM_ALLOC_ZERO); 1716 if (m == NULL) 1717 VM_WAIT; 1718 else { 1719 ptdpg[i++] = m; 1720 } 1721 } 1722 1723 pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); 1724 1725 for (i = 0; i < NPGPTD; i++) { 1726 if ((ptdpg[i]->flags & PG_ZERO) == 0) 1727 bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE); 1728 } 1729 1730 mtx_lock_spin(&allpmaps_lock); 1731 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1732 /* Copy the kernel page table directory entries. */ 1733 bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); 1734 mtx_unlock_spin(&allpmaps_lock); 1735 1736 /* install self-referential address mapping entry(s) */ 1737 for (i = 0; i < NPGPTD; i++) { 1738 pa = VM_PAGE_TO_PHYS(ptdpg[i]); 1739 pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; 1740#ifdef PAE 1741 pmap->pm_pdpt[i] = pa | PG_V; 1742#endif 1743 } 1744 1745 pmap->pm_active = 0; 1746 TAILQ_INIT(&pmap->pm_pvchunk); 1747 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1748 1749 return (1); 1750} 1751 1752/* 1753 * this routine is called if the page table page is not 1754 * mapped correctly. 1755 */ 1756static vm_page_t 1757_pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags) 1758{ 1759 vm_paddr_t ptepa; 1760 vm_page_t m; 1761 1762 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || 1763 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, 1764 ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); 1765 1766 /* 1767 * Allocate a page table page. 1768 */ 1769 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1770 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1771 if (flags & M_WAITOK) { 1772 PMAP_UNLOCK(pmap); 1773 vm_page_unlock_queues(); 1774 VM_WAIT; 1775 vm_page_lock_queues(); 1776 PMAP_LOCK(pmap); 1777 } 1778 1779 /* 1780 * Indicate the need to retry. While waiting, the page table 1781 * page may have been allocated. 1782 */ 1783 return (NULL); 1784 } 1785 if ((m->flags & PG_ZERO) == 0) 1786 pmap_zero_page(m); 1787 1788 /* 1789 * Map the pagetable page into the process address space, if 1790 * it isn't already there. 1791 */ 1792 1793 pmap->pm_stats.resident_count++; 1794 1795 ptepa = VM_PAGE_TO_PHYS(m); 1796 pmap->pm_pdir[ptepindex] = 1797 (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); 1798 1799 return (m); 1800} 1801 1802static vm_page_t 1803pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags) 1804{ 1805 unsigned ptepindex; 1806 pd_entry_t ptepa; 1807 vm_page_t m; 1808 1809 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || 1810 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, 1811 ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); 1812 1813 /* 1814 * Calculate pagetable page index 1815 */ 1816 ptepindex = va >> PDRSHIFT; 1817retry: 1818 /* 1819 * Get the page directory entry 1820 */ 1821 ptepa = pmap->pm_pdir[ptepindex]; 1822 1823 /* 1824 * This supports switching from a 4MB page to a 1825 * normal 4K page. 1826 */ 1827 if (ptepa & PG_PS) { 1828 (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); 1829 ptepa = pmap->pm_pdir[ptepindex]; 1830 } 1831 1832 /* 1833 * If the page table page is mapped, we just increment the 1834 * hold count, and activate it. 1835 */ 1836 if (ptepa) { 1837 m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 1838 m->wire_count++; 1839 } else { 1840 /* 1841 * Here if the pte page isn't mapped, or if it has 1842 * been deallocated. 1843 */ 1844 m = _pmap_allocpte(pmap, ptepindex, flags); 1845 if (m == NULL && (flags & M_WAITOK)) 1846 goto retry; 1847 } 1848 return (m); 1849} 1850 1851 1852/*************************************************** 1853* Pmap allocation/deallocation routines. 1854 ***************************************************/ 1855 1856#ifdef SMP 1857/* 1858 * Deal with a SMP shootdown of other users of the pmap that we are 1859 * trying to dispose of. This can be a bit hairy. 1860 */ 1861static cpumask_t *lazymask; 1862static u_int lazyptd; 1863static volatile u_int lazywait; 1864 1865void pmap_lazyfix_action(void); 1866 1867void 1868pmap_lazyfix_action(void) 1869{ 1870 cpumask_t mymask = PCPU_GET(cpumask); 1871 1872#ifdef COUNT_IPIS 1873 (*ipi_lazypmap_counts[PCPU_GET(cpuid)])++; 1874#endif 1875 if (rcr3() == lazyptd) 1876 load_cr3(PCPU_GET(curpcb)->pcb_cr3); 1877 atomic_clear_int(lazymask, mymask); 1878 atomic_store_rel_int(&lazywait, 1); 1879} 1880 1881static void 1882pmap_lazyfix_self(cpumask_t mymask) 1883{ 1884 1885 if (rcr3() == lazyptd) 1886 load_cr3(PCPU_GET(curpcb)->pcb_cr3); 1887 atomic_clear_int(lazymask, mymask); 1888} 1889 1890 1891static void 1892pmap_lazyfix(pmap_t pmap) 1893{ 1894 cpumask_t mymask, mask; 1895 u_int spins; 1896 1897 while ((mask = pmap->pm_active) != 0) { 1898 spins = 50000000; 1899 mask = mask & -mask; /* Find least significant set bit */ 1900 mtx_lock_spin(&smp_ipi_mtx); 1901#ifdef PAE 1902 lazyptd = vtophys(pmap->pm_pdpt); 1903#else 1904 lazyptd = vtophys(pmap->pm_pdir); 1905#endif 1906 mymask = PCPU_GET(cpumask); 1907 if (mask == mymask) { 1908 lazymask = &pmap->pm_active; 1909 pmap_lazyfix_self(mymask); 1910 } else { 1911 atomic_store_rel_int((u_int *)&lazymask, 1912 (u_int)&pmap->pm_active); 1913 atomic_store_rel_int(&lazywait, 0); 1914 ipi_selected(mask, IPI_LAZYPMAP); 1915 while (lazywait == 0) { 1916 ia32_pause(); 1917 if (--spins == 0) 1918 break; 1919 } 1920 } 1921 mtx_unlock_spin(&smp_ipi_mtx); 1922 if (spins == 0) 1923 printf("pmap_lazyfix: spun for 50000000\n"); 1924 } 1925} 1926 1927#else /* SMP */ 1928 1929/* 1930 * Cleaning up on uniprocessor is easy. For various reasons, we're 1931 * unlikely to have to even execute this code, including the fact 1932 * that the cleanup is deferred until the parent does a wait(2), which 1933 * means that another userland process has run. 1934 */ 1935static void 1936pmap_lazyfix(pmap_t pmap) 1937{ 1938 u_int cr3; 1939 1940 cr3 = vtophys(pmap->pm_pdir); 1941 if (cr3 == rcr3()) { 1942 load_cr3(PCPU_GET(curpcb)->pcb_cr3); 1943 pmap->pm_active &= ~(PCPU_GET(cpumask)); 1944 } 1945} 1946#endif /* SMP */ 1947 1948/* 1949 * Release any resources held by the given physical map. 1950 * Called when a pmap initialized by pmap_pinit is being released. 1951 * Should only be called if the map contains no valid mappings. 1952 */ 1953void 1954pmap_release(pmap_t pmap) 1955{ 1956 vm_page_t m, ptdpg[NPGPTD]; 1957 int i; 1958 1959 KASSERT(pmap->pm_stats.resident_count == 0, 1960 ("pmap_release: pmap resident count %ld != 0", 1961 pmap->pm_stats.resident_count)); 1962 KASSERT(pmap->pm_root == NULL, 1963 ("pmap_release: pmap has reserved page table page(s)")); 1964 1965 pmap_lazyfix(pmap); 1966 mtx_lock_spin(&allpmaps_lock); 1967 LIST_REMOVE(pmap, pm_list); 1968 mtx_unlock_spin(&allpmaps_lock); 1969 1970 for (i = 0; i < NPGPTD; i++) 1971 ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] & 1972 PG_FRAME); 1973 1974 bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * 1975 sizeof(*pmap->pm_pdir)); 1976 1977 pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); 1978 1979 for (i = 0; i < NPGPTD; i++) { 1980 m = ptdpg[i]; 1981#ifdef PAE 1982 KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), 1983 ("pmap_release: got wrong ptd page")); 1984#endif 1985 m->wire_count--; 1986 atomic_subtract_int(&cnt.v_wire_count, 1); 1987 vm_page_free_zero(m); 1988 } 1989 PMAP_LOCK_DESTROY(pmap); 1990} 1991 1992static int 1993kvm_size(SYSCTL_HANDLER_ARGS) 1994{ 1995 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; 1996 1997 return (sysctl_handle_long(oidp, &ksize, 0, req)); 1998} 1999SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 2000 0, 0, kvm_size, "IU", "Size of KVM"); 2001 2002static int 2003kvm_free(SYSCTL_HANDLER_ARGS) 2004{ 2005 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 2006 2007 return (sysctl_handle_long(oidp, &kfree, 0, req)); 2008} 2009SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 2010 0, 0, kvm_free, "IU", "Amount of KVM free"); 2011 2012/* 2013 * grow the number of kernel page table entries, if needed 2014 */ 2015void 2016pmap_growkernel(vm_offset_t addr) 2017{ 2018 vm_paddr_t ptppaddr; 2019 vm_page_t nkpg; 2020 pd_entry_t newpdir; 2021 2022 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2023 addr = roundup2(addr, NBPDR); 2024 if (addr - 1 >= kernel_map->max_offset) 2025 addr = kernel_map->max_offset; 2026 while (kernel_vm_end < addr) { 2027 if (pdir_pde(PTD, kernel_vm_end)) { 2028 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2029 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2030 kernel_vm_end = kernel_map->max_offset; 2031 break; 2032 } 2033 continue; 2034 } 2035 2036 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, 2037 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2038 VM_ALLOC_ZERO); 2039 if (nkpg == NULL) 2040 panic("pmap_growkernel: no memory to grow kernel"); 2041 2042 nkpt++; 2043 2044 if ((nkpg->flags & PG_ZERO) == 0) 2045 pmap_zero_page(nkpg); 2046 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 2047 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); 2048 pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir; 2049 2050 pmap_kenter_pde(kernel_vm_end, newpdir); 2051 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2052 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2053 kernel_vm_end = kernel_map->max_offset; 2054 break; 2055 } 2056 } 2057} 2058 2059 2060/*************************************************** 2061 * page management routines. 2062 ***************************************************/ 2063 2064CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2065CTASSERT(_NPCM == 11); 2066 2067static __inline struct pv_chunk * 2068pv_to_chunk(pv_entry_t pv) 2069{ 2070 2071 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2072} 2073 2074#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2075 2076#define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ 2077#define PC_FREE10 0x0000fffful /* Free values for index 10 */ 2078 2079static uint32_t pc_freemask[11] = { 2080 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2081 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2082 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2083 PC_FREE0_9, PC_FREE10 2084}; 2085 2086SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2087 "Current number of pv entries"); 2088 2089#ifdef PV_STATS 2090static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2091 2092SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2093 "Current number of pv entry chunks"); 2094SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2095 "Current number of pv entry chunks allocated"); 2096SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2097 "Current number of pv entry chunks frees"); 2098SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2099 "Number of times tried to get a chunk page but failed."); 2100 2101static long pv_entry_frees, pv_entry_allocs; 2102static int pv_entry_spare; 2103 2104SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2105 "Current number of pv entry frees"); 2106SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2107 "Current number of pv entry allocs"); 2108SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2109 "Current number of spare pv entries"); 2110 2111static int pmap_collect_inactive, pmap_collect_active; 2112 2113SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0, 2114 "Current number times pmap_collect called on inactive queue"); 2115SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0, 2116 "Current number times pmap_collect called on active queue"); 2117#endif 2118 2119/* 2120 * We are in a serious low memory condition. Resort to 2121 * drastic measures to free some pages so we can allocate 2122 * another pv entry chunk. This is normally called to 2123 * unmap inactive pages, and if necessary, active pages. 2124 */ 2125static void 2126pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq) 2127{ 2128 pd_entry_t *pde; 2129 pmap_t pmap; 2130 pt_entry_t *pte, tpte; 2131 pv_entry_t next_pv, pv; 2132 vm_offset_t va; 2133 vm_page_t m, free; 2134 2135 sched_pin(); 2136 TAILQ_FOREACH(m, &vpq->pl, pageq) { 2137 if (m->hold_count || m->busy) 2138 continue; 2139 TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) { 2140 va = pv->pv_va; 2141 pmap = PV_PMAP(pv); 2142 /* Avoid deadlock and lock recursion. */ 2143 if (pmap > locked_pmap) 2144 PMAP_LOCK(pmap); 2145 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) 2146 continue; 2147 pmap->pm_stats.resident_count--; 2148 pde = pmap_pde(pmap, va); 2149 KASSERT((*pde & PG_PS) == 0, ("pmap_collect: found" 2150 " a 4mpage in page %p's pv list", m)); 2151 pte = pmap_pte_quick(pmap, va); 2152 tpte = pte_load_clear(pte); 2153 KASSERT((tpte & PG_W) == 0, 2154 ("pmap_collect: wired pte %#jx", (uintmax_t)tpte)); 2155 if (tpte & PG_A) 2156 vm_page_flag_set(m, PG_REFERENCED); 2157 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2158 vm_page_dirty(m); 2159 free = NULL; 2160 pmap_unuse_pt(pmap, va, &free); 2161 pmap_invalidate_page(pmap, va); 2162 pmap_free_zero_pages(free); 2163 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2164 free_pv_entry(pmap, pv); 2165 if (pmap != locked_pmap) 2166 PMAP_UNLOCK(pmap); 2167 } 2168 if (TAILQ_EMPTY(&m->md.pv_list) && 2169 TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)) 2170 vm_page_flag_clear(m, PG_WRITEABLE); 2171 } 2172 sched_unpin(); 2173} 2174 2175 2176/* 2177 * free the pv_entry back to the free list 2178 */ 2179static void 2180free_pv_entry(pmap_t pmap, pv_entry_t pv) 2181{ 2182 vm_page_t m; 2183 struct pv_chunk *pc; 2184 int idx, field, bit; 2185 2186 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2187 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2188 PV_STAT(pv_entry_frees++); 2189 PV_STAT(pv_entry_spare++); 2190 pv_entry_count--; 2191 pc = pv_to_chunk(pv); 2192 idx = pv - &pc->pc_pventry[0]; 2193 field = idx / 32; 2194 bit = idx % 32; 2195 pc->pc_map[field] |= 1ul << bit; 2196 /* move to head of list */ 2197 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2198 for (idx = 0; idx < _NPCM; idx++) 2199 if (pc->pc_map[idx] != pc_freemask[idx]) { 2200 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2201 return; 2202 } 2203 PV_STAT(pv_entry_spare -= _NPCPV); 2204 PV_STAT(pc_chunk_count--); 2205 PV_STAT(pc_chunk_frees++); 2206 /* entire chunk is free, return it */ 2207 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2208 pmap_qremove((vm_offset_t)pc, 1); 2209 vm_page_unwire(m, 0); 2210 vm_page_free(m); 2211 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2212} 2213 2214/* 2215 * get a new pv_entry, allocating a block from the system 2216 * when needed. 2217 */ 2218static pv_entry_t 2219get_pv_entry(pmap_t pmap, int try) 2220{ 2221 static const struct timeval printinterval = { 60, 0 }; 2222 static struct timeval lastprint; 2223 static vm_pindex_t colour; 2224 struct vpgqueues *pq; 2225 int bit, field; 2226 pv_entry_t pv; 2227 struct pv_chunk *pc; 2228 vm_page_t m; 2229 2230 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2231 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2232 PV_STAT(pv_entry_allocs++); 2233 pv_entry_count++; 2234 if (pv_entry_count > pv_entry_high_water) 2235 if (ratecheck(&lastprint, &printinterval)) 2236 printf("Approaching the limit on PV entries, consider " 2237 "increasing either the vm.pmap.shpgperproc or the " 2238 "vm.pmap.pv_entry_max tunable.\n"); 2239 pq = NULL; 2240retry: 2241 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2242 if (pc != NULL) { 2243 for (field = 0; field < _NPCM; field++) { 2244 if (pc->pc_map[field]) { 2245 bit = bsfl(pc->pc_map[field]); 2246 break; 2247 } 2248 } 2249 if (field < _NPCM) { 2250 pv = &pc->pc_pventry[field * 32 + bit]; 2251 pc->pc_map[field] &= ~(1ul << bit); 2252 /* If this was the last item, move it to tail */ 2253 for (field = 0; field < _NPCM; field++) 2254 if (pc->pc_map[field] != 0) { 2255 PV_STAT(pv_entry_spare--); 2256 return (pv); /* not full, return */ 2257 } 2258 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2259 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2260 PV_STAT(pv_entry_spare--); 2261 return (pv); 2262 } 2263 } 2264 /* 2265 * Access to the ptelist "pv_vafree" is synchronized by the page 2266 * queues lock. If "pv_vafree" is currently non-empty, it will 2267 * remain non-empty until pmap_ptelist_alloc() completes. 2268 */ 2269 if (pv_vafree == 0 || (m = vm_page_alloc(NULL, colour, (pq == 2270 &vm_page_queues[PQ_ACTIVE] ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) | 2271 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 2272 if (try) { 2273 pv_entry_count--; 2274 PV_STAT(pc_chunk_tryfail++); 2275 return (NULL); 2276 } 2277 /* 2278 * Reclaim pv entries: At first, destroy mappings to 2279 * inactive pages. After that, if a pv chunk entry 2280 * is still needed, destroy mappings to active pages. 2281 */ 2282 if (pq == NULL) { 2283 PV_STAT(pmap_collect_inactive++); 2284 pq = &vm_page_queues[PQ_INACTIVE]; 2285 } else if (pq == &vm_page_queues[PQ_INACTIVE]) { 2286 PV_STAT(pmap_collect_active++); 2287 pq = &vm_page_queues[PQ_ACTIVE]; 2288 } else 2289 panic("get_pv_entry: increase vm.pmap.shpgperproc"); 2290 pmap_collect(pmap, pq); 2291 goto retry; 2292 } 2293 PV_STAT(pc_chunk_count++); 2294 PV_STAT(pc_chunk_allocs++); 2295 colour++; 2296 pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); 2297 pmap_qenter((vm_offset_t)pc, &m, 1); 2298 pc->pc_pmap = pmap; 2299 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ 2300 for (field = 1; field < _NPCM; field++) 2301 pc->pc_map[field] = pc_freemask[field]; 2302 pv = &pc->pc_pventry[0]; 2303 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2304 PV_STAT(pv_entry_spare += _NPCPV - 1); 2305 return (pv); 2306} 2307 2308static __inline pv_entry_t 2309pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2310{ 2311 pv_entry_t pv; 2312 2313 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2314 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 2315 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 2316 TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); 2317 break; 2318 } 2319 } 2320 return (pv); 2321} 2322 2323static void 2324pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2325{ 2326 struct md_page *pvh; 2327 pv_entry_t pv; 2328 vm_offset_t va_last; 2329 vm_page_t m; 2330 2331 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2332 KASSERT((pa & PDRMASK) == 0, 2333 ("pmap_pv_demote_pde: pa is not 4mpage aligned")); 2334 2335 /* 2336 * Transfer the 4mpage's pv entry for this mapping to the first 2337 * page's pv list. 2338 */ 2339 pvh = pa_to_pvh(pa); 2340 va = trunc_4mpage(va); 2341 pv = pmap_pvh_remove(pvh, pmap, va); 2342 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 2343 m = PHYS_TO_VM_PAGE(pa); 2344 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2345 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 2346 va_last = va + NBPDR - PAGE_SIZE; 2347 do { 2348 m++; 2349 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0, 2350 ("pmap_pv_demote_pde: page %p is not managed", m)); 2351 va += PAGE_SIZE; 2352 pmap_insert_entry(pmap, va, m); 2353 } while (va < va_last); 2354} 2355 2356static void 2357pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2358{ 2359 struct md_page *pvh; 2360 pv_entry_t pv; 2361 vm_offset_t va_last; 2362 vm_page_t m; 2363 2364 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2365 KASSERT((pa & PDRMASK) == 0, 2366 ("pmap_pv_promote_pde: pa is not 4mpage aligned")); 2367 2368 /* 2369 * Transfer the first page's pv entry for this mapping to the 2370 * 4mpage's pv list. Aside from avoiding the cost of a call 2371 * to get_pv_entry(), a transfer avoids the possibility that 2372 * get_pv_entry() calls pmap_collect() and that pmap_collect() 2373 * removes one of the mappings that is being promoted. 2374 */ 2375 m = PHYS_TO_VM_PAGE(pa); 2376 va = trunc_4mpage(va); 2377 pv = pmap_pvh_remove(&m->md, pmap, va); 2378 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 2379 pvh = pa_to_pvh(pa); 2380 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); 2381 /* Free the remaining NPTEPG - 1 pv entries. */ 2382 va_last = va + NBPDR - PAGE_SIZE; 2383 do { 2384 m++; 2385 va += PAGE_SIZE; 2386 pmap_pvh_free(&m->md, pmap, va); 2387 } while (va < va_last); 2388} 2389 2390static void 2391pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2392{ 2393 pv_entry_t pv; 2394 2395 pv = pmap_pvh_remove(pvh, pmap, va); 2396 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 2397 free_pv_entry(pmap, pv); 2398} 2399 2400static void 2401pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 2402{ 2403 struct md_page *pvh; 2404 2405 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2406 pmap_pvh_free(&m->md, pmap, va); 2407 if (TAILQ_EMPTY(&m->md.pv_list)) { 2408 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2409 if (TAILQ_EMPTY(&pvh->pv_list)) 2410 vm_page_flag_clear(m, PG_WRITEABLE); 2411 } 2412} 2413 2414/* 2415 * Create a pv entry for page at pa for 2416 * (pmap, va). 2417 */ 2418static void 2419pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2420{ 2421 pv_entry_t pv; 2422 2423 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2424 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2425 pv = get_pv_entry(pmap, FALSE); 2426 pv->pv_va = va; 2427 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2428} 2429 2430/* 2431 * Conditionally create a pv entry. 2432 */ 2433static boolean_t 2434pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2435{ 2436 pv_entry_t pv; 2437 2438 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2439 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2440 if (pv_entry_count < pv_entry_high_water && 2441 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2442 pv->pv_va = va; 2443 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2444 return (TRUE); 2445 } else 2446 return (FALSE); 2447} 2448 2449/* 2450 * Create the pv entries for each of the pages within a superpage. 2451 */ 2452static boolean_t 2453pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2454{ 2455 struct md_page *pvh; 2456 pv_entry_t pv; 2457 2458 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2459 if (pv_entry_count < pv_entry_high_water && 2460 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2461 pv->pv_va = va; 2462 pvh = pa_to_pvh(pa); 2463 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); 2464 return (TRUE); 2465 } else 2466 return (FALSE); 2467} 2468 2469/* 2470 * Fills a page table page with mappings to consecutive physical pages. 2471 */ 2472static void 2473pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 2474{ 2475 pt_entry_t *pte; 2476 2477 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 2478 *pte = newpte; 2479 newpte += PAGE_SIZE; 2480 } 2481} 2482 2483/* 2484 * Tries to demote a 2- or 4MB page mapping. If demotion fails, the 2485 * 2- or 4MB page mapping is invalidated. 2486 */ 2487static boolean_t 2488pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2489{ 2490 pd_entry_t newpde, oldpde; 2491 pt_entry_t *firstpte, newpte; 2492 vm_paddr_t mptepa; 2493 vm_page_t free, mpte; 2494 2495 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2496 oldpde = *pde; 2497 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 2498 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 2499 mpte = pmap_lookup_pt_page(pmap, va); 2500 if (mpte != NULL) 2501 pmap_remove_pt_page(pmap, mpte); 2502 else { 2503 KASSERT((oldpde & PG_W) == 0, 2504 ("pmap_demote_pde: page table page for a wired mapping" 2505 " is missing")); 2506 2507 /* 2508 * Invalidate the 2- or 4MB page mapping and return 2509 * "failure" if the mapping was never accessed or the 2510 * allocation of the new page table page fails. 2511 */ 2512 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 2513 va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | 2514 VM_ALLOC_WIRED)) == NULL) { 2515 free = NULL; 2516 pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free); 2517 pmap_invalidate_page(pmap, trunc_4mpage(va)); 2518 pmap_free_zero_pages(free); 2519 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" 2520 " in pmap %p", va, pmap); 2521 return (FALSE); 2522 } 2523 if (va < VM_MAXUSER_ADDRESS) 2524 pmap->pm_stats.resident_count++; 2525 } 2526 mptepa = VM_PAGE_TO_PHYS(mpte); 2527 2528 /* 2529 * If the page mapping is in the kernel's address space, then the 2530 * KPTmap can provide access to the page table page. Otherwise, 2531 * temporarily map the page table page (mpte) into the kernel's 2532 * address space at either PADDR1 or PADDR2. 2533 */ 2534 if (va >= KERNBASE) 2535 firstpte = &KPTmap[i386_btop(trunc_4mpage(va))]; 2536 else if (curthread->td_pinned > 0 && mtx_owned(&vm_page_queue_mtx)) { 2537 if ((*PMAP1 & PG_FRAME) != mptepa) { 2538 *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2539#ifdef SMP 2540 PMAP1cpu = PCPU_GET(cpuid); 2541#endif 2542 invlcaddr(PADDR1); 2543 PMAP1changed++; 2544 } else 2545#ifdef SMP 2546 if (PMAP1cpu != PCPU_GET(cpuid)) { 2547 PMAP1cpu = PCPU_GET(cpuid); 2548 invlcaddr(PADDR1); 2549 PMAP1changedcpu++; 2550 } else 2551#endif 2552 PMAP1unchanged++; 2553 firstpte = PADDR1; 2554 } else { 2555 mtx_lock(&PMAP2mutex); 2556 if ((*PMAP2 & PG_FRAME) != mptepa) { 2557 *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2558 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); 2559 } 2560 firstpte = PADDR2; 2561 } 2562 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 2563 KASSERT((oldpde & PG_A) != 0, 2564 ("pmap_demote_pde: oldpde is missing PG_A")); 2565 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 2566 ("pmap_demote_pde: oldpde is missing PG_M")); 2567 newpte = oldpde & ~PG_PS; 2568 if ((newpte & PG_PDE_PAT) != 0) 2569 newpte ^= PG_PDE_PAT | PG_PTE_PAT; 2570 2571 /* 2572 * If the page table page is new, initialize it. 2573 */ 2574 if (mpte->wire_count == 1) { 2575 mpte->wire_count = NPTEPG; 2576 pmap_fill_ptp(firstpte, newpte); 2577 } 2578 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 2579 ("pmap_demote_pde: firstpte and newpte map different physical" 2580 " addresses")); 2581 2582 /* 2583 * If the mapping has changed attributes, update the page table 2584 * entries. 2585 */ 2586 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 2587 pmap_fill_ptp(firstpte, newpte); 2588 2589 /* 2590 * Demote the mapping. This pmap is locked. The old PDE has 2591 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 2592 * set. Thus, there is no danger of a race with another 2593 * processor changing the setting of PG_A and/or PG_M between 2594 * the read above and the store below. 2595 */ 2596 if (workaround_erratum383) 2597 pmap_update_pde(pmap, va, pde, newpde); 2598 else if (pmap == kernel_pmap) 2599 pmap_kenter_pde(va, newpde); 2600 else 2601 pde_store(pde, newpde); 2602 if (firstpte == PADDR2) 2603 mtx_unlock(&PMAP2mutex); 2604 2605 /* 2606 * Invalidate the recursive mapping of the page table page. 2607 */ 2608 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 2609 2610 /* 2611 * Demote the pv entry. This depends on the earlier demotion 2612 * of the mapping. Specifically, the (re)creation of a per- 2613 * page pv entry might trigger the execution of pmap_collect(), 2614 * which might reclaim a newly (re)created per-page pv entry 2615 * and destroy the associated mapping. In order to destroy 2616 * the mapping, the PDE must have already changed from mapping 2617 * the 2mpage to referencing the page table page. 2618 */ 2619 if ((oldpde & PG_MANAGED) != 0) 2620 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); 2621 2622 pmap_pde_demotions++; 2623 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" 2624 " in pmap %p", va, pmap); 2625 return (TRUE); 2626} 2627 2628/* 2629 * pmap_remove_pde: do the things to unmap a superpage in a process 2630 */ 2631static void 2632pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 2633 vm_page_t *free) 2634{ 2635 struct md_page *pvh; 2636 pd_entry_t oldpde; 2637 vm_offset_t eva, va; 2638 vm_page_t m, mpte; 2639 2640 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2641 KASSERT((sva & PDRMASK) == 0, 2642 ("pmap_remove_pde: sva is not 4mpage aligned")); 2643 oldpde = pte_load_clear(pdq); 2644 if (oldpde & PG_W) 2645 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 2646 2647 /* 2648 * Machines that don't support invlpg, also don't support 2649 * PG_G. 2650 */ 2651 if (oldpde & PG_G) 2652 pmap_invalidate_page(kernel_pmap, sva); 2653 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2654 if (oldpde & PG_MANAGED) { 2655 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 2656 pmap_pvh_free(pvh, pmap, sva); 2657 eva = sva + NBPDR; 2658 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 2659 va < eva; va += PAGE_SIZE, m++) { 2660 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2661 vm_page_dirty(m); 2662 if (oldpde & PG_A) 2663 vm_page_flag_set(m, PG_REFERENCED); 2664 if (TAILQ_EMPTY(&m->md.pv_list) && 2665 TAILQ_EMPTY(&pvh->pv_list)) 2666 vm_page_flag_clear(m, PG_WRITEABLE); 2667 } 2668 } 2669 if (pmap == kernel_pmap) { 2670 if (!pmap_demote_pde(pmap, pdq, sva)) 2671 panic("pmap_remove_pde: failed demotion"); 2672 } else { 2673 mpte = pmap_lookup_pt_page(pmap, sva); 2674 if (mpte != NULL) { 2675 pmap_remove_pt_page(pmap, mpte); 2676 pmap->pm_stats.resident_count--; 2677 KASSERT(mpte->wire_count == NPTEPG, 2678 ("pmap_remove_pde: pte page wire count error")); 2679 mpte->wire_count = 0; 2680 pmap_add_delayed_free_list(mpte, free, FALSE); 2681 atomic_subtract_int(&cnt.v_wire_count, 1); 2682 } 2683 } 2684} 2685 2686/* 2687 * pmap_remove_pte: do the things to unmap a page in a process 2688 */ 2689static int 2690pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free) 2691{ 2692 pt_entry_t oldpte; 2693 vm_page_t m; 2694 2695 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2696 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2697 oldpte = pte_load_clear(ptq); 2698 if (oldpte & PG_W) 2699 pmap->pm_stats.wired_count -= 1; 2700 /* 2701 * Machines that don't support invlpg, also don't support 2702 * PG_G. 2703 */ 2704 if (oldpte & PG_G) 2705 pmap_invalidate_page(kernel_pmap, va); 2706 pmap->pm_stats.resident_count -= 1; 2707 if (oldpte & PG_MANAGED) { 2708 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 2709 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2710 vm_page_dirty(m); 2711 if (oldpte & PG_A) 2712 vm_page_flag_set(m, PG_REFERENCED); 2713 pmap_remove_entry(pmap, m, va); 2714 } 2715 return (pmap_unuse_pt(pmap, va, free)); 2716} 2717 2718/* 2719 * Remove a single page from a process address space 2720 */ 2721static void 2722pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free) 2723{ 2724 pt_entry_t *pte; 2725 2726 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2727 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 2728 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2729 if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) 2730 return; 2731 pmap_remove_pte(pmap, pte, va, free); 2732 pmap_invalidate_page(pmap, va); 2733} 2734 2735/* 2736 * Remove the given range of addresses from the specified map. 2737 * 2738 * It is assumed that the start and end are properly 2739 * rounded to the page size. 2740 */ 2741void 2742pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2743{ 2744 vm_offset_t pdnxt; 2745 pd_entry_t ptpaddr; 2746 pt_entry_t *pte; 2747 vm_page_t free = NULL; 2748 int anyvalid; 2749 2750 /* 2751 * Perform an unsynchronized read. This is, however, safe. 2752 */ 2753 if (pmap->pm_stats.resident_count == 0) 2754 return; 2755 2756 anyvalid = 0; 2757 2758 vm_page_lock_queues(); 2759 sched_pin(); 2760 PMAP_LOCK(pmap); 2761 2762 /* 2763 * special handling of removing one page. a very 2764 * common operation and easy to short circuit some 2765 * code. 2766 */ 2767 if ((sva + PAGE_SIZE == eva) && 2768 ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { 2769 pmap_remove_page(pmap, sva, &free); 2770 goto out; 2771 } 2772 2773 for (; sva < eva; sva = pdnxt) { 2774 unsigned pdirindex; 2775 2776 /* 2777 * Calculate index for next page table. 2778 */ 2779 pdnxt = (sva + NBPDR) & ~PDRMASK; 2780 if (pdnxt < sva) 2781 pdnxt = eva; 2782 if (pmap->pm_stats.resident_count == 0) 2783 break; 2784 2785 pdirindex = sva >> PDRSHIFT; 2786 ptpaddr = pmap->pm_pdir[pdirindex]; 2787 2788 /* 2789 * Weed out invalid mappings. Note: we assume that the page 2790 * directory table is always allocated, and in kernel virtual. 2791 */ 2792 if (ptpaddr == 0) 2793 continue; 2794 2795 /* 2796 * Check for large page. 2797 */ 2798 if ((ptpaddr & PG_PS) != 0) { 2799 /* 2800 * Are we removing the entire large page? If not, 2801 * demote the mapping and fall through. 2802 */ 2803 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 2804 /* 2805 * The TLB entry for a PG_G mapping is 2806 * invalidated by pmap_remove_pde(). 2807 */ 2808 if ((ptpaddr & PG_G) == 0) 2809 anyvalid = 1; 2810 pmap_remove_pde(pmap, 2811 &pmap->pm_pdir[pdirindex], sva, &free); 2812 continue; 2813 } else if (!pmap_demote_pde(pmap, 2814 &pmap->pm_pdir[pdirindex], sva)) { 2815 /* The large page mapping was destroyed. */ 2816 continue; 2817 } 2818 } 2819 2820 /* 2821 * Limit our scan to either the end of the va represented 2822 * by the current page table page, or to the end of the 2823 * range being removed. 2824 */ 2825 if (pdnxt > eva) 2826 pdnxt = eva; 2827 2828 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 2829 sva += PAGE_SIZE) { 2830 if (*pte == 0) 2831 continue; 2832 2833 /* 2834 * The TLB entry for a PG_G mapping is invalidated 2835 * by pmap_remove_pte(). 2836 */ 2837 if ((*pte & PG_G) == 0) 2838 anyvalid = 1; 2839 if (pmap_remove_pte(pmap, pte, sva, &free)) 2840 break; 2841 } 2842 } 2843out: 2844 sched_unpin(); 2845 if (anyvalid) 2846 pmap_invalidate_all(pmap); 2847 vm_page_unlock_queues(); 2848 PMAP_UNLOCK(pmap); 2849 pmap_free_zero_pages(free); 2850} 2851 2852/* 2853 * Routine: pmap_remove_all 2854 * Function: 2855 * Removes this physical page from 2856 * all physical maps in which it resides. 2857 * Reflects back modify bits to the pager. 2858 * 2859 * Notes: 2860 * Original versions of this routine were very 2861 * inefficient because they iteratively called 2862 * pmap_remove (slow...) 2863 */ 2864 2865void 2866pmap_remove_all(vm_page_t m) 2867{ 2868 struct md_page *pvh; 2869 pv_entry_t pv; 2870 pmap_t pmap; 2871 pt_entry_t *pte, tpte; 2872 pd_entry_t *pde; 2873 vm_offset_t va; 2874 vm_page_t free; 2875 2876 KASSERT((m->flags & PG_FICTITIOUS) == 0, 2877 ("pmap_remove_all: page %p is fictitious", m)); 2878 free = NULL; 2879 vm_page_lock_queues(); 2880 sched_pin(); 2881 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2882 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 2883 va = pv->pv_va; 2884 pmap = PV_PMAP(pv); 2885 PMAP_LOCK(pmap); 2886 pde = pmap_pde(pmap, va); 2887 (void)pmap_demote_pde(pmap, pde, va); 2888 PMAP_UNLOCK(pmap); 2889 } 2890 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2891 pmap = PV_PMAP(pv); 2892 PMAP_LOCK(pmap); 2893 pmap->pm_stats.resident_count--; 2894 pde = pmap_pde(pmap, pv->pv_va); 2895 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 2896 " a 4mpage in page %p's pv list", m)); 2897 pte = pmap_pte_quick(pmap, pv->pv_va); 2898 tpte = pte_load_clear(pte); 2899 if (tpte & PG_W) 2900 pmap->pm_stats.wired_count--; 2901 if (tpte & PG_A) 2902 vm_page_flag_set(m, PG_REFERENCED); 2903 2904 /* 2905 * Update the vm_page_t clean and reference bits. 2906 */ 2907 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2908 vm_page_dirty(m); 2909 pmap_unuse_pt(pmap, pv->pv_va, &free); 2910 pmap_invalidate_page(pmap, pv->pv_va); 2911 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2912 free_pv_entry(pmap, pv); 2913 PMAP_UNLOCK(pmap); 2914 } 2915 vm_page_flag_clear(m, PG_WRITEABLE); 2916 sched_unpin(); 2917 vm_page_unlock_queues(); 2918 pmap_free_zero_pages(free); 2919} 2920 2921/* 2922 * pmap_protect_pde: do the things to protect a 4mpage in a process 2923 */ 2924static boolean_t 2925pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 2926{ 2927 pd_entry_t newpde, oldpde; 2928 vm_offset_t eva, va; 2929 vm_page_t m; 2930 boolean_t anychanged; 2931 2932 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2933 KASSERT((sva & PDRMASK) == 0, 2934 ("pmap_protect_pde: sva is not 4mpage aligned")); 2935 anychanged = FALSE; 2936retry: 2937 oldpde = newpde = *pde; 2938 if (oldpde & PG_MANAGED) { 2939 eva = sva + NBPDR; 2940 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 2941 va < eva; va += PAGE_SIZE, m++) 2942 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2943 vm_page_dirty(m); 2944 } 2945 if ((prot & VM_PROT_WRITE) == 0) 2946 newpde &= ~(PG_RW | PG_M); 2947#ifdef PAE 2948 if ((prot & VM_PROT_EXECUTE) == 0) 2949 newpde |= pg_nx; 2950#endif 2951 if (newpde != oldpde) { 2952 if (!pde_cmpset(pde, oldpde, newpde)) 2953 goto retry; 2954 if (oldpde & PG_G) 2955 pmap_invalidate_page(pmap, sva); 2956 else 2957 anychanged = TRUE; 2958 } 2959 return (anychanged); 2960} 2961 2962/* 2963 * Set the physical protection on the 2964 * specified range of this map as requested. 2965 */ 2966void 2967pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2968{ 2969 vm_offset_t pdnxt; 2970 pd_entry_t ptpaddr; 2971 pt_entry_t *pte; 2972 int anychanged; 2973 2974 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2975 pmap_remove(pmap, sva, eva); 2976 return; 2977 } 2978 2979#ifdef PAE 2980 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 2981 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 2982 return; 2983#else 2984 if (prot & VM_PROT_WRITE) 2985 return; 2986#endif 2987 2988 anychanged = 0; 2989 2990 vm_page_lock_queues(); 2991 sched_pin(); 2992 PMAP_LOCK(pmap); 2993 for (; sva < eva; sva = pdnxt) { 2994 pt_entry_t obits, pbits; 2995 unsigned pdirindex; 2996 2997 pdnxt = (sva + NBPDR) & ~PDRMASK; 2998 if (pdnxt < sva) 2999 pdnxt = eva; 3000 3001 pdirindex = sva >> PDRSHIFT; 3002 ptpaddr = pmap->pm_pdir[pdirindex]; 3003 3004 /* 3005 * Weed out invalid mappings. Note: we assume that the page 3006 * directory table is always allocated, and in kernel virtual. 3007 */ 3008 if (ptpaddr == 0) 3009 continue; 3010 3011 /* 3012 * Check for large page. 3013 */ 3014 if ((ptpaddr & PG_PS) != 0) { 3015 /* 3016 * Are we protecting the entire large page? If not, 3017 * demote the mapping and fall through. 3018 */ 3019 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 3020 /* 3021 * The TLB entry for a PG_G mapping is 3022 * invalidated by pmap_protect_pde(). 3023 */ 3024 if (pmap_protect_pde(pmap, 3025 &pmap->pm_pdir[pdirindex], sva, prot)) 3026 anychanged = 1; 3027 continue; 3028 } else if (!pmap_demote_pde(pmap, 3029 &pmap->pm_pdir[pdirindex], sva)) { 3030 /* The large page mapping was destroyed. */ 3031 continue; 3032 } 3033 } 3034 3035 if (pdnxt > eva) 3036 pdnxt = eva; 3037 3038 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 3039 sva += PAGE_SIZE) { 3040 vm_page_t m; 3041 3042retry: 3043 /* 3044 * Regardless of whether a pte is 32 or 64 bits in 3045 * size, PG_RW, PG_A, and PG_M are among the least 3046 * significant 32 bits. 3047 */ 3048 obits = pbits = *pte; 3049 if ((pbits & PG_V) == 0) 3050 continue; 3051 3052 if ((prot & VM_PROT_WRITE) == 0) { 3053 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 3054 (PG_MANAGED | PG_M | PG_RW)) { 3055 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 3056 vm_page_dirty(m); 3057 } 3058 pbits &= ~(PG_RW | PG_M); 3059 } 3060#ifdef PAE 3061 if ((prot & VM_PROT_EXECUTE) == 0) 3062 pbits |= pg_nx; 3063#endif 3064 3065 if (pbits != obits) { 3066#ifdef PAE 3067 if (!atomic_cmpset_64(pte, obits, pbits)) 3068 goto retry; 3069#else 3070 if (!atomic_cmpset_int((u_int *)pte, obits, 3071 pbits)) 3072 goto retry; 3073#endif 3074 if (obits & PG_G) 3075 pmap_invalidate_page(pmap, sva); 3076 else 3077 anychanged = 1; 3078 } 3079 } 3080 } 3081 sched_unpin(); 3082 if (anychanged) 3083 pmap_invalidate_all(pmap); 3084 vm_page_unlock_queues(); 3085 PMAP_UNLOCK(pmap); 3086} 3087 3088/* 3089 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are 3090 * within a single page table page (PTP) to a single 2- or 4MB page mapping. 3091 * For promotion to occur, two conditions must be met: (1) the 4KB page 3092 * mappings must map aligned, contiguous physical memory and (2) the 4KB page 3093 * mappings must have identical characteristics. 3094 * 3095 * Managed (PG_MANAGED) mappings within the kernel address space are not 3096 * promoted. The reason is that kernel PDEs are replicated in each pmap but 3097 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel 3098 * pmap. 3099 */ 3100static void 3101pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3102{ 3103 pd_entry_t newpde; 3104 pt_entry_t *firstpte, oldpte, pa, *pte; 3105 vm_offset_t oldpteva; 3106 vm_page_t mpte; 3107 3108 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3109 3110 /* 3111 * Examine the first PTE in the specified PTP. Abort if this PTE is 3112 * either invalid, unused, or does not map the first 4KB physical page 3113 * within a 2- or 4MB page. 3114 */ 3115 firstpte = pmap_pte_quick(pmap, trunc_4mpage(va)); 3116setpde: 3117 newpde = *firstpte; 3118 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 3119 pmap_pde_p_failures++; 3120 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3121 " in pmap %p", va, pmap); 3122 return; 3123 } 3124 if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { 3125 pmap_pde_p_failures++; 3126 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3127 " in pmap %p", va, pmap); 3128 return; 3129 } 3130 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 3131 /* 3132 * When PG_M is already clear, PG_RW can be cleared without 3133 * a TLB invalidation. 3134 */ 3135 if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde & 3136 ~PG_RW)) 3137 goto setpde; 3138 newpde &= ~PG_RW; 3139 } 3140 3141 /* 3142 * Examine each of the other PTEs in the specified PTP. Abort if this 3143 * PTE maps an unexpected 4KB physical page or does not have identical 3144 * characteristics to the first PTE. 3145 */ 3146 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 3147 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 3148setpte: 3149 oldpte = *pte; 3150 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 3151 pmap_pde_p_failures++; 3152 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3153 " in pmap %p", va, pmap); 3154 return; 3155 } 3156 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 3157 /* 3158 * When PG_M is already clear, PG_RW can be cleared 3159 * without a TLB invalidation. 3160 */ 3161 if (!atomic_cmpset_int((u_int *)pte, oldpte, 3162 oldpte & ~PG_RW)) 3163 goto setpte; 3164 oldpte &= ~PG_RW; 3165 oldpteva = (oldpte & PG_FRAME & PDRMASK) | 3166 (va & ~PDRMASK); 3167 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" 3168 " in pmap %p", oldpteva, pmap); 3169 } 3170 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 3171 pmap_pde_p_failures++; 3172 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3173 " in pmap %p", va, pmap); 3174 return; 3175 } 3176 pa -= PAGE_SIZE; 3177 } 3178 3179 /* 3180 * Save the page table page in its current state until the PDE 3181 * mapping the superpage is demoted by pmap_demote_pde() or 3182 * destroyed by pmap_remove_pde(). 3183 */ 3184 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 3185 KASSERT(mpte >= vm_page_array && 3186 mpte < &vm_page_array[vm_page_array_size], 3187 ("pmap_promote_pde: page table page is out of range")); 3188 KASSERT(mpte->pindex == va >> PDRSHIFT, 3189 ("pmap_promote_pde: page table page's pindex is wrong")); 3190 pmap_insert_pt_page(pmap, mpte); 3191 3192 /* 3193 * Promote the pv entries. 3194 */ 3195 if ((newpde & PG_MANAGED) != 0) 3196 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); 3197 3198 /* 3199 * Propagate the PAT index to its proper position. 3200 */ 3201 if ((newpde & PG_PTE_PAT) != 0) 3202 newpde ^= PG_PDE_PAT | PG_PTE_PAT; 3203 3204 /* 3205 * Map the superpage. 3206 */ 3207 if (workaround_erratum383) 3208 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 3209 else if (pmap == kernel_pmap) 3210 pmap_kenter_pde(va, PG_PS | newpde); 3211 else 3212 pde_store(pde, PG_PS | newpde); 3213 3214 pmap_pde_promotions++; 3215 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" 3216 " in pmap %p", va, pmap); 3217} 3218 3219/* 3220 * Insert the given physical page (p) at 3221 * the specified virtual address (v) in the 3222 * target physical map with the protection requested. 3223 * 3224 * If specified, the page will be wired down, meaning 3225 * that the related pte can not be reclaimed. 3226 * 3227 * NB: This is the only routine which MAY NOT lazy-evaluate 3228 * or lose information. That is, this routine must actually 3229 * insert this page into the given map NOW. 3230 */ 3231void 3232pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m, 3233 vm_prot_t prot, boolean_t wired) 3234{ 3235 pd_entry_t *pde; 3236 pt_entry_t *pte; 3237 pt_entry_t newpte, origpte; 3238 pv_entry_t pv; 3239 vm_paddr_t opa, pa; 3240 vm_page_t mpte, om; 3241 boolean_t invlva; 3242 3243 va = trunc_page(va); 3244 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 3245 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 3246 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", 3247 va)); 3248 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0 || 3249 (m->oflags & VPO_BUSY) != 0, 3250 ("pmap_enter: page %p is not busy", m)); 3251 3252 mpte = NULL; 3253 3254 vm_page_lock_queues(); 3255 PMAP_LOCK(pmap); 3256 sched_pin(); 3257 3258 /* 3259 * In the case that a page table page is not 3260 * resident, we are creating it here. 3261 */ 3262 if (va < VM_MAXUSER_ADDRESS) { 3263 mpte = pmap_allocpte(pmap, va, M_WAITOK); 3264 } 3265 3266 pde = pmap_pde(pmap, va); 3267 if ((*pde & PG_PS) != 0) 3268 panic("pmap_enter: attempted pmap_enter on 4MB page"); 3269 pte = pmap_pte_quick(pmap, va); 3270 3271 /* 3272 * Page Directory table entry not valid, we need a new PT page 3273 */ 3274 if (pte == NULL) { 3275 panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", 3276 (uintmax_t)pmap->pm_pdir[PTDPTDI], va); 3277 } 3278 3279 pa = VM_PAGE_TO_PHYS(m); 3280 om = NULL; 3281 origpte = *pte; 3282 opa = origpte & PG_FRAME; 3283 3284 /* 3285 * Mapping has not changed, must be protection or wiring change. 3286 */ 3287 if (origpte && (opa == pa)) { 3288 /* 3289 * Wiring change, just update stats. We don't worry about 3290 * wiring PT pages as they remain resident as long as there 3291 * are valid mappings in them. Hence, if a user page is wired, 3292 * the PT page will be also. 3293 */ 3294 if (wired && ((origpte & PG_W) == 0)) 3295 pmap->pm_stats.wired_count++; 3296 else if (!wired && (origpte & PG_W)) 3297 pmap->pm_stats.wired_count--; 3298 3299 /* 3300 * Remove extra pte reference 3301 */ 3302 if (mpte) 3303 mpte->wire_count--; 3304 3305 if (origpte & PG_MANAGED) { 3306 om = m; 3307 pa |= PG_MANAGED; 3308 } 3309 goto validate; 3310 } 3311 3312 pv = NULL; 3313 3314 /* 3315 * Mapping has changed, invalidate old range and fall through to 3316 * handle validating new mapping. 3317 */ 3318 if (opa) { 3319 if (origpte & PG_W) 3320 pmap->pm_stats.wired_count--; 3321 if (origpte & PG_MANAGED) { 3322 om = PHYS_TO_VM_PAGE(opa); 3323 pv = pmap_pvh_remove(&om->md, pmap, va); 3324 } 3325 if (mpte != NULL) { 3326 mpte->wire_count--; 3327 KASSERT(mpte->wire_count > 0, 3328 ("pmap_enter: missing reference to page table page," 3329 " va: 0x%x", va)); 3330 } 3331 } else 3332 pmap->pm_stats.resident_count++; 3333 3334 /* 3335 * Enter on the PV list if part of our managed memory. 3336 */ 3337 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { 3338 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, 3339 ("pmap_enter: managed mapping within the clean submap")); 3340 if (pv == NULL) 3341 pv = get_pv_entry(pmap, FALSE); 3342 pv->pv_va = va; 3343 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 3344 pa |= PG_MANAGED; 3345 } else if (pv != NULL) 3346 free_pv_entry(pmap, pv); 3347 3348 /* 3349 * Increment counters 3350 */ 3351 if (wired) 3352 pmap->pm_stats.wired_count++; 3353 3354validate: 3355 /* 3356 * Now validate mapping with desired protection/wiring. 3357 */ 3358 newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V); 3359 if ((prot & VM_PROT_WRITE) != 0) { 3360 newpte |= PG_RW; 3361 if ((newpte & PG_MANAGED) != 0) 3362 vm_page_flag_set(m, PG_WRITEABLE); 3363 } 3364#ifdef PAE 3365 if ((prot & VM_PROT_EXECUTE) == 0) 3366 newpte |= pg_nx; 3367#endif 3368 if (wired) 3369 newpte |= PG_W; 3370 if (va < VM_MAXUSER_ADDRESS) 3371 newpte |= PG_U; 3372 if (pmap == kernel_pmap) 3373 newpte |= pgeflag; 3374 3375 /* 3376 * if the mapping or permission bits are different, we need 3377 * to update the pte. 3378 */ 3379 if ((origpte & ~(PG_M|PG_A)) != newpte) { 3380 newpte |= PG_A; 3381 if ((access & VM_PROT_WRITE) != 0) 3382 newpte |= PG_M; 3383 if (origpte & PG_V) { 3384 invlva = FALSE; 3385 origpte = pte_load_store(pte, newpte); 3386 if (origpte & PG_A) { 3387 if (origpte & PG_MANAGED) 3388 vm_page_flag_set(om, PG_REFERENCED); 3389 if (opa != VM_PAGE_TO_PHYS(m)) 3390 invlva = TRUE; 3391#ifdef PAE 3392 if ((origpte & PG_NX) == 0 && 3393 (newpte & PG_NX) != 0) 3394 invlva = TRUE; 3395#endif 3396 } 3397 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 3398 if ((origpte & PG_MANAGED) != 0) 3399 vm_page_dirty(om); 3400 if ((prot & VM_PROT_WRITE) == 0) 3401 invlva = TRUE; 3402 } 3403 if ((origpte & PG_MANAGED) != 0 && 3404 TAILQ_EMPTY(&om->md.pv_list) && 3405 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)) 3406 vm_page_flag_clear(om, PG_WRITEABLE); 3407 if (invlva) 3408 pmap_invalidate_page(pmap, va); 3409 } else 3410 pte_store(pte, newpte); 3411 } 3412 3413 /* 3414 * If both the page table page and the reservation are fully 3415 * populated, then attempt promotion. 3416 */ 3417 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 3418 pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0) 3419 pmap_promote_pde(pmap, pde, va); 3420 3421 sched_unpin(); 3422 vm_page_unlock_queues(); 3423 PMAP_UNLOCK(pmap); 3424} 3425 3426/* 3427 * Tries to create a 2- or 4MB page mapping. Returns TRUE if successful and 3428 * FALSE otherwise. Fails if (1) a page table page cannot be allocated without 3429 * blocking, (2) a mapping already exists at the specified virtual address, or 3430 * (3) a pv entry cannot be allocated without reclaiming another pv entry. 3431 */ 3432static boolean_t 3433pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3434{ 3435 pd_entry_t *pde, newpde; 3436 3437 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3438 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3439 pde = pmap_pde(pmap, va); 3440 if (*pde != 0) { 3441 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3442 " in pmap %p", va, pmap); 3443 return (FALSE); 3444 } 3445 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) | 3446 PG_PS | PG_V; 3447 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { 3448 newpde |= PG_MANAGED; 3449 3450 /* 3451 * Abort this mapping if its PV entry could not be created. 3452 */ 3453 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) { 3454 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3455 " in pmap %p", va, pmap); 3456 return (FALSE); 3457 } 3458 } 3459#ifdef PAE 3460 if ((prot & VM_PROT_EXECUTE) == 0) 3461 newpde |= pg_nx; 3462#endif 3463 if (va < VM_MAXUSER_ADDRESS) 3464 newpde |= PG_U; 3465 3466 /* 3467 * Increment counters. 3468 */ 3469 pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; 3470 3471 /* 3472 * Map the superpage. 3473 */ 3474 pde_store(pde, newpde); 3475 3476 pmap_pde_mappings++; 3477 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 3478 " in pmap %p", va, pmap); 3479 return (TRUE); 3480} 3481 3482/* 3483 * Maps a sequence of resident pages belonging to the same object. 3484 * The sequence begins with the given page m_start. This page is 3485 * mapped at the given virtual address start. Each subsequent page is 3486 * mapped at a virtual address that is offset from start by the same 3487 * amount as the page is offset from m_start within the object. The 3488 * last page in the sequence is the page with the largest offset from 3489 * m_start that can be mapped at a virtual address less than the given 3490 * virtual address end. Not every virtual page between start and end 3491 * is mapped; only those for which a resident page exists with the 3492 * corresponding offset from m_start are mapped. 3493 */ 3494void 3495pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3496 vm_page_t m_start, vm_prot_t prot) 3497{ 3498 vm_offset_t va; 3499 vm_page_t m, mpte; 3500 vm_pindex_t diff, psize; 3501 3502 VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED); 3503 psize = atop(end - start); 3504 mpte = NULL; 3505 m = m_start; 3506 vm_page_lock_queues(); 3507 PMAP_LOCK(pmap); 3508 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3509 va = start + ptoa(diff); 3510 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 3511 (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 && 3512 pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 && 3513 pmap_enter_pde(pmap, va, m, prot)) 3514 m = &m[NBPDR / PAGE_SIZE - 1]; 3515 else 3516 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 3517 mpte); 3518 m = TAILQ_NEXT(m, listq); 3519 } 3520 vm_page_unlock_queues(); 3521 PMAP_UNLOCK(pmap); 3522} 3523 3524/* 3525 * this code makes some *MAJOR* assumptions: 3526 * 1. Current pmap & pmap exists. 3527 * 2. Not wired. 3528 * 3. Read access. 3529 * 4. No page table pages. 3530 * but is *MUCH* faster than pmap_enter... 3531 */ 3532 3533void 3534pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3535{ 3536 3537 vm_page_lock_queues(); 3538 PMAP_LOCK(pmap); 3539 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); 3540 vm_page_unlock_queues(); 3541 PMAP_UNLOCK(pmap); 3542} 3543 3544static vm_page_t 3545pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3546 vm_prot_t prot, vm_page_t mpte) 3547{ 3548 pt_entry_t *pte; 3549 vm_paddr_t pa; 3550 vm_page_t free; 3551 3552 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 3553 (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0, 3554 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3555 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3556 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3557 3558 /* 3559 * In the case that a page table page is not 3560 * resident, we are creating it here. 3561 */ 3562 if (va < VM_MAXUSER_ADDRESS) { 3563 unsigned ptepindex; 3564 pd_entry_t ptepa; 3565 3566 /* 3567 * Calculate pagetable page index 3568 */ 3569 ptepindex = va >> PDRSHIFT; 3570 if (mpte && (mpte->pindex == ptepindex)) { 3571 mpte->wire_count++; 3572 } else { 3573 /* 3574 * Get the page directory entry 3575 */ 3576 ptepa = pmap->pm_pdir[ptepindex]; 3577 3578 /* 3579 * If the page table page is mapped, we just increment 3580 * the hold count, and activate it. 3581 */ 3582 if (ptepa) { 3583 if (ptepa & PG_PS) 3584 return (NULL); 3585 mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 3586 mpte->wire_count++; 3587 } else { 3588 mpte = _pmap_allocpte(pmap, ptepindex, 3589 M_NOWAIT); 3590 if (mpte == NULL) 3591 return (mpte); 3592 } 3593 } 3594 } else { 3595 mpte = NULL; 3596 } 3597 3598 /* 3599 * This call to vtopte makes the assumption that we are 3600 * entering the page into the current pmap. In order to support 3601 * quick entry into any pmap, one would likely use pmap_pte_quick. 3602 * But that isn't as quick as vtopte. 3603 */ 3604 pte = vtopte(va); 3605 if (*pte) { 3606 if (mpte != NULL) { 3607 mpte->wire_count--; 3608 mpte = NULL; 3609 } 3610 return (mpte); 3611 } 3612 3613 /* 3614 * Enter on the PV list if part of our managed memory. 3615 */ 3616 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 && 3617 !pmap_try_insert_pv_entry(pmap, va, m)) { 3618 if (mpte != NULL) { 3619 free = NULL; 3620 if (pmap_unwire_pte_hold(pmap, mpte, &free)) { 3621 pmap_invalidate_page(pmap, va); 3622 pmap_free_zero_pages(free); 3623 } 3624 3625 mpte = NULL; 3626 } 3627 return (mpte); 3628 } 3629 3630 /* 3631 * Increment counters 3632 */ 3633 pmap->pm_stats.resident_count++; 3634 3635 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); 3636#ifdef PAE 3637 if ((prot & VM_PROT_EXECUTE) == 0) 3638 pa |= pg_nx; 3639#endif 3640 3641 /* 3642 * Now validate mapping with RO protection 3643 */ 3644 if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) 3645 pte_store(pte, pa | PG_V | PG_U); 3646 else 3647 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 3648 return (mpte); 3649} 3650 3651/* 3652 * Make a temporary mapping for a physical address. This is only intended 3653 * to be used for panic dumps. 3654 */ 3655void * 3656pmap_kenter_temporary(vm_paddr_t pa, int i) 3657{ 3658 vm_offset_t va; 3659 3660 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 3661 pmap_kenter(va, pa); 3662 invlpg(va); 3663 return ((void *)crashdumpmap); 3664} 3665 3666/* 3667 * This code maps large physical mmap regions into the 3668 * processor address space. Note that some shortcuts 3669 * are taken, but the code works. 3670 */ 3671void 3672pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3673 vm_pindex_t pindex, vm_size_t size) 3674{ 3675 pd_entry_t *pde; 3676 vm_paddr_t pa, ptepa; 3677 vm_page_t p; 3678 int pat_mode; 3679 3680 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 3681 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3682 ("pmap_object_init_pt: non-device object")); 3683 if (pseflag && 3684 (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 3685 if (!vm_object_populate(object, pindex, pindex + atop(size))) 3686 return; 3687 p = vm_page_lookup(object, pindex); 3688 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3689 ("pmap_object_init_pt: invalid page %p", p)); 3690 pat_mode = p->md.pat_mode; 3691 3692 /* 3693 * Abort the mapping if the first page is not physically 3694 * aligned to a 2/4MB page boundary. 3695 */ 3696 ptepa = VM_PAGE_TO_PHYS(p); 3697 if (ptepa & (NBPDR - 1)) 3698 return; 3699 3700 /* 3701 * Skip the first page. Abort the mapping if the rest of 3702 * the pages are not physically contiguous or have differing 3703 * memory attributes. 3704 */ 3705 p = TAILQ_NEXT(p, listq); 3706 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 3707 pa += PAGE_SIZE) { 3708 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3709 ("pmap_object_init_pt: invalid page %p", p)); 3710 if (pa != VM_PAGE_TO_PHYS(p) || 3711 pat_mode != p->md.pat_mode) 3712 return; 3713 p = TAILQ_NEXT(p, listq); 3714 } 3715 3716 /* 3717 * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and 3718 * "size" is a multiple of 2/4M, adding the PAT setting to 3719 * "pa" will not affect the termination of this loop. 3720 */ 3721 PMAP_LOCK(pmap); 3722 for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa + 3723 size; pa += NBPDR) { 3724 pde = pmap_pde(pmap, addr); 3725 if (*pde == 0) { 3726 pde_store(pde, pa | PG_PS | PG_M | PG_A | 3727 PG_U | PG_RW | PG_V); 3728 pmap->pm_stats.resident_count += NBPDR / 3729 PAGE_SIZE; 3730 pmap_pde_mappings++; 3731 } 3732 /* Else continue on if the PDE is already valid. */ 3733 addr += NBPDR; 3734 } 3735 PMAP_UNLOCK(pmap); 3736 } 3737} 3738 3739/* 3740 * Routine: pmap_change_wiring 3741 * Function: Change the wiring attribute for a map/virtual-address 3742 * pair. 3743 * In/out conditions: 3744 * The mapping must already exist in the pmap. 3745 */ 3746void 3747pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) 3748{ 3749 pd_entry_t *pde; 3750 pt_entry_t *pte; 3751 boolean_t are_queues_locked; 3752 3753 are_queues_locked = FALSE; 3754retry: 3755 PMAP_LOCK(pmap); 3756 pde = pmap_pde(pmap, va); 3757 if ((*pde & PG_PS) != 0) { 3758 if (!wired != ((*pde & PG_W) == 0)) { 3759 if (!are_queues_locked) { 3760 are_queues_locked = TRUE; 3761 if (!mtx_trylock(&vm_page_queue_mtx)) { 3762 PMAP_UNLOCK(pmap); 3763 vm_page_lock_queues(); 3764 goto retry; 3765 } 3766 } 3767 if (!pmap_demote_pde(pmap, pde, va)) 3768 panic("pmap_change_wiring: demotion failed"); 3769 } else 3770 goto out; 3771 } 3772 pte = pmap_pte(pmap, va); 3773 3774 if (wired && !pmap_pte_w(pte)) 3775 pmap->pm_stats.wired_count++; 3776 else if (!wired && pmap_pte_w(pte)) 3777 pmap->pm_stats.wired_count--; 3778 3779 /* 3780 * Wiring is not a hardware characteristic so there is no need to 3781 * invalidate TLB. 3782 */ 3783 pmap_pte_set_w(pte, wired); 3784 pmap_pte_release(pte); 3785out: 3786 if (are_queues_locked) 3787 vm_page_unlock_queues(); 3788 PMAP_UNLOCK(pmap); 3789} 3790 3791 3792 3793/* 3794 * Copy the range specified by src_addr/len 3795 * from the source map to the range dst_addr/len 3796 * in the destination map. 3797 * 3798 * This routine is only advisory and need not do anything. 3799 */ 3800 3801void 3802pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 3803 vm_offset_t src_addr) 3804{ 3805 vm_page_t free; 3806 vm_offset_t addr; 3807 vm_offset_t end_addr = src_addr + len; 3808 vm_offset_t pdnxt; 3809 3810 if (dst_addr != src_addr) 3811 return; 3812 3813 if (!pmap_is_current(src_pmap)) 3814 return; 3815 3816 vm_page_lock_queues(); 3817 if (dst_pmap < src_pmap) { 3818 PMAP_LOCK(dst_pmap); 3819 PMAP_LOCK(src_pmap); 3820 } else { 3821 PMAP_LOCK(src_pmap); 3822 PMAP_LOCK(dst_pmap); 3823 } 3824 sched_pin(); 3825 for (addr = src_addr; addr < end_addr; addr = pdnxt) { 3826 pt_entry_t *src_pte, *dst_pte; 3827 vm_page_t dstmpte, srcmpte; 3828 pd_entry_t srcptepaddr; 3829 unsigned ptepindex; 3830 3831 KASSERT(addr < UPT_MIN_ADDRESS, 3832 ("pmap_copy: invalid to pmap_copy page tables")); 3833 3834 pdnxt = (addr + NBPDR) & ~PDRMASK; 3835 if (pdnxt < addr) 3836 pdnxt = end_addr; 3837 ptepindex = addr >> PDRSHIFT; 3838 3839 srcptepaddr = src_pmap->pm_pdir[ptepindex]; 3840 if (srcptepaddr == 0) 3841 continue; 3842 3843 if (srcptepaddr & PG_PS) { 3844 if (dst_pmap->pm_pdir[ptepindex] == 0 && 3845 ((srcptepaddr & PG_MANAGED) == 0 || 3846 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & 3847 PG_PS_FRAME))) { 3848 dst_pmap->pm_pdir[ptepindex] = srcptepaddr & 3849 ~PG_W; 3850 dst_pmap->pm_stats.resident_count += 3851 NBPDR / PAGE_SIZE; 3852 } 3853 continue; 3854 } 3855 3856 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); 3857 KASSERT(srcmpte->wire_count > 0, 3858 ("pmap_copy: source page table page is unused")); 3859 3860 if (pdnxt > end_addr) 3861 pdnxt = end_addr; 3862 3863 src_pte = vtopte(addr); 3864 while (addr < pdnxt) { 3865 pt_entry_t ptetemp; 3866 ptetemp = *src_pte; 3867 /* 3868 * we only virtual copy managed pages 3869 */ 3870 if ((ptetemp & PG_MANAGED) != 0) { 3871 dstmpte = pmap_allocpte(dst_pmap, addr, 3872 M_NOWAIT); 3873 if (dstmpte == NULL) 3874 goto out; 3875 dst_pte = pmap_pte_quick(dst_pmap, addr); 3876 if (*dst_pte == 0 && 3877 pmap_try_insert_pv_entry(dst_pmap, addr, 3878 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { 3879 /* 3880 * Clear the wired, modified, and 3881 * accessed (referenced) bits 3882 * during the copy. 3883 */ 3884 *dst_pte = ptetemp & ~(PG_W | PG_M | 3885 PG_A); 3886 dst_pmap->pm_stats.resident_count++; 3887 } else { 3888 free = NULL; 3889 if (pmap_unwire_pte_hold(dst_pmap, 3890 dstmpte, &free)) { 3891 pmap_invalidate_page(dst_pmap, 3892 addr); 3893 pmap_free_zero_pages(free); 3894 } 3895 goto out; 3896 } 3897 if (dstmpte->wire_count >= srcmpte->wire_count) 3898 break; 3899 } 3900 addr += PAGE_SIZE; 3901 src_pte++; 3902 } 3903 } 3904out: 3905 sched_unpin(); 3906 vm_page_unlock_queues(); 3907 PMAP_UNLOCK(src_pmap); 3908 PMAP_UNLOCK(dst_pmap); 3909} 3910 3911static __inline void 3912pagezero(void *page) 3913{ 3914#if defined(I686_CPU) 3915 if (cpu_class == CPUCLASS_686) { 3916#if defined(CPU_ENABLE_SSE) 3917 if (cpu_feature & CPUID_SSE2) 3918 sse2_pagezero(page); 3919 else 3920#endif 3921 i686_pagezero(page); 3922 } else 3923#endif 3924 bzero(page, PAGE_SIZE); 3925} 3926 3927/* 3928 * pmap_zero_page zeros the specified hardware page by mapping 3929 * the page into KVM and using bzero to clear its contents. 3930 */ 3931void 3932pmap_zero_page(vm_page_t m) 3933{ 3934 struct sysmaps *sysmaps; 3935 3936 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 3937 mtx_lock(&sysmaps->lock); 3938 if (*sysmaps->CMAP2) 3939 panic("pmap_zero_page: CMAP2 busy"); 3940 sched_pin(); 3941 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 3942 pmap_cache_bits(m->md.pat_mode, 0); 3943 invlcaddr(sysmaps->CADDR2); 3944 pagezero(sysmaps->CADDR2); 3945 *sysmaps->CMAP2 = 0; 3946 sched_unpin(); 3947 mtx_unlock(&sysmaps->lock); 3948} 3949 3950/* 3951 * pmap_zero_page_area zeros the specified hardware page by mapping 3952 * the page into KVM and using bzero to clear its contents. 3953 * 3954 * off and size may not cover an area beyond a single hardware page. 3955 */ 3956void 3957pmap_zero_page_area(vm_page_t m, int off, int size) 3958{ 3959 struct sysmaps *sysmaps; 3960 3961 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 3962 mtx_lock(&sysmaps->lock); 3963 if (*sysmaps->CMAP2) 3964 panic("pmap_zero_page_area: CMAP2 busy"); 3965 sched_pin(); 3966 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 3967 pmap_cache_bits(m->md.pat_mode, 0); 3968 invlcaddr(sysmaps->CADDR2); 3969 if (off == 0 && size == PAGE_SIZE) 3970 pagezero(sysmaps->CADDR2); 3971 else 3972 bzero((char *)sysmaps->CADDR2 + off, size); 3973 *sysmaps->CMAP2 = 0; 3974 sched_unpin(); 3975 mtx_unlock(&sysmaps->lock); 3976} 3977 3978/* 3979 * pmap_zero_page_idle zeros the specified hardware page by mapping 3980 * the page into KVM and using bzero to clear its contents. This 3981 * is intended to be called from the vm_pagezero process only and 3982 * outside of Giant. 3983 */ 3984void 3985pmap_zero_page_idle(vm_page_t m) 3986{ 3987 3988 if (*CMAP3) 3989 panic("pmap_zero_page_idle: CMAP3 busy"); 3990 sched_pin(); 3991 *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 3992 pmap_cache_bits(m->md.pat_mode, 0); 3993 invlcaddr(CADDR3); 3994 pagezero(CADDR3); 3995 *CMAP3 = 0; 3996 sched_unpin(); 3997} 3998 3999/* 4000 * pmap_copy_page copies the specified (machine independent) 4001 * page by mapping the page into virtual memory and using 4002 * bcopy to copy the page, one machine dependent page at a 4003 * time. 4004 */ 4005void 4006pmap_copy_page(vm_page_t src, vm_page_t dst) 4007{ 4008 struct sysmaps *sysmaps; 4009 4010 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 4011 mtx_lock(&sysmaps->lock); 4012 if (*sysmaps->CMAP1) 4013 panic("pmap_copy_page: CMAP1 busy"); 4014 if (*sysmaps->CMAP2) 4015 panic("pmap_copy_page: CMAP2 busy"); 4016 sched_pin(); 4017 invlpg((u_int)sysmaps->CADDR1); 4018 invlpg((u_int)sysmaps->CADDR2); 4019 *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A | 4020 pmap_cache_bits(src->md.pat_mode, 0); 4021 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M | 4022 pmap_cache_bits(dst->md.pat_mode, 0); 4023 bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE); 4024 *sysmaps->CMAP1 = 0; 4025 *sysmaps->CMAP2 = 0; 4026 sched_unpin(); 4027 mtx_unlock(&sysmaps->lock); 4028} 4029 4030/* 4031 * Returns true if the pmap's pv is one of the first 4032 * 16 pvs linked to from this page. This count may 4033 * be changed upwards or downwards in the future; it 4034 * is only necessary that true be returned for a small 4035 * subset of pmaps for proper page aging. 4036 */ 4037boolean_t 4038pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 4039{ 4040 struct md_page *pvh; 4041 pv_entry_t pv; 4042 int loops = 0; 4043 boolean_t rv; 4044 4045 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0, 4046 ("pmap_page_exists_quick: page %p is not managed", m)); 4047 rv = FALSE; 4048 vm_page_lock_queues(); 4049 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4050 if (PV_PMAP(pv) == pmap) { 4051 rv = TRUE; 4052 break; 4053 } 4054 loops++; 4055 if (loops >= 16) 4056 break; 4057 } 4058 if (!rv && loops < 16) { 4059 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4060 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 4061 if (PV_PMAP(pv) == pmap) { 4062 rv = TRUE; 4063 break; 4064 } 4065 loops++; 4066 if (loops >= 16) 4067 break; 4068 } 4069 } 4070 vm_page_unlock_queues(); 4071 return (rv); 4072} 4073 4074/* 4075 * pmap_page_wired_mappings: 4076 * 4077 * Return the number of managed mappings to the given physical page 4078 * that are wired. 4079 */ 4080int 4081pmap_page_wired_mappings(vm_page_t m) 4082{ 4083 int count; 4084 4085 count = 0; 4086 if ((m->flags & PG_FICTITIOUS) != 0) 4087 return (count); 4088 vm_page_lock_queues(); 4089 count = pmap_pvh_wired_mappings(&m->md, count); 4090 count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count); 4091 vm_page_unlock_queues(); 4092 return (count); 4093} 4094 4095/* 4096 * pmap_pvh_wired_mappings: 4097 * 4098 * Return the updated number "count" of managed mappings that are wired. 4099 */ 4100static int 4101pmap_pvh_wired_mappings(struct md_page *pvh, int count) 4102{ 4103 pmap_t pmap; 4104 pt_entry_t *pte; 4105 pv_entry_t pv; 4106 4107 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 4108 sched_pin(); 4109 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 4110 pmap = PV_PMAP(pv); 4111 PMAP_LOCK(pmap); 4112 pte = pmap_pte_quick(pmap, pv->pv_va); 4113 if ((*pte & PG_W) != 0) 4114 count++; 4115 PMAP_UNLOCK(pmap); 4116 } 4117 sched_unpin(); 4118 return (count); 4119} 4120 4121/* 4122 * Returns TRUE if the given page is mapped individually or as part of 4123 * a 4mpage. Otherwise, returns FALSE. 4124 */ 4125boolean_t 4126pmap_page_is_mapped(vm_page_t m) 4127{ 4128 boolean_t rv; 4129 4130 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0) 4131 return (FALSE); 4132 vm_page_lock_queues(); 4133 rv = !TAILQ_EMPTY(&m->md.pv_list) || 4134 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list); 4135 vm_page_unlock_queues(); 4136 return (rv); 4137} 4138 4139/* 4140 * Remove all pages from specified address space 4141 * this aids process exit speeds. Also, this code 4142 * is special cased for current process only, but 4143 * can have the more generic (and slightly slower) 4144 * mode enabled. This is much faster than pmap_remove 4145 * in the case of running down an entire address space. 4146 */ 4147void 4148pmap_remove_pages(pmap_t pmap) 4149{ 4150 pt_entry_t *pte, tpte; 4151 vm_page_t free = NULL; 4152 vm_page_t m, mpte, mt; 4153 pv_entry_t pv; 4154 struct md_page *pvh; 4155 struct pv_chunk *pc, *npc; 4156 int field, idx; 4157 int32_t bit; 4158 uint32_t inuse, bitmask; 4159 int allfree; 4160 4161 if (pmap != PCPU_GET(curpmap)) { 4162 printf("warning: pmap_remove_pages called with non-current pmap\n"); 4163 return; 4164 } 4165 vm_page_lock_queues(); 4166 PMAP_LOCK(pmap); 4167 sched_pin(); 4168 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4169 allfree = 1; 4170 for (field = 0; field < _NPCM; field++) { 4171 inuse = (~(pc->pc_map[field])) & pc_freemask[field]; 4172 while (inuse != 0) { 4173 bit = bsfl(inuse); 4174 bitmask = 1UL << bit; 4175 idx = field * 32 + bit; 4176 pv = &pc->pc_pventry[idx]; 4177 inuse &= ~bitmask; 4178 4179 pte = pmap_pde(pmap, pv->pv_va); 4180 tpte = *pte; 4181 if ((tpte & PG_PS) == 0) { 4182 pte = vtopte(pv->pv_va); 4183 tpte = *pte & ~PG_PTE_PAT; 4184 } 4185 4186 if (tpte == 0) { 4187 printf( 4188 "TPTE at %p IS ZERO @ VA %08x\n", 4189 pte, pv->pv_va); 4190 panic("bad pte"); 4191 } 4192 4193/* 4194 * We cannot remove wired pages from a process' mapping at this time 4195 */ 4196 if (tpte & PG_W) { 4197 allfree = 0; 4198 continue; 4199 } 4200 4201 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 4202 KASSERT(m->phys_addr == (tpte & PG_FRAME), 4203 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 4204 m, (uintmax_t)m->phys_addr, 4205 (uintmax_t)tpte)); 4206 4207 KASSERT(m < &vm_page_array[vm_page_array_size], 4208 ("pmap_remove_pages: bad tpte %#jx", 4209 (uintmax_t)tpte)); 4210 4211 pte_clear(pte); 4212 4213 /* 4214 * Update the vm_page_t clean/reference bits. 4215 */ 4216 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4217 if ((tpte & PG_PS) != 0) { 4218 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4219 vm_page_dirty(mt); 4220 } else 4221 vm_page_dirty(m); 4222 } 4223 4224 /* Mark free */ 4225 PV_STAT(pv_entry_frees++); 4226 PV_STAT(pv_entry_spare++); 4227 pv_entry_count--; 4228 pc->pc_map[field] |= bitmask; 4229 if ((tpte & PG_PS) != 0) { 4230 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 4231 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 4232 TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); 4233 if (TAILQ_EMPTY(&pvh->pv_list)) { 4234 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4235 if (TAILQ_EMPTY(&mt->md.pv_list)) 4236 vm_page_flag_clear(mt, PG_WRITEABLE); 4237 } 4238 mpte = pmap_lookup_pt_page(pmap, pv->pv_va); 4239 if (mpte != NULL) { 4240 pmap_remove_pt_page(pmap, mpte); 4241 pmap->pm_stats.resident_count--; 4242 KASSERT(mpte->wire_count == NPTEPG, 4243 ("pmap_remove_pages: pte page wire count error")); 4244 mpte->wire_count = 0; 4245 pmap_add_delayed_free_list(mpte, &free, FALSE); 4246 atomic_subtract_int(&cnt.v_wire_count, 1); 4247 } 4248 } else { 4249 pmap->pm_stats.resident_count--; 4250 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 4251 if (TAILQ_EMPTY(&m->md.pv_list)) { 4252 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4253 if (TAILQ_EMPTY(&pvh->pv_list)) 4254 vm_page_flag_clear(m, PG_WRITEABLE); 4255 } 4256 pmap_unuse_pt(pmap, pv->pv_va, &free); 4257 } 4258 } 4259 } 4260 if (allfree) { 4261 PV_STAT(pv_entry_spare -= _NPCPV); 4262 PV_STAT(pc_chunk_count--); 4263 PV_STAT(pc_chunk_frees++); 4264 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4265 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 4266 pmap_qremove((vm_offset_t)pc, 1); 4267 vm_page_unwire(m, 0); 4268 vm_page_free(m); 4269 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 4270 } 4271 } 4272 sched_unpin(); 4273 pmap_invalidate_all(pmap); 4274 vm_page_unlock_queues(); 4275 PMAP_UNLOCK(pmap); 4276 pmap_free_zero_pages(free); 4277} 4278 4279/* 4280 * pmap_is_modified: 4281 * 4282 * Return whether or not the specified physical page was modified 4283 * in any physical maps. 4284 */ 4285boolean_t 4286pmap_is_modified(vm_page_t m) 4287{ 4288 boolean_t rv; 4289 4290 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0, 4291 ("pmap_is_modified: page %p is not managed", m)); 4292 4293 /* 4294 * If the page is not VPO_BUSY, then PG_WRITEABLE cannot be 4295 * concurrently set while the object is locked. Thus, if PG_WRITEABLE 4296 * is clear, no PTEs can have PG_M set. 4297 */ 4298 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 4299 if ((m->oflags & VPO_BUSY) == 0 && 4300 (m->flags & PG_WRITEABLE) == 0) 4301 return (FALSE); 4302 vm_page_lock_queues(); 4303 rv = pmap_is_modified_pvh(&m->md) || 4304 pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))); 4305 vm_page_unlock_queues(); 4306 return (rv); 4307} 4308 4309/* 4310 * Returns TRUE if any of the given mappings were used to modify 4311 * physical memory. Otherwise, returns FALSE. Both page and 2mpage 4312 * mappings are supported. 4313 */ 4314static boolean_t 4315pmap_is_modified_pvh(struct md_page *pvh) 4316{ 4317 pv_entry_t pv; 4318 pt_entry_t *pte; 4319 pmap_t pmap; 4320 boolean_t rv; 4321 4322 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 4323 rv = FALSE; 4324 sched_pin(); 4325 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 4326 pmap = PV_PMAP(pv); 4327 PMAP_LOCK(pmap); 4328 pte = pmap_pte_quick(pmap, pv->pv_va); 4329 rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); 4330 PMAP_UNLOCK(pmap); 4331 if (rv) 4332 break; 4333 } 4334 sched_unpin(); 4335 return (rv); 4336} 4337 4338/* 4339 * pmap_is_prefaultable: 4340 * 4341 * Return whether or not the specified virtual address is elgible 4342 * for prefault. 4343 */ 4344boolean_t 4345pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 4346{ 4347 pd_entry_t *pde; 4348 pt_entry_t *pte; 4349 boolean_t rv; 4350 4351 rv = FALSE; 4352 PMAP_LOCK(pmap); 4353 pde = pmap_pde(pmap, addr); 4354 if (*pde != 0 && (*pde & PG_PS) == 0) { 4355 pte = vtopte(addr); 4356 rv = *pte == 0; 4357 } 4358 PMAP_UNLOCK(pmap); 4359 return (rv); 4360} 4361 4362/* 4363 * pmap_is_referenced: 4364 * 4365 * Return whether or not the specified physical page was referenced 4366 * in any physical maps. 4367 */ 4368boolean_t 4369pmap_is_referenced(vm_page_t m) 4370{ 4371 boolean_t rv; 4372 4373 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0, 4374 ("pmap_is_referenced: page %p is not managed", m)); 4375 vm_page_lock_queues(); 4376 rv = pmap_is_referenced_pvh(&m->md) || 4377 pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))); 4378 vm_page_unlock_queues(); 4379 return (rv); 4380} 4381 4382/* 4383 * Returns TRUE if any of the given mappings were referenced and FALSE 4384 * otherwise. Both page and 4mpage mappings are supported. 4385 */ 4386static boolean_t 4387pmap_is_referenced_pvh(struct md_page *pvh) 4388{ 4389 pv_entry_t pv; 4390 pt_entry_t *pte; 4391 pmap_t pmap; 4392 boolean_t rv; 4393 4394 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 4395 rv = FALSE; 4396 sched_pin(); 4397 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 4398 pmap = PV_PMAP(pv); 4399 PMAP_LOCK(pmap); 4400 pte = pmap_pte_quick(pmap, pv->pv_va); 4401 rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); 4402 PMAP_UNLOCK(pmap); 4403 if (rv) 4404 break; 4405 } 4406 sched_unpin(); 4407 return (rv); 4408} 4409 4410/* 4411 * Clear the write and modified bits in each of the given page's mappings. 4412 */ 4413void 4414pmap_remove_write(vm_page_t m) 4415{ 4416 struct md_page *pvh; 4417 pv_entry_t next_pv, pv; 4418 pmap_t pmap; 4419 pd_entry_t *pde; 4420 pt_entry_t oldpte, *pte; 4421 vm_offset_t va; 4422 4423 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0, 4424 ("pmap_remove_write: page %p is not managed", m)); 4425 4426 /* 4427 * If the page is not VPO_BUSY, then PG_WRITEABLE cannot be set by 4428 * another thread while the object is locked. Thus, if PG_WRITEABLE 4429 * is clear, no page table entries need updating. 4430 */ 4431 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 4432 if ((m->oflags & VPO_BUSY) == 0 && 4433 (m->flags & PG_WRITEABLE) == 0) 4434 return; 4435 vm_page_lock_queues(); 4436 sched_pin(); 4437 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4438 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { 4439 va = pv->pv_va; 4440 pmap = PV_PMAP(pv); 4441 PMAP_LOCK(pmap); 4442 pde = pmap_pde(pmap, va); 4443 if ((*pde & PG_RW) != 0) 4444 (void)pmap_demote_pde(pmap, pde, va); 4445 PMAP_UNLOCK(pmap); 4446 } 4447 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4448 pmap = PV_PMAP(pv); 4449 PMAP_LOCK(pmap); 4450 pde = pmap_pde(pmap, pv->pv_va); 4451 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" 4452 " a 4mpage in page %p's pv list", m)); 4453 pte = pmap_pte_quick(pmap, pv->pv_va); 4454retry: 4455 oldpte = *pte; 4456 if ((oldpte & PG_RW) != 0) { 4457 /* 4458 * Regardless of whether a pte is 32 or 64 bits 4459 * in size, PG_RW and PG_M are among the least 4460 * significant 32 bits. 4461 */ 4462 if (!atomic_cmpset_int((u_int *)pte, oldpte, 4463 oldpte & ~(PG_RW | PG_M))) 4464 goto retry; 4465 if ((oldpte & PG_M) != 0) 4466 vm_page_dirty(m); 4467 pmap_invalidate_page(pmap, pv->pv_va); 4468 } 4469 PMAP_UNLOCK(pmap); 4470 } 4471 vm_page_flag_clear(m, PG_WRITEABLE); 4472 sched_unpin(); 4473 vm_page_unlock_queues(); 4474} 4475 4476/* 4477 * pmap_ts_referenced: 4478 * 4479 * Return a count of reference bits for a page, clearing those bits. 4480 * It is not necessary for every reference bit to be cleared, but it 4481 * is necessary that 0 only be returned when there are truly no 4482 * reference bits set. 4483 * 4484 * XXX: The exact number of bits to check and clear is a matter that 4485 * should be tested and standardized at some point in the future for 4486 * optimal aging of shared pages. 4487 */ 4488int 4489pmap_ts_referenced(vm_page_t m) 4490{ 4491 struct md_page *pvh; 4492 pv_entry_t pv, pvf, pvn; 4493 pmap_t pmap; 4494 pd_entry_t oldpde, *pde; 4495 pt_entry_t *pte; 4496 vm_offset_t va; 4497 int rtval = 0; 4498 4499 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0, 4500 ("pmap_ts_referenced: page %p is not managed", m)); 4501 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4502 vm_page_lock_queues(); 4503 sched_pin(); 4504 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) { 4505 va = pv->pv_va; 4506 pmap = PV_PMAP(pv); 4507 PMAP_LOCK(pmap); 4508 pde = pmap_pde(pmap, va); 4509 oldpde = *pde; 4510 if ((oldpde & PG_A) != 0) { 4511 if (pmap_demote_pde(pmap, pde, va)) { 4512 if ((oldpde & PG_W) == 0) { 4513 /* 4514 * Remove the mapping to a single page 4515 * so that a subsequent access may 4516 * repromote. Since the underlying 4517 * page table page is fully populated, 4518 * this removal never frees a page 4519 * table page. 4520 */ 4521 va += VM_PAGE_TO_PHYS(m) - (oldpde & 4522 PG_PS_FRAME); 4523 pmap_remove_page(pmap, va, NULL); 4524 rtval++; 4525 if (rtval > 4) { 4526 PMAP_UNLOCK(pmap); 4527 goto out; 4528 } 4529 } 4530 } 4531 } 4532 PMAP_UNLOCK(pmap); 4533 } 4534 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4535 pvf = pv; 4536 do { 4537 pvn = TAILQ_NEXT(pv, pv_list); 4538 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 4539 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 4540 pmap = PV_PMAP(pv); 4541 PMAP_LOCK(pmap); 4542 pde = pmap_pde(pmap, pv->pv_va); 4543 KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:" 4544 " found a 4mpage in page %p's pv list", m)); 4545 pte = pmap_pte_quick(pmap, pv->pv_va); 4546 if ((*pte & PG_A) != 0) { 4547 atomic_clear_int((u_int *)pte, PG_A); 4548 pmap_invalidate_page(pmap, pv->pv_va); 4549 rtval++; 4550 if (rtval > 4) 4551 pvn = NULL; 4552 } 4553 PMAP_UNLOCK(pmap); 4554 } while ((pv = pvn) != NULL && pv != pvf); 4555 } 4556out: 4557 sched_unpin(); 4558 vm_page_unlock_queues(); 4559 return (rtval); 4560} 4561 4562/* 4563 * Clear the modify bits on the specified physical page. 4564 */ 4565void 4566pmap_clear_modify(vm_page_t m) 4567{ 4568 struct md_page *pvh; 4569 pv_entry_t next_pv, pv; 4570 pmap_t pmap; 4571 pd_entry_t oldpde, *pde; 4572 pt_entry_t oldpte, *pte; 4573 vm_offset_t va; 4574 4575 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0, 4576 ("pmap_clear_modify: page %p is not managed", m)); 4577 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 4578 KASSERT((m->oflags & VPO_BUSY) == 0, 4579 ("pmap_clear_modify: page %p is busy", m)); 4580 4581 /* 4582 * If the page is not PG_WRITEABLE, then no PTEs can have PG_M set. 4583 * If the object containing the page is locked and the page is not 4584 * VPO_BUSY, then PG_WRITEABLE cannot be concurrently set. 4585 */ 4586 if ((m->flags & PG_WRITEABLE) == 0) 4587 return; 4588 vm_page_lock_queues(); 4589 sched_pin(); 4590 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4591 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { 4592 va = pv->pv_va; 4593 pmap = PV_PMAP(pv); 4594 PMAP_LOCK(pmap); 4595 pde = pmap_pde(pmap, va); 4596 oldpde = *pde; 4597 if ((oldpde & PG_RW) != 0) { 4598 if (pmap_demote_pde(pmap, pde, va)) { 4599 if ((oldpde & PG_W) == 0) { 4600 /* 4601 * Write protect the mapping to a 4602 * single page so that a subsequent 4603 * write access may repromote. 4604 */ 4605 va += VM_PAGE_TO_PHYS(m) - (oldpde & 4606 PG_PS_FRAME); 4607 pte = pmap_pte_quick(pmap, va); 4608 oldpte = *pte; 4609 if ((oldpte & PG_V) != 0) { 4610 /* 4611 * Regardless of whether a pte is 32 or 64 bits 4612 * in size, PG_RW and PG_M are among the least 4613 * significant 32 bits. 4614 */ 4615 while (!atomic_cmpset_int((u_int *)pte, 4616 oldpte, 4617 oldpte & ~(PG_M | PG_RW))) 4618 oldpte = *pte; 4619 vm_page_dirty(m); 4620 pmap_invalidate_page(pmap, va); 4621 } 4622 } 4623 } 4624 } 4625 PMAP_UNLOCK(pmap); 4626 } 4627 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4628 pmap = PV_PMAP(pv); 4629 PMAP_LOCK(pmap); 4630 pde = pmap_pde(pmap, pv->pv_va); 4631 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 4632 " a 4mpage in page %p's pv list", m)); 4633 pte = pmap_pte_quick(pmap, pv->pv_va); 4634 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4635 /* 4636 * Regardless of whether a pte is 32 or 64 bits 4637 * in size, PG_M is among the least significant 4638 * 32 bits. 4639 */ 4640 atomic_clear_int((u_int *)pte, PG_M); 4641 pmap_invalidate_page(pmap, pv->pv_va); 4642 } 4643 PMAP_UNLOCK(pmap); 4644 } 4645 sched_unpin(); 4646 vm_page_unlock_queues(); 4647} 4648 4649/* 4650 * pmap_clear_reference: 4651 * 4652 * Clear the reference bit on the specified physical page. 4653 */ 4654void 4655pmap_clear_reference(vm_page_t m) 4656{ 4657 struct md_page *pvh; 4658 pv_entry_t next_pv, pv; 4659 pmap_t pmap; 4660 pd_entry_t oldpde, *pde; 4661 pt_entry_t *pte; 4662 vm_offset_t va; 4663 4664 KASSERT((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0, 4665 ("pmap_clear_reference: page %p is not managed", m)); 4666 vm_page_lock_queues(); 4667 sched_pin(); 4668 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4669 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { 4670 va = pv->pv_va; 4671 pmap = PV_PMAP(pv); 4672 PMAP_LOCK(pmap); 4673 pde = pmap_pde(pmap, va); 4674 oldpde = *pde; 4675 if ((oldpde & PG_A) != 0) { 4676 if (pmap_demote_pde(pmap, pde, va)) { 4677 /* 4678 * Remove the mapping to a single page so 4679 * that a subsequent access may repromote. 4680 * Since the underlying page table page is 4681 * fully populated, this removal never frees 4682 * a page table page. 4683 */ 4684 va += VM_PAGE_TO_PHYS(m) - (oldpde & 4685 PG_PS_FRAME); 4686 pmap_remove_page(pmap, va, NULL); 4687 } 4688 } 4689 PMAP_UNLOCK(pmap); 4690 } 4691 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4692 pmap = PV_PMAP(pv); 4693 PMAP_LOCK(pmap); 4694 pde = pmap_pde(pmap, pv->pv_va); 4695 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found" 4696 " a 4mpage in page %p's pv list", m)); 4697 pte = pmap_pte_quick(pmap, pv->pv_va); 4698 if ((*pte & PG_A) != 0) { 4699 /* 4700 * Regardless of whether a pte is 32 or 64 bits 4701 * in size, PG_A is among the least significant 4702 * 32 bits. 4703 */ 4704 atomic_clear_int((u_int *)pte, PG_A); 4705 pmap_invalidate_page(pmap, pv->pv_va); 4706 } 4707 PMAP_UNLOCK(pmap); 4708 } 4709 sched_unpin(); 4710 vm_page_unlock_queues(); 4711} 4712 4713/* 4714 * Miscellaneous support routines follow 4715 */ 4716 4717/* Adjust the cache mode for a 4KB page mapped via a PTE. */ 4718static __inline void 4719pmap_pte_attr(pt_entry_t *pte, int cache_bits) 4720{ 4721 u_int opte, npte; 4722 4723 /* 4724 * The cache mode bits are all in the low 32-bits of the 4725 * PTE, so we can just spin on updating the low 32-bits. 4726 */ 4727 do { 4728 opte = *(u_int *)pte; 4729 npte = opte & ~PG_PTE_CACHE; 4730 npte |= cache_bits; 4731 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 4732} 4733 4734/* Adjust the cache mode for a 2/4MB page mapped via a PDE. */ 4735static __inline void 4736pmap_pde_attr(pd_entry_t *pde, int cache_bits) 4737{ 4738 u_int opde, npde; 4739 4740 /* 4741 * The cache mode bits are all in the low 32-bits of the 4742 * PDE, so we can just spin on updating the low 32-bits. 4743 */ 4744 do { 4745 opde = *(u_int *)pde; 4746 npde = opde & ~PG_PDE_CACHE; 4747 npde |= cache_bits; 4748 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 4749} 4750 4751/* 4752 * Map a set of physical memory pages into the kernel virtual 4753 * address space. Return a pointer to where it is mapped. This 4754 * routine is intended to be used for mapping device memory, 4755 * NOT real memory. 4756 */ 4757void * 4758pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 4759{ 4760 vm_offset_t va, offset; 4761 vm_size_t tmpsize; 4762 4763 offset = pa & PAGE_MASK; 4764 size = roundup(offset + size, PAGE_SIZE); 4765 pa = pa & PG_FRAME; 4766 4767 if (pa < KERNLOAD && pa + size <= KERNLOAD) 4768 va = KERNBASE + pa; 4769 else 4770 va = kmem_alloc_nofault(kernel_map, size); 4771 if (!va) 4772 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 4773 4774 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 4775 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 4776 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 4777 pmap_invalidate_cache_range(va, va + size); 4778 return ((void *)(va + offset)); 4779} 4780 4781void * 4782pmap_mapdev(vm_paddr_t pa, vm_size_t size) 4783{ 4784 4785 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 4786} 4787 4788void * 4789pmap_mapbios(vm_paddr_t pa, vm_size_t size) 4790{ 4791 4792 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 4793} 4794 4795void 4796pmap_unmapdev(vm_offset_t va, vm_size_t size) 4797{ 4798 vm_offset_t base, offset, tmpva; 4799 4800 if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) 4801 return; 4802 base = trunc_page(va); 4803 offset = va & PAGE_MASK; 4804 size = roundup(offset + size, PAGE_SIZE); 4805 for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) 4806 pmap_kremove(tmpva); 4807 pmap_invalidate_range(kernel_pmap, va, tmpva); 4808 kmem_free(kernel_map, base, size); 4809} 4810 4811/* 4812 * Sets the memory attribute for the specified page. 4813 */ 4814void 4815pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 4816{ 4817 struct sysmaps *sysmaps; 4818 vm_offset_t sva, eva; 4819 4820 m->md.pat_mode = ma; 4821 if ((m->flags & PG_FICTITIOUS) != 0) 4822 return; 4823 4824 /* 4825 * If "m" is a normal page, flush it from the cache. 4826 * See pmap_invalidate_cache_range(). 4827 * 4828 * First, try to find an existing mapping of the page by sf 4829 * buffer. sf_buf_invalidate_cache() modifies mapping and 4830 * flushes the cache. 4831 */ 4832 if (sf_buf_invalidate_cache(m)) 4833 return; 4834 4835 /* 4836 * If page is not mapped by sf buffer, but CPU does not 4837 * support self snoop, map the page transient and do 4838 * invalidation. In the worst case, whole cache is flushed by 4839 * pmap_invalidate_cache_range(). 4840 */ 4841 if ((cpu_feature & (CPUID_SS|CPUID_CLFSH)) == CPUID_CLFSH) { 4842 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 4843 mtx_lock(&sysmaps->lock); 4844 if (*sysmaps->CMAP2) 4845 panic("pmap_page_set_memattr: CMAP2 busy"); 4846 sched_pin(); 4847 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | 4848 PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); 4849 invlcaddr(sysmaps->CADDR2); 4850 sva = (vm_offset_t)sysmaps->CADDR2; 4851 eva = sva + PAGE_SIZE; 4852 } else 4853 sva = eva = 0; /* gcc */ 4854 pmap_invalidate_cache_range(sva, eva); 4855 if (sva != 0) { 4856 *sysmaps->CMAP2 = 0; 4857 sched_unpin(); 4858 mtx_unlock(&sysmaps->lock); 4859 } 4860} 4861 4862/* 4863 * Changes the specified virtual address range's memory type to that given by 4864 * the parameter "mode". The specified virtual address range must be 4865 * completely contained within either the kernel map. 4866 * 4867 * Returns zero if the change completed successfully, and either EINVAL or 4868 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 4869 * of the virtual address range was not mapped, and ENOMEM is returned if 4870 * there was insufficient memory available to complete the change. 4871 */ 4872int 4873pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 4874{ 4875 vm_offset_t base, offset, tmpva; 4876 pd_entry_t *pde; 4877 pt_entry_t *pte; 4878 int cache_bits_pte, cache_bits_pde; 4879 boolean_t changed; 4880 4881 base = trunc_page(va); 4882 offset = va & PAGE_MASK; 4883 size = roundup(offset + size, PAGE_SIZE); 4884 4885 /* 4886 * Only supported on kernel virtual addresses above the recursive map. 4887 */ 4888 if (base < VM_MIN_KERNEL_ADDRESS) 4889 return (EINVAL); 4890 4891 cache_bits_pde = pmap_cache_bits(mode, 1); 4892 cache_bits_pte = pmap_cache_bits(mode, 0); 4893 changed = FALSE; 4894 4895 /* 4896 * Pages that aren't mapped aren't supported. Also break down 4897 * 2/4MB pages into 4KB pages if required. 4898 */ 4899 PMAP_LOCK(kernel_pmap); 4900 for (tmpva = base; tmpva < base + size; ) { 4901 pde = pmap_pde(kernel_pmap, tmpva); 4902 if (*pde == 0) { 4903 PMAP_UNLOCK(kernel_pmap); 4904 return (EINVAL); 4905 } 4906 if (*pde & PG_PS) { 4907 /* 4908 * If the current 2/4MB page already has 4909 * the required memory type, then we need not 4910 * demote this page. Just increment tmpva to 4911 * the next 2/4MB page frame. 4912 */ 4913 if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { 4914 tmpva = trunc_4mpage(tmpva) + NBPDR; 4915 continue; 4916 } 4917 4918 /* 4919 * If the current offset aligns with a 2/4MB 4920 * page frame and there is at least 2/4MB left 4921 * within the range, then we need not break 4922 * down this page into 4KB pages. 4923 */ 4924 if ((tmpva & PDRMASK) == 0 && 4925 tmpva + PDRMASK < base + size) { 4926 tmpva += NBPDR; 4927 continue; 4928 } 4929 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) { 4930 PMAP_UNLOCK(kernel_pmap); 4931 return (ENOMEM); 4932 } 4933 } 4934 pte = vtopte(tmpva); 4935 if (*pte == 0) { 4936 PMAP_UNLOCK(kernel_pmap); 4937 return (EINVAL); 4938 } 4939 tmpva += PAGE_SIZE; 4940 } 4941 PMAP_UNLOCK(kernel_pmap); 4942 4943 /* 4944 * Ok, all the pages exist, so run through them updating their 4945 * cache mode if required. 4946 */ 4947 for (tmpva = base; tmpva < base + size; ) { 4948 pde = pmap_pde(kernel_pmap, tmpva); 4949 if (*pde & PG_PS) { 4950 if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { 4951 pmap_pde_attr(pde, cache_bits_pde); 4952 changed = TRUE; 4953 } 4954 tmpva = trunc_4mpage(tmpva) + NBPDR; 4955 } else { 4956 pte = vtopte(tmpva); 4957 if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { 4958 pmap_pte_attr(pte, cache_bits_pte); 4959 changed = TRUE; 4960 } 4961 tmpva += PAGE_SIZE; 4962 } 4963 } 4964 4965 /* 4966 * Flush CPU caches to make sure any data isn't cached that 4967 * shouldn't be, etc. 4968 */ 4969 if (changed) { 4970 pmap_invalidate_range(kernel_pmap, base, tmpva); 4971 pmap_invalidate_cache_range(base, tmpva); 4972 } 4973 return (0); 4974} 4975 4976/* 4977 * perform the pmap work for mincore 4978 */ 4979int 4980pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 4981{ 4982 pt_entry_t *ptep, pte; 4983 vm_paddr_t pa; 4984 int val; 4985 4986 PMAP_LOCK(pmap); 4987retry: 4988 ptep = pmap_pte(pmap, addr); 4989 pte = (ptep != NULL) ? *ptep : 0; 4990 pmap_pte_release(ptep); 4991 if ((pte & PG_V) != 0) { 4992 val = MINCORE_INCORE; 4993 if ((pte & PG_PS) != 0) { 4994 val |= MINCORE_SUPER; 4995 /* Compute the physical address of the 4KB page. */ 4996 pa = (pte & PG_PS_FRAME) | (addr & PG_FRAME & PDRMASK); 4997 } else 4998 pa = pte & PG_FRAME; 4999 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5000 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 5001 if ((pte & PG_A) != 0) 5002 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 5003 } else { 5004 val = 0; 5005 pa = 0; 5006 } 5007 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 5008 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 5009 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 5010 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 5011 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 5012 goto retry; 5013 } else 5014 PA_UNLOCK_COND(*locked_pa); 5015 PMAP_UNLOCK(pmap); 5016 return (val); 5017} 5018 5019void 5020pmap_activate(struct thread *td) 5021{ 5022 pmap_t pmap, oldpmap; 5023 u_int32_t cr3; 5024 5025 critical_enter(); 5026 pmap = vmspace_pmap(td->td_proc->p_vmspace); 5027 oldpmap = PCPU_GET(curpmap); 5028#if defined(SMP) 5029 atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask)); 5030 atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask)); 5031#else 5032 oldpmap->pm_active &= ~1; 5033 pmap->pm_active |= 1; 5034#endif 5035#ifdef PAE 5036 cr3 = vtophys(pmap->pm_pdpt); 5037#else 5038 cr3 = vtophys(pmap->pm_pdir); 5039#endif 5040 /* 5041 * pmap_activate is for the current thread on the current cpu 5042 */ 5043 td->td_pcb->pcb_cr3 = cr3; 5044 load_cr3(cr3); 5045 PCPU_SET(curpmap, pmap); 5046 critical_exit(); 5047} 5048 5049void 5050pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 5051{ 5052} 5053 5054/* 5055 * Increase the starting virtual address of the given mapping if a 5056 * different alignment might result in more superpage mappings. 5057 */ 5058void 5059pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 5060 vm_offset_t *addr, vm_size_t size) 5061{ 5062 vm_offset_t superpage_offset; 5063 5064 if (size < NBPDR) 5065 return; 5066 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 5067 offset += ptoa(object->pg_color); 5068 superpage_offset = offset & PDRMASK; 5069 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 5070 (*addr & PDRMASK) == superpage_offset) 5071 return; 5072 if ((*addr & PDRMASK) < superpage_offset) 5073 *addr = (*addr & ~PDRMASK) + superpage_offset; 5074 else 5075 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 5076} 5077 5078 5079#if defined(PMAP_DEBUG) 5080pmap_pid_dump(int pid) 5081{ 5082 pmap_t pmap; 5083 struct proc *p; 5084 int npte = 0; 5085 int index; 5086 5087 sx_slock(&allproc_lock); 5088 FOREACH_PROC_IN_SYSTEM(p) { 5089 if (p->p_pid != pid) 5090 continue; 5091 5092 if (p->p_vmspace) { 5093 int i,j; 5094 index = 0; 5095 pmap = vmspace_pmap(p->p_vmspace); 5096 for (i = 0; i < NPDEPTD; i++) { 5097 pd_entry_t *pde; 5098 pt_entry_t *pte; 5099 vm_offset_t base = i << PDRSHIFT; 5100 5101 pde = &pmap->pm_pdir[i]; 5102 if (pde && pmap_pde_v(pde)) { 5103 for (j = 0; j < NPTEPG; j++) { 5104 vm_offset_t va = base + (j << PAGE_SHIFT); 5105 if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { 5106 if (index) { 5107 index = 0; 5108 printf("\n"); 5109 } 5110 sx_sunlock(&allproc_lock); 5111 return (npte); 5112 } 5113 pte = pmap_pte(pmap, va); 5114 if (pte && pmap_pte_v(pte)) { 5115 pt_entry_t pa; 5116 vm_page_t m; 5117 pa = *pte; 5118 m = PHYS_TO_VM_PAGE(pa & PG_FRAME); 5119 printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", 5120 va, pa, m->hold_count, m->wire_count, m->flags); 5121 npte++; 5122 index++; 5123 if (index >= 2) { 5124 index = 0; 5125 printf("\n"); 5126 } else { 5127 printf(" "); 5128 } 5129 } 5130 } 5131 } 5132 } 5133 } 5134 } 5135 sx_sunlock(&allproc_lock); 5136 return (npte); 5137} 5138#endif 5139 5140#if defined(DEBUG) 5141 5142static void pads(pmap_t pm); 5143void pmap_pvdump(vm_offset_t pa); 5144 5145/* print address space of pmap*/ 5146static void 5147pads(pmap_t pm) 5148{ 5149 int i, j; 5150 vm_paddr_t va; 5151 pt_entry_t *ptep; 5152 5153 if (pm == kernel_pmap) 5154 return; 5155 for (i = 0; i < NPDEPTD; i++) 5156 if (pm->pm_pdir[i]) 5157 for (j = 0; j < NPTEPG; j++) { 5158 va = (i << PDRSHIFT) + (j << PAGE_SHIFT); 5159 if (pm == kernel_pmap && va < KERNBASE) 5160 continue; 5161 if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) 5162 continue; 5163 ptep = pmap_pte(pm, va); 5164 if (pmap_pte_v(ptep)) 5165 printf("%x:%x ", va, *ptep); 5166 }; 5167 5168} 5169 5170void 5171pmap_pvdump(vm_paddr_t pa) 5172{ 5173 pv_entry_t pv; 5174 pmap_t pmap; 5175 vm_page_t m; 5176 5177 printf("pa %x", pa); 5178 m = PHYS_TO_VM_PAGE(pa); 5179 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 5180 pmap = PV_PMAP(pv); 5181 printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va); 5182 pads(pmap); 5183 } 5184 printf(" "); 5185} 5186#endif 5187