1/*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu> 9 * All rights reserved. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * the Systems Programming Group of the University of Utah Computer 13 * Science Department and William Jolitz of UUNET Technologies Inc. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 3. All advertising materials mentioning features or use of this software 24 * must display the following acknowledgement: 25 * This product includes software developed by the University of 26 * California, Berkeley and its contributors. 27 * 4. Neither the name of the University nor the names of its contributors 28 * may be used to endorse or promote products derived from this software 29 * without specific prior written permission. 30 * 31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41 * SUCH DAMAGE. 42 * 43 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 44 */ 45/*- 46 * Copyright (c) 2003 Networks Associates Technology, Inc. 47 * All rights reserved. 48 * 49 * This software was developed for the FreeBSD Project by Jake Burkholder, 50 * Safeport Network Services, and Network Associates Laboratories, the 51 * Security Research Division of Network Associates, Inc. under 52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 53 * CHATS research program. 54 * 55 * Redistribution and use in source and binary forms, with or without 56 * modification, are permitted provided that the following conditions 57 * are met: 58 * 1. Redistributions of source code must retain the above copyright 59 * notice, this list of conditions and the following disclaimer. 60 * 2. Redistributions in binary form must reproduce the above copyright 61 * notice, this list of conditions and the following disclaimer in the 62 * documentation and/or other materials provided with the distribution. 63 * 64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 67 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 74 * SUCH DAMAGE. 75 */ 76 77#include <sys/cdefs.h> 78__FBSDID("$FreeBSD$"); 79 80/* 81 * Manages physical address maps. 82 * 83 * In addition to hardware address maps, this 84 * module is called upon to provide software-use-only 85 * maps which may or may not be stored in the same 86 * form as hardware maps. These pseudo-maps are 87 * used to store intermediate results from copy 88 * operations to and from address spaces. 89 * 90 * Since the information managed by this module is 91 * also stored by the logical address mapping module, 92 * this module may throw away valid virtual-to-physical 93 * mappings at almost any time. However, invalidations 94 * of virtual-to-physical mappings must be done as 95 * requested. 96 * 97 * In order to cope with hardware architectures which 98 * make virtual-to-physical map invalidates expensive, 99 * this module may delay invalidate or reduced protection 100 * operations until such time as they are actually 101 * necessary. This module is given full information as 102 * to which processors are currently using which maps, 103 * and to when physical maps must be made correct. 104 */ 105 106#include "opt_apic.h" 107#include "opt_cpu.h" 108#include "opt_pmap.h" 109#include "opt_smp.h" 110#include "opt_xbox.h" 111 112#include <sys/param.h> 113#include <sys/systm.h> 114#include <sys/kernel.h> 115#include <sys/ktr.h> 116#include <sys/lock.h> 117#include <sys/malloc.h> 118#include <sys/mman.h> 119#include <sys/msgbuf.h> 120#include <sys/mutex.h> 121#include <sys/proc.h> 122#include <sys/rwlock.h> 123#include <sys/sf_buf.h> 124#include <sys/sx.h> 125#include <sys/vmmeter.h> 126#include <sys/sched.h> 127#include <sys/sysctl.h> 128#ifdef SMP 129#include <sys/smp.h> 130#else 131#include <sys/cpuset.h> 132#endif 133 134#include <vm/vm.h> 135#include <vm/vm_param.h> 136#include <vm/vm_kern.h> 137#include <vm/vm_page.h> 138#include <vm/vm_map.h> 139#include <vm/vm_object.h> 140#include <vm/vm_extern.h> 141#include <vm/vm_pageout.h> 142#include <vm/vm_pager.h> 143#include <vm/vm_reserv.h> 144#include <vm/uma.h> 145 146#ifdef DEV_APIC 147#include <sys/bus.h> 148#include <machine/intr_machdep.h> 149#include <machine/apicvar.h> 150#endif 151#include <machine/cpu.h> 152#include <machine/cputypes.h> 153#include <machine/md_var.h> 154#include <machine/pcb.h> 155#include <machine/specialreg.h> 156#ifdef SMP 157#include <machine/smp.h> 158#endif 159 160#ifdef XBOX 161#include <machine/xbox.h> 162#endif 163 164#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) 165#define CPU_ENABLE_SSE 166#endif 167 168#ifndef PMAP_SHPGPERPROC 169#define PMAP_SHPGPERPROC 200 170#endif 171 172#if !defined(DIAGNOSTIC) 173#ifdef __GNUC_GNU_INLINE__ 174#define PMAP_INLINE __attribute__((__gnu_inline__)) inline 175#else 176#define PMAP_INLINE extern inline 177#endif 178#else 179#define PMAP_INLINE 180#endif 181 182#ifdef PV_STATS 183#define PV_STAT(x) do { x ; } while (0) 184#else 185#define PV_STAT(x) do { } while (0) 186#endif 187 188#define pa_index(pa) ((pa) >> PDRSHIFT) 189#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) 190 191/* 192 * Get PDEs and PTEs for user/kernel address space 193 */ 194#define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) 195#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 196 197#define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) 198#define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) 199#define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) 200#define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) 201#define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) 202 203#define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ 204 atomic_clear_int((u_int *)(pte), PG_W)) 205#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) 206 207struct pmap kernel_pmap_store; 208LIST_HEAD(pmaplist, pmap); 209static struct pmaplist allpmaps; 210static struct mtx allpmaps_lock; 211 212vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 213vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 214int pgeflag = 0; /* PG_G or-in */ 215int pseflag = 0; /* PG_PS or-in */ 216 217static int nkpt = NKPT; 218vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR; 219extern u_int32_t KERNend; 220extern u_int32_t KPTphys; 221 222#ifdef PAE 223pt_entry_t pg_nx; 224static uma_zone_t pdptzone; 225#endif 226 227static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 228 229static int pat_works = 1; 230SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, 231 "Is page attribute table fully functional?"); 232 233static int pg_ps_enabled = 1; 234SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN, &pg_ps_enabled, 0, 235 "Are large page mappings enabled?"); 236 237#define PAT_INDEX_SIZE 8 238static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ 239 240/* 241 * Isolate the global pv list lock from data and other locks to prevent false 242 * sharing within the cache. 243 */ 244static struct { 245 struct rwlock lock; 246 char padding[CACHE_LINE_SIZE - sizeof(struct rwlock)]; 247} pvh_global __aligned(CACHE_LINE_SIZE); 248 249#define pvh_global_lock pvh_global.lock 250 251/* 252 * Data for the pv entry allocation mechanism 253 */ 254static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); 255static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 256static struct md_page *pv_table; 257static int shpgperproc = PMAP_SHPGPERPROC; 258 259struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ 260int pv_maxchunks; /* How many chunks we have KVA for */ 261vm_offset_t pv_vafree; /* freelist stored in the PTE */ 262 263/* 264 * All those kernel PT submaps that BSD is so fond of 265 */ 266struct sysmaps { 267 struct mtx lock; 268 pt_entry_t *CMAP1; 269 pt_entry_t *CMAP2; 270 caddr_t CADDR1; 271 caddr_t CADDR2; 272}; 273static struct sysmaps sysmaps_pcpu[MAXCPU]; 274pt_entry_t *CMAP1 = 0; 275static pt_entry_t *CMAP3; 276static pd_entry_t *KPTD; 277caddr_t CADDR1 = 0, ptvmmap = 0; 278static caddr_t CADDR3; 279struct msgbuf *msgbufp = 0; 280 281/* 282 * Crashdump maps. 283 */ 284static caddr_t crashdumpmap; 285 286static pt_entry_t *PMAP1 = 0, *PMAP2; 287static pt_entry_t *PADDR1 = 0, *PADDR2; 288#ifdef SMP 289static int PMAP1cpu; 290static int PMAP1changedcpu; 291SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 292 &PMAP1changedcpu, 0, 293 "Number of times pmap_pte_quick changed CPU with same PMAP1"); 294#endif 295static int PMAP1changed; 296SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 297 &PMAP1changed, 0, 298 "Number of times pmap_pte_quick changed PMAP1"); 299static int PMAP1unchanged; 300SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 301 &PMAP1unchanged, 0, 302 "Number of times pmap_pte_quick didn't change PMAP1"); 303static struct mtx PMAP2mutex; 304 305static void free_pv_chunk(struct pv_chunk *pc); 306static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 307static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); 308static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 309static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 310static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); 311static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); 312static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, 313 vm_offset_t va); 314static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); 315 316static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 317static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, 318 vm_prot_t prot); 319static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 320 vm_page_t m, vm_prot_t prot, vm_page_t mpte); 321static void pmap_flush_page(vm_page_t m); 322static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); 323static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); 324static boolean_t pmap_is_modified_pvh(struct md_page *pvh); 325static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); 326static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); 327static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde); 328static vm_page_t pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va); 329static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); 330static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); 331static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, 332 vm_prot_t prot); 333static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); 334static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 335 vm_page_t *free); 336static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, 337 vm_page_t *free); 338static void pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte); 339static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, 340 vm_page_t *free); 341static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, 342 vm_offset_t va); 343static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); 344static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 345 vm_page_t m); 346static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, 347 pd_entry_t newpde); 348static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); 349 350static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags); 351 352static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, int flags); 353static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free); 354static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); 355static void pmap_pte_release(pt_entry_t *pte); 356static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *); 357#ifdef PAE 358static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait); 359#endif 360static void pmap_set_pg(void); 361 362static __inline void pagezero(void *page); 363 364CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); 365CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); 366 367/* 368 * If you get an error here, then you set KVA_PAGES wrong! See the 369 * description of KVA_PAGES in sys/i386/include/pmap.h. It must be 370 * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. 371 */ 372CTASSERT(KERNBASE % (1 << 24) == 0); 373 374/* 375 * Bootstrap the system enough to run with virtual memory. 376 * 377 * On the i386 this is called after mapping has already been enabled 378 * and just syncs the pmap module with what has already been done. 379 * [We can't call it easily with mapping off since the kernel is not 380 * mapped with PA == VA, hence we would have to relocate every address 381 * from the linked base (virtual) address "KERNBASE" to the actual 382 * (physical) address starting relative to 0] 383 */ 384void 385pmap_bootstrap(vm_paddr_t firstaddr) 386{ 387 vm_offset_t va; 388 pt_entry_t *pte, *unused; 389 struct sysmaps *sysmaps; 390 int i; 391 392 /* 393 * Initialize the first available kernel virtual address. However, 394 * using "firstaddr" may waste a few pages of the kernel virtual 395 * address space, because locore may not have mapped every physical 396 * page that it allocated. Preferably, locore would provide a first 397 * unused virtual address in addition to "firstaddr". 398 */ 399 virtual_avail = (vm_offset_t) KERNBASE + firstaddr; 400 401 virtual_end = VM_MAX_KERNEL_ADDRESS; 402 403 /* 404 * Initialize the kernel pmap (which is statically allocated). 405 */ 406 PMAP_LOCK_INIT(kernel_pmap); 407 kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); 408#ifdef PAE 409 kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); 410#endif 411 kernel_pmap->pm_root = NULL; 412 CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ 413 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 414 415 /* 416 * Initialize the global pv list lock. 417 */ 418 rw_init(&pvh_global_lock, "pmap pv global"); 419 420 LIST_INIT(&allpmaps); 421 422 /* 423 * Request a spin mutex so that changes to allpmaps cannot be 424 * preempted by smp_rendezvous_cpus(). Otherwise, 425 * pmap_update_pde_kernel() could access allpmaps while it is 426 * being changed. 427 */ 428 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); 429 mtx_lock_spin(&allpmaps_lock); 430 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); 431 mtx_unlock_spin(&allpmaps_lock); 432 433 /* 434 * Reserve some special page table entries/VA space for temporary 435 * mapping of pages. 436 */ 437#define SYSMAP(c, p, v, n) \ 438 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 439 440 va = virtual_avail; 441 pte = vtopte(va); 442 443 /* 444 * CMAP1/CMAP2 are used for zeroing and copying pages. 445 * CMAP3 is used for the idle process page zeroing. 446 */ 447 for (i = 0; i < MAXCPU; i++) { 448 sysmaps = &sysmaps_pcpu[i]; 449 mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF); 450 SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1) 451 SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1) 452 } 453 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 454 SYSMAP(caddr_t, CMAP3, CADDR3, 1) 455 456 /* 457 * Crashdump maps. 458 */ 459 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) 460 461 /* 462 * ptvmmap is used for reading arbitrary physical pages via /dev/mem. 463 */ 464 SYSMAP(caddr_t, unused, ptvmmap, 1) 465 466 /* 467 * msgbufp is used to map the system message buffer. 468 */ 469 SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) 470 471 /* 472 * KPTmap is used by pmap_kextract(). 473 * 474 * KPTmap is first initialized by locore. However, that initial 475 * KPTmap can only support NKPT page table pages. Here, a larger 476 * KPTmap is created that can support KVA_PAGES page table pages. 477 */ 478 SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES) 479 480 for (i = 0; i < NKPT; i++) 481 KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V; 482 483 /* 484 * Adjust the start of the KPTD and KPTmap so that the implementation 485 * of pmap_kextract() and pmap_growkernel() can be made simpler. 486 */ 487 KPTD -= KPTDI; 488 KPTmap -= i386_btop(KPTDI << PDRSHIFT); 489 490 /* 491 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), 492 * respectively. 493 */ 494 SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) 495 SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) 496 497 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); 498 499 virtual_avail = va; 500 501 /* 502 * Leave in place an identity mapping (virt == phys) for the low 1 MB 503 * physical memory region that is used by the ACPI wakeup code. This 504 * mapping must not have PG_G set. 505 */ 506#ifdef XBOX 507 /* FIXME: This is gross, but needed for the XBOX. Since we are in such 508 * an early stadium, we cannot yet neatly map video memory ... :-( 509 * Better fixes are very welcome! */ 510 if (!arch_i386_is_xbox) 511#endif 512 for (i = 1; i < NKPT; i++) 513 PTD[i] = 0; 514 515 /* Initialize the PAT MSR if present. */ 516 pmap_init_pat(); 517 518 /* Turn on PG_G on kernel page(s) */ 519 pmap_set_pg(); 520} 521 522/* 523 * Setup the PAT MSR. 524 */ 525void 526pmap_init_pat(void) 527{ 528 int pat_table[PAT_INDEX_SIZE]; 529 uint64_t pat_msr; 530 u_long cr0, cr4; 531 int i; 532 533 /* Set default PAT index table. */ 534 for (i = 0; i < PAT_INDEX_SIZE; i++) 535 pat_table[i] = -1; 536 pat_table[PAT_WRITE_BACK] = 0; 537 pat_table[PAT_WRITE_THROUGH] = 1; 538 pat_table[PAT_UNCACHEABLE] = 3; 539 pat_table[PAT_WRITE_COMBINING] = 3; 540 pat_table[PAT_WRITE_PROTECTED] = 3; 541 pat_table[PAT_UNCACHED] = 3; 542 543 /* Bail if this CPU doesn't implement PAT. */ 544 if ((cpu_feature & CPUID_PAT) == 0) { 545 for (i = 0; i < PAT_INDEX_SIZE; i++) 546 pat_index[i] = pat_table[i]; 547 pat_works = 0; 548 return; 549 } 550 551 /* 552 * Due to some Intel errata, we can only safely use the lower 4 553 * PAT entries. 554 * 555 * Intel Pentium III Processor Specification Update 556 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B 557 * or Mode C Paging) 558 * 559 * Intel Pentium IV Processor Specification Update 560 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) 561 */ 562 if (cpu_vendor_id == CPU_VENDOR_INTEL && 563 !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) 564 pat_works = 0; 565 566 /* Initialize default PAT entries. */ 567 pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | 568 PAT_VALUE(1, PAT_WRITE_THROUGH) | 569 PAT_VALUE(2, PAT_UNCACHED) | 570 PAT_VALUE(3, PAT_UNCACHEABLE) | 571 PAT_VALUE(4, PAT_WRITE_BACK) | 572 PAT_VALUE(5, PAT_WRITE_THROUGH) | 573 PAT_VALUE(6, PAT_UNCACHED) | 574 PAT_VALUE(7, PAT_UNCACHEABLE); 575 576 if (pat_works) { 577 /* 578 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. 579 * Program 5 and 6 as WP and WC. 580 * Leave 4 and 7 as WB and UC. 581 */ 582 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); 583 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | 584 PAT_VALUE(6, PAT_WRITE_COMBINING); 585 pat_table[PAT_UNCACHED] = 2; 586 pat_table[PAT_WRITE_PROTECTED] = 5; 587 pat_table[PAT_WRITE_COMBINING] = 6; 588 } else { 589 /* 590 * Just replace PAT Index 2 with WC instead of UC-. 591 */ 592 pat_msr &= ~PAT_MASK(2); 593 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 594 pat_table[PAT_WRITE_COMBINING] = 2; 595 } 596 597 /* Disable PGE. */ 598 cr4 = rcr4(); 599 load_cr4(cr4 & ~CR4_PGE); 600 601 /* Disable caches (CD = 1, NW = 0). */ 602 cr0 = rcr0(); 603 load_cr0((cr0 & ~CR0_NW) | CR0_CD); 604 605 /* Flushes caches and TLBs. */ 606 wbinvd(); 607 invltlb(); 608 609 /* Update PAT and index table. */ 610 wrmsr(MSR_PAT, pat_msr); 611 for (i = 0; i < PAT_INDEX_SIZE; i++) 612 pat_index[i] = pat_table[i]; 613 614 /* Flush caches and TLBs again. */ 615 wbinvd(); 616 invltlb(); 617 618 /* Restore caches and PGE. */ 619 load_cr0(cr0); 620 load_cr4(cr4); 621} 622 623/* 624 * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. 625 */ 626static void 627pmap_set_pg(void) 628{ 629 pt_entry_t *pte; 630 vm_offset_t va, endva; 631 632 if (pgeflag == 0) 633 return; 634 635 endva = KERNBASE + KERNend; 636 637 if (pseflag) { 638 va = KERNBASE + KERNLOAD; 639 while (va < endva) { 640 pdir_pde(PTD, va) |= pgeflag; 641 invltlb(); /* Play it safe, invltlb() every time */ 642 va += NBPDR; 643 } 644 } else { 645 va = (vm_offset_t)btext; 646 while (va < endva) { 647 pte = vtopte(va); 648 if (*pte) 649 *pte |= pgeflag; 650 invltlb(); /* Play it safe, invltlb() every time */ 651 va += PAGE_SIZE; 652 } 653 } 654} 655 656/* 657 * Initialize a vm_page's machine-dependent fields. 658 */ 659void 660pmap_page_init(vm_page_t m) 661{ 662 663 TAILQ_INIT(&m->md.pv_list); 664 m->md.pat_mode = PAT_WRITE_BACK; 665} 666 667#ifdef PAE 668static void * 669pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) 670{ 671 672 /* Inform UMA that this allocator uses kernel_map/object. */ 673 *flags = UMA_SLAB_KERNEL; 674 return ((void *)kmem_alloc_contig(kernel_map, bytes, wait, 0x0ULL, 675 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); 676} 677#endif 678 679/* 680 * ABuse the pte nodes for unmapped kva to thread a kva freelist through. 681 * Requirements: 682 * - Must deal with pages in order to ensure that none of the PG_* bits 683 * are ever set, PG_V in particular. 684 * - Assumes we can write to ptes without pte_store() atomic ops, even 685 * on PAE systems. This should be ok. 686 * - Assumes nothing will ever test these addresses for 0 to indicate 687 * no mapping instead of correctly checking PG_V. 688 * - Assumes a vm_offset_t will fit in a pte (true for i386). 689 * Because PG_V is never set, there can be no mappings to invalidate. 690 */ 691static vm_offset_t 692pmap_ptelist_alloc(vm_offset_t *head) 693{ 694 pt_entry_t *pte; 695 vm_offset_t va; 696 697 va = *head; 698 if (va == 0) 699 return (va); /* Out of memory */ 700 pte = vtopte(va); 701 *head = *pte; 702 if (*head & PG_V) 703 panic("pmap_ptelist_alloc: va with PG_V set!"); 704 *pte = 0; 705 return (va); 706} 707 708static void 709pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) 710{ 711 pt_entry_t *pte; 712 713 if (va & PG_V) 714 panic("pmap_ptelist_free: freeing va with PG_V set!"); 715 pte = vtopte(va); 716 *pte = *head; /* virtual! PG_V is 0 though */ 717 *head = va; 718} 719 720static void 721pmap_ptelist_init(vm_offset_t *head, void *base, int npages) 722{ 723 int i; 724 vm_offset_t va; 725 726 *head = 0; 727 for (i = npages - 1; i >= 0; i--) { 728 va = (vm_offset_t)base + i * PAGE_SIZE; 729 pmap_ptelist_free(head, va); 730 } 731} 732 733 734/* 735 * Initialize the pmap module. 736 * Called by vm_init, to initialize any structures that the pmap 737 * system needs to map virtual memory. 738 */ 739void 740pmap_init(void) 741{ 742 vm_page_t mpte; 743 vm_size_t s; 744 int i, pv_npg; 745 746 /* 747 * Initialize the vm page array entries for the kernel pmap's 748 * page table pages. 749 */ 750 for (i = 0; i < NKPT; i++) { 751 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); 752 KASSERT(mpte >= vm_page_array && 753 mpte < &vm_page_array[vm_page_array_size], 754 ("pmap_init: page table page is out of range")); 755 mpte->pindex = i + KPTDI; 756 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); 757 } 758 759 /* 760 * Initialize the address space (zone) for the pv entries. Set a 761 * high water mark so that the system can recover from excessive 762 * numbers of pv entries. 763 */ 764 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 765 pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; 766 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 767 pv_entry_max = roundup(pv_entry_max, _NPCPV); 768 pv_entry_high_water = 9 * (pv_entry_max / 10); 769 770 /* 771 * If the kernel is running on a virtual machine, then it must assume 772 * that MCA is enabled by the hypervisor. Moreover, the kernel must 773 * be prepared for the hypervisor changing the vendor and family that 774 * are reported by CPUID. Consequently, the workaround for AMD Family 775 * 10h Erratum 383 is enabled if the processor's feature set does not 776 * include at least one feature that is only supported by older Intel 777 * or newer AMD processors. 778 */ 779 if (vm_guest == VM_GUEST_VM && (cpu_feature & CPUID_SS) == 0 && 780 (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | 781 CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | 782 AMDID2_FMA4)) == 0) 783 workaround_erratum383 = 1; 784 785 /* 786 * Are large page mappings supported and enabled? 787 */ 788 TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); 789 if (pseflag == 0) 790 pg_ps_enabled = 0; 791 else if (pg_ps_enabled) { 792 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, 793 ("pmap_init: can't assign to pagesizes[1]")); 794 pagesizes[1] = NBPDR; 795 } 796 797 /* 798 * Calculate the size of the pv head table for superpages. 799 */ 800 for (i = 0; phys_avail[i + 1]; i += 2); 801 pv_npg = round_4mpage(phys_avail[(i - 2) + 1]) / NBPDR; 802 803 /* 804 * Allocate memory for the pv head table for superpages. 805 */ 806 s = (vm_size_t)(pv_npg * sizeof(struct md_page)); 807 s = round_page(s); 808 pv_table = (struct md_page *)kmem_alloc(kernel_map, s); 809 for (i = 0; i < pv_npg; i++) 810 TAILQ_INIT(&pv_table[i].pv_list); 811 812 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); 813 pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map, 814 PAGE_SIZE * pv_maxchunks); 815 if (pv_chunkbase == NULL) 816 panic("pmap_init: not enough kvm for pv chunks"); 817 pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); 818#ifdef PAE 819 pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, 820 NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, 821 UMA_ZONE_VM | UMA_ZONE_NOFREE); 822 uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); 823#endif 824} 825 826 827SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, 828 "Max number of PV entries"); 829SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, 830 "Page share factor per proc"); 831 832static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, 833 "2/4MB page mapping counters"); 834 835static u_long pmap_pde_demotions; 836SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, 837 &pmap_pde_demotions, 0, "2/4MB page demotions"); 838 839static u_long pmap_pde_mappings; 840SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, 841 &pmap_pde_mappings, 0, "2/4MB page mappings"); 842 843static u_long pmap_pde_p_failures; 844SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, 845 &pmap_pde_p_failures, 0, "2/4MB page promotion failures"); 846 847static u_long pmap_pde_promotions; 848SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, 849 &pmap_pde_promotions, 0, "2/4MB page promotions"); 850 851/*************************************************** 852 * Low level helper routines..... 853 ***************************************************/ 854 855/* 856 * Determine the appropriate bits to set in a PTE or PDE for a specified 857 * caching mode. 858 */ 859int 860pmap_cache_bits(int mode, boolean_t is_pde) 861{ 862 int cache_bits, pat_flag, pat_idx; 863 864 if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) 865 panic("Unknown caching mode %d\n", mode); 866 867 /* The PAT bit is different for PTE's and PDE's. */ 868 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; 869 870 /* Map the caching mode to a PAT index. */ 871 pat_idx = pat_index[mode]; 872 873 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 874 cache_bits = 0; 875 if (pat_idx & 0x4) 876 cache_bits |= pat_flag; 877 if (pat_idx & 0x2) 878 cache_bits |= PG_NC_PCD; 879 if (pat_idx & 0x1) 880 cache_bits |= PG_NC_PWT; 881 return (cache_bits); 882} 883 884/* 885 * The caller is responsible for maintaining TLB consistency. 886 */ 887static void 888pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde) 889{ 890 pd_entry_t *pde; 891 pmap_t pmap; 892 boolean_t PTD_updated; 893 894 PTD_updated = FALSE; 895 mtx_lock_spin(&allpmaps_lock); 896 LIST_FOREACH(pmap, &allpmaps, pm_list) { 897 if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & 898 PG_FRAME)) 899 PTD_updated = TRUE; 900 pde = pmap_pde(pmap, va); 901 pde_store(pde, newpde); 902 } 903 mtx_unlock_spin(&allpmaps_lock); 904 KASSERT(PTD_updated, 905 ("pmap_kenter_pde: current page table is not in allpmaps")); 906} 907 908/* 909 * After changing the page size for the specified virtual address in the page 910 * table, flush the corresponding entries from the processor's TLB. Only the 911 * calling processor's TLB is affected. 912 * 913 * The calling thread must be pinned to a processor. 914 */ 915static void 916pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) 917{ 918 u_long cr4; 919 920 if ((newpde & PG_PS) == 0) 921 /* Demotion: flush a specific 2MB page mapping. */ 922 invlpg(va); 923 else if ((newpde & PG_G) == 0) 924 /* 925 * Promotion: flush every 4KB page mapping from the TLB 926 * because there are too many to flush individually. 927 */ 928 invltlb(); 929 else { 930 /* 931 * Promotion: flush every 4KB page mapping from the TLB, 932 * including any global (PG_G) mappings. 933 */ 934 cr4 = rcr4(); 935 load_cr4(cr4 & ~CR4_PGE); 936 /* 937 * Although preemption at this point could be detrimental to 938 * performance, it would not lead to an error. PG_G is simply 939 * ignored if CR4.PGE is clear. Moreover, in case this block 940 * is re-entered, the load_cr4() either above or below will 941 * modify CR4.PGE flushing the TLB. 942 */ 943 load_cr4(cr4 | CR4_PGE); 944 } 945} 946#ifdef SMP 947/* 948 * For SMP, these functions have to use the IPI mechanism for coherence. 949 * 950 * N.B.: Before calling any of the following TLB invalidation functions, 951 * the calling processor must ensure that all stores updating a non- 952 * kernel page table are globally performed. Otherwise, another 953 * processor could cache an old, pre-update entry without being 954 * invalidated. This can happen one of two ways: (1) The pmap becomes 955 * active on another processor after its pm_active field is checked by 956 * one of the following functions but before a store updating the page 957 * table is globally performed. (2) The pmap becomes active on another 958 * processor before its pm_active field is checked but due to 959 * speculative loads one of the following functions stills reads the 960 * pmap as inactive on the other processor. 961 * 962 * The kernel page table is exempt because its pm_active field is 963 * immutable. The kernel page table is always active on every 964 * processor. 965 */ 966void 967pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 968{ 969 cpuset_t other_cpus; 970 u_int cpuid; 971 972 sched_pin(); 973 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 974 invlpg(va); 975 smp_invlpg(va); 976 } else { 977 cpuid = PCPU_GET(cpuid); 978 other_cpus = all_cpus; 979 CPU_CLR(cpuid, &other_cpus); 980 if (CPU_ISSET(cpuid, &pmap->pm_active)) 981 invlpg(va); 982 CPU_AND(&other_cpus, &pmap->pm_active); 983 if (!CPU_EMPTY(&other_cpus)) 984 smp_masked_invlpg(other_cpus, va); 985 } 986 sched_unpin(); 987} 988 989void 990pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 991{ 992 cpuset_t other_cpus; 993 vm_offset_t addr; 994 u_int cpuid; 995 996 sched_pin(); 997 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 998 for (addr = sva; addr < eva; addr += PAGE_SIZE) 999 invlpg(addr); 1000 smp_invlpg_range(sva, eva); 1001 } else { 1002 cpuid = PCPU_GET(cpuid); 1003 other_cpus = all_cpus; 1004 CPU_CLR(cpuid, &other_cpus); 1005 if (CPU_ISSET(cpuid, &pmap->pm_active)) 1006 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1007 invlpg(addr); 1008 CPU_AND(&other_cpus, &pmap->pm_active); 1009 if (!CPU_EMPTY(&other_cpus)) 1010 smp_masked_invlpg_range(other_cpus, sva, eva); 1011 } 1012 sched_unpin(); 1013} 1014 1015void 1016pmap_invalidate_all(pmap_t pmap) 1017{ 1018 cpuset_t other_cpus; 1019 u_int cpuid; 1020 1021 sched_pin(); 1022 if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { 1023 invltlb(); 1024 smp_invltlb(); 1025 } else { 1026 cpuid = PCPU_GET(cpuid); 1027 other_cpus = all_cpus; 1028 CPU_CLR(cpuid, &other_cpus); 1029 if (CPU_ISSET(cpuid, &pmap->pm_active)) 1030 invltlb(); 1031 CPU_AND(&other_cpus, &pmap->pm_active); 1032 if (!CPU_EMPTY(&other_cpus)) 1033 smp_masked_invltlb(other_cpus); 1034 } 1035 sched_unpin(); 1036} 1037 1038void 1039pmap_invalidate_cache(void) 1040{ 1041 1042 sched_pin(); 1043 wbinvd(); 1044 smp_cache_flush(); 1045 sched_unpin(); 1046} 1047 1048struct pde_action { 1049 cpuset_t invalidate; /* processors that invalidate their TLB */ 1050 vm_offset_t va; 1051 pd_entry_t *pde; 1052 pd_entry_t newpde; 1053 u_int store; /* processor that updates the PDE */ 1054}; 1055 1056static void 1057pmap_update_pde_kernel(void *arg) 1058{ 1059 struct pde_action *act = arg; 1060 pd_entry_t *pde; 1061 pmap_t pmap; 1062 1063 if (act->store == PCPU_GET(cpuid)) { 1064 1065 /* 1066 * Elsewhere, this operation requires allpmaps_lock for 1067 * synchronization. Here, it does not because it is being 1068 * performed in the context of an all_cpus rendezvous. 1069 */ 1070 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1071 pde = pmap_pde(pmap, act->va); 1072 pde_store(pde, act->newpde); 1073 } 1074 } 1075} 1076 1077static void 1078pmap_update_pde_user(void *arg) 1079{ 1080 struct pde_action *act = arg; 1081 1082 if (act->store == PCPU_GET(cpuid)) 1083 pde_store(act->pde, act->newpde); 1084} 1085 1086static void 1087pmap_update_pde_teardown(void *arg) 1088{ 1089 struct pde_action *act = arg; 1090 1091 if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) 1092 pmap_update_pde_invalidate(act->va, act->newpde); 1093} 1094 1095/* 1096 * Change the page size for the specified virtual address in a way that 1097 * prevents any possibility of the TLB ever having two entries that map the 1098 * same virtual address using different page sizes. This is the recommended 1099 * workaround for Erratum 383 on AMD Family 10h processors. It prevents a 1100 * machine check exception for a TLB state that is improperly diagnosed as a 1101 * hardware error. 1102 */ 1103static void 1104pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1105{ 1106 struct pde_action act; 1107 cpuset_t active, other_cpus; 1108 u_int cpuid; 1109 1110 sched_pin(); 1111 cpuid = PCPU_GET(cpuid); 1112 other_cpus = all_cpus; 1113 CPU_CLR(cpuid, &other_cpus); 1114 if (pmap == kernel_pmap) 1115 active = all_cpus; 1116 else 1117 active = pmap->pm_active; 1118 if (CPU_OVERLAP(&active, &other_cpus)) { 1119 act.store = cpuid; 1120 act.invalidate = active; 1121 act.va = va; 1122 act.pde = pde; 1123 act.newpde = newpde; 1124 CPU_SET(cpuid, &active); 1125 smp_rendezvous_cpus(active, 1126 smp_no_rendevous_barrier, pmap == kernel_pmap ? 1127 pmap_update_pde_kernel : pmap_update_pde_user, 1128 pmap_update_pde_teardown, &act); 1129 } else { 1130 if (pmap == kernel_pmap) 1131 pmap_kenter_pde(va, newpde); 1132 else 1133 pde_store(pde, newpde); 1134 if (CPU_ISSET(cpuid, &active)) 1135 pmap_update_pde_invalidate(va, newpde); 1136 } 1137 sched_unpin(); 1138} 1139#else /* !SMP */ 1140/* 1141 * Normal, non-SMP, 486+ invalidation functions. 1142 * We inline these within pmap.c for speed. 1143 */ 1144PMAP_INLINE void 1145pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 1146{ 1147 1148 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1149 invlpg(va); 1150} 1151 1152PMAP_INLINE void 1153pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1154{ 1155 vm_offset_t addr; 1156 1157 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1158 for (addr = sva; addr < eva; addr += PAGE_SIZE) 1159 invlpg(addr); 1160} 1161 1162PMAP_INLINE void 1163pmap_invalidate_all(pmap_t pmap) 1164{ 1165 1166 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1167 invltlb(); 1168} 1169 1170PMAP_INLINE void 1171pmap_invalidate_cache(void) 1172{ 1173 1174 wbinvd(); 1175} 1176 1177static void 1178pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) 1179{ 1180 1181 if (pmap == kernel_pmap) 1182 pmap_kenter_pde(va, newpde); 1183 else 1184 pde_store(pde, newpde); 1185 if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) 1186 pmap_update_pde_invalidate(va, newpde); 1187} 1188#endif /* !SMP */ 1189 1190#define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) 1191 1192void 1193pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) 1194{ 1195 1196 KASSERT((sva & PAGE_MASK) == 0, 1197 ("pmap_invalidate_cache_range: sva not page-aligned")); 1198 KASSERT((eva & PAGE_MASK) == 0, 1199 ("pmap_invalidate_cache_range: eva not page-aligned")); 1200 1201 if (cpu_feature & CPUID_SS) 1202 ; /* If "Self Snoop" is supported, do nothing. */ 1203 else if ((cpu_feature & CPUID_CLFSH) != 0 && 1204 eva - sva < PMAP_CLFLUSH_THRESHOLD) { 1205 1206#ifdef DEV_APIC 1207 /* 1208 * XXX: Some CPUs fault, hang, or trash the local APIC 1209 * registers if we use CLFLUSH on the local APIC 1210 * range. The local APIC is always uncached, so we 1211 * don't need to flush for that range anyway. 1212 */ 1213 if (pmap_kextract(sva) == lapic_paddr) 1214 return; 1215#endif 1216 /* 1217 * Otherwise, do per-cache line flush. Use the mfence 1218 * instruction to insure that previous stores are 1219 * included in the write-back. The processor 1220 * propagates flush to other processors in the cache 1221 * coherence domain. 1222 */ 1223 mfence(); 1224 for (; sva < eva; sva += cpu_clflush_line_size) 1225 clflush(sva); 1226 mfence(); 1227 } else { 1228 1229 /* 1230 * No targeted cache flush methods are supported by CPU, 1231 * or the supplied range is bigger than 2MB. 1232 * Globally invalidate cache. 1233 */ 1234 pmap_invalidate_cache(); 1235 } 1236} 1237 1238void 1239pmap_invalidate_cache_pages(vm_page_t *pages, int count) 1240{ 1241 int i; 1242 1243 if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || 1244 (cpu_feature & CPUID_CLFSH) == 0) { 1245 pmap_invalidate_cache(); 1246 } else { 1247 for (i = 0; i < count; i++) 1248 pmap_flush_page(pages[i]); 1249 } 1250} 1251 1252/* 1253 * Are we current address space or kernel? N.B. We return FALSE when 1254 * a pmap's page table is in use because a kernel thread is borrowing 1255 * it. The borrowed page table can change spontaneously, making any 1256 * dependence on its continued use subject to a race condition. 1257 */ 1258static __inline int 1259pmap_is_current(pmap_t pmap) 1260{ 1261 1262 return (pmap == kernel_pmap || 1263 (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) && 1264 (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME))); 1265} 1266 1267/* 1268 * If the given pmap is not the current or kernel pmap, the returned pte must 1269 * be released by passing it to pmap_pte_release(). 1270 */ 1271pt_entry_t * 1272pmap_pte(pmap_t pmap, vm_offset_t va) 1273{ 1274 pd_entry_t newpf; 1275 pd_entry_t *pde; 1276 1277 pde = pmap_pde(pmap, va); 1278 if (*pde & PG_PS) 1279 return (pde); 1280 if (*pde != 0) { 1281 /* are we current address space or kernel? */ 1282 if (pmap_is_current(pmap)) 1283 return (vtopte(va)); 1284 mtx_lock(&PMAP2mutex); 1285 newpf = *pde & PG_FRAME; 1286 if ((*PMAP2 & PG_FRAME) != newpf) { 1287 *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; 1288 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); 1289 } 1290 return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); 1291 } 1292 return (NULL); 1293} 1294 1295/* 1296 * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte 1297 * being NULL. 1298 */ 1299static __inline void 1300pmap_pte_release(pt_entry_t *pte) 1301{ 1302 1303 if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) 1304 mtx_unlock(&PMAP2mutex); 1305} 1306 1307static __inline void 1308invlcaddr(void *caddr) 1309{ 1310 1311 invlpg((u_int)caddr); 1312} 1313 1314/* 1315 * Super fast pmap_pte routine best used when scanning 1316 * the pv lists. This eliminates many coarse-grained 1317 * invltlb calls. Note that many of the pv list 1318 * scans are across different pmaps. It is very wasteful 1319 * to do an entire invltlb for checking a single mapping. 1320 * 1321 * If the given pmap is not the current pmap, pvh_global_lock 1322 * must be held and curthread pinned to a CPU. 1323 */ 1324static pt_entry_t * 1325pmap_pte_quick(pmap_t pmap, vm_offset_t va) 1326{ 1327 pd_entry_t newpf; 1328 pd_entry_t *pde; 1329 1330 pde = pmap_pde(pmap, va); 1331 if (*pde & PG_PS) 1332 return (pde); 1333 if (*pde != 0) { 1334 /* are we current address space or kernel? */ 1335 if (pmap_is_current(pmap)) 1336 return (vtopte(va)); 1337 rw_assert(&pvh_global_lock, RA_WLOCKED); 1338 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 1339 newpf = *pde & PG_FRAME; 1340 if ((*PMAP1 & PG_FRAME) != newpf) { 1341 *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; 1342#ifdef SMP 1343 PMAP1cpu = PCPU_GET(cpuid); 1344#endif 1345 invlcaddr(PADDR1); 1346 PMAP1changed++; 1347 } else 1348#ifdef SMP 1349 if (PMAP1cpu != PCPU_GET(cpuid)) { 1350 PMAP1cpu = PCPU_GET(cpuid); 1351 invlcaddr(PADDR1); 1352 PMAP1changedcpu++; 1353 } else 1354#endif 1355 PMAP1unchanged++; 1356 return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); 1357 } 1358 return (0); 1359} 1360 1361/* 1362 * Routine: pmap_extract 1363 * Function: 1364 * Extract the physical page address associated 1365 * with the given map/virtual_address pair. 1366 */ 1367vm_paddr_t 1368pmap_extract(pmap_t pmap, vm_offset_t va) 1369{ 1370 vm_paddr_t rtval; 1371 pt_entry_t *pte; 1372 pd_entry_t pde; 1373 1374 rtval = 0; 1375 PMAP_LOCK(pmap); 1376 pde = pmap->pm_pdir[va >> PDRSHIFT]; 1377 if (pde != 0) { 1378 if ((pde & PG_PS) != 0) 1379 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); 1380 else { 1381 pte = pmap_pte(pmap, va); 1382 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); 1383 pmap_pte_release(pte); 1384 } 1385 } 1386 PMAP_UNLOCK(pmap); 1387 return (rtval); 1388} 1389 1390/* 1391 * Routine: pmap_extract_and_hold 1392 * Function: 1393 * Atomically extract and hold the physical page 1394 * with the given pmap and virtual address pair 1395 * if that mapping permits the given protection. 1396 */ 1397vm_page_t 1398pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1399{ 1400 pd_entry_t pde; 1401 pt_entry_t pte, *ptep; 1402 vm_page_t m; 1403 vm_paddr_t pa; 1404 1405 pa = 0; 1406 m = NULL; 1407 PMAP_LOCK(pmap); 1408retry: 1409 pde = *pmap_pde(pmap, va); 1410 if (pde != 0) { 1411 if (pde & PG_PS) { 1412 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 1413 if (vm_page_pa_tryrelock(pmap, (pde & 1414 PG_PS_FRAME) | (va & PDRMASK), &pa)) 1415 goto retry; 1416 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | 1417 (va & PDRMASK)); 1418 vm_page_hold(m); 1419 } 1420 } else { 1421 ptep = pmap_pte(pmap, va); 1422 pte = *ptep; 1423 pmap_pte_release(ptep); 1424 if (pte != 0 && 1425 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 1426 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, 1427 &pa)) 1428 goto retry; 1429 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 1430 vm_page_hold(m); 1431 } 1432 } 1433 } 1434 PA_UNLOCK_COND(pa); 1435 PMAP_UNLOCK(pmap); 1436 return (m); 1437} 1438 1439/*************************************************** 1440 * Low level mapping routines..... 1441 ***************************************************/ 1442 1443/* 1444 * Add a wired page to the kva. 1445 * Note: not SMP coherent. 1446 * 1447 * This function may be used before pmap_bootstrap() is called. 1448 */ 1449PMAP_INLINE void 1450pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1451{ 1452 pt_entry_t *pte; 1453 1454 pte = vtopte(va); 1455 pte_store(pte, pa | PG_RW | PG_V | pgeflag); 1456} 1457 1458static __inline void 1459pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 1460{ 1461 pt_entry_t *pte; 1462 1463 pte = vtopte(va); 1464 pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); 1465} 1466 1467/* 1468 * Remove a page from the kernel pagetables. 1469 * Note: not SMP coherent. 1470 * 1471 * This function may be used before pmap_bootstrap() is called. 1472 */ 1473PMAP_INLINE void 1474pmap_kremove(vm_offset_t va) 1475{ 1476 pt_entry_t *pte; 1477 1478 pte = vtopte(va); 1479 pte_clear(pte); 1480} 1481 1482/* 1483 * Used to map a range of physical addresses into kernel 1484 * virtual address space. 1485 * 1486 * The value passed in '*virt' is a suggested virtual address for 1487 * the mapping. Architectures which can support a direct-mapped 1488 * physical to virtual region can return the appropriate address 1489 * within that region, leaving '*virt' unchanged. Other 1490 * architectures should map the pages starting at '*virt' and 1491 * update '*virt' with the first usable address after the mapped 1492 * region. 1493 */ 1494vm_offset_t 1495pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1496{ 1497 vm_offset_t va, sva; 1498 vm_paddr_t superpage_offset; 1499 pd_entry_t newpde; 1500 1501 va = *virt; 1502 /* 1503 * Does the physical address range's size and alignment permit at 1504 * least one superpage mapping to be created? 1505 */ 1506 superpage_offset = start & PDRMASK; 1507 if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) { 1508 /* 1509 * Increase the starting virtual address so that its alignment 1510 * does not preclude the use of superpage mappings. 1511 */ 1512 if ((va & PDRMASK) < superpage_offset) 1513 va = (va & ~PDRMASK) + superpage_offset; 1514 else if ((va & PDRMASK) > superpage_offset) 1515 va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset; 1516 } 1517 sva = va; 1518 while (start < end) { 1519 if ((start & PDRMASK) == 0 && end - start >= NBPDR && 1520 pseflag) { 1521 KASSERT((va & PDRMASK) == 0, 1522 ("pmap_map: misaligned va %#x", va)); 1523 newpde = start | PG_PS | pgeflag | PG_RW | PG_V; 1524 pmap_kenter_pde(va, newpde); 1525 va += NBPDR; 1526 start += NBPDR; 1527 } else { 1528 pmap_kenter(va, start); 1529 va += PAGE_SIZE; 1530 start += PAGE_SIZE; 1531 } 1532 } 1533 pmap_invalidate_range(kernel_pmap, sva, va); 1534 *virt = va; 1535 return (sva); 1536} 1537 1538 1539/* 1540 * Add a list of wired pages to the kva 1541 * this routine is only used for temporary 1542 * kernel mappings that do not need to have 1543 * page modification or references recorded. 1544 * Note that old mappings are simply written 1545 * over. The page *must* be wired. 1546 * Note: SMP coherent. Uses a ranged shootdown IPI. 1547 */ 1548void 1549pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1550{ 1551 pt_entry_t *endpte, oldpte, pa, *pte; 1552 vm_page_t m; 1553 1554 oldpte = 0; 1555 pte = vtopte(sva); 1556 endpte = pte + count; 1557 while (pte < endpte) { 1558 m = *ma++; 1559 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); 1560 if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { 1561 oldpte |= *pte; 1562 pte_store(pte, pa | pgeflag | PG_RW | PG_V); 1563 } 1564 pte++; 1565 } 1566 if (__predict_false((oldpte & PG_V) != 0)) 1567 pmap_invalidate_range(kernel_pmap, sva, sva + count * 1568 PAGE_SIZE); 1569} 1570 1571/* 1572 * This routine tears out page mappings from the 1573 * kernel -- it is meant only for temporary mappings. 1574 * Note: SMP coherent. Uses a ranged shootdown IPI. 1575 */ 1576void 1577pmap_qremove(vm_offset_t sva, int count) 1578{ 1579 vm_offset_t va; 1580 1581 va = sva; 1582 while (count-- > 0) { 1583 pmap_kremove(va); 1584 va += PAGE_SIZE; 1585 } 1586 pmap_invalidate_range(kernel_pmap, sva, va); 1587} 1588 1589/*************************************************** 1590 * Page table page management routines..... 1591 ***************************************************/ 1592static __inline void 1593pmap_free_zero_pages(vm_page_t free) 1594{ 1595 vm_page_t m; 1596 1597 while (free != NULL) { 1598 m = free; 1599 free = m->right; 1600 /* Preserve the page's PG_ZERO setting. */ 1601 vm_page_free_toq(m); 1602 } 1603} 1604 1605/* 1606 * Schedule the specified unused page table page to be freed. Specifically, 1607 * add the page to the specified list of pages that will be released to the 1608 * physical memory manager after the TLB has been updated. 1609 */ 1610static __inline void 1611pmap_add_delayed_free_list(vm_page_t m, vm_page_t *free, boolean_t set_PG_ZERO) 1612{ 1613 1614 if (set_PG_ZERO) 1615 m->flags |= PG_ZERO; 1616 else 1617 m->flags &= ~PG_ZERO; 1618 m->right = *free; 1619 *free = m; 1620} 1621 1622/* 1623 * Inserts the specified page table page into the specified pmap's collection 1624 * of idle page table pages. Each of a pmap's page table pages is responsible 1625 * for mapping a distinct range of virtual addresses. The pmap's collection is 1626 * ordered by this virtual address range. 1627 */ 1628static void 1629pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) 1630{ 1631 vm_page_t root; 1632 1633 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1634 root = pmap->pm_root; 1635 if (root == NULL) { 1636 mpte->left = NULL; 1637 mpte->right = NULL; 1638 } else { 1639 root = vm_page_splay(mpte->pindex, root); 1640 if (mpte->pindex < root->pindex) { 1641 mpte->left = root->left; 1642 mpte->right = root; 1643 root->left = NULL; 1644 } else if (mpte->pindex == root->pindex) 1645 panic("pmap_insert_pt_page: pindex already inserted"); 1646 else { 1647 mpte->right = root->right; 1648 mpte->left = root; 1649 root->right = NULL; 1650 } 1651 } 1652 pmap->pm_root = mpte; 1653} 1654 1655/* 1656 * Looks for a page table page mapping the specified virtual address in the 1657 * specified pmap's collection of idle page table pages. Returns NULL if there 1658 * is no page table page corresponding to the specified virtual address. 1659 */ 1660static vm_page_t 1661pmap_lookup_pt_page(pmap_t pmap, vm_offset_t va) 1662{ 1663 vm_page_t mpte; 1664 vm_pindex_t pindex = va >> PDRSHIFT; 1665 1666 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1667 if ((mpte = pmap->pm_root) != NULL && mpte->pindex != pindex) { 1668 mpte = vm_page_splay(pindex, mpte); 1669 if ((pmap->pm_root = mpte)->pindex != pindex) 1670 mpte = NULL; 1671 } 1672 return (mpte); 1673} 1674 1675/* 1676 * Removes the specified page table page from the specified pmap's collection 1677 * of idle page table pages. The specified page table page must be a member of 1678 * the pmap's collection. 1679 */ 1680static void 1681pmap_remove_pt_page(pmap_t pmap, vm_page_t mpte) 1682{ 1683 vm_page_t root; 1684 1685 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1686 if (mpte != pmap->pm_root) 1687 vm_page_splay(mpte->pindex, pmap->pm_root); 1688 if (mpte->left == NULL) 1689 root = mpte->right; 1690 else { 1691 root = vm_page_splay(mpte->pindex, mpte->left); 1692 root->right = mpte->right; 1693 } 1694 pmap->pm_root = root; 1695} 1696 1697/* 1698 * Decrements a page table page's wire count, which is used to record the 1699 * number of valid page table entries within the page. If the wire count 1700 * drops to zero, then the page table page is unmapped. Returns TRUE if the 1701 * page table page was unmapped and FALSE otherwise. 1702 */ 1703static inline boolean_t 1704pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free) 1705{ 1706 1707 --m->wire_count; 1708 if (m->wire_count == 0) { 1709 _pmap_unwire_ptp(pmap, m, free); 1710 return (TRUE); 1711 } else 1712 return (FALSE); 1713} 1714 1715static void 1716_pmap_unwire_ptp(pmap_t pmap, vm_page_t m, vm_page_t *free) 1717{ 1718 vm_offset_t pteva; 1719 1720 /* 1721 * unmap the page table page 1722 */ 1723 pmap->pm_pdir[m->pindex] = 0; 1724 --pmap->pm_stats.resident_count; 1725 1726 /* 1727 * This is a release store so that the ordinary store unmapping 1728 * the page table page is globally performed before TLB shoot- 1729 * down is begun. 1730 */ 1731 atomic_subtract_rel_int(&cnt.v_wire_count, 1); 1732 1733 /* 1734 * Do an invltlb to make the invalidated mapping 1735 * take effect immediately. 1736 */ 1737 pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); 1738 pmap_invalidate_page(pmap, pteva); 1739 1740 /* 1741 * Put page on a list so that it is released after 1742 * *ALL* TLB shootdown is done 1743 */ 1744 pmap_add_delayed_free_list(m, free, TRUE); 1745} 1746 1747/* 1748 * After removing a page table entry, this routine is used to 1749 * conditionally free the page, and manage the hold/wire counts. 1750 */ 1751static int 1752pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free) 1753{ 1754 pd_entry_t ptepde; 1755 vm_page_t mpte; 1756 1757 if (va >= VM_MAXUSER_ADDRESS) 1758 return (0); 1759 ptepde = *pmap_pde(pmap, va); 1760 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 1761 return (pmap_unwire_ptp(pmap, mpte, free)); 1762} 1763 1764/* 1765 * Initialize the pmap for the swapper process. 1766 */ 1767void 1768pmap_pinit0(pmap_t pmap) 1769{ 1770 1771 PMAP_LOCK_INIT(pmap); 1772 /* 1773 * Since the page table directory is shared with the kernel pmap, 1774 * which is already included in the list "allpmaps", this pmap does 1775 * not need to be inserted into that list. 1776 */ 1777 pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); 1778#ifdef PAE 1779 pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); 1780#endif 1781 pmap->pm_root = NULL; 1782 CPU_ZERO(&pmap->pm_active); 1783 PCPU_SET(curpmap, pmap); 1784 TAILQ_INIT(&pmap->pm_pvchunk); 1785 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1786} 1787 1788/* 1789 * Initialize a preallocated and zeroed pmap structure, 1790 * such as one in a vmspace structure. 1791 */ 1792int 1793pmap_pinit(pmap_t pmap) 1794{ 1795 vm_page_t m, ptdpg[NPGPTD]; 1796 vm_paddr_t pa; 1797 int i; 1798 1799 PMAP_LOCK_INIT(pmap); 1800 1801 /* 1802 * No need to allocate page table space yet but we do need a valid 1803 * page directory table. 1804 */ 1805 if (pmap->pm_pdir == NULL) { 1806 pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map, 1807 NBPTD); 1808 if (pmap->pm_pdir == NULL) { 1809 PMAP_LOCK_DESTROY(pmap); 1810 return (0); 1811 } 1812#ifdef PAE 1813 pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); 1814 KASSERT(((vm_offset_t)pmap->pm_pdpt & 1815 ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, 1816 ("pmap_pinit: pdpt misaligned")); 1817 KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), 1818 ("pmap_pinit: pdpt above 4g")); 1819#endif 1820 pmap->pm_root = NULL; 1821 } 1822 KASSERT(pmap->pm_root == NULL, 1823 ("pmap_pinit: pmap has reserved page table page(s)")); 1824 1825 /* 1826 * allocate the page directory page(s) 1827 */ 1828 for (i = 0; i < NPGPTD;) { 1829 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 1830 VM_ALLOC_WIRED | VM_ALLOC_ZERO); 1831 if (m == NULL) 1832 VM_WAIT; 1833 else { 1834 ptdpg[i++] = m; 1835 } 1836 } 1837 1838 pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); 1839 1840 for (i = 0; i < NPGPTD; i++) 1841 if ((ptdpg[i]->flags & PG_ZERO) == 0) 1842 pagezero(pmap->pm_pdir + (i * NPDEPG)); 1843 1844 mtx_lock_spin(&allpmaps_lock); 1845 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1846 /* Copy the kernel page table directory entries. */ 1847 bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); 1848 mtx_unlock_spin(&allpmaps_lock); 1849 1850 /* install self-referential address mapping entry(s) */ 1851 for (i = 0; i < NPGPTD; i++) { 1852 pa = VM_PAGE_TO_PHYS(ptdpg[i]); 1853 pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; 1854#ifdef PAE 1855 pmap->pm_pdpt[i] = pa | PG_V; 1856#endif 1857 } 1858 1859 CPU_ZERO(&pmap->pm_active); 1860 TAILQ_INIT(&pmap->pm_pvchunk); 1861 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1862 1863 return (1); 1864} 1865 1866/* 1867 * this routine is called if the page table page is not 1868 * mapped correctly. 1869 */ 1870static vm_page_t 1871_pmap_allocpte(pmap_t pmap, u_int ptepindex, int flags) 1872{ 1873 vm_paddr_t ptepa; 1874 vm_page_t m; 1875 1876 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || 1877 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, 1878 ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); 1879 1880 /* 1881 * Allocate a page table page. 1882 */ 1883 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1884 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1885 if (flags & M_WAITOK) { 1886 PMAP_UNLOCK(pmap); 1887 rw_wunlock(&pvh_global_lock); 1888 VM_WAIT; 1889 rw_wlock(&pvh_global_lock); 1890 PMAP_LOCK(pmap); 1891 } 1892 1893 /* 1894 * Indicate the need to retry. While waiting, the page table 1895 * page may have been allocated. 1896 */ 1897 return (NULL); 1898 } 1899 if ((m->flags & PG_ZERO) == 0) 1900 pmap_zero_page(m); 1901 1902 /* 1903 * Map the pagetable page into the process address space, if 1904 * it isn't already there. 1905 */ 1906 1907 pmap->pm_stats.resident_count++; 1908 1909 ptepa = VM_PAGE_TO_PHYS(m); 1910 pmap->pm_pdir[ptepindex] = 1911 (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); 1912 1913 return (m); 1914} 1915 1916static vm_page_t 1917pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags) 1918{ 1919 u_int ptepindex; 1920 pd_entry_t ptepa; 1921 vm_page_t m; 1922 1923 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || 1924 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, 1925 ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); 1926 1927 /* 1928 * Calculate pagetable page index 1929 */ 1930 ptepindex = va >> PDRSHIFT; 1931retry: 1932 /* 1933 * Get the page directory entry 1934 */ 1935 ptepa = pmap->pm_pdir[ptepindex]; 1936 1937 /* 1938 * This supports switching from a 4MB page to a 1939 * normal 4K page. 1940 */ 1941 if (ptepa & PG_PS) { 1942 (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); 1943 ptepa = pmap->pm_pdir[ptepindex]; 1944 } 1945 1946 /* 1947 * If the page table page is mapped, we just increment the 1948 * hold count, and activate it. 1949 */ 1950 if (ptepa) { 1951 m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 1952 m->wire_count++; 1953 } else { 1954 /* 1955 * Here if the pte page isn't mapped, or if it has 1956 * been deallocated. 1957 */ 1958 m = _pmap_allocpte(pmap, ptepindex, flags); 1959 if (m == NULL && (flags & M_WAITOK)) 1960 goto retry; 1961 } 1962 return (m); 1963} 1964 1965 1966/*************************************************** 1967* Pmap allocation/deallocation routines. 1968 ***************************************************/ 1969 1970#ifdef SMP 1971/* 1972 * Deal with a SMP shootdown of other users of the pmap that we are 1973 * trying to dispose of. This can be a bit hairy. 1974 */ 1975static cpuset_t *lazymask; 1976static u_int lazyptd; 1977static volatile u_int lazywait; 1978 1979void pmap_lazyfix_action(void); 1980 1981void 1982pmap_lazyfix_action(void) 1983{ 1984 1985#ifdef COUNT_IPIS 1986 (*ipi_lazypmap_counts[PCPU_GET(cpuid)])++; 1987#endif 1988 if (rcr3() == lazyptd) 1989 load_cr3(curpcb->pcb_cr3); 1990 CPU_CLR_ATOMIC(PCPU_GET(cpuid), lazymask); 1991 atomic_store_rel_int(&lazywait, 1); 1992} 1993 1994static void 1995pmap_lazyfix_self(u_int cpuid) 1996{ 1997 1998 if (rcr3() == lazyptd) 1999 load_cr3(curpcb->pcb_cr3); 2000 CPU_CLR_ATOMIC(cpuid, lazymask); 2001} 2002 2003 2004static void 2005pmap_lazyfix(pmap_t pmap) 2006{ 2007 cpuset_t mymask, mask; 2008 u_int cpuid, spins; 2009 int lsb; 2010 2011 mask = pmap->pm_active; 2012 while (!CPU_EMPTY(&mask)) { 2013 spins = 50000000; 2014 2015 /* Find least significant set bit. */ 2016 lsb = CPU_FFS(&mask); 2017 MPASS(lsb != 0); 2018 lsb--; 2019 CPU_SETOF(lsb, &mask); 2020 mtx_lock_spin(&smp_ipi_mtx); 2021#ifdef PAE 2022 lazyptd = vtophys(pmap->pm_pdpt); 2023#else 2024 lazyptd = vtophys(pmap->pm_pdir); 2025#endif 2026 cpuid = PCPU_GET(cpuid); 2027 2028 /* Use a cpuset just for having an easy check. */ 2029 CPU_SETOF(cpuid, &mymask); 2030 if (!CPU_CMP(&mask, &mymask)) { 2031 lazymask = &pmap->pm_active; 2032 pmap_lazyfix_self(cpuid); 2033 } else { 2034 atomic_store_rel_int((u_int *)&lazymask, 2035 (u_int)&pmap->pm_active); 2036 atomic_store_rel_int(&lazywait, 0); 2037 ipi_selected(mask, IPI_LAZYPMAP); 2038 while (lazywait == 0) { 2039 ia32_pause(); 2040 if (--spins == 0) 2041 break; 2042 } 2043 } 2044 mtx_unlock_spin(&smp_ipi_mtx); 2045 if (spins == 0) 2046 printf("pmap_lazyfix: spun for 50000000\n"); 2047 mask = pmap->pm_active; 2048 } 2049} 2050 2051#else /* SMP */ 2052 2053/* 2054 * Cleaning up on uniprocessor is easy. For various reasons, we're 2055 * unlikely to have to even execute this code, including the fact 2056 * that the cleanup is deferred until the parent does a wait(2), which 2057 * means that another userland process has run. 2058 */ 2059static void 2060pmap_lazyfix(pmap_t pmap) 2061{ 2062 u_int cr3; 2063 2064 cr3 = vtophys(pmap->pm_pdir); 2065 if (cr3 == rcr3()) { 2066 load_cr3(curpcb->pcb_cr3); 2067 CPU_CLR(PCPU_GET(cpuid), &pmap->pm_active); 2068 } 2069} 2070#endif /* SMP */ 2071 2072/* 2073 * Release any resources held by the given physical map. 2074 * Called when a pmap initialized by pmap_pinit is being released. 2075 * Should only be called if the map contains no valid mappings. 2076 */ 2077void 2078pmap_release(pmap_t pmap) 2079{ 2080 vm_page_t m, ptdpg[NPGPTD]; 2081 int i; 2082 2083 KASSERT(pmap->pm_stats.resident_count == 0, 2084 ("pmap_release: pmap resident count %ld != 0", 2085 pmap->pm_stats.resident_count)); 2086 KASSERT(pmap->pm_root == NULL, 2087 ("pmap_release: pmap has reserved page table page(s)")); 2088 2089 pmap_lazyfix(pmap); 2090 mtx_lock_spin(&allpmaps_lock); 2091 LIST_REMOVE(pmap, pm_list); 2092 mtx_unlock_spin(&allpmaps_lock); 2093 2094 for (i = 0; i < NPGPTD; i++) 2095 ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] & 2096 PG_FRAME); 2097 2098 bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * 2099 sizeof(*pmap->pm_pdir)); 2100 2101 pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); 2102 2103 for (i = 0; i < NPGPTD; i++) { 2104 m = ptdpg[i]; 2105#ifdef PAE 2106 KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), 2107 ("pmap_release: got wrong ptd page")); 2108#endif 2109 m->wire_count--; 2110 atomic_subtract_int(&cnt.v_wire_count, 1); 2111 vm_page_free_zero(m); 2112 } 2113 PMAP_LOCK_DESTROY(pmap); 2114} 2115 2116static int 2117kvm_size(SYSCTL_HANDLER_ARGS) 2118{ 2119 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; 2120 2121 return (sysctl_handle_long(oidp, &ksize, 0, req)); 2122} 2123SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 2124 0, 0, kvm_size, "IU", "Size of KVM"); 2125 2126static int 2127kvm_free(SYSCTL_HANDLER_ARGS) 2128{ 2129 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 2130 2131 return (sysctl_handle_long(oidp, &kfree, 0, req)); 2132} 2133SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 2134 0, 0, kvm_free, "IU", "Amount of KVM free"); 2135 2136/* 2137 * grow the number of kernel page table entries, if needed 2138 */ 2139void 2140pmap_growkernel(vm_offset_t addr) 2141{ 2142 vm_paddr_t ptppaddr; 2143 vm_page_t nkpg; 2144 pd_entry_t newpdir; 2145 2146 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 2147 addr = roundup2(addr, NBPDR); 2148 if (addr - 1 >= kernel_map->max_offset) 2149 addr = kernel_map->max_offset; 2150 while (kernel_vm_end < addr) { 2151 if (pdir_pde(PTD, kernel_vm_end)) { 2152 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2153 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2154 kernel_vm_end = kernel_map->max_offset; 2155 break; 2156 } 2157 continue; 2158 } 2159 2160 nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, 2161 VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 2162 VM_ALLOC_ZERO); 2163 if (nkpg == NULL) 2164 panic("pmap_growkernel: no memory to grow kernel"); 2165 2166 nkpt++; 2167 2168 if ((nkpg->flags & PG_ZERO) == 0) 2169 pmap_zero_page(nkpg); 2170 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 2171 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); 2172 pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir; 2173 2174 pmap_kenter_pde(kernel_vm_end, newpdir); 2175 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; 2176 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 2177 kernel_vm_end = kernel_map->max_offset; 2178 break; 2179 } 2180 } 2181} 2182 2183 2184/*************************************************** 2185 * page management routines. 2186 ***************************************************/ 2187 2188CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 2189CTASSERT(_NPCM == 11); 2190CTASSERT(_NPCPV == 336); 2191 2192static __inline struct pv_chunk * 2193pv_to_chunk(pv_entry_t pv) 2194{ 2195 2196 return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); 2197} 2198 2199#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 2200 2201#define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ 2202#define PC_FREE10 0x0000fffful /* Free values for index 10 */ 2203 2204static const uint32_t pc_freemask[_NPCM] = { 2205 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2206 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2207 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 2208 PC_FREE0_9, PC_FREE10 2209}; 2210 2211SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 2212 "Current number of pv entries"); 2213 2214#ifdef PV_STATS 2215static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 2216 2217SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 2218 "Current number of pv entry chunks"); 2219SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 2220 "Current number of pv entry chunks allocated"); 2221SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 2222 "Current number of pv entry chunks frees"); 2223SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 2224 "Number of times tried to get a chunk page but failed."); 2225 2226static long pv_entry_frees, pv_entry_allocs; 2227static int pv_entry_spare; 2228 2229SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 2230 "Current number of pv entry frees"); 2231SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 2232 "Current number of pv entry allocs"); 2233SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 2234 "Current number of spare pv entries"); 2235#endif 2236 2237/* 2238 * We are in a serious low memory condition. Resort to 2239 * drastic measures to free some pages so we can allocate 2240 * another pv entry chunk. 2241 */ 2242static vm_page_t 2243pmap_pv_reclaim(pmap_t locked_pmap) 2244{ 2245 struct pch newtail; 2246 struct pv_chunk *pc; 2247 struct md_page *pvh; 2248 pd_entry_t *pde; 2249 pmap_t pmap; 2250 pt_entry_t *pte, tpte; 2251 pv_entry_t pv; 2252 vm_offset_t va; 2253 vm_page_t free, m, m_pc; 2254 uint32_t inuse; 2255 int bit, field, freed; 2256 2257 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); 2258 pmap = NULL; 2259 free = m_pc = NULL; 2260 TAILQ_INIT(&newtail); 2261 while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || 2262 free == NULL)) { 2263 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2264 if (pmap != pc->pc_pmap) { 2265 if (pmap != NULL) { 2266 pmap_invalidate_all(pmap); 2267 if (pmap != locked_pmap) 2268 PMAP_UNLOCK(pmap); 2269 } 2270 pmap = pc->pc_pmap; 2271 /* Avoid deadlock and lock recursion. */ 2272 if (pmap > locked_pmap) 2273 PMAP_LOCK(pmap); 2274 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { 2275 pmap = NULL; 2276 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2277 continue; 2278 } 2279 } 2280 2281 /* 2282 * Destroy every non-wired, 4 KB page mapping in the chunk. 2283 */ 2284 freed = 0; 2285 for (field = 0; field < _NPCM; field++) { 2286 for (inuse = ~pc->pc_map[field] & pc_freemask[field]; 2287 inuse != 0; inuse &= ~(1UL << bit)) { 2288 bit = bsfl(inuse); 2289 pv = &pc->pc_pventry[field * 32 + bit]; 2290 va = pv->pv_va; 2291 pde = pmap_pde(pmap, va); 2292 if ((*pde & PG_PS) != 0) 2293 continue; 2294 pte = pmap_pte(pmap, va); 2295 tpte = *pte; 2296 if ((tpte & PG_W) == 0) 2297 tpte = pte_load_clear(pte); 2298 pmap_pte_release(pte); 2299 if ((tpte & PG_W) != 0) 2300 continue; 2301 KASSERT(tpte != 0, 2302 ("pmap_pv_reclaim: pmap %p va %x zero pte", 2303 pmap, va)); 2304 if ((tpte & PG_G) != 0) 2305 pmap_invalidate_page(pmap, va); 2306 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 2307 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2308 vm_page_dirty(m); 2309 if ((tpte & PG_A) != 0) 2310 vm_page_aflag_set(m, PGA_REFERENCED); 2311 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2312 if (TAILQ_EMPTY(&m->md.pv_list) && 2313 (m->flags & PG_FICTITIOUS) == 0) { 2314 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2315 if (TAILQ_EMPTY(&pvh->pv_list)) { 2316 vm_page_aflag_clear(m, 2317 PGA_WRITEABLE); 2318 } 2319 } 2320 pc->pc_map[field] |= 1UL << bit; 2321 pmap_unuse_pt(pmap, va, &free); 2322 freed++; 2323 } 2324 } 2325 if (freed == 0) { 2326 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2327 continue; 2328 } 2329 /* Every freed mapping is for a 4 KB page. */ 2330 pmap->pm_stats.resident_count -= freed; 2331 PV_STAT(pv_entry_frees += freed); 2332 PV_STAT(pv_entry_spare += freed); 2333 pv_entry_count -= freed; 2334 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2335 for (field = 0; field < _NPCM; field++) 2336 if (pc->pc_map[field] != pc_freemask[field]) { 2337 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2338 pc_list); 2339 TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); 2340 2341 /* 2342 * One freed pv entry in locked_pmap is 2343 * sufficient. 2344 */ 2345 if (pmap == locked_pmap) 2346 goto out; 2347 break; 2348 } 2349 if (field == _NPCM) { 2350 PV_STAT(pv_entry_spare -= _NPCPV); 2351 PV_STAT(pc_chunk_count--); 2352 PV_STAT(pc_chunk_frees++); 2353 /* Entire chunk is free; return it. */ 2354 m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2355 pmap_qremove((vm_offset_t)pc, 1); 2356 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2357 break; 2358 } 2359 } 2360out: 2361 TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); 2362 if (pmap != NULL) { 2363 pmap_invalidate_all(pmap); 2364 if (pmap != locked_pmap) 2365 PMAP_UNLOCK(pmap); 2366 } 2367 if (m_pc == NULL && pv_vafree != 0 && free != NULL) { 2368 m_pc = free; 2369 free = m_pc->right; 2370 /* Recycle a freed page table page. */ 2371 m_pc->wire_count = 1; 2372 atomic_add_int(&cnt.v_wire_count, 1); 2373 } 2374 pmap_free_zero_pages(free); 2375 return (m_pc); 2376} 2377 2378/* 2379 * free the pv_entry back to the free list 2380 */ 2381static void 2382free_pv_entry(pmap_t pmap, pv_entry_t pv) 2383{ 2384 struct pv_chunk *pc; 2385 int idx, field, bit; 2386 2387 rw_assert(&pvh_global_lock, RA_WLOCKED); 2388 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2389 PV_STAT(pv_entry_frees++); 2390 PV_STAT(pv_entry_spare++); 2391 pv_entry_count--; 2392 pc = pv_to_chunk(pv); 2393 idx = pv - &pc->pc_pventry[0]; 2394 field = idx / 32; 2395 bit = idx % 32; 2396 pc->pc_map[field] |= 1ul << bit; 2397 for (idx = 0; idx < _NPCM; idx++) 2398 if (pc->pc_map[idx] != pc_freemask[idx]) { 2399 /* 2400 * 98% of the time, pc is already at the head of the 2401 * list. If it isn't already, move it to the head. 2402 */ 2403 if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != 2404 pc)) { 2405 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2406 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, 2407 pc_list); 2408 } 2409 return; 2410 } 2411 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2412 free_pv_chunk(pc); 2413} 2414 2415static void 2416free_pv_chunk(struct pv_chunk *pc) 2417{ 2418 vm_page_t m; 2419 2420 TAILQ_REMOVE(&pv_chunks, pc, pc_lru); 2421 PV_STAT(pv_entry_spare -= _NPCPV); 2422 PV_STAT(pc_chunk_count--); 2423 PV_STAT(pc_chunk_frees++); 2424 /* entire chunk is free, return it */ 2425 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 2426 pmap_qremove((vm_offset_t)pc, 1); 2427 vm_page_unwire(m, 0); 2428 vm_page_free(m); 2429 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 2430} 2431 2432/* 2433 * get a new pv_entry, allocating a block from the system 2434 * when needed. 2435 */ 2436static pv_entry_t 2437get_pv_entry(pmap_t pmap, boolean_t try) 2438{ 2439 static const struct timeval printinterval = { 60, 0 }; 2440 static struct timeval lastprint; 2441 int bit, field; 2442 pv_entry_t pv; 2443 struct pv_chunk *pc; 2444 vm_page_t m; 2445 2446 rw_assert(&pvh_global_lock, RA_WLOCKED); 2447 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2448 PV_STAT(pv_entry_allocs++); 2449 pv_entry_count++; 2450 if (pv_entry_count > pv_entry_high_water) 2451 if (ratecheck(&lastprint, &printinterval)) 2452 printf("Approaching the limit on PV entries, consider " 2453 "increasing either the vm.pmap.shpgperproc or the " 2454 "vm.pmap.pv_entry_max tunable.\n"); 2455retry: 2456 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 2457 if (pc != NULL) { 2458 for (field = 0; field < _NPCM; field++) { 2459 if (pc->pc_map[field]) { 2460 bit = bsfl(pc->pc_map[field]); 2461 break; 2462 } 2463 } 2464 if (field < _NPCM) { 2465 pv = &pc->pc_pventry[field * 32 + bit]; 2466 pc->pc_map[field] &= ~(1ul << bit); 2467 /* If this was the last item, move it to tail */ 2468 for (field = 0; field < _NPCM; field++) 2469 if (pc->pc_map[field] != 0) { 2470 PV_STAT(pv_entry_spare--); 2471 return (pv); /* not full, return */ 2472 } 2473 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 2474 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 2475 PV_STAT(pv_entry_spare--); 2476 return (pv); 2477 } 2478 } 2479 /* 2480 * Access to the ptelist "pv_vafree" is synchronized by the pvh 2481 * global lock. If "pv_vafree" is currently non-empty, it will 2482 * remain non-empty until pmap_ptelist_alloc() completes. 2483 */ 2484 if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | 2485 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { 2486 if (try) { 2487 pv_entry_count--; 2488 PV_STAT(pc_chunk_tryfail++); 2489 return (NULL); 2490 } 2491 m = pmap_pv_reclaim(pmap); 2492 if (m == NULL) 2493 goto retry; 2494 } 2495 PV_STAT(pc_chunk_count++); 2496 PV_STAT(pc_chunk_allocs++); 2497 pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); 2498 pmap_qenter((vm_offset_t)pc, &m, 1); 2499 pc->pc_pmap = pmap; 2500 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ 2501 for (field = 1; field < _NPCM; field++) 2502 pc->pc_map[field] = pc_freemask[field]; 2503 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); 2504 pv = &pc->pc_pventry[0]; 2505 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 2506 PV_STAT(pv_entry_spare += _NPCPV - 1); 2507 return (pv); 2508} 2509 2510static __inline pv_entry_t 2511pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2512{ 2513 pv_entry_t pv; 2514 2515 rw_assert(&pvh_global_lock, RA_WLOCKED); 2516 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 2517 if (pmap == PV_PMAP(pv) && va == pv->pv_va) { 2518 TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); 2519 break; 2520 } 2521 } 2522 return (pv); 2523} 2524 2525static void 2526pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2527{ 2528 struct md_page *pvh; 2529 pv_entry_t pv; 2530 vm_offset_t va_last; 2531 vm_page_t m; 2532 2533 rw_assert(&pvh_global_lock, RA_WLOCKED); 2534 KASSERT((pa & PDRMASK) == 0, 2535 ("pmap_pv_demote_pde: pa is not 4mpage aligned")); 2536 2537 /* 2538 * Transfer the 4mpage's pv entry for this mapping to the first 2539 * page's pv list. 2540 */ 2541 pvh = pa_to_pvh(pa); 2542 va = trunc_4mpage(va); 2543 pv = pmap_pvh_remove(pvh, pmap, va); 2544 KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); 2545 m = PHYS_TO_VM_PAGE(pa); 2546 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2547 /* Instantiate the remaining NPTEPG - 1 pv entries. */ 2548 va_last = va + NBPDR - PAGE_SIZE; 2549 do { 2550 m++; 2551 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 2552 ("pmap_pv_demote_pde: page %p is not managed", m)); 2553 va += PAGE_SIZE; 2554 pmap_insert_entry(pmap, va, m); 2555 } while (va < va_last); 2556} 2557 2558static void 2559pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2560{ 2561 struct md_page *pvh; 2562 pv_entry_t pv; 2563 vm_offset_t va_last; 2564 vm_page_t m; 2565 2566 rw_assert(&pvh_global_lock, RA_WLOCKED); 2567 KASSERT((pa & PDRMASK) == 0, 2568 ("pmap_pv_promote_pde: pa is not 4mpage aligned")); 2569 2570 /* 2571 * Transfer the first page's pv entry for this mapping to the 2572 * 4mpage's pv list. Aside from avoiding the cost of a call 2573 * to get_pv_entry(), a transfer avoids the possibility that 2574 * get_pv_entry() calls pmap_collect() and that pmap_collect() 2575 * removes one of the mappings that is being promoted. 2576 */ 2577 m = PHYS_TO_VM_PAGE(pa); 2578 va = trunc_4mpage(va); 2579 pv = pmap_pvh_remove(&m->md, pmap, va); 2580 KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); 2581 pvh = pa_to_pvh(pa); 2582 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); 2583 /* Free the remaining NPTEPG - 1 pv entries. */ 2584 va_last = va + NBPDR - PAGE_SIZE; 2585 do { 2586 m++; 2587 va += PAGE_SIZE; 2588 pmap_pvh_free(&m->md, pmap, va); 2589 } while (va < va_last); 2590} 2591 2592static void 2593pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) 2594{ 2595 pv_entry_t pv; 2596 2597 pv = pmap_pvh_remove(pvh, pmap, va); 2598 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); 2599 free_pv_entry(pmap, pv); 2600} 2601 2602static void 2603pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 2604{ 2605 struct md_page *pvh; 2606 2607 rw_assert(&pvh_global_lock, RA_WLOCKED); 2608 pmap_pvh_free(&m->md, pmap, va); 2609 if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { 2610 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 2611 if (TAILQ_EMPTY(&pvh->pv_list)) 2612 vm_page_aflag_clear(m, PGA_WRITEABLE); 2613 } 2614} 2615 2616/* 2617 * Create a pv entry for page at pa for 2618 * (pmap, va). 2619 */ 2620static void 2621pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2622{ 2623 pv_entry_t pv; 2624 2625 rw_assert(&pvh_global_lock, RA_WLOCKED); 2626 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2627 pv = get_pv_entry(pmap, FALSE); 2628 pv->pv_va = va; 2629 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2630} 2631 2632/* 2633 * Conditionally create a pv entry. 2634 */ 2635static boolean_t 2636pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 2637{ 2638 pv_entry_t pv; 2639 2640 rw_assert(&pvh_global_lock, RA_WLOCKED); 2641 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2642 if (pv_entry_count < pv_entry_high_water && 2643 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2644 pv->pv_va = va; 2645 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2646 return (TRUE); 2647 } else 2648 return (FALSE); 2649} 2650 2651/* 2652 * Create the pv entries for each of the pages within a superpage. 2653 */ 2654static boolean_t 2655pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) 2656{ 2657 struct md_page *pvh; 2658 pv_entry_t pv; 2659 2660 rw_assert(&pvh_global_lock, RA_WLOCKED); 2661 if (pv_entry_count < pv_entry_high_water && 2662 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 2663 pv->pv_va = va; 2664 pvh = pa_to_pvh(pa); 2665 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_list); 2666 return (TRUE); 2667 } else 2668 return (FALSE); 2669} 2670 2671/* 2672 * Fills a page table page with mappings to consecutive physical pages. 2673 */ 2674static void 2675pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) 2676{ 2677 pt_entry_t *pte; 2678 2679 for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { 2680 *pte = newpte; 2681 newpte += PAGE_SIZE; 2682 } 2683} 2684 2685/* 2686 * Tries to demote a 2- or 4MB page mapping. If demotion fails, the 2687 * 2- or 4MB page mapping is invalidated. 2688 */ 2689static boolean_t 2690pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 2691{ 2692 pd_entry_t newpde, oldpde; 2693 pt_entry_t *firstpte, newpte; 2694 vm_paddr_t mptepa; 2695 vm_page_t free, mpte; 2696 2697 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2698 oldpde = *pde; 2699 KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), 2700 ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); 2701 mpte = pmap_lookup_pt_page(pmap, va); 2702 if (mpte != NULL) 2703 pmap_remove_pt_page(pmap, mpte); 2704 else { 2705 KASSERT((oldpde & PG_W) == 0, 2706 ("pmap_demote_pde: page table page for a wired mapping" 2707 " is missing")); 2708 2709 /* 2710 * Invalidate the 2- or 4MB page mapping and return 2711 * "failure" if the mapping was never accessed or the 2712 * allocation of the new page table page fails. 2713 */ 2714 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, 2715 va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | 2716 VM_ALLOC_WIRED)) == NULL) { 2717 free = NULL; 2718 pmap_remove_pde(pmap, pde, trunc_4mpage(va), &free); 2719 pmap_invalidate_page(pmap, trunc_4mpage(va)); 2720 pmap_free_zero_pages(free); 2721 CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" 2722 " in pmap %p", va, pmap); 2723 return (FALSE); 2724 } 2725 if (va < VM_MAXUSER_ADDRESS) 2726 pmap->pm_stats.resident_count++; 2727 } 2728 mptepa = VM_PAGE_TO_PHYS(mpte); 2729 2730 /* 2731 * If the page mapping is in the kernel's address space, then the 2732 * KPTmap can provide access to the page table page. Otherwise, 2733 * temporarily map the page table page (mpte) into the kernel's 2734 * address space at either PADDR1 or PADDR2. 2735 */ 2736 if (va >= KERNBASE) 2737 firstpte = &KPTmap[i386_btop(trunc_4mpage(va))]; 2738 else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { 2739 if ((*PMAP1 & PG_FRAME) != mptepa) { 2740 *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2741#ifdef SMP 2742 PMAP1cpu = PCPU_GET(cpuid); 2743#endif 2744 invlcaddr(PADDR1); 2745 PMAP1changed++; 2746 } else 2747#ifdef SMP 2748 if (PMAP1cpu != PCPU_GET(cpuid)) { 2749 PMAP1cpu = PCPU_GET(cpuid); 2750 invlcaddr(PADDR1); 2751 PMAP1changedcpu++; 2752 } else 2753#endif 2754 PMAP1unchanged++; 2755 firstpte = PADDR1; 2756 } else { 2757 mtx_lock(&PMAP2mutex); 2758 if ((*PMAP2 & PG_FRAME) != mptepa) { 2759 *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; 2760 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); 2761 } 2762 firstpte = PADDR2; 2763 } 2764 newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; 2765 KASSERT((oldpde & PG_A) != 0, 2766 ("pmap_demote_pde: oldpde is missing PG_A")); 2767 KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, 2768 ("pmap_demote_pde: oldpde is missing PG_M")); 2769 newpte = oldpde & ~PG_PS; 2770 if ((newpte & PG_PDE_PAT) != 0) 2771 newpte ^= PG_PDE_PAT | PG_PTE_PAT; 2772 2773 /* 2774 * If the page table page is new, initialize it. 2775 */ 2776 if (mpte->wire_count == 1) { 2777 mpte->wire_count = NPTEPG; 2778 pmap_fill_ptp(firstpte, newpte); 2779 } 2780 KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), 2781 ("pmap_demote_pde: firstpte and newpte map different physical" 2782 " addresses")); 2783 2784 /* 2785 * If the mapping has changed attributes, update the page table 2786 * entries. 2787 */ 2788 if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) 2789 pmap_fill_ptp(firstpte, newpte); 2790 2791 /* 2792 * Demote the mapping. This pmap is locked. The old PDE has 2793 * PG_A set. If the old PDE has PG_RW set, it also has PG_M 2794 * set. Thus, there is no danger of a race with another 2795 * processor changing the setting of PG_A and/or PG_M between 2796 * the read above and the store below. 2797 */ 2798 if (workaround_erratum383) 2799 pmap_update_pde(pmap, va, pde, newpde); 2800 else if (pmap == kernel_pmap) 2801 pmap_kenter_pde(va, newpde); 2802 else 2803 pde_store(pde, newpde); 2804 if (firstpte == PADDR2) 2805 mtx_unlock(&PMAP2mutex); 2806 2807 /* 2808 * Invalidate the recursive mapping of the page table page. 2809 */ 2810 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); 2811 2812 /* 2813 * Demote the pv entry. This depends on the earlier demotion 2814 * of the mapping. Specifically, the (re)creation of a per- 2815 * page pv entry might trigger the execution of pmap_collect(), 2816 * which might reclaim a newly (re)created per-page pv entry 2817 * and destroy the associated mapping. In order to destroy 2818 * the mapping, the PDE must have already changed from mapping 2819 * the 2mpage to referencing the page table page. 2820 */ 2821 if ((oldpde & PG_MANAGED) != 0) 2822 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); 2823 2824 pmap_pde_demotions++; 2825 CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" 2826 " in pmap %p", va, pmap); 2827 return (TRUE); 2828} 2829 2830/* 2831 * pmap_remove_pde: do the things to unmap a superpage in a process 2832 */ 2833static void 2834pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, 2835 vm_page_t *free) 2836{ 2837 struct md_page *pvh; 2838 pd_entry_t oldpde; 2839 vm_offset_t eva, va; 2840 vm_page_t m, mpte; 2841 2842 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2843 KASSERT((sva & PDRMASK) == 0, 2844 ("pmap_remove_pde: sva is not 4mpage aligned")); 2845 oldpde = pte_load_clear(pdq); 2846 if (oldpde & PG_W) 2847 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; 2848 2849 /* 2850 * Machines that don't support invlpg, also don't support 2851 * PG_G. 2852 */ 2853 if (oldpde & PG_G) 2854 pmap_invalidate_page(kernel_pmap, sva); 2855 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2856 if (oldpde & PG_MANAGED) { 2857 pvh = pa_to_pvh(oldpde & PG_PS_FRAME); 2858 pmap_pvh_free(pvh, pmap, sva); 2859 eva = sva + NBPDR; 2860 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 2861 va < eva; va += PAGE_SIZE, m++) { 2862 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2863 vm_page_dirty(m); 2864 if (oldpde & PG_A) 2865 vm_page_aflag_set(m, PGA_REFERENCED); 2866 if (TAILQ_EMPTY(&m->md.pv_list) && 2867 TAILQ_EMPTY(&pvh->pv_list)) 2868 vm_page_aflag_clear(m, PGA_WRITEABLE); 2869 } 2870 } 2871 if (pmap == kernel_pmap) { 2872 if (!pmap_demote_pde(pmap, pdq, sva)) 2873 panic("pmap_remove_pde: failed demotion"); 2874 } else { 2875 mpte = pmap_lookup_pt_page(pmap, sva); 2876 if (mpte != NULL) { 2877 pmap_remove_pt_page(pmap, mpte); 2878 pmap->pm_stats.resident_count--; 2879 KASSERT(mpte->wire_count == NPTEPG, 2880 ("pmap_remove_pde: pte page wire count error")); 2881 mpte->wire_count = 0; 2882 pmap_add_delayed_free_list(mpte, free, FALSE); 2883 atomic_subtract_int(&cnt.v_wire_count, 1); 2884 } 2885 } 2886} 2887 2888/* 2889 * pmap_remove_pte: do the things to unmap a page in a process 2890 */ 2891static int 2892pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free) 2893{ 2894 pt_entry_t oldpte; 2895 vm_page_t m; 2896 2897 rw_assert(&pvh_global_lock, RA_WLOCKED); 2898 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2899 oldpte = pte_load_clear(ptq); 2900 KASSERT(oldpte != 0, 2901 ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); 2902 if (oldpte & PG_W) 2903 pmap->pm_stats.wired_count -= 1; 2904 /* 2905 * Machines that don't support invlpg, also don't support 2906 * PG_G. 2907 */ 2908 if (oldpte & PG_G) 2909 pmap_invalidate_page(kernel_pmap, va); 2910 pmap->pm_stats.resident_count -= 1; 2911 if (oldpte & PG_MANAGED) { 2912 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); 2913 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 2914 vm_page_dirty(m); 2915 if (oldpte & PG_A) 2916 vm_page_aflag_set(m, PGA_REFERENCED); 2917 pmap_remove_entry(pmap, m, va); 2918 } 2919 return (pmap_unuse_pt(pmap, va, free)); 2920} 2921 2922/* 2923 * Remove a single page from a process address space 2924 */ 2925static void 2926pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free) 2927{ 2928 pt_entry_t *pte; 2929 2930 rw_assert(&pvh_global_lock, RA_WLOCKED); 2931 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 2932 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2933 if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) 2934 return; 2935 pmap_remove_pte(pmap, pte, va, free); 2936 pmap_invalidate_page(pmap, va); 2937} 2938 2939/* 2940 * Remove the given range of addresses from the specified map. 2941 * 2942 * It is assumed that the start and end are properly 2943 * rounded to the page size. 2944 */ 2945void 2946pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2947{ 2948 vm_offset_t pdnxt; 2949 pd_entry_t ptpaddr; 2950 pt_entry_t *pte; 2951 vm_page_t free = NULL; 2952 int anyvalid; 2953 2954 /* 2955 * Perform an unsynchronized read. This is, however, safe. 2956 */ 2957 if (pmap->pm_stats.resident_count == 0) 2958 return; 2959 2960 anyvalid = 0; 2961 2962 rw_wlock(&pvh_global_lock); 2963 sched_pin(); 2964 PMAP_LOCK(pmap); 2965 2966 /* 2967 * special handling of removing one page. a very 2968 * common operation and easy to short circuit some 2969 * code. 2970 */ 2971 if ((sva + PAGE_SIZE == eva) && 2972 ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { 2973 pmap_remove_page(pmap, sva, &free); 2974 goto out; 2975 } 2976 2977 for (; sva < eva; sva = pdnxt) { 2978 u_int pdirindex; 2979 2980 /* 2981 * Calculate index for next page table. 2982 */ 2983 pdnxt = (sva + NBPDR) & ~PDRMASK; 2984 if (pdnxt < sva) 2985 pdnxt = eva; 2986 if (pmap->pm_stats.resident_count == 0) 2987 break; 2988 2989 pdirindex = sva >> PDRSHIFT; 2990 ptpaddr = pmap->pm_pdir[pdirindex]; 2991 2992 /* 2993 * Weed out invalid mappings. Note: we assume that the page 2994 * directory table is always allocated, and in kernel virtual. 2995 */ 2996 if (ptpaddr == 0) 2997 continue; 2998 2999 /* 3000 * Check for large page. 3001 */ 3002 if ((ptpaddr & PG_PS) != 0) { 3003 /* 3004 * Are we removing the entire large page? If not, 3005 * demote the mapping and fall through. 3006 */ 3007 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 3008 /* 3009 * The TLB entry for a PG_G mapping is 3010 * invalidated by pmap_remove_pde(). 3011 */ 3012 if ((ptpaddr & PG_G) == 0) 3013 anyvalid = 1; 3014 pmap_remove_pde(pmap, 3015 &pmap->pm_pdir[pdirindex], sva, &free); 3016 continue; 3017 } else if (!pmap_demote_pde(pmap, 3018 &pmap->pm_pdir[pdirindex], sva)) { 3019 /* The large page mapping was destroyed. */ 3020 continue; 3021 } 3022 } 3023 3024 /* 3025 * Limit our scan to either the end of the va represented 3026 * by the current page table page, or to the end of the 3027 * range being removed. 3028 */ 3029 if (pdnxt > eva) 3030 pdnxt = eva; 3031 3032 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 3033 sva += PAGE_SIZE) { 3034 if (*pte == 0) 3035 continue; 3036 3037 /* 3038 * The TLB entry for a PG_G mapping is invalidated 3039 * by pmap_remove_pte(). 3040 */ 3041 if ((*pte & PG_G) == 0) 3042 anyvalid = 1; 3043 if (pmap_remove_pte(pmap, pte, sva, &free)) 3044 break; 3045 } 3046 } 3047out: 3048 sched_unpin(); 3049 if (anyvalid) 3050 pmap_invalidate_all(pmap); 3051 rw_wunlock(&pvh_global_lock); 3052 PMAP_UNLOCK(pmap); 3053 pmap_free_zero_pages(free); 3054} 3055 3056/* 3057 * Routine: pmap_remove_all 3058 * Function: 3059 * Removes this physical page from 3060 * all physical maps in which it resides. 3061 * Reflects back modify bits to the pager. 3062 * 3063 * Notes: 3064 * Original versions of this routine were very 3065 * inefficient because they iteratively called 3066 * pmap_remove (slow...) 3067 */ 3068 3069void 3070pmap_remove_all(vm_page_t m) 3071{ 3072 struct md_page *pvh; 3073 pv_entry_t pv; 3074 pmap_t pmap; 3075 pt_entry_t *pte, tpte; 3076 pd_entry_t *pde; 3077 vm_offset_t va; 3078 vm_page_t free; 3079 3080 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 3081 ("pmap_remove_all: page %p is not managed", m)); 3082 free = NULL; 3083 rw_wlock(&pvh_global_lock); 3084 sched_pin(); 3085 if ((m->flags & PG_FICTITIOUS) != 0) 3086 goto small_mappings; 3087 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 3088 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { 3089 va = pv->pv_va; 3090 pmap = PV_PMAP(pv); 3091 PMAP_LOCK(pmap); 3092 pde = pmap_pde(pmap, va); 3093 (void)pmap_demote_pde(pmap, pde, va); 3094 PMAP_UNLOCK(pmap); 3095 } 3096small_mappings: 3097 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3098 pmap = PV_PMAP(pv); 3099 PMAP_LOCK(pmap); 3100 pmap->pm_stats.resident_count--; 3101 pde = pmap_pde(pmap, pv->pv_va); 3102 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" 3103 " a 4mpage in page %p's pv list", m)); 3104 pte = pmap_pte_quick(pmap, pv->pv_va); 3105 tpte = pte_load_clear(pte); 3106 KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", 3107 pmap, pv->pv_va)); 3108 if (tpte & PG_W) 3109 pmap->pm_stats.wired_count--; 3110 if (tpte & PG_A) 3111 vm_page_aflag_set(m, PGA_REFERENCED); 3112 3113 /* 3114 * Update the vm_page_t clean and reference bits. 3115 */ 3116 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3117 vm_page_dirty(m); 3118 pmap_unuse_pt(pmap, pv->pv_va, &free); 3119 pmap_invalidate_page(pmap, pv->pv_va); 3120 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3121 free_pv_entry(pmap, pv); 3122 PMAP_UNLOCK(pmap); 3123 } 3124 vm_page_aflag_clear(m, PGA_WRITEABLE); 3125 sched_unpin(); 3126 rw_wunlock(&pvh_global_lock); 3127 pmap_free_zero_pages(free); 3128} 3129 3130/* 3131 * pmap_protect_pde: do the things to protect a 4mpage in a process 3132 */ 3133static boolean_t 3134pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) 3135{ 3136 pd_entry_t newpde, oldpde; 3137 vm_offset_t eva, va; 3138 vm_page_t m; 3139 boolean_t anychanged; 3140 3141 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3142 KASSERT((sva & PDRMASK) == 0, 3143 ("pmap_protect_pde: sva is not 4mpage aligned")); 3144 anychanged = FALSE; 3145retry: 3146 oldpde = newpde = *pde; 3147 if (oldpde & PG_MANAGED) { 3148 eva = sva + NBPDR; 3149 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); 3150 va < eva; va += PAGE_SIZE, m++) 3151 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) 3152 vm_page_dirty(m); 3153 } 3154 if ((prot & VM_PROT_WRITE) == 0) 3155 newpde &= ~(PG_RW | PG_M); 3156#ifdef PAE 3157 if ((prot & VM_PROT_EXECUTE) == 0) 3158 newpde |= pg_nx; 3159#endif 3160 if (newpde != oldpde) { 3161 if (!pde_cmpset(pde, oldpde, newpde)) 3162 goto retry; 3163 if (oldpde & PG_G) 3164 pmap_invalidate_page(pmap, sva); 3165 else 3166 anychanged = TRUE; 3167 } 3168 return (anychanged); 3169} 3170 3171/* 3172 * Set the physical protection on the 3173 * specified range of this map as requested. 3174 */ 3175void 3176pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 3177{ 3178 vm_offset_t pdnxt; 3179 pd_entry_t ptpaddr; 3180 pt_entry_t *pte; 3181 boolean_t anychanged, pv_lists_locked; 3182 3183 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 3184 pmap_remove(pmap, sva, eva); 3185 return; 3186 } 3187 3188#ifdef PAE 3189 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 3190 (VM_PROT_WRITE|VM_PROT_EXECUTE)) 3191 return; 3192#else 3193 if (prot & VM_PROT_WRITE) 3194 return; 3195#endif 3196 3197 if (pmap_is_current(pmap)) 3198 pv_lists_locked = FALSE; 3199 else { 3200 pv_lists_locked = TRUE; 3201resume: 3202 rw_wlock(&pvh_global_lock); 3203 sched_pin(); 3204 } 3205 anychanged = FALSE; 3206 3207 PMAP_LOCK(pmap); 3208 for (; sva < eva; sva = pdnxt) { 3209 pt_entry_t obits, pbits; 3210 u_int pdirindex; 3211 3212 pdnxt = (sva + NBPDR) & ~PDRMASK; 3213 if (pdnxt < sva) 3214 pdnxt = eva; 3215 3216 pdirindex = sva >> PDRSHIFT; 3217 ptpaddr = pmap->pm_pdir[pdirindex]; 3218 3219 /* 3220 * Weed out invalid mappings. Note: we assume that the page 3221 * directory table is always allocated, and in kernel virtual. 3222 */ 3223 if (ptpaddr == 0) 3224 continue; 3225 3226 /* 3227 * Check for large page. 3228 */ 3229 if ((ptpaddr & PG_PS) != 0) { 3230 /* 3231 * Are we protecting the entire large page? If not, 3232 * demote the mapping and fall through. 3233 */ 3234 if (sva + NBPDR == pdnxt && eva >= pdnxt) { 3235 /* 3236 * The TLB entry for a PG_G mapping is 3237 * invalidated by pmap_protect_pde(). 3238 */ 3239 if (pmap_protect_pde(pmap, 3240 &pmap->pm_pdir[pdirindex], sva, prot)) 3241 anychanged = TRUE; 3242 continue; 3243 } else { 3244 if (!pv_lists_locked) { 3245 pv_lists_locked = TRUE; 3246 if (!rw_try_wlock(&pvh_global_lock)) { 3247 if (anychanged) 3248 pmap_invalidate_all( 3249 pmap); 3250 PMAP_UNLOCK(pmap); 3251 goto resume; 3252 } 3253 sched_pin(); 3254 } 3255 if (!pmap_demote_pde(pmap, 3256 &pmap->pm_pdir[pdirindex], sva)) { 3257 /* 3258 * The large page mapping was 3259 * destroyed. 3260 */ 3261 continue; 3262 } 3263 } 3264 } 3265 3266 if (pdnxt > eva) 3267 pdnxt = eva; 3268 3269 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 3270 sva += PAGE_SIZE) { 3271 vm_page_t m; 3272 3273retry: 3274 /* 3275 * Regardless of whether a pte is 32 or 64 bits in 3276 * size, PG_RW, PG_A, and PG_M are among the least 3277 * significant 32 bits. 3278 */ 3279 obits = pbits = *pte; 3280 if ((pbits & PG_V) == 0) 3281 continue; 3282 3283 if ((prot & VM_PROT_WRITE) == 0) { 3284 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == 3285 (PG_MANAGED | PG_M | PG_RW)) { 3286 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); 3287 vm_page_dirty(m); 3288 } 3289 pbits &= ~(PG_RW | PG_M); 3290 } 3291#ifdef PAE 3292 if ((prot & VM_PROT_EXECUTE) == 0) 3293 pbits |= pg_nx; 3294#endif 3295 3296 if (pbits != obits) { 3297#ifdef PAE 3298 if (!atomic_cmpset_64(pte, obits, pbits)) 3299 goto retry; 3300#else 3301 if (!atomic_cmpset_int((u_int *)pte, obits, 3302 pbits)) 3303 goto retry; 3304#endif 3305 if (obits & PG_G) 3306 pmap_invalidate_page(pmap, sva); 3307 else 3308 anychanged = TRUE; 3309 } 3310 } 3311 } 3312 if (anychanged) 3313 pmap_invalidate_all(pmap); 3314 if (pv_lists_locked) { 3315 sched_unpin(); 3316 rw_wunlock(&pvh_global_lock); 3317 } 3318 PMAP_UNLOCK(pmap); 3319} 3320 3321/* 3322 * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are 3323 * within a single page table page (PTP) to a single 2- or 4MB page mapping. 3324 * For promotion to occur, two conditions must be met: (1) the 4KB page 3325 * mappings must map aligned, contiguous physical memory and (2) the 4KB page 3326 * mappings must have identical characteristics. 3327 * 3328 * Managed (PG_MANAGED) mappings within the kernel address space are not 3329 * promoted. The reason is that kernel PDEs are replicated in each pmap but 3330 * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel 3331 * pmap. 3332 */ 3333static void 3334pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) 3335{ 3336 pd_entry_t newpde; 3337 pt_entry_t *firstpte, oldpte, pa, *pte; 3338 vm_offset_t oldpteva; 3339 vm_page_t mpte; 3340 3341 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3342 3343 /* 3344 * Examine the first PTE in the specified PTP. Abort if this PTE is 3345 * either invalid, unused, or does not map the first 4KB physical page 3346 * within a 2- or 4MB page. 3347 */ 3348 firstpte = pmap_pte_quick(pmap, trunc_4mpage(va)); 3349setpde: 3350 newpde = *firstpte; 3351 if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { 3352 pmap_pde_p_failures++; 3353 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3354 " in pmap %p", va, pmap); 3355 return; 3356 } 3357 if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { 3358 pmap_pde_p_failures++; 3359 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3360 " in pmap %p", va, pmap); 3361 return; 3362 } 3363 if ((newpde & (PG_M | PG_RW)) == PG_RW) { 3364 /* 3365 * When PG_M is already clear, PG_RW can be cleared without 3366 * a TLB invalidation. 3367 */ 3368 if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde & 3369 ~PG_RW)) 3370 goto setpde; 3371 newpde &= ~PG_RW; 3372 } 3373 3374 /* 3375 * Examine each of the other PTEs in the specified PTP. Abort if this 3376 * PTE maps an unexpected 4KB physical page or does not have identical 3377 * characteristics to the first PTE. 3378 */ 3379 pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; 3380 for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { 3381setpte: 3382 oldpte = *pte; 3383 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { 3384 pmap_pde_p_failures++; 3385 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3386 " in pmap %p", va, pmap); 3387 return; 3388 } 3389 if ((oldpte & (PG_M | PG_RW)) == PG_RW) { 3390 /* 3391 * When PG_M is already clear, PG_RW can be cleared 3392 * without a TLB invalidation. 3393 */ 3394 if (!atomic_cmpset_int((u_int *)pte, oldpte, 3395 oldpte & ~PG_RW)) 3396 goto setpte; 3397 oldpte &= ~PG_RW; 3398 oldpteva = (oldpte & PG_FRAME & PDRMASK) | 3399 (va & ~PDRMASK); 3400 CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" 3401 " in pmap %p", oldpteva, pmap); 3402 } 3403 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { 3404 pmap_pde_p_failures++; 3405 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" 3406 " in pmap %p", va, pmap); 3407 return; 3408 } 3409 pa -= PAGE_SIZE; 3410 } 3411 3412 /* 3413 * Save the page table page in its current state until the PDE 3414 * mapping the superpage is demoted by pmap_demote_pde() or 3415 * destroyed by pmap_remove_pde(). 3416 */ 3417 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); 3418 KASSERT(mpte >= vm_page_array && 3419 mpte < &vm_page_array[vm_page_array_size], 3420 ("pmap_promote_pde: page table page is out of range")); 3421 KASSERT(mpte->pindex == va >> PDRSHIFT, 3422 ("pmap_promote_pde: page table page's pindex is wrong")); 3423 pmap_insert_pt_page(pmap, mpte); 3424 3425 /* 3426 * Promote the pv entries. 3427 */ 3428 if ((newpde & PG_MANAGED) != 0) 3429 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); 3430 3431 /* 3432 * Propagate the PAT index to its proper position. 3433 */ 3434 if ((newpde & PG_PTE_PAT) != 0) 3435 newpde ^= PG_PDE_PAT | PG_PTE_PAT; 3436 3437 /* 3438 * Map the superpage. 3439 */ 3440 if (workaround_erratum383) 3441 pmap_update_pde(pmap, va, pde, PG_PS | newpde); 3442 else if (pmap == kernel_pmap) 3443 pmap_kenter_pde(va, PG_PS | newpde); 3444 else 3445 pde_store(pde, PG_PS | newpde); 3446 3447 pmap_pde_promotions++; 3448 CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" 3449 " in pmap %p", va, pmap); 3450} 3451 3452/* 3453 * Insert the given physical page (p) at 3454 * the specified virtual address (v) in the 3455 * target physical map with the protection requested. 3456 * 3457 * If specified, the page will be wired down, meaning 3458 * that the related pte can not be reclaimed. 3459 * 3460 * NB: This is the only routine which MAY NOT lazy-evaluate 3461 * or lose information. That is, this routine must actually 3462 * insert this page into the given map NOW. 3463 */ 3464void 3465pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m, 3466 vm_prot_t prot, boolean_t wired) 3467{ 3468 pd_entry_t *pde; 3469 pt_entry_t *pte; 3470 pt_entry_t newpte, origpte; 3471 pv_entry_t pv; 3472 vm_paddr_t opa, pa; 3473 vm_page_t mpte, om; 3474 boolean_t invlva; 3475 3476 va = trunc_page(va); 3477 KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); 3478 KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, 3479 ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", 3480 va)); 3481 KASSERT((m->oflags & (VPO_UNMANAGED | VPO_BUSY)) != 0 || 3482 VM_OBJECT_LOCKED(m->object), 3483 ("pmap_enter: page %p is not busy", m)); 3484 3485 mpte = NULL; 3486 3487 rw_wlock(&pvh_global_lock); 3488 PMAP_LOCK(pmap); 3489 sched_pin(); 3490 3491 /* 3492 * In the case that a page table page is not 3493 * resident, we are creating it here. 3494 */ 3495 if (va < VM_MAXUSER_ADDRESS) { 3496 mpte = pmap_allocpte(pmap, va, M_WAITOK); 3497 } 3498 3499 pde = pmap_pde(pmap, va); 3500 if ((*pde & PG_PS) != 0) 3501 panic("pmap_enter: attempted pmap_enter on 4MB page"); 3502 pte = pmap_pte_quick(pmap, va); 3503 3504 /* 3505 * Page Directory table entry not valid, we need a new PT page 3506 */ 3507 if (pte == NULL) { 3508 panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", 3509 (uintmax_t)pmap->pm_pdir[PTDPTDI], va); 3510 } 3511 3512 pa = VM_PAGE_TO_PHYS(m); 3513 om = NULL; 3514 origpte = *pte; 3515 opa = origpte & PG_FRAME; 3516 3517 /* 3518 * Mapping has not changed, must be protection or wiring change. 3519 */ 3520 if (origpte && (opa == pa)) { 3521 /* 3522 * Wiring change, just update stats. We don't worry about 3523 * wiring PT pages as they remain resident as long as there 3524 * are valid mappings in them. Hence, if a user page is wired, 3525 * the PT page will be also. 3526 */ 3527 if (wired && ((origpte & PG_W) == 0)) 3528 pmap->pm_stats.wired_count++; 3529 else if (!wired && (origpte & PG_W)) 3530 pmap->pm_stats.wired_count--; 3531 3532 /* 3533 * Remove extra pte reference 3534 */ 3535 if (mpte) 3536 mpte->wire_count--; 3537 3538 if (origpte & PG_MANAGED) { 3539 om = m; 3540 pa |= PG_MANAGED; 3541 } 3542 goto validate; 3543 } 3544 3545 pv = NULL; 3546 3547 /* 3548 * Mapping has changed, invalidate old range and fall through to 3549 * handle validating new mapping. 3550 */ 3551 if (opa) { 3552 if (origpte & PG_W) 3553 pmap->pm_stats.wired_count--; 3554 if (origpte & PG_MANAGED) { 3555 om = PHYS_TO_VM_PAGE(opa); 3556 pv = pmap_pvh_remove(&om->md, pmap, va); 3557 } 3558 if (mpte != NULL) { 3559 mpte->wire_count--; 3560 KASSERT(mpte->wire_count > 0, 3561 ("pmap_enter: missing reference to page table page," 3562 " va: 0x%x", va)); 3563 } 3564 } else 3565 pmap->pm_stats.resident_count++; 3566 3567 /* 3568 * Enter on the PV list if part of our managed memory. 3569 */ 3570 if ((m->oflags & VPO_UNMANAGED) == 0) { 3571 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, 3572 ("pmap_enter: managed mapping within the clean submap")); 3573 if (pv == NULL) 3574 pv = get_pv_entry(pmap, FALSE); 3575 pv->pv_va = va; 3576 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 3577 pa |= PG_MANAGED; 3578 } else if (pv != NULL) 3579 free_pv_entry(pmap, pv); 3580 3581 /* 3582 * Increment counters 3583 */ 3584 if (wired) 3585 pmap->pm_stats.wired_count++; 3586 3587validate: 3588 /* 3589 * Now validate mapping with desired protection/wiring. 3590 */ 3591 newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V); 3592 if ((prot & VM_PROT_WRITE) != 0) { 3593 newpte |= PG_RW; 3594 if ((newpte & PG_MANAGED) != 0) 3595 vm_page_aflag_set(m, PGA_WRITEABLE); 3596 } 3597#ifdef PAE 3598 if ((prot & VM_PROT_EXECUTE) == 0) 3599 newpte |= pg_nx; 3600#endif 3601 if (wired) 3602 newpte |= PG_W; 3603 if (va < VM_MAXUSER_ADDRESS) 3604 newpte |= PG_U; 3605 if (pmap == kernel_pmap) 3606 newpte |= pgeflag; 3607 3608 /* 3609 * if the mapping or permission bits are different, we need 3610 * to update the pte. 3611 */ 3612 if ((origpte & ~(PG_M|PG_A)) != newpte) { 3613 newpte |= PG_A; 3614 if ((access & VM_PROT_WRITE) != 0) 3615 newpte |= PG_M; 3616 if (origpte & PG_V) { 3617 invlva = FALSE; 3618 origpte = pte_load_store(pte, newpte); 3619 if (origpte & PG_A) { 3620 if (origpte & PG_MANAGED) 3621 vm_page_aflag_set(om, PGA_REFERENCED); 3622 if (opa != VM_PAGE_TO_PHYS(m)) 3623 invlva = TRUE; 3624#ifdef PAE 3625 if ((origpte & PG_NX) == 0 && 3626 (newpte & PG_NX) != 0) 3627 invlva = TRUE; 3628#endif 3629 } 3630 if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 3631 if ((origpte & PG_MANAGED) != 0) 3632 vm_page_dirty(om); 3633 if ((prot & VM_PROT_WRITE) == 0) 3634 invlva = TRUE; 3635 } 3636 if ((origpte & PG_MANAGED) != 0 && 3637 TAILQ_EMPTY(&om->md.pv_list) && 3638 ((om->flags & PG_FICTITIOUS) != 0 || 3639 TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) 3640 vm_page_aflag_clear(om, PGA_WRITEABLE); 3641 if (invlva) 3642 pmap_invalidate_page(pmap, va); 3643 } else 3644 pte_store(pte, newpte); 3645 } 3646 3647 /* 3648 * If both the page table page and the reservation are fully 3649 * populated, then attempt promotion. 3650 */ 3651 if ((mpte == NULL || mpte->wire_count == NPTEPG) && 3652 pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 && 3653 vm_reserv_level_iffullpop(m) == 0) 3654 pmap_promote_pde(pmap, pde, va); 3655 3656 sched_unpin(); 3657 rw_wunlock(&pvh_global_lock); 3658 PMAP_UNLOCK(pmap); 3659} 3660 3661/* 3662 * Tries to create a 2- or 4MB page mapping. Returns TRUE if successful and 3663 * FALSE otherwise. Fails if (1) a page table page cannot be allocated without 3664 * blocking, (2) a mapping already exists at the specified virtual address, or 3665 * (3) a pv entry cannot be allocated without reclaiming another pv entry. 3666 */ 3667static boolean_t 3668pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3669{ 3670 pd_entry_t *pde, newpde; 3671 3672 rw_assert(&pvh_global_lock, RA_WLOCKED); 3673 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3674 pde = pmap_pde(pmap, va); 3675 if (*pde != 0) { 3676 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3677 " in pmap %p", va, pmap); 3678 return (FALSE); 3679 } 3680 newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) | 3681 PG_PS | PG_V; 3682 if ((m->oflags & VPO_UNMANAGED) == 0) { 3683 newpde |= PG_MANAGED; 3684 3685 /* 3686 * Abort this mapping if its PV entry could not be created. 3687 */ 3688 if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) { 3689 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" 3690 " in pmap %p", va, pmap); 3691 return (FALSE); 3692 } 3693 } 3694#ifdef PAE 3695 if ((prot & VM_PROT_EXECUTE) == 0) 3696 newpde |= pg_nx; 3697#endif 3698 if (va < VM_MAXUSER_ADDRESS) 3699 newpde |= PG_U; 3700 3701 /* 3702 * Increment counters. 3703 */ 3704 pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; 3705 3706 /* 3707 * Map the superpage. 3708 */ 3709 pde_store(pde, newpde); 3710 3711 pmap_pde_mappings++; 3712 CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" 3713 " in pmap %p", va, pmap); 3714 return (TRUE); 3715} 3716 3717/* 3718 * Maps a sequence of resident pages belonging to the same object. 3719 * The sequence begins with the given page m_start. This page is 3720 * mapped at the given virtual address start. Each subsequent page is 3721 * mapped at a virtual address that is offset from start by the same 3722 * amount as the page is offset from m_start within the object. The 3723 * last page in the sequence is the page with the largest offset from 3724 * m_start that can be mapped at a virtual address less than the given 3725 * virtual address end. Not every virtual page between start and end 3726 * is mapped; only those for which a resident page exists with the 3727 * corresponding offset from m_start are mapped. 3728 */ 3729void 3730pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 3731 vm_page_t m_start, vm_prot_t prot) 3732{ 3733 vm_offset_t va; 3734 vm_page_t m, mpte; 3735 vm_pindex_t diff, psize; 3736 3737 VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED); 3738 psize = atop(end - start); 3739 mpte = NULL; 3740 m = m_start; 3741 rw_wlock(&pvh_global_lock); 3742 PMAP_LOCK(pmap); 3743 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 3744 va = start + ptoa(diff); 3745 if ((va & PDRMASK) == 0 && va + NBPDR <= end && 3746 (VM_PAGE_TO_PHYS(m) & PDRMASK) == 0 && 3747 pg_ps_enabled && vm_reserv_level_iffullpop(m) == 0 && 3748 pmap_enter_pde(pmap, va, m, prot)) 3749 m = &m[NBPDR / PAGE_SIZE - 1]; 3750 else 3751 mpte = pmap_enter_quick_locked(pmap, va, m, prot, 3752 mpte); 3753 m = TAILQ_NEXT(m, listq); 3754 } 3755 rw_wunlock(&pvh_global_lock); 3756 PMAP_UNLOCK(pmap); 3757} 3758 3759/* 3760 * this code makes some *MAJOR* assumptions: 3761 * 1. Current pmap & pmap exists. 3762 * 2. Not wired. 3763 * 3. Read access. 3764 * 4. No page table pages. 3765 * but is *MUCH* faster than pmap_enter... 3766 */ 3767 3768void 3769pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 3770{ 3771 3772 rw_wlock(&pvh_global_lock); 3773 PMAP_LOCK(pmap); 3774 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); 3775 rw_wunlock(&pvh_global_lock); 3776 PMAP_UNLOCK(pmap); 3777} 3778 3779static vm_page_t 3780pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 3781 vm_prot_t prot, vm_page_t mpte) 3782{ 3783 pt_entry_t *pte; 3784 vm_paddr_t pa; 3785 vm_page_t free; 3786 3787 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 3788 (m->oflags & VPO_UNMANAGED) != 0, 3789 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 3790 rw_assert(&pvh_global_lock, RA_WLOCKED); 3791 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 3792 3793 /* 3794 * In the case that a page table page is not 3795 * resident, we are creating it here. 3796 */ 3797 if (va < VM_MAXUSER_ADDRESS) { 3798 u_int ptepindex; 3799 pd_entry_t ptepa; 3800 3801 /* 3802 * Calculate pagetable page index 3803 */ 3804 ptepindex = va >> PDRSHIFT; 3805 if (mpte && (mpte->pindex == ptepindex)) { 3806 mpte->wire_count++; 3807 } else { 3808 /* 3809 * Get the page directory entry 3810 */ 3811 ptepa = pmap->pm_pdir[ptepindex]; 3812 3813 /* 3814 * If the page table page is mapped, we just increment 3815 * the hold count, and activate it. 3816 */ 3817 if (ptepa) { 3818 if (ptepa & PG_PS) 3819 return (NULL); 3820 mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); 3821 mpte->wire_count++; 3822 } else { 3823 mpte = _pmap_allocpte(pmap, ptepindex, 3824 M_NOWAIT); 3825 if (mpte == NULL) 3826 return (mpte); 3827 } 3828 } 3829 } else { 3830 mpte = NULL; 3831 } 3832 3833 /* 3834 * This call to vtopte makes the assumption that we are 3835 * entering the page into the current pmap. In order to support 3836 * quick entry into any pmap, one would likely use pmap_pte_quick. 3837 * But that isn't as quick as vtopte. 3838 */ 3839 pte = vtopte(va); 3840 if (*pte) { 3841 if (mpte != NULL) { 3842 mpte->wire_count--; 3843 mpte = NULL; 3844 } 3845 return (mpte); 3846 } 3847 3848 /* 3849 * Enter on the PV list if part of our managed memory. 3850 */ 3851 if ((m->oflags & VPO_UNMANAGED) == 0 && 3852 !pmap_try_insert_pv_entry(pmap, va, m)) { 3853 if (mpte != NULL) { 3854 free = NULL; 3855 if (pmap_unwire_ptp(pmap, mpte, &free)) { 3856 pmap_invalidate_page(pmap, va); 3857 pmap_free_zero_pages(free); 3858 } 3859 3860 mpte = NULL; 3861 } 3862 return (mpte); 3863 } 3864 3865 /* 3866 * Increment counters 3867 */ 3868 pmap->pm_stats.resident_count++; 3869 3870 pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); 3871#ifdef PAE 3872 if ((prot & VM_PROT_EXECUTE) == 0) 3873 pa |= pg_nx; 3874#endif 3875 3876 /* 3877 * Now validate mapping with RO protection 3878 */ 3879 if ((m->oflags & VPO_UNMANAGED) != 0) 3880 pte_store(pte, pa | PG_V | PG_U); 3881 else 3882 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 3883 return (mpte); 3884} 3885 3886/* 3887 * Make a temporary mapping for a physical address. This is only intended 3888 * to be used for panic dumps. 3889 */ 3890void * 3891pmap_kenter_temporary(vm_paddr_t pa, int i) 3892{ 3893 vm_offset_t va; 3894 3895 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 3896 pmap_kenter(va, pa); 3897 invlpg(va); 3898 return ((void *)crashdumpmap); 3899} 3900 3901/* 3902 * This code maps large physical mmap regions into the 3903 * processor address space. Note that some shortcuts 3904 * are taken, but the code works. 3905 */ 3906void 3907pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 3908 vm_pindex_t pindex, vm_size_t size) 3909{ 3910 pd_entry_t *pde; 3911 vm_paddr_t pa, ptepa; 3912 vm_page_t p; 3913 int pat_mode; 3914 3915 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 3916 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, 3917 ("pmap_object_init_pt: non-device object")); 3918 if (pseflag && 3919 (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { 3920 if (!vm_object_populate(object, pindex, pindex + atop(size))) 3921 return; 3922 p = vm_page_lookup(object, pindex); 3923 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3924 ("pmap_object_init_pt: invalid page %p", p)); 3925 pat_mode = p->md.pat_mode; 3926 3927 /* 3928 * Abort the mapping if the first page is not physically 3929 * aligned to a 2/4MB page boundary. 3930 */ 3931 ptepa = VM_PAGE_TO_PHYS(p); 3932 if (ptepa & (NBPDR - 1)) 3933 return; 3934 3935 /* 3936 * Skip the first page. Abort the mapping if the rest of 3937 * the pages are not physically contiguous or have differing 3938 * memory attributes. 3939 */ 3940 p = TAILQ_NEXT(p, listq); 3941 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; 3942 pa += PAGE_SIZE) { 3943 KASSERT(p->valid == VM_PAGE_BITS_ALL, 3944 ("pmap_object_init_pt: invalid page %p", p)); 3945 if (pa != VM_PAGE_TO_PHYS(p) || 3946 pat_mode != p->md.pat_mode) 3947 return; 3948 p = TAILQ_NEXT(p, listq); 3949 } 3950 3951 /* 3952 * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and 3953 * "size" is a multiple of 2/4M, adding the PAT setting to 3954 * "pa" will not affect the termination of this loop. 3955 */ 3956 PMAP_LOCK(pmap); 3957 for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa + 3958 size; pa += NBPDR) { 3959 pde = pmap_pde(pmap, addr); 3960 if (*pde == 0) { 3961 pde_store(pde, pa | PG_PS | PG_M | PG_A | 3962 PG_U | PG_RW | PG_V); 3963 pmap->pm_stats.resident_count += NBPDR / 3964 PAGE_SIZE; 3965 pmap_pde_mappings++; 3966 } 3967 /* Else continue on if the PDE is already valid. */ 3968 addr += NBPDR; 3969 } 3970 PMAP_UNLOCK(pmap); 3971 } 3972} 3973 3974/* 3975 * Routine: pmap_change_wiring 3976 * Function: Change the wiring attribute for a map/virtual-address 3977 * pair. 3978 * In/out conditions: 3979 * The mapping must already exist in the pmap. 3980 */ 3981void 3982pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) 3983{ 3984 pd_entry_t *pde; 3985 pt_entry_t *pte; 3986 boolean_t are_queues_locked; 3987 3988 are_queues_locked = FALSE; 3989retry: 3990 PMAP_LOCK(pmap); 3991 pde = pmap_pde(pmap, va); 3992 if ((*pde & PG_PS) != 0) { 3993 if (!wired != ((*pde & PG_W) == 0)) { 3994 if (!are_queues_locked) { 3995 are_queues_locked = TRUE; 3996 if (!rw_try_wlock(&pvh_global_lock)) { 3997 PMAP_UNLOCK(pmap); 3998 rw_wlock(&pvh_global_lock); 3999 goto retry; 4000 } 4001 } 4002 if (!pmap_demote_pde(pmap, pde, va)) 4003 panic("pmap_change_wiring: demotion failed"); 4004 } else 4005 goto out; 4006 } 4007 pte = pmap_pte(pmap, va); 4008 4009 if (wired && !pmap_pte_w(pte)) 4010 pmap->pm_stats.wired_count++; 4011 else if (!wired && pmap_pte_w(pte)) 4012 pmap->pm_stats.wired_count--; 4013 4014 /* 4015 * Wiring is not a hardware characteristic so there is no need to 4016 * invalidate TLB. 4017 */ 4018 pmap_pte_set_w(pte, wired); 4019 pmap_pte_release(pte); 4020out: 4021 if (are_queues_locked) 4022 rw_wunlock(&pvh_global_lock); 4023 PMAP_UNLOCK(pmap); 4024} 4025 4026 4027 4028/* 4029 * Copy the range specified by src_addr/len 4030 * from the source map to the range dst_addr/len 4031 * in the destination map. 4032 * 4033 * This routine is only advisory and need not do anything. 4034 */ 4035 4036void 4037pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 4038 vm_offset_t src_addr) 4039{ 4040 vm_page_t free; 4041 vm_offset_t addr; 4042 vm_offset_t end_addr = src_addr + len; 4043 vm_offset_t pdnxt; 4044 4045 if (dst_addr != src_addr) 4046 return; 4047 4048 if (!pmap_is_current(src_pmap)) 4049 return; 4050 4051 rw_wlock(&pvh_global_lock); 4052 if (dst_pmap < src_pmap) { 4053 PMAP_LOCK(dst_pmap); 4054 PMAP_LOCK(src_pmap); 4055 } else { 4056 PMAP_LOCK(src_pmap); 4057 PMAP_LOCK(dst_pmap); 4058 } 4059 sched_pin(); 4060 for (addr = src_addr; addr < end_addr; addr = pdnxt) { 4061 pt_entry_t *src_pte, *dst_pte; 4062 vm_page_t dstmpte, srcmpte; 4063 pd_entry_t srcptepaddr; 4064 u_int ptepindex; 4065 4066 KASSERT(addr < UPT_MIN_ADDRESS, 4067 ("pmap_copy: invalid to pmap_copy page tables")); 4068 4069 pdnxt = (addr + NBPDR) & ~PDRMASK; 4070 if (pdnxt < addr) 4071 pdnxt = end_addr; 4072 ptepindex = addr >> PDRSHIFT; 4073 4074 srcptepaddr = src_pmap->pm_pdir[ptepindex]; 4075 if (srcptepaddr == 0) 4076 continue; 4077 4078 if (srcptepaddr & PG_PS) { 4079 if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) 4080 continue; 4081 if (dst_pmap->pm_pdir[ptepindex] == 0 && 4082 ((srcptepaddr & PG_MANAGED) == 0 || 4083 pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & 4084 PG_PS_FRAME))) { 4085 dst_pmap->pm_pdir[ptepindex] = srcptepaddr & 4086 ~PG_W; 4087 dst_pmap->pm_stats.resident_count += 4088 NBPDR / PAGE_SIZE; 4089 } 4090 continue; 4091 } 4092 4093 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); 4094 KASSERT(srcmpte->wire_count > 0, 4095 ("pmap_copy: source page table page is unused")); 4096 4097 if (pdnxt > end_addr) 4098 pdnxt = end_addr; 4099 4100 src_pte = vtopte(addr); 4101 while (addr < pdnxt) { 4102 pt_entry_t ptetemp; 4103 ptetemp = *src_pte; 4104 /* 4105 * we only virtual copy managed pages 4106 */ 4107 if ((ptetemp & PG_MANAGED) != 0) { 4108 dstmpte = pmap_allocpte(dst_pmap, addr, 4109 M_NOWAIT); 4110 if (dstmpte == NULL) 4111 goto out; 4112 dst_pte = pmap_pte_quick(dst_pmap, addr); 4113 if (*dst_pte == 0 && 4114 pmap_try_insert_pv_entry(dst_pmap, addr, 4115 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { 4116 /* 4117 * Clear the wired, modified, and 4118 * accessed (referenced) bits 4119 * during the copy. 4120 */ 4121 *dst_pte = ptetemp & ~(PG_W | PG_M | 4122 PG_A); 4123 dst_pmap->pm_stats.resident_count++; 4124 } else { 4125 free = NULL; 4126 if (pmap_unwire_ptp(dst_pmap, dstmpte, 4127 &free)) { 4128 pmap_invalidate_page(dst_pmap, 4129 addr); 4130 pmap_free_zero_pages(free); 4131 } 4132 goto out; 4133 } 4134 if (dstmpte->wire_count >= srcmpte->wire_count) 4135 break; 4136 } 4137 addr += PAGE_SIZE; 4138 src_pte++; 4139 } 4140 } 4141out: 4142 sched_unpin(); 4143 rw_wunlock(&pvh_global_lock); 4144 PMAP_UNLOCK(src_pmap); 4145 PMAP_UNLOCK(dst_pmap); 4146} 4147 4148static __inline void 4149pagezero(void *page) 4150{ 4151#if defined(I686_CPU) 4152 if (cpu_class == CPUCLASS_686) { 4153#if defined(CPU_ENABLE_SSE) 4154 if (cpu_feature & CPUID_SSE2) 4155 sse2_pagezero(page); 4156 else 4157#endif 4158 i686_pagezero(page); 4159 } else 4160#endif 4161 bzero(page, PAGE_SIZE); 4162} 4163 4164/* 4165 * pmap_zero_page zeros the specified hardware page by mapping 4166 * the page into KVM and using bzero to clear its contents. 4167 */ 4168void 4169pmap_zero_page(vm_page_t m) 4170{ 4171 struct sysmaps *sysmaps; 4172 4173 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 4174 mtx_lock(&sysmaps->lock); 4175 if (*sysmaps->CMAP2) 4176 panic("pmap_zero_page: CMAP2 busy"); 4177 sched_pin(); 4178 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4179 pmap_cache_bits(m->md.pat_mode, 0); 4180 invlcaddr(sysmaps->CADDR2); 4181 pagezero(sysmaps->CADDR2); 4182 *sysmaps->CMAP2 = 0; 4183 sched_unpin(); 4184 mtx_unlock(&sysmaps->lock); 4185} 4186 4187/* 4188 * pmap_zero_page_area zeros the specified hardware page by mapping 4189 * the page into KVM and using bzero to clear its contents. 4190 * 4191 * off and size may not cover an area beyond a single hardware page. 4192 */ 4193void 4194pmap_zero_page_area(vm_page_t m, int off, int size) 4195{ 4196 struct sysmaps *sysmaps; 4197 4198 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 4199 mtx_lock(&sysmaps->lock); 4200 if (*sysmaps->CMAP2) 4201 panic("pmap_zero_page_area: CMAP2 busy"); 4202 sched_pin(); 4203 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4204 pmap_cache_bits(m->md.pat_mode, 0); 4205 invlcaddr(sysmaps->CADDR2); 4206 if (off == 0 && size == PAGE_SIZE) 4207 pagezero(sysmaps->CADDR2); 4208 else 4209 bzero((char *)sysmaps->CADDR2 + off, size); 4210 *sysmaps->CMAP2 = 0; 4211 sched_unpin(); 4212 mtx_unlock(&sysmaps->lock); 4213} 4214 4215/* 4216 * pmap_zero_page_idle zeros the specified hardware page by mapping 4217 * the page into KVM and using bzero to clear its contents. This 4218 * is intended to be called from the vm_pagezero process only and 4219 * outside of Giant. 4220 */ 4221void 4222pmap_zero_page_idle(vm_page_t m) 4223{ 4224 4225 if (*CMAP3) 4226 panic("pmap_zero_page_idle: CMAP3 busy"); 4227 sched_pin(); 4228 *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | 4229 pmap_cache_bits(m->md.pat_mode, 0); 4230 invlcaddr(CADDR3); 4231 pagezero(CADDR3); 4232 *CMAP3 = 0; 4233 sched_unpin(); 4234} 4235 4236/* 4237 * pmap_copy_page copies the specified (machine independent) 4238 * page by mapping the page into virtual memory and using 4239 * bcopy to copy the page, one machine dependent page at a 4240 * time. 4241 */ 4242void 4243pmap_copy_page(vm_page_t src, vm_page_t dst) 4244{ 4245 struct sysmaps *sysmaps; 4246 4247 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 4248 mtx_lock(&sysmaps->lock); 4249 if (*sysmaps->CMAP1) 4250 panic("pmap_copy_page: CMAP1 busy"); 4251 if (*sysmaps->CMAP2) 4252 panic("pmap_copy_page: CMAP2 busy"); 4253 sched_pin(); 4254 invlpg((u_int)sysmaps->CADDR1); 4255 invlpg((u_int)sysmaps->CADDR2); 4256 *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A | 4257 pmap_cache_bits(src->md.pat_mode, 0); 4258 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M | 4259 pmap_cache_bits(dst->md.pat_mode, 0); 4260 bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE); 4261 *sysmaps->CMAP1 = 0; 4262 *sysmaps->CMAP2 = 0; 4263 sched_unpin(); 4264 mtx_unlock(&sysmaps->lock); 4265} 4266 4267int unmapped_buf_allowed = 1; 4268 4269void 4270pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], 4271 vm_offset_t b_offset, int xfersize) 4272{ 4273 struct sysmaps *sysmaps; 4274 vm_page_t a_pg, b_pg; 4275 char *a_cp, *b_cp; 4276 vm_offset_t a_pg_offset, b_pg_offset; 4277 int cnt; 4278 4279 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 4280 mtx_lock(&sysmaps->lock); 4281 if (*sysmaps->CMAP1 != 0) 4282 panic("pmap_copy_pages: CMAP1 busy"); 4283 if (*sysmaps->CMAP2 != 0) 4284 panic("pmap_copy_pages: CMAP2 busy"); 4285 sched_pin(); 4286 while (xfersize > 0) { 4287 invlpg((u_int)sysmaps->CADDR1); 4288 invlpg((u_int)sysmaps->CADDR2); 4289 a_pg = ma[a_offset >> PAGE_SHIFT]; 4290 a_pg_offset = a_offset & PAGE_MASK; 4291 cnt = min(xfersize, PAGE_SIZE - a_pg_offset); 4292 b_pg = mb[b_offset >> PAGE_SHIFT]; 4293 b_pg_offset = b_offset & PAGE_MASK; 4294 cnt = min(cnt, PAGE_SIZE - b_pg_offset); 4295 *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A | 4296 pmap_cache_bits(b_pg->md.pat_mode, 0); 4297 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A | 4298 PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0); 4299 a_cp = sysmaps->CADDR1 + a_pg_offset; 4300 b_cp = sysmaps->CADDR2 + b_pg_offset; 4301 bcopy(a_cp, b_cp, cnt); 4302 a_offset += cnt; 4303 b_offset += cnt; 4304 xfersize -= cnt; 4305 } 4306 *sysmaps->CMAP1 = 0; 4307 *sysmaps->CMAP2 = 0; 4308 sched_unpin(); 4309 mtx_unlock(&sysmaps->lock); 4310} 4311 4312/* 4313 * Returns true if the pmap's pv is one of the first 4314 * 16 pvs linked to from this page. This count may 4315 * be changed upwards or downwards in the future; it 4316 * is only necessary that true be returned for a small 4317 * subset of pmaps for proper page aging. 4318 */ 4319boolean_t 4320pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 4321{ 4322 struct md_page *pvh; 4323 pv_entry_t pv; 4324 int loops = 0; 4325 boolean_t rv; 4326 4327 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4328 ("pmap_page_exists_quick: page %p is not managed", m)); 4329 rv = FALSE; 4330 rw_wlock(&pvh_global_lock); 4331 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4332 if (PV_PMAP(pv) == pmap) { 4333 rv = TRUE; 4334 break; 4335 } 4336 loops++; 4337 if (loops >= 16) 4338 break; 4339 } 4340 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { 4341 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4342 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 4343 if (PV_PMAP(pv) == pmap) { 4344 rv = TRUE; 4345 break; 4346 } 4347 loops++; 4348 if (loops >= 16) 4349 break; 4350 } 4351 } 4352 rw_wunlock(&pvh_global_lock); 4353 return (rv); 4354} 4355 4356/* 4357 * pmap_page_wired_mappings: 4358 * 4359 * Return the number of managed mappings to the given physical page 4360 * that are wired. 4361 */ 4362int 4363pmap_page_wired_mappings(vm_page_t m) 4364{ 4365 int count; 4366 4367 count = 0; 4368 if ((m->oflags & VPO_UNMANAGED) != 0) 4369 return (count); 4370 rw_wlock(&pvh_global_lock); 4371 count = pmap_pvh_wired_mappings(&m->md, count); 4372 if ((m->flags & PG_FICTITIOUS) == 0) { 4373 count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), 4374 count); 4375 } 4376 rw_wunlock(&pvh_global_lock); 4377 return (count); 4378} 4379 4380/* 4381 * pmap_pvh_wired_mappings: 4382 * 4383 * Return the updated number "count" of managed mappings that are wired. 4384 */ 4385static int 4386pmap_pvh_wired_mappings(struct md_page *pvh, int count) 4387{ 4388 pmap_t pmap; 4389 pt_entry_t *pte; 4390 pv_entry_t pv; 4391 4392 rw_assert(&pvh_global_lock, RA_WLOCKED); 4393 sched_pin(); 4394 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 4395 pmap = PV_PMAP(pv); 4396 PMAP_LOCK(pmap); 4397 pte = pmap_pte_quick(pmap, pv->pv_va); 4398 if ((*pte & PG_W) != 0) 4399 count++; 4400 PMAP_UNLOCK(pmap); 4401 } 4402 sched_unpin(); 4403 return (count); 4404} 4405 4406/* 4407 * Returns TRUE if the given page is mapped individually or as part of 4408 * a 4mpage. Otherwise, returns FALSE. 4409 */ 4410boolean_t 4411pmap_page_is_mapped(vm_page_t m) 4412{ 4413 boolean_t rv; 4414 4415 if ((m->oflags & VPO_UNMANAGED) != 0) 4416 return (FALSE); 4417 rw_wlock(&pvh_global_lock); 4418 rv = !TAILQ_EMPTY(&m->md.pv_list) || 4419 ((m->flags & PG_FICTITIOUS) == 0 && 4420 !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); 4421 rw_wunlock(&pvh_global_lock); 4422 return (rv); 4423} 4424 4425/* 4426 * Remove all pages from specified address space 4427 * this aids process exit speeds. Also, this code 4428 * is special cased for current process only, but 4429 * can have the more generic (and slightly slower) 4430 * mode enabled. This is much faster than pmap_remove 4431 * in the case of running down an entire address space. 4432 */ 4433void 4434pmap_remove_pages(pmap_t pmap) 4435{ 4436 pt_entry_t *pte, tpte; 4437 vm_page_t free = NULL; 4438 vm_page_t m, mpte, mt; 4439 pv_entry_t pv; 4440 struct md_page *pvh; 4441 struct pv_chunk *pc, *npc; 4442 int field, idx; 4443 int32_t bit; 4444 uint32_t inuse, bitmask; 4445 int allfree; 4446 4447 if (pmap != PCPU_GET(curpmap)) { 4448 printf("warning: pmap_remove_pages called with non-current pmap\n"); 4449 return; 4450 } 4451 rw_wlock(&pvh_global_lock); 4452 PMAP_LOCK(pmap); 4453 sched_pin(); 4454 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 4455 KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, 4456 pc->pc_pmap)); 4457 allfree = 1; 4458 for (field = 0; field < _NPCM; field++) { 4459 inuse = ~pc->pc_map[field] & pc_freemask[field]; 4460 while (inuse != 0) { 4461 bit = bsfl(inuse); 4462 bitmask = 1UL << bit; 4463 idx = field * 32 + bit; 4464 pv = &pc->pc_pventry[idx]; 4465 inuse &= ~bitmask; 4466 4467 pte = pmap_pde(pmap, pv->pv_va); 4468 tpte = *pte; 4469 if ((tpte & PG_PS) == 0) { 4470 pte = vtopte(pv->pv_va); 4471 tpte = *pte & ~PG_PTE_PAT; 4472 } 4473 4474 if (tpte == 0) { 4475 printf( 4476 "TPTE at %p IS ZERO @ VA %08x\n", 4477 pte, pv->pv_va); 4478 panic("bad pte"); 4479 } 4480 4481/* 4482 * We cannot remove wired pages from a process' mapping at this time 4483 */ 4484 if (tpte & PG_W) { 4485 allfree = 0; 4486 continue; 4487 } 4488 4489 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); 4490 KASSERT(m->phys_addr == (tpte & PG_FRAME), 4491 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 4492 m, (uintmax_t)m->phys_addr, 4493 (uintmax_t)tpte)); 4494 4495 KASSERT((m->flags & PG_FICTITIOUS) != 0 || 4496 m < &vm_page_array[vm_page_array_size], 4497 ("pmap_remove_pages: bad tpte %#jx", 4498 (uintmax_t)tpte)); 4499 4500 pte_clear(pte); 4501 4502 /* 4503 * Update the vm_page_t clean/reference bits. 4504 */ 4505 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4506 if ((tpte & PG_PS) != 0) { 4507 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4508 vm_page_dirty(mt); 4509 } else 4510 vm_page_dirty(m); 4511 } 4512 4513 /* Mark free */ 4514 PV_STAT(pv_entry_frees++); 4515 PV_STAT(pv_entry_spare++); 4516 pv_entry_count--; 4517 pc->pc_map[field] |= bitmask; 4518 if ((tpte & PG_PS) != 0) { 4519 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 4520 pvh = pa_to_pvh(tpte & PG_PS_FRAME); 4521 TAILQ_REMOVE(&pvh->pv_list, pv, pv_list); 4522 if (TAILQ_EMPTY(&pvh->pv_list)) { 4523 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) 4524 if (TAILQ_EMPTY(&mt->md.pv_list)) 4525 vm_page_aflag_clear(mt, PGA_WRITEABLE); 4526 } 4527 mpte = pmap_lookup_pt_page(pmap, pv->pv_va); 4528 if (mpte != NULL) { 4529 pmap_remove_pt_page(pmap, mpte); 4530 pmap->pm_stats.resident_count--; 4531 KASSERT(mpte->wire_count == NPTEPG, 4532 ("pmap_remove_pages: pte page wire count error")); 4533 mpte->wire_count = 0; 4534 pmap_add_delayed_free_list(mpte, &free, FALSE); 4535 atomic_subtract_int(&cnt.v_wire_count, 1); 4536 } 4537 } else { 4538 pmap->pm_stats.resident_count--; 4539 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 4540 if (TAILQ_EMPTY(&m->md.pv_list) && 4541 (m->flags & PG_FICTITIOUS) == 0) { 4542 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4543 if (TAILQ_EMPTY(&pvh->pv_list)) 4544 vm_page_aflag_clear(m, PGA_WRITEABLE); 4545 } 4546 pmap_unuse_pt(pmap, pv->pv_va, &free); 4547 } 4548 } 4549 } 4550 if (allfree) { 4551 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 4552 free_pv_chunk(pc); 4553 } 4554 } 4555 sched_unpin(); 4556 pmap_invalidate_all(pmap); 4557 rw_wunlock(&pvh_global_lock); 4558 PMAP_UNLOCK(pmap); 4559 pmap_free_zero_pages(free); 4560} 4561 4562/* 4563 * pmap_is_modified: 4564 * 4565 * Return whether or not the specified physical page was modified 4566 * in any physical maps. 4567 */ 4568boolean_t 4569pmap_is_modified(vm_page_t m) 4570{ 4571 boolean_t rv; 4572 4573 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4574 ("pmap_is_modified: page %p is not managed", m)); 4575 4576 /* 4577 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be 4578 * concurrently set while the object is locked. Thus, if PGA_WRITEABLE 4579 * is clear, no PTEs can have PG_M set. 4580 */ 4581 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 4582 if ((m->oflags & VPO_BUSY) == 0 && 4583 (m->aflags & PGA_WRITEABLE) == 0) 4584 return (FALSE); 4585 rw_wlock(&pvh_global_lock); 4586 rv = pmap_is_modified_pvh(&m->md) || 4587 ((m->flags & PG_FICTITIOUS) == 0 && 4588 pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 4589 rw_wunlock(&pvh_global_lock); 4590 return (rv); 4591} 4592 4593/* 4594 * Returns TRUE if any of the given mappings were used to modify 4595 * physical memory. Otherwise, returns FALSE. Both page and 2mpage 4596 * mappings are supported. 4597 */ 4598static boolean_t 4599pmap_is_modified_pvh(struct md_page *pvh) 4600{ 4601 pv_entry_t pv; 4602 pt_entry_t *pte; 4603 pmap_t pmap; 4604 boolean_t rv; 4605 4606 rw_assert(&pvh_global_lock, RA_WLOCKED); 4607 rv = FALSE; 4608 sched_pin(); 4609 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 4610 pmap = PV_PMAP(pv); 4611 PMAP_LOCK(pmap); 4612 pte = pmap_pte_quick(pmap, pv->pv_va); 4613 rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); 4614 PMAP_UNLOCK(pmap); 4615 if (rv) 4616 break; 4617 } 4618 sched_unpin(); 4619 return (rv); 4620} 4621 4622/* 4623 * pmap_is_prefaultable: 4624 * 4625 * Return whether or not the specified virtual address is elgible 4626 * for prefault. 4627 */ 4628boolean_t 4629pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 4630{ 4631 pd_entry_t *pde; 4632 pt_entry_t *pte; 4633 boolean_t rv; 4634 4635 rv = FALSE; 4636 PMAP_LOCK(pmap); 4637 pde = pmap_pde(pmap, addr); 4638 if (*pde != 0 && (*pde & PG_PS) == 0) { 4639 pte = vtopte(addr); 4640 rv = *pte == 0; 4641 } 4642 PMAP_UNLOCK(pmap); 4643 return (rv); 4644} 4645 4646/* 4647 * pmap_is_referenced: 4648 * 4649 * Return whether or not the specified physical page was referenced 4650 * in any physical maps. 4651 */ 4652boolean_t 4653pmap_is_referenced(vm_page_t m) 4654{ 4655 boolean_t rv; 4656 4657 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4658 ("pmap_is_referenced: page %p is not managed", m)); 4659 rw_wlock(&pvh_global_lock); 4660 rv = pmap_is_referenced_pvh(&m->md) || 4661 ((m->flags & PG_FICTITIOUS) == 0 && 4662 pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); 4663 rw_wunlock(&pvh_global_lock); 4664 return (rv); 4665} 4666 4667/* 4668 * Returns TRUE if any of the given mappings were referenced and FALSE 4669 * otherwise. Both page and 4mpage mappings are supported. 4670 */ 4671static boolean_t 4672pmap_is_referenced_pvh(struct md_page *pvh) 4673{ 4674 pv_entry_t pv; 4675 pt_entry_t *pte; 4676 pmap_t pmap; 4677 boolean_t rv; 4678 4679 rw_assert(&pvh_global_lock, RA_WLOCKED); 4680 rv = FALSE; 4681 sched_pin(); 4682 TAILQ_FOREACH(pv, &pvh->pv_list, pv_list) { 4683 pmap = PV_PMAP(pv); 4684 PMAP_LOCK(pmap); 4685 pte = pmap_pte_quick(pmap, pv->pv_va); 4686 rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); 4687 PMAP_UNLOCK(pmap); 4688 if (rv) 4689 break; 4690 } 4691 sched_unpin(); 4692 return (rv); 4693} 4694 4695/* 4696 * Clear the write and modified bits in each of the given page's mappings. 4697 */ 4698void 4699pmap_remove_write(vm_page_t m) 4700{ 4701 struct md_page *pvh; 4702 pv_entry_t next_pv, pv; 4703 pmap_t pmap; 4704 pd_entry_t *pde; 4705 pt_entry_t oldpte, *pte; 4706 vm_offset_t va; 4707 4708 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4709 ("pmap_remove_write: page %p is not managed", m)); 4710 4711 /* 4712 * If the page is not VPO_BUSY, then PGA_WRITEABLE cannot be set by 4713 * another thread while the object is locked. Thus, if PGA_WRITEABLE 4714 * is clear, no page table entries need updating. 4715 */ 4716 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 4717 if ((m->oflags & VPO_BUSY) == 0 && 4718 (m->aflags & PGA_WRITEABLE) == 0) 4719 return; 4720 rw_wlock(&pvh_global_lock); 4721 sched_pin(); 4722 if ((m->flags & PG_FICTITIOUS) != 0) 4723 goto small_mappings; 4724 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4725 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { 4726 va = pv->pv_va; 4727 pmap = PV_PMAP(pv); 4728 PMAP_LOCK(pmap); 4729 pde = pmap_pde(pmap, va); 4730 if ((*pde & PG_RW) != 0) 4731 (void)pmap_demote_pde(pmap, pde, va); 4732 PMAP_UNLOCK(pmap); 4733 } 4734small_mappings: 4735 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4736 pmap = PV_PMAP(pv); 4737 PMAP_LOCK(pmap); 4738 pde = pmap_pde(pmap, pv->pv_va); 4739 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" 4740 " a 4mpage in page %p's pv list", m)); 4741 pte = pmap_pte_quick(pmap, pv->pv_va); 4742retry: 4743 oldpte = *pte; 4744 if ((oldpte & PG_RW) != 0) { 4745 /* 4746 * Regardless of whether a pte is 32 or 64 bits 4747 * in size, PG_RW and PG_M are among the least 4748 * significant 32 bits. 4749 */ 4750 if (!atomic_cmpset_int((u_int *)pte, oldpte, 4751 oldpte & ~(PG_RW | PG_M))) 4752 goto retry; 4753 if ((oldpte & PG_M) != 0) 4754 vm_page_dirty(m); 4755 pmap_invalidate_page(pmap, pv->pv_va); 4756 } 4757 PMAP_UNLOCK(pmap); 4758 } 4759 vm_page_aflag_clear(m, PGA_WRITEABLE); 4760 sched_unpin(); 4761 rw_wunlock(&pvh_global_lock); 4762} 4763 4764/* 4765 * pmap_ts_referenced: 4766 * 4767 * Return a count of reference bits for a page, clearing those bits. 4768 * It is not necessary for every reference bit to be cleared, but it 4769 * is necessary that 0 only be returned when there are truly no 4770 * reference bits set. 4771 * 4772 * XXX: The exact number of bits to check and clear is a matter that 4773 * should be tested and standardized at some point in the future for 4774 * optimal aging of shared pages. 4775 */ 4776int 4777pmap_ts_referenced(vm_page_t m) 4778{ 4779 struct md_page *pvh; 4780 pv_entry_t pv, pvf, pvn; 4781 pmap_t pmap; 4782 pd_entry_t oldpde, *pde; 4783 pt_entry_t *pte; 4784 vm_offset_t va; 4785 int rtval = 0; 4786 4787 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4788 ("pmap_ts_referenced: page %p is not managed", m)); 4789 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4790 rw_wlock(&pvh_global_lock); 4791 sched_pin(); 4792 if ((m->flags & PG_FICTITIOUS) != 0) 4793 goto small_mappings; 4794 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, pvn) { 4795 va = pv->pv_va; 4796 pmap = PV_PMAP(pv); 4797 PMAP_LOCK(pmap); 4798 pde = pmap_pde(pmap, va); 4799 oldpde = *pde; 4800 if ((oldpde & PG_A) != 0) { 4801 if (pmap_demote_pde(pmap, pde, va)) { 4802 if ((oldpde & PG_W) == 0) { 4803 /* 4804 * Remove the mapping to a single page 4805 * so that a subsequent access may 4806 * repromote. Since the underlying 4807 * page table page is fully populated, 4808 * this removal never frees a page 4809 * table page. 4810 */ 4811 va += VM_PAGE_TO_PHYS(m) - (oldpde & 4812 PG_PS_FRAME); 4813 pmap_remove_page(pmap, va, NULL); 4814 rtval++; 4815 if (rtval > 4) { 4816 PMAP_UNLOCK(pmap); 4817 goto out; 4818 } 4819 } 4820 } 4821 } 4822 PMAP_UNLOCK(pmap); 4823 } 4824small_mappings: 4825 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 4826 pvf = pv; 4827 do { 4828 pvn = TAILQ_NEXT(pv, pv_list); 4829 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 4830 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 4831 pmap = PV_PMAP(pv); 4832 PMAP_LOCK(pmap); 4833 pde = pmap_pde(pmap, pv->pv_va); 4834 KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced:" 4835 " found a 4mpage in page %p's pv list", m)); 4836 pte = pmap_pte_quick(pmap, pv->pv_va); 4837 if ((*pte & PG_A) != 0) { 4838 atomic_clear_int((u_int *)pte, PG_A); 4839 pmap_invalidate_page(pmap, pv->pv_va); 4840 rtval++; 4841 if (rtval > 4) 4842 pvn = NULL; 4843 } 4844 PMAP_UNLOCK(pmap); 4845 } while ((pv = pvn) != NULL && pv != pvf); 4846 } 4847out: 4848 sched_unpin(); 4849 rw_wunlock(&pvh_global_lock); 4850 return (rtval); 4851} 4852 4853/* 4854 * Clear the modify bits on the specified physical page. 4855 */ 4856void 4857pmap_clear_modify(vm_page_t m) 4858{ 4859 struct md_page *pvh; 4860 pv_entry_t next_pv, pv; 4861 pmap_t pmap; 4862 pd_entry_t oldpde, *pde; 4863 pt_entry_t oldpte, *pte; 4864 vm_offset_t va; 4865 4866 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4867 ("pmap_clear_modify: page %p is not managed", m)); 4868 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED); 4869 KASSERT((m->oflags & VPO_BUSY) == 0, 4870 ("pmap_clear_modify: page %p is busy", m)); 4871 4872 /* 4873 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. 4874 * If the object containing the page is locked and the page is not 4875 * VPO_BUSY, then PGA_WRITEABLE cannot be concurrently set. 4876 */ 4877 if ((m->aflags & PGA_WRITEABLE) == 0) 4878 return; 4879 rw_wlock(&pvh_global_lock); 4880 sched_pin(); 4881 if ((m->flags & PG_FICTITIOUS) != 0) 4882 goto small_mappings; 4883 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4884 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { 4885 va = pv->pv_va; 4886 pmap = PV_PMAP(pv); 4887 PMAP_LOCK(pmap); 4888 pde = pmap_pde(pmap, va); 4889 oldpde = *pde; 4890 if ((oldpde & PG_RW) != 0) { 4891 if (pmap_demote_pde(pmap, pde, va)) { 4892 if ((oldpde & PG_W) == 0) { 4893 /* 4894 * Write protect the mapping to a 4895 * single page so that a subsequent 4896 * write access may repromote. 4897 */ 4898 va += VM_PAGE_TO_PHYS(m) - (oldpde & 4899 PG_PS_FRAME); 4900 pte = pmap_pte_quick(pmap, va); 4901 oldpte = *pte; 4902 if ((oldpte & PG_V) != 0) { 4903 /* 4904 * Regardless of whether a pte is 32 or 64 bits 4905 * in size, PG_RW and PG_M are among the least 4906 * significant 32 bits. 4907 */ 4908 while (!atomic_cmpset_int((u_int *)pte, 4909 oldpte, 4910 oldpte & ~(PG_M | PG_RW))) 4911 oldpte = *pte; 4912 vm_page_dirty(m); 4913 pmap_invalidate_page(pmap, va); 4914 } 4915 } 4916 } 4917 } 4918 PMAP_UNLOCK(pmap); 4919 } 4920small_mappings: 4921 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4922 pmap = PV_PMAP(pv); 4923 PMAP_LOCK(pmap); 4924 pde = pmap_pde(pmap, pv->pv_va); 4925 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" 4926 " a 4mpage in page %p's pv list", m)); 4927 pte = pmap_pte_quick(pmap, pv->pv_va); 4928 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { 4929 /* 4930 * Regardless of whether a pte is 32 or 64 bits 4931 * in size, PG_M is among the least significant 4932 * 32 bits. 4933 */ 4934 atomic_clear_int((u_int *)pte, PG_M); 4935 pmap_invalidate_page(pmap, pv->pv_va); 4936 } 4937 PMAP_UNLOCK(pmap); 4938 } 4939 sched_unpin(); 4940 rw_wunlock(&pvh_global_lock); 4941} 4942 4943/* 4944 * pmap_clear_reference: 4945 * 4946 * Clear the reference bit on the specified physical page. 4947 */ 4948void 4949pmap_clear_reference(vm_page_t m) 4950{ 4951 struct md_page *pvh; 4952 pv_entry_t next_pv, pv; 4953 pmap_t pmap; 4954 pd_entry_t oldpde, *pde; 4955 pt_entry_t *pte; 4956 vm_offset_t va; 4957 4958 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 4959 ("pmap_clear_reference: page %p is not managed", m)); 4960 rw_wlock(&pvh_global_lock); 4961 sched_pin(); 4962 if ((m->flags & PG_FICTITIOUS) != 0) 4963 goto small_mappings; 4964 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); 4965 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_list, next_pv) { 4966 va = pv->pv_va; 4967 pmap = PV_PMAP(pv); 4968 PMAP_LOCK(pmap); 4969 pde = pmap_pde(pmap, va); 4970 oldpde = *pde; 4971 if ((oldpde & PG_A) != 0) { 4972 if (pmap_demote_pde(pmap, pde, va)) { 4973 /* 4974 * Remove the mapping to a single page so 4975 * that a subsequent access may repromote. 4976 * Since the underlying page table page is 4977 * fully populated, this removal never frees 4978 * a page table page. 4979 */ 4980 va += VM_PAGE_TO_PHYS(m) - (oldpde & 4981 PG_PS_FRAME); 4982 pmap_remove_page(pmap, va, NULL); 4983 } 4984 } 4985 PMAP_UNLOCK(pmap); 4986 } 4987small_mappings: 4988 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 4989 pmap = PV_PMAP(pv); 4990 PMAP_LOCK(pmap); 4991 pde = pmap_pde(pmap, pv->pv_va); 4992 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_reference: found" 4993 " a 4mpage in page %p's pv list", m)); 4994 pte = pmap_pte_quick(pmap, pv->pv_va); 4995 if ((*pte & PG_A) != 0) { 4996 /* 4997 * Regardless of whether a pte is 32 or 64 bits 4998 * in size, PG_A is among the least significant 4999 * 32 bits. 5000 */ 5001 atomic_clear_int((u_int *)pte, PG_A); 5002 pmap_invalidate_page(pmap, pv->pv_va); 5003 } 5004 PMAP_UNLOCK(pmap); 5005 } 5006 sched_unpin(); 5007 rw_wunlock(&pvh_global_lock); 5008} 5009 5010/* 5011 * Miscellaneous support routines follow 5012 */ 5013 5014/* Adjust the cache mode for a 4KB page mapped via a PTE. */ 5015static __inline void 5016pmap_pte_attr(pt_entry_t *pte, int cache_bits) 5017{ 5018 u_int opte, npte; 5019 5020 /* 5021 * The cache mode bits are all in the low 32-bits of the 5022 * PTE, so we can just spin on updating the low 32-bits. 5023 */ 5024 do { 5025 opte = *(u_int *)pte; 5026 npte = opte & ~PG_PTE_CACHE; 5027 npte |= cache_bits; 5028 } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); 5029} 5030 5031/* Adjust the cache mode for a 2/4MB page mapped via a PDE. */ 5032static __inline void 5033pmap_pde_attr(pd_entry_t *pde, int cache_bits) 5034{ 5035 u_int opde, npde; 5036 5037 /* 5038 * The cache mode bits are all in the low 32-bits of the 5039 * PDE, so we can just spin on updating the low 32-bits. 5040 */ 5041 do { 5042 opde = *(u_int *)pde; 5043 npde = opde & ~PG_PDE_CACHE; 5044 npde |= cache_bits; 5045 } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); 5046} 5047 5048/* 5049 * Map a set of physical memory pages into the kernel virtual 5050 * address space. Return a pointer to where it is mapped. This 5051 * routine is intended to be used for mapping device memory, 5052 * NOT real memory. 5053 */ 5054void * 5055pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 5056{ 5057 vm_offset_t va, offset; 5058 vm_size_t tmpsize; 5059 5060 offset = pa & PAGE_MASK; 5061 size = roundup(offset + size, PAGE_SIZE); 5062 pa = pa & PG_FRAME; 5063 5064 if (pa < KERNLOAD && pa + size <= KERNLOAD) 5065 va = KERNBASE + pa; 5066 else 5067 va = kmem_alloc_nofault(kernel_map, size); 5068 if (!va) 5069 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 5070 5071 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) 5072 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); 5073 pmap_invalidate_range(kernel_pmap, va, va + tmpsize); 5074 pmap_invalidate_cache_range(va, va + size); 5075 return ((void *)(va + offset)); 5076} 5077 5078void * 5079pmap_mapdev(vm_paddr_t pa, vm_size_t size) 5080{ 5081 5082 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 5083} 5084 5085void * 5086pmap_mapbios(vm_paddr_t pa, vm_size_t size) 5087{ 5088 5089 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 5090} 5091 5092void 5093pmap_unmapdev(vm_offset_t va, vm_size_t size) 5094{ 5095 vm_offset_t base, offset; 5096 5097 if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) 5098 return; 5099 base = trunc_page(va); 5100 offset = va & PAGE_MASK; 5101 size = roundup(offset + size, PAGE_SIZE); 5102 kmem_free(kernel_map, base, size); 5103} 5104 5105/* 5106 * Sets the memory attribute for the specified page. 5107 */ 5108void 5109pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) 5110{ 5111 5112 m->md.pat_mode = ma; 5113 if ((m->flags & PG_FICTITIOUS) != 0) 5114 return; 5115 5116 /* 5117 * If "m" is a normal page, flush it from the cache. 5118 * See pmap_invalidate_cache_range(). 5119 * 5120 * First, try to find an existing mapping of the page by sf 5121 * buffer. sf_buf_invalidate_cache() modifies mapping and 5122 * flushes the cache. 5123 */ 5124 if (sf_buf_invalidate_cache(m)) 5125 return; 5126 5127 /* 5128 * If page is not mapped by sf buffer, but CPU does not 5129 * support self snoop, map the page transient and do 5130 * invalidation. In the worst case, whole cache is flushed by 5131 * pmap_invalidate_cache_range(). 5132 */ 5133 if ((cpu_feature & CPUID_SS) == 0) 5134 pmap_flush_page(m); 5135} 5136 5137static void 5138pmap_flush_page(vm_page_t m) 5139{ 5140 struct sysmaps *sysmaps; 5141 vm_offset_t sva, eva; 5142 5143 if ((cpu_feature & CPUID_CLFSH) != 0) { 5144 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 5145 mtx_lock(&sysmaps->lock); 5146 if (*sysmaps->CMAP2) 5147 panic("pmap_flush_page: CMAP2 busy"); 5148 sched_pin(); 5149 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | 5150 PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); 5151 invlcaddr(sysmaps->CADDR2); 5152 sva = (vm_offset_t)sysmaps->CADDR2; 5153 eva = sva + PAGE_SIZE; 5154 5155 /* 5156 * Use mfence despite the ordering implied by 5157 * mtx_{un,}lock() because clflush is not guaranteed 5158 * to be ordered by any other instruction. 5159 */ 5160 mfence(); 5161 for (; sva < eva; sva += cpu_clflush_line_size) 5162 clflush(sva); 5163 mfence(); 5164 *sysmaps->CMAP2 = 0; 5165 sched_unpin(); 5166 mtx_unlock(&sysmaps->lock); 5167 } else 5168 pmap_invalidate_cache(); 5169} 5170 5171/* 5172 * Changes the specified virtual address range's memory type to that given by 5173 * the parameter "mode". The specified virtual address range must be 5174 * completely contained within either the kernel map. 5175 * 5176 * Returns zero if the change completed successfully, and either EINVAL or 5177 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part 5178 * of the virtual address range was not mapped, and ENOMEM is returned if 5179 * there was insufficient memory available to complete the change. 5180 */ 5181int 5182pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) 5183{ 5184 vm_offset_t base, offset, tmpva; 5185 pd_entry_t *pde; 5186 pt_entry_t *pte; 5187 int cache_bits_pte, cache_bits_pde; 5188 boolean_t changed; 5189 5190 base = trunc_page(va); 5191 offset = va & PAGE_MASK; 5192 size = roundup(offset + size, PAGE_SIZE); 5193 5194 /* 5195 * Only supported on kernel virtual addresses above the recursive map. 5196 */ 5197 if (base < VM_MIN_KERNEL_ADDRESS) 5198 return (EINVAL); 5199 5200 cache_bits_pde = pmap_cache_bits(mode, 1); 5201 cache_bits_pte = pmap_cache_bits(mode, 0); 5202 changed = FALSE; 5203 5204 /* 5205 * Pages that aren't mapped aren't supported. Also break down 5206 * 2/4MB pages into 4KB pages if required. 5207 */ 5208 PMAP_LOCK(kernel_pmap); 5209 for (tmpva = base; tmpva < base + size; ) { 5210 pde = pmap_pde(kernel_pmap, tmpva); 5211 if (*pde == 0) { 5212 PMAP_UNLOCK(kernel_pmap); 5213 return (EINVAL); 5214 } 5215 if (*pde & PG_PS) { 5216 /* 5217 * If the current 2/4MB page already has 5218 * the required memory type, then we need not 5219 * demote this page. Just increment tmpva to 5220 * the next 2/4MB page frame. 5221 */ 5222 if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { 5223 tmpva = trunc_4mpage(tmpva) + NBPDR; 5224 continue; 5225 } 5226 5227 /* 5228 * If the current offset aligns with a 2/4MB 5229 * page frame and there is at least 2/4MB left 5230 * within the range, then we need not break 5231 * down this page into 4KB pages. 5232 */ 5233 if ((tmpva & PDRMASK) == 0 && 5234 tmpva + PDRMASK < base + size) { 5235 tmpva += NBPDR; 5236 continue; 5237 } 5238 if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) { 5239 PMAP_UNLOCK(kernel_pmap); 5240 return (ENOMEM); 5241 } 5242 } 5243 pte = vtopte(tmpva); 5244 if (*pte == 0) { 5245 PMAP_UNLOCK(kernel_pmap); 5246 return (EINVAL); 5247 } 5248 tmpva += PAGE_SIZE; 5249 } 5250 PMAP_UNLOCK(kernel_pmap); 5251 5252 /* 5253 * Ok, all the pages exist, so run through them updating their 5254 * cache mode if required. 5255 */ 5256 for (tmpva = base; tmpva < base + size; ) { 5257 pde = pmap_pde(kernel_pmap, tmpva); 5258 if (*pde & PG_PS) { 5259 if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { 5260 pmap_pde_attr(pde, cache_bits_pde); 5261 changed = TRUE; 5262 } 5263 tmpva = trunc_4mpage(tmpva) + NBPDR; 5264 } else { 5265 pte = vtopte(tmpva); 5266 if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { 5267 pmap_pte_attr(pte, cache_bits_pte); 5268 changed = TRUE; 5269 } 5270 tmpva += PAGE_SIZE; 5271 } 5272 } 5273 5274 /* 5275 * Flush CPU caches to make sure any data isn't cached that 5276 * shouldn't be, etc. 5277 */ 5278 if (changed) { 5279 pmap_invalidate_range(kernel_pmap, base, tmpva); 5280 pmap_invalidate_cache_range(base, tmpva); 5281 } 5282 return (0); 5283} 5284 5285/* 5286 * perform the pmap work for mincore 5287 */ 5288int 5289pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) 5290{ 5291 pd_entry_t *pdep; 5292 pt_entry_t *ptep, pte; 5293 vm_paddr_t pa; 5294 int val; 5295 5296 PMAP_LOCK(pmap); 5297retry: 5298 pdep = pmap_pde(pmap, addr); 5299 if (*pdep != 0) { 5300 if (*pdep & PG_PS) { 5301 pte = *pdep; 5302 /* Compute the physical address of the 4KB page. */ 5303 pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & 5304 PG_FRAME; 5305 val = MINCORE_SUPER; 5306 } else { 5307 ptep = pmap_pte(pmap, addr); 5308 pte = *ptep; 5309 pmap_pte_release(ptep); 5310 pa = pte & PG_FRAME; 5311 val = 0; 5312 } 5313 } else { 5314 pte = 0; 5315 pa = 0; 5316 val = 0; 5317 } 5318 if ((pte & PG_V) != 0) { 5319 val |= MINCORE_INCORE; 5320 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) 5321 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; 5322 if ((pte & PG_A) != 0) 5323 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; 5324 } 5325 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != 5326 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && 5327 (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { 5328 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ 5329 if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) 5330 goto retry; 5331 } else 5332 PA_UNLOCK_COND(*locked_pa); 5333 PMAP_UNLOCK(pmap); 5334 return (val); 5335} 5336 5337void 5338pmap_activate(struct thread *td) 5339{ 5340 pmap_t pmap, oldpmap; 5341 u_int cpuid; 5342 u_int32_t cr3; 5343 5344 critical_enter(); 5345 pmap = vmspace_pmap(td->td_proc->p_vmspace); 5346 oldpmap = PCPU_GET(curpmap); 5347 cpuid = PCPU_GET(cpuid); 5348#if defined(SMP) 5349 CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); 5350 CPU_SET_ATOMIC(cpuid, &pmap->pm_active); 5351#else 5352 CPU_CLR(cpuid, &oldpmap->pm_active); 5353 CPU_SET(cpuid, &pmap->pm_active); 5354#endif 5355#ifdef PAE 5356 cr3 = vtophys(pmap->pm_pdpt); 5357#else 5358 cr3 = vtophys(pmap->pm_pdir); 5359#endif 5360 /* 5361 * pmap_activate is for the current thread on the current cpu 5362 */ 5363 td->td_pcb->pcb_cr3 = cr3; 5364 load_cr3(cr3); 5365 PCPU_SET(curpmap, pmap); 5366 critical_exit(); 5367} 5368 5369void 5370pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) 5371{ 5372} 5373 5374/* 5375 * Increase the starting virtual address of the given mapping if a 5376 * different alignment might result in more superpage mappings. 5377 */ 5378void 5379pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, 5380 vm_offset_t *addr, vm_size_t size) 5381{ 5382 vm_offset_t superpage_offset; 5383 5384 if (size < NBPDR) 5385 return; 5386 if (object != NULL && (object->flags & OBJ_COLORED) != 0) 5387 offset += ptoa(object->pg_color); 5388 superpage_offset = offset & PDRMASK; 5389 if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || 5390 (*addr & PDRMASK) == superpage_offset) 5391 return; 5392 if ((*addr & PDRMASK) < superpage_offset) 5393 *addr = (*addr & ~PDRMASK) + superpage_offset; 5394 else 5395 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; 5396} 5397 5398 5399#if defined(PMAP_DEBUG) 5400pmap_pid_dump(int pid) 5401{ 5402 pmap_t pmap; 5403 struct proc *p; 5404 int npte = 0; 5405 int index; 5406 5407 sx_slock(&allproc_lock); 5408 FOREACH_PROC_IN_SYSTEM(p) { 5409 if (p->p_pid != pid) 5410 continue; 5411 5412 if (p->p_vmspace) { 5413 int i,j; 5414 index = 0; 5415 pmap = vmspace_pmap(p->p_vmspace); 5416 for (i = 0; i < NPDEPTD; i++) { 5417 pd_entry_t *pde; 5418 pt_entry_t *pte; 5419 vm_offset_t base = i << PDRSHIFT; 5420 5421 pde = &pmap->pm_pdir[i]; 5422 if (pde && pmap_pde_v(pde)) { 5423 for (j = 0; j < NPTEPG; j++) { 5424 vm_offset_t va = base + (j << PAGE_SHIFT); 5425 if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { 5426 if (index) { 5427 index = 0; 5428 printf("\n"); 5429 } 5430 sx_sunlock(&allproc_lock); 5431 return (npte); 5432 } 5433 pte = pmap_pte(pmap, va); 5434 if (pte && pmap_pte_v(pte)) { 5435 pt_entry_t pa; 5436 vm_page_t m; 5437 pa = *pte; 5438 m = PHYS_TO_VM_PAGE(pa & PG_FRAME); 5439 printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", 5440 va, pa, m->hold_count, m->wire_count, m->flags); 5441 npte++; 5442 index++; 5443 if (index >= 2) { 5444 index = 0; 5445 printf("\n"); 5446 } else { 5447 printf(" "); 5448 } 5449 } 5450 } 5451 } 5452 } 5453 } 5454 } 5455 sx_sunlock(&allproc_lock); 5456 return (npte); 5457} 5458#endif 5459 5460#if defined(DEBUG) 5461 5462static void pads(pmap_t pm); 5463void pmap_pvdump(vm_paddr_t pa); 5464 5465/* print address space of pmap*/ 5466static void 5467pads(pmap_t pm) 5468{ 5469 int i, j; 5470 vm_paddr_t va; 5471 pt_entry_t *ptep; 5472 5473 if (pm == kernel_pmap) 5474 return; 5475 for (i = 0; i < NPDEPTD; i++) 5476 if (pm->pm_pdir[i]) 5477 for (j = 0; j < NPTEPG; j++) { 5478 va = (i << PDRSHIFT) + (j << PAGE_SHIFT); 5479 if (pm == kernel_pmap && va < KERNBASE) 5480 continue; 5481 if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) 5482 continue; 5483 ptep = pmap_pte(pm, va); 5484 if (pmap_pte_v(ptep)) 5485 printf("%x:%x ", va, *ptep); 5486 }; 5487 5488} 5489 5490void 5491pmap_pvdump(vm_paddr_t pa) 5492{ 5493 pv_entry_t pv; 5494 pmap_t pmap; 5495 vm_page_t m; 5496 5497 printf("pa %x", pa); 5498 m = PHYS_TO_VM_PAGE(pa); 5499 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 5500 pmap = PV_PMAP(pv); 5501 printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va); 5502 pads(pmap); 5503 } 5504 printf(" "); 5505} 5506#endif 5507