pmap.c revision 166074
1/*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu> 9 * All rights reserved. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * the Systems Programming Group of the University of Utah Computer 13 * Science Department and William Jolitz of UUNET Technologies Inc. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 3. All advertising materials mentioning features or use of this software 24 * must display the following acknowledgement: 25 * This product includes software developed by the University of 26 * California, Berkeley and its contributors. 27 * 4. Neither the name of the University nor the names of its contributors 28 * may be used to endorse or promote products derived from this software 29 * without specific prior written permission. 30 * 31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41 * SUCH DAMAGE. 42 * 43 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 44 */ 45/*- 46 * Copyright (c) 2003 Networks Associates Technology, Inc. 47 * All rights reserved. 48 * 49 * This software was developed for the FreeBSD Project by Jake Burkholder, 50 * Safeport Network Services, and Network Associates Laboratories, the 51 * Security Research Division of Network Associates, Inc. under 52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 53 * CHATS research program. 54 * 55 * Redistribution and use in source and binary forms, with or without 56 * modification, are permitted provided that the following conditions 57 * are met: 58 * 1. Redistributions of source code must retain the above copyright 59 * notice, this list of conditions and the following disclaimer. 60 * 2. Redistributions in binary form must reproduce the above copyright 61 * notice, this list of conditions and the following disclaimer in the 62 * documentation and/or other materials provided with the distribution. 63 * 64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 67 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 74 * SUCH DAMAGE. 75 */ 76 77#include <sys/cdefs.h> 78__FBSDID("$FreeBSD: head/sys/i386/i386/pmap.c 166074 2007-01-17 15:05:52Z delphij $"); 79 80/* 81 * Manages physical address maps. 82 * 83 * In addition to hardware address maps, this 84 * module is called upon to provide software-use-only 85 * maps which may or may not be stored in the same 86 * form as hardware maps. These pseudo-maps are 87 * used to store intermediate results from copy 88 * operations to and from address spaces. 89 * 90 * Since the information managed by this module is 91 * also stored by the logical address mapping module, 92 * this module may throw away valid virtual-to-physical 93 * mappings at almost any time. However, invalidations 94 * of virtual-to-physical mappings must be done as 95 * requested. 96 * 97 * In order to cope with hardware architectures which 98 * make virtual-to-physical map invalidates expensive, 99 * this module may delay invalidate or reduced protection 100 * operations until such time as they are actually 101 * necessary. This module is given full information as 102 * to which processors are currently using which maps, 103 * and to when physical maps must be made correct. 104 */ 105 106#include "opt_cpu.h" 107#include "opt_pmap.h" 108#include "opt_msgbuf.h" 109#include "opt_smp.h" 110#include "opt_xbox.h" 111 112#include <sys/param.h> 113#include <sys/systm.h> 114#include <sys/kernel.h> 115#include <sys/lock.h> 116#include <sys/malloc.h> 117#include <sys/mman.h> 118#include <sys/msgbuf.h> 119#include <sys/mutex.h> 120#include <sys/proc.h> 121#include <sys/sx.h> 122#include <sys/vmmeter.h> 123#include <sys/sched.h> 124#include <sys/sysctl.h> 125#ifdef SMP 126#include <sys/smp.h> 127#endif 128 129#include <vm/vm.h> 130#include <vm/vm_param.h> 131#include <vm/vm_kern.h> 132#include <vm/vm_page.h> 133#include <vm/vm_map.h> 134#include <vm/vm_object.h> 135#include <vm/vm_extern.h> 136#include <vm/vm_pageout.h> 137#include <vm/vm_pager.h> 138#include <vm/uma.h> 139 140#include <machine/cpu.h> 141#include <machine/cputypes.h> 142#include <machine/md_var.h> 143#include <machine/pcb.h> 144#include <machine/specialreg.h> 145#ifdef SMP 146#include <machine/smp.h> 147#endif 148 149#ifdef XBOX 150#include <machine/xbox.h> 151#endif 152 153#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) 154#define CPU_ENABLE_SSE 155#endif 156 157#ifndef PMAP_SHPGPERPROC 158#define PMAP_SHPGPERPROC 200 159#endif 160 161#if defined(DIAGNOSTIC) 162#define PMAP_DIAGNOSTIC 163#endif 164 165#if !defined(PMAP_DIAGNOSTIC) 166#define PMAP_INLINE __inline 167#else 168#define PMAP_INLINE 169#endif 170 171#define PV_STATS 172#ifdef PV_STATS 173#define PV_STAT(x) do { x ; } while (0) 174#else 175#define PV_STAT(x) do { } while (0) 176#endif 177 178/* 179 * Get PDEs and PTEs for user/kernel address space 180 */ 181#define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) 182#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 183 184#define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) 185#define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) 186#define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) 187#define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) 188#define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) 189 190#define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ 191 atomic_clear_int((u_int *)(pte), PG_W)) 192#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) 193 194struct pmap kernel_pmap_store; 195LIST_HEAD(pmaplist, pmap); 196static struct pmaplist allpmaps; 197static struct mtx allpmaps_lock; 198 199vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 200vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 201int pgeflag = 0; /* PG_G or-in */ 202int pseflag = 0; /* PG_PS or-in */ 203 204static int nkpt; 205vm_offset_t kernel_vm_end; 206extern u_int32_t KERNend; 207 208#ifdef PAE 209static uma_zone_t pdptzone; 210#endif 211 212/* 213 * Data for the pv entry allocation mechanism 214 */ 215static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 216static int shpgperproc = PMAP_SHPGPERPROC; 217 218struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ 219int pv_maxchunks; /* How many chunks we have KVA for */ 220vm_offset_t pv_vafree; /* freelist stored in the PTE */ 221 222/* 223 * All those kernel PT submaps that BSD is so fond of 224 */ 225struct sysmaps { 226 struct mtx lock; 227 pt_entry_t *CMAP1; 228 pt_entry_t *CMAP2; 229 caddr_t CADDR1; 230 caddr_t CADDR2; 231}; 232static struct sysmaps sysmaps_pcpu[MAXCPU]; 233pt_entry_t *CMAP1 = 0; 234static pt_entry_t *CMAP3; 235caddr_t CADDR1 = 0, ptvmmap = 0; 236static caddr_t CADDR3; 237struct msgbuf *msgbufp = 0; 238 239/* 240 * Crashdump maps. 241 */ 242static caddr_t crashdumpmap; 243 244#ifdef SMP 245extern pt_entry_t *SMPpt; 246#endif 247static pt_entry_t *PMAP1 = 0, *PMAP2; 248static pt_entry_t *PADDR1 = 0, *PADDR2; 249#ifdef SMP 250static int PMAP1cpu; 251static int PMAP1changedcpu; 252SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 253 &PMAP1changedcpu, 0, 254 "Number of times pmap_pte_quick changed CPU with same PMAP1"); 255#endif 256static int PMAP1changed; 257SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 258 &PMAP1changed, 0, 259 "Number of times pmap_pte_quick changed PMAP1"); 260static int PMAP1unchanged; 261SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 262 &PMAP1unchanged, 0, 263 "Number of times pmap_pte_quick didn't change PMAP1"); 264static struct mtx PMAP2mutex; 265 266static void free_pv_entry(pmap_t pmap, pv_entry_t pv); 267static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try); 268 269static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, 270 vm_page_t m, vm_prot_t prot, vm_page_t mpte); 271static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva); 272static void pmap_remove_page(struct pmap *pmap, vm_offset_t va); 273static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, 274 vm_offset_t va); 275static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); 276static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, 277 vm_page_t m); 278 279static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags); 280 281static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags); 282static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m); 283static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); 284static void pmap_pte_release(pt_entry_t *pte); 285static int pmap_unuse_pt(pmap_t, vm_offset_t); 286static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 287#ifdef PAE 288static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait); 289#endif 290 291CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); 292CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); 293 294/* 295 * Move the kernel virtual free pointer to the next 296 * 4MB. This is used to help improve performance 297 * by using a large (4MB) page for much of the kernel 298 * (.text, .data, .bss) 299 */ 300static vm_offset_t 301pmap_kmem_choose(vm_offset_t addr) 302{ 303 vm_offset_t newaddr = addr; 304 305#ifndef DISABLE_PSE 306 if (cpu_feature & CPUID_PSE) 307 newaddr = (addr + PDRMASK) & ~PDRMASK; 308#endif 309 return newaddr; 310} 311 312/* 313 * Bootstrap the system enough to run with virtual memory. 314 * 315 * On the i386 this is called after mapping has already been enabled 316 * and just syncs the pmap module with what has already been done. 317 * [We can't call it easily with mapping off since the kernel is not 318 * mapped with PA == VA, hence we would have to relocate every address 319 * from the linked base (virtual) address "KERNBASE" to the actual 320 * (physical) address starting relative to 0] 321 */ 322void 323pmap_bootstrap(vm_paddr_t firstaddr, vm_paddr_t loadaddr) 324{ 325 vm_offset_t va; 326 pt_entry_t *pte, *unused; 327 struct sysmaps *sysmaps; 328 int i; 329 330 /* 331 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too 332 * large. It should instead be correctly calculated in locore.s and 333 * not based on 'first' (which is a physical address, not a virtual 334 * address, for the start of unused physical memory). The kernel 335 * page tables are NOT double mapped and thus should not be included 336 * in this calculation. 337 */ 338 virtual_avail = (vm_offset_t) KERNBASE + firstaddr; 339 virtual_avail = pmap_kmem_choose(virtual_avail); 340 341 virtual_end = VM_MAX_KERNEL_ADDRESS; 342 343 /* 344 * Initialize the kernel pmap (which is statically allocated). 345 */ 346 PMAP_LOCK_INIT(kernel_pmap); 347 kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); 348#ifdef PAE 349 kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); 350#endif 351 kernel_pmap->pm_active = -1; /* don't allow deactivation */ 352 TAILQ_INIT(&kernel_pmap->pm_pvchunk); 353 LIST_INIT(&allpmaps); 354 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); 355 mtx_lock_spin(&allpmaps_lock); 356 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); 357 mtx_unlock_spin(&allpmaps_lock); 358 nkpt = NKPT; 359 360 /* 361 * Reserve some special page table entries/VA space for temporary 362 * mapping of pages. 363 */ 364#define SYSMAP(c, p, v, n) \ 365 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 366 367 va = virtual_avail; 368 pte = vtopte(va); 369 370 /* 371 * CMAP1/CMAP2 are used for zeroing and copying pages. 372 * CMAP3 is used for the idle process page zeroing. 373 */ 374 for (i = 0; i < MAXCPU; i++) { 375 sysmaps = &sysmaps_pcpu[i]; 376 mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF); 377 SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1) 378 SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1) 379 } 380 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 381 SYSMAP(caddr_t, CMAP3, CADDR3, 1) 382 *CMAP3 = 0; 383 384 /* 385 * Crashdump maps. 386 */ 387 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) 388 389 /* 390 * ptvmmap is used for reading arbitrary physical pages via /dev/mem. 391 */ 392 SYSMAP(caddr_t, unused, ptvmmap, 1) 393 394 /* 395 * msgbufp is used to map the system message buffer. 396 */ 397 SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE))) 398 399 /* 400 * ptemap is used for pmap_pte_quick 401 */ 402 SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1); 403 SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1); 404 405 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); 406 407 virtual_avail = va; 408 409 *CMAP1 = 0; 410 411#ifdef XBOX 412 /* FIXME: This is gross, but needed for the XBOX. Since we are in such 413 * an early stadium, we cannot yet neatly map video memory ... :-( 414 * Better fixes are very welcome! */ 415 if (!arch_i386_is_xbox) 416#endif 417 for (i = 0; i < NKPT; i++) 418 PTD[i] = 0; 419 420 /* Initialize the PAT MSR if present. */ 421 pmap_init_pat(); 422 423 /* Turn on PG_G on kernel page(s) */ 424 pmap_set_pg(); 425} 426 427/* 428 * Setup the PAT MSR. 429 */ 430void 431pmap_init_pat(void) 432{ 433 uint64_t pat_msr; 434 435 /* Bail if this CPU doesn't implement PAT. */ 436 if (!(cpu_feature & CPUID_PAT)) 437 return; 438 439#ifdef PAT_WORKS 440 /* 441 * Leave the indices 0-3 at the default of WB, WT, UC, and UC-. 442 * Program 4 and 5 as WP and WC. 443 * Leave 6 and 7 as UC and UC-. 444 */ 445 pat_msr = rdmsr(MSR_PAT); 446 pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5)); 447 pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) | 448 PAT_VALUE(5, PAT_WRITE_COMBINING); 449#else 450 /* 451 * Due to some Intel errata, we can only safely use the lower 4 452 * PAT entries. Thus, just replace PAT Index 2 with WC instead 453 * of UC-. 454 * 455 * Intel Pentium III Processor Specification Update 456 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B 457 * or Mode C Paging) 458 * 459 * Intel Pentium IV Processor Specification Update 460 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) 461 */ 462 pat_msr = rdmsr(MSR_PAT); 463 pat_msr &= ~PAT_MASK(2); 464 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); 465#endif 466 wrmsr(MSR_PAT, pat_msr); 467} 468 469/* 470 * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. 471 */ 472void 473pmap_set_pg(void) 474{ 475 pd_entry_t pdir; 476 pt_entry_t *pte; 477 vm_offset_t va, endva; 478 int i; 479 480 if (pgeflag == 0) 481 return; 482 483 i = KERNLOAD/NBPDR; 484 endva = KERNBASE + KERNend; 485 486 if (pseflag) { 487 va = KERNBASE + KERNLOAD; 488 while (va < endva) { 489 pdir = kernel_pmap->pm_pdir[KPTDI+i]; 490 pdir |= pgeflag; 491 kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir; 492 invltlb(); /* Play it safe, invltlb() every time */ 493 i++; 494 va += NBPDR; 495 } 496 } else { 497 va = (vm_offset_t)btext; 498 while (va < endva) { 499 pte = vtopte(va); 500 if (*pte) 501 *pte |= pgeflag; 502 invltlb(); /* Play it safe, invltlb() every time */ 503 va += PAGE_SIZE; 504 } 505 } 506} 507 508/* 509 * Initialize a vm_page's machine-dependent fields. 510 */ 511void 512pmap_page_init(vm_page_t m) 513{ 514 515 TAILQ_INIT(&m->md.pv_list); 516 m->md.pv_list_count = 0; 517} 518 519#ifdef PAE 520 521static MALLOC_DEFINE(M_PMAPPDPT, "pmap", "pmap pdpt"); 522 523static void * 524pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) 525{ 526 *flags = UMA_SLAB_PRIV; 527 return (contigmalloc(PAGE_SIZE, M_PMAPPDPT, 0, 0x0ULL, 0xffffffffULL, 528 1, 0)); 529} 530#endif 531 532/* 533 * ABuse the pte nodes for unmapped kva to thread a kva freelist through. 534 * Requirements: 535 * - Must deal with pages in order to ensure that none of the PG_* bits 536 * are ever set, PG_V in particular. 537 * - Assumes we can write to ptes without pte_store() atomic ops, even 538 * on PAE systems. This should be ok. 539 * - Assumes nothing will ever test these addresses for 0 to indicate 540 * no mapping instead of correctly checking PG_V. 541 * - Assumes a vm_offset_t will fit in a pte (true for i386). 542 * Because PG_V is never set, there can be no mappings to invalidate. 543 */ 544static vm_offset_t 545pmap_ptelist_alloc(vm_offset_t *head) 546{ 547 pt_entry_t *pte; 548 vm_offset_t va; 549 550 va = *head; 551 if (va == 0) 552 return (va); /* Out of memory */ 553 pte = vtopte(va); 554 *head = *pte; 555 if (*head & PG_V) 556 panic("pmap_ptelist_alloc: va with PG_V set!"); 557 *pte = 0; 558 return (va); 559} 560 561static void 562pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) 563{ 564 pt_entry_t *pte; 565 566 if (va & PG_V) 567 panic("pmap_ptelist_free: freeing va with PG_V set!"); 568 pte = vtopte(va); 569 *pte = *head; /* virtual! PG_V is 0 though */ 570 *head = va; 571} 572 573static void 574pmap_ptelist_init(vm_offset_t *head, void *base, int npages) 575{ 576 int i; 577 vm_offset_t va; 578 579 *head = 0; 580 for (i = npages - 1; i >= 0; i--) { 581 va = (vm_offset_t)base + i * PAGE_SIZE; 582 pmap_ptelist_free(head, va); 583 } 584} 585 586 587/* 588 * Initialize the pmap module. 589 * Called by vm_init, to initialize any structures that the pmap 590 * system needs to map virtual memory. 591 */ 592void 593pmap_init(void) 594{ 595 596 /* 597 * Initialize the address space (zone) for the pv entries. Set a 598 * high water mark so that the system can recover from excessive 599 * numbers of pv entries. 600 */ 601 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 602 pv_entry_max = shpgperproc * maxproc + cnt.v_page_count; 603 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 604 pv_entry_max = roundup(pv_entry_max, _NPCPV); 605 pv_entry_high_water = 9 * (pv_entry_max / 10); 606 607 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); 608 pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map, 609 PAGE_SIZE * pv_maxchunks); 610 if (pv_chunkbase == NULL) 611 panic("pmap_init: not enough kvm for pv chunks"); 612 pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); 613#ifdef PAE 614 pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, 615 NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, 616 UMA_ZONE_VM | UMA_ZONE_NOFREE); 617 uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); 618#endif 619} 620 621 622SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); 623SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, 624 "Max number of PV entries"); 625SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, 626 "Page share factor per proc"); 627 628/*************************************************** 629 * Low level helper routines..... 630 ***************************************************/ 631 632/* 633 * Determine the appropriate bits to set in a PTE or PDE for a specified 634 * caching mode. 635 */ 636static int 637pmap_cache_bits(int mode, boolean_t is_pde) 638{ 639 int pat_flag, pat_index, cache_bits; 640 641 /* The PAT bit is different for PTE's and PDE's. */ 642 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; 643 644 /* If we don't support PAT, map extended modes to older ones. */ 645 if (!(cpu_feature & CPUID_PAT)) { 646 switch (mode) { 647 case PAT_UNCACHEABLE: 648 case PAT_WRITE_THROUGH: 649 case PAT_WRITE_BACK: 650 break; 651 case PAT_UNCACHED: 652 case PAT_WRITE_COMBINING: 653 case PAT_WRITE_PROTECTED: 654 mode = PAT_UNCACHEABLE; 655 break; 656 } 657 } 658 659 /* Map the caching mode to a PAT index. */ 660 switch (mode) { 661#ifdef PAT_WORKS 662 case PAT_UNCACHEABLE: 663 pat_index = 3; 664 break; 665 case PAT_WRITE_THROUGH: 666 pat_index = 1; 667 break; 668 case PAT_WRITE_BACK: 669 pat_index = 0; 670 break; 671 case PAT_UNCACHED: 672 pat_index = 2; 673 break; 674 case PAT_WRITE_COMBINING: 675 pat_index = 5; 676 break; 677 case PAT_WRITE_PROTECTED: 678 pat_index = 4; 679 break; 680#else 681 case PAT_UNCACHED: 682 case PAT_UNCACHEABLE: 683 case PAT_WRITE_PROTECTED: 684 pat_index = 3; 685 break; 686 case PAT_WRITE_THROUGH: 687 pat_index = 1; 688 break; 689 case PAT_WRITE_BACK: 690 pat_index = 0; 691 break; 692 case PAT_WRITE_COMBINING: 693 pat_index = 2; 694 break; 695#endif 696 default: 697 panic("Unknown caching mode %d\n", mode); 698 } 699 700 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ 701 cache_bits = 0; 702 if (pat_index & 0x4) 703 cache_bits |= pat_flag; 704 if (pat_index & 0x2) 705 cache_bits |= PG_NC_PCD; 706 if (pat_index & 0x1) 707 cache_bits |= PG_NC_PWT; 708 return (cache_bits); 709} 710#ifdef SMP 711/* 712 * For SMP, these functions have to use the IPI mechanism for coherence. 713 */ 714void 715pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 716{ 717 u_int cpumask; 718 u_int other_cpus; 719 720 if (smp_started) { 721 if (!(read_eflags() & PSL_I)) 722 panic("%s: interrupts disabled", __func__); 723 mtx_lock_spin(&smp_ipi_mtx); 724 } else 725 critical_enter(); 726 /* 727 * We need to disable interrupt preemption but MUST NOT have 728 * interrupts disabled here. 729 * XXX we may need to hold schedlock to get a coherent pm_active 730 * XXX critical sections disable interrupts again 731 */ 732 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 733 invlpg(va); 734 smp_invlpg(va); 735 } else { 736 cpumask = PCPU_GET(cpumask); 737 other_cpus = PCPU_GET(other_cpus); 738 if (pmap->pm_active & cpumask) 739 invlpg(va); 740 if (pmap->pm_active & other_cpus) 741 smp_masked_invlpg(pmap->pm_active & other_cpus, va); 742 } 743 if (smp_started) 744 mtx_unlock_spin(&smp_ipi_mtx); 745 else 746 critical_exit(); 747} 748 749void 750pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 751{ 752 u_int cpumask; 753 u_int other_cpus; 754 vm_offset_t addr; 755 756 if (smp_started) { 757 if (!(read_eflags() & PSL_I)) 758 panic("%s: interrupts disabled", __func__); 759 mtx_lock_spin(&smp_ipi_mtx); 760 } else 761 critical_enter(); 762 /* 763 * We need to disable interrupt preemption but MUST NOT have 764 * interrupts disabled here. 765 * XXX we may need to hold schedlock to get a coherent pm_active 766 * XXX critical sections disable interrupts again 767 */ 768 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 769 for (addr = sva; addr < eva; addr += PAGE_SIZE) 770 invlpg(addr); 771 smp_invlpg_range(sva, eva); 772 } else { 773 cpumask = PCPU_GET(cpumask); 774 other_cpus = PCPU_GET(other_cpus); 775 if (pmap->pm_active & cpumask) 776 for (addr = sva; addr < eva; addr += PAGE_SIZE) 777 invlpg(addr); 778 if (pmap->pm_active & other_cpus) 779 smp_masked_invlpg_range(pmap->pm_active & other_cpus, 780 sva, eva); 781 } 782 if (smp_started) 783 mtx_unlock_spin(&smp_ipi_mtx); 784 else 785 critical_exit(); 786} 787 788void 789pmap_invalidate_all(pmap_t pmap) 790{ 791 u_int cpumask; 792 u_int other_cpus; 793 794 if (smp_started) { 795 if (!(read_eflags() & PSL_I)) 796 panic("%s: interrupts disabled", __func__); 797 mtx_lock_spin(&smp_ipi_mtx); 798 } else 799 critical_enter(); 800 /* 801 * We need to disable interrupt preemption but MUST NOT have 802 * interrupts disabled here. 803 * XXX we may need to hold schedlock to get a coherent pm_active 804 * XXX critical sections disable interrupts again 805 */ 806 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 807 invltlb(); 808 smp_invltlb(); 809 } else { 810 cpumask = PCPU_GET(cpumask); 811 other_cpus = PCPU_GET(other_cpus); 812 if (pmap->pm_active & cpumask) 813 invltlb(); 814 if (pmap->pm_active & other_cpus) 815 smp_masked_invltlb(pmap->pm_active & other_cpus); 816 } 817 if (smp_started) 818 mtx_unlock_spin(&smp_ipi_mtx); 819 else 820 critical_exit(); 821} 822 823void 824pmap_invalidate_cache(void) 825{ 826 827 if (smp_started) { 828 if (!(read_eflags() & PSL_I)) 829 panic("%s: interrupts disabled", __func__); 830 mtx_lock_spin(&smp_ipi_mtx); 831 } else 832 critical_enter(); 833 /* 834 * We need to disable interrupt preemption but MUST NOT have 835 * interrupts disabled here. 836 * XXX we may need to hold schedlock to get a coherent pm_active 837 * XXX critical sections disable interrupts again 838 */ 839 wbinvd(); 840 smp_cache_flush(); 841 if (smp_started) 842 mtx_unlock_spin(&smp_ipi_mtx); 843 else 844 critical_exit(); 845} 846#else /* !SMP */ 847/* 848 * Normal, non-SMP, 486+ invalidation functions. 849 * We inline these within pmap.c for speed. 850 */ 851PMAP_INLINE void 852pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 853{ 854 855 if (pmap == kernel_pmap || pmap->pm_active) 856 invlpg(va); 857} 858 859PMAP_INLINE void 860pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 861{ 862 vm_offset_t addr; 863 864 if (pmap == kernel_pmap || pmap->pm_active) 865 for (addr = sva; addr < eva; addr += PAGE_SIZE) 866 invlpg(addr); 867} 868 869PMAP_INLINE void 870pmap_invalidate_all(pmap_t pmap) 871{ 872 873 if (pmap == kernel_pmap || pmap->pm_active) 874 invltlb(); 875} 876 877PMAP_INLINE void 878pmap_invalidate_cache(void) 879{ 880 881 wbinvd(); 882} 883#endif /* !SMP */ 884 885/* 886 * Are we current address space or kernel? N.B. We return FALSE when 887 * a pmap's page table is in use because a kernel thread is borrowing 888 * it. The borrowed page table can change spontaneously, making any 889 * dependence on its continued use subject to a race condition. 890 */ 891static __inline int 892pmap_is_current(pmap_t pmap) 893{ 894 895 return (pmap == kernel_pmap || 896 (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) && 897 (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME))); 898} 899 900/* 901 * If the given pmap is not the current or kernel pmap, the returned pte must 902 * be released by passing it to pmap_pte_release(). 903 */ 904pt_entry_t * 905pmap_pte(pmap_t pmap, vm_offset_t va) 906{ 907 pd_entry_t newpf; 908 pd_entry_t *pde; 909 910 pde = pmap_pde(pmap, va); 911 if (*pde & PG_PS) 912 return (pde); 913 if (*pde != 0) { 914 /* are we current address space or kernel? */ 915 if (pmap_is_current(pmap)) 916 return (vtopte(va)); 917 mtx_lock(&PMAP2mutex); 918 newpf = *pde & PG_FRAME; 919 if ((*PMAP2 & PG_FRAME) != newpf) { 920 *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; 921 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); 922 } 923 return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); 924 } 925 return (0); 926} 927 928/* 929 * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte 930 * being NULL. 931 */ 932static __inline void 933pmap_pte_release(pt_entry_t *pte) 934{ 935 936 if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) 937 mtx_unlock(&PMAP2mutex); 938} 939 940static __inline void 941invlcaddr(void *caddr) 942{ 943 944 invlpg((u_int)caddr); 945} 946 947/* 948 * Super fast pmap_pte routine best used when scanning 949 * the pv lists. This eliminates many coarse-grained 950 * invltlb calls. Note that many of the pv list 951 * scans are across different pmaps. It is very wasteful 952 * to do an entire invltlb for checking a single mapping. 953 * 954 * If the given pmap is not the current pmap, vm_page_queue_mtx 955 * must be held and curthread pinned to a CPU. 956 */ 957static pt_entry_t * 958pmap_pte_quick(pmap_t pmap, vm_offset_t va) 959{ 960 pd_entry_t newpf; 961 pd_entry_t *pde; 962 963 pde = pmap_pde(pmap, va); 964 if (*pde & PG_PS) 965 return (pde); 966 if (*pde != 0) { 967 /* are we current address space or kernel? */ 968 if (pmap_is_current(pmap)) 969 return (vtopte(va)); 970 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 971 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 972 newpf = *pde & PG_FRAME; 973 if ((*PMAP1 & PG_FRAME) != newpf) { 974 *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; 975#ifdef SMP 976 PMAP1cpu = PCPU_GET(cpuid); 977#endif 978 invlcaddr(PADDR1); 979 PMAP1changed++; 980 } else 981#ifdef SMP 982 if (PMAP1cpu != PCPU_GET(cpuid)) { 983 PMAP1cpu = PCPU_GET(cpuid); 984 invlcaddr(PADDR1); 985 PMAP1changedcpu++; 986 } else 987#endif 988 PMAP1unchanged++; 989 return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); 990 } 991 return (0); 992} 993 994/* 995 * Routine: pmap_extract 996 * Function: 997 * Extract the physical page address associated 998 * with the given map/virtual_address pair. 999 */ 1000vm_paddr_t 1001pmap_extract(pmap_t pmap, vm_offset_t va) 1002{ 1003 vm_paddr_t rtval; 1004 pt_entry_t *pte; 1005 pd_entry_t pde; 1006 1007 rtval = 0; 1008 PMAP_LOCK(pmap); 1009 pde = pmap->pm_pdir[va >> PDRSHIFT]; 1010 if (pde != 0) { 1011 if ((pde & PG_PS) != 0) { 1012 rtval = (pde & ~PDRMASK) | (va & PDRMASK); 1013 PMAP_UNLOCK(pmap); 1014 return rtval; 1015 } 1016 pte = pmap_pte(pmap, va); 1017 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); 1018 pmap_pte_release(pte); 1019 } 1020 PMAP_UNLOCK(pmap); 1021 return (rtval); 1022} 1023 1024/* 1025 * Routine: pmap_extract_and_hold 1026 * Function: 1027 * Atomically extract and hold the physical page 1028 * with the given pmap and virtual address pair 1029 * if that mapping permits the given protection. 1030 */ 1031vm_page_t 1032pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 1033{ 1034 pd_entry_t pde; 1035 pt_entry_t pte; 1036 vm_page_t m; 1037 1038 m = NULL; 1039 vm_page_lock_queues(); 1040 PMAP_LOCK(pmap); 1041 pde = *pmap_pde(pmap, va); 1042 if (pde != 0) { 1043 if (pde & PG_PS) { 1044 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { 1045 m = PHYS_TO_VM_PAGE((pde & ~PDRMASK) | 1046 (va & PDRMASK)); 1047 vm_page_hold(m); 1048 } 1049 } else { 1050 sched_pin(); 1051 pte = *pmap_pte_quick(pmap, va); 1052 if (pte != 0 && 1053 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { 1054 m = PHYS_TO_VM_PAGE(pte & PG_FRAME); 1055 vm_page_hold(m); 1056 } 1057 sched_unpin(); 1058 } 1059 } 1060 vm_page_unlock_queues(); 1061 PMAP_UNLOCK(pmap); 1062 return (m); 1063} 1064 1065/*************************************************** 1066 * Low level mapping routines..... 1067 ***************************************************/ 1068 1069/* 1070 * Add a wired page to the kva. 1071 * Note: not SMP coherent. 1072 */ 1073PMAP_INLINE void 1074pmap_kenter(vm_offset_t va, vm_paddr_t pa) 1075{ 1076 pt_entry_t *pte; 1077 1078 pte = vtopte(va); 1079 pte_store(pte, pa | PG_RW | PG_V | pgeflag); 1080} 1081 1082PMAP_INLINE void 1083pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) 1084{ 1085 pt_entry_t *pte; 1086 1087 pte = vtopte(va); 1088 pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); 1089} 1090 1091/* 1092 * Remove a page from the kernel pagetables. 1093 * Note: not SMP coherent. 1094 */ 1095PMAP_INLINE void 1096pmap_kremove(vm_offset_t va) 1097{ 1098 pt_entry_t *pte; 1099 1100 pte = vtopte(va); 1101 pte_clear(pte); 1102} 1103 1104/* 1105 * Used to map a range of physical addresses into kernel 1106 * virtual address space. 1107 * 1108 * The value passed in '*virt' is a suggested virtual address for 1109 * the mapping. Architectures which can support a direct-mapped 1110 * physical to virtual region can return the appropriate address 1111 * within that region, leaving '*virt' unchanged. Other 1112 * architectures should map the pages starting at '*virt' and 1113 * update '*virt' with the first usable address after the mapped 1114 * region. 1115 */ 1116vm_offset_t 1117pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 1118{ 1119 vm_offset_t va, sva; 1120 1121 va = sva = *virt; 1122 while (start < end) { 1123 pmap_kenter(va, start); 1124 va += PAGE_SIZE; 1125 start += PAGE_SIZE; 1126 } 1127 pmap_invalidate_range(kernel_pmap, sva, va); 1128 *virt = va; 1129 return (sva); 1130} 1131 1132 1133/* 1134 * Add a list of wired pages to the kva 1135 * this routine is only used for temporary 1136 * kernel mappings that do not need to have 1137 * page modification or references recorded. 1138 * Note that old mappings are simply written 1139 * over. The page *must* be wired. 1140 * Note: SMP coherent. Uses a ranged shootdown IPI. 1141 */ 1142void 1143pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) 1144{ 1145 pt_entry_t *endpte, oldpte, *pte; 1146 1147 oldpte = 0; 1148 pte = vtopte(sva); 1149 endpte = pte + count; 1150 while (pte < endpte) { 1151 oldpte |= *pte; 1152 pte_store(pte, VM_PAGE_TO_PHYS(*ma) | pgeflag | PG_RW | PG_V); 1153 pte++; 1154 ma++; 1155 } 1156 if ((oldpte & PG_V) != 0) 1157 pmap_invalidate_range(kernel_pmap, sva, sva + count * 1158 PAGE_SIZE); 1159} 1160 1161/* 1162 * This routine tears out page mappings from the 1163 * kernel -- it is meant only for temporary mappings. 1164 * Note: SMP coherent. Uses a ranged shootdown IPI. 1165 */ 1166void 1167pmap_qremove(vm_offset_t sva, int count) 1168{ 1169 vm_offset_t va; 1170 1171 va = sva; 1172 while (count-- > 0) { 1173 pmap_kremove(va); 1174 va += PAGE_SIZE; 1175 } 1176 pmap_invalidate_range(kernel_pmap, sva, va); 1177} 1178 1179/*************************************************** 1180 * Page table page management routines..... 1181 ***************************************************/ 1182 1183/* 1184 * This routine unholds page table pages, and if the hold count 1185 * drops to zero, then it decrements the wire count. 1186 */ 1187static PMAP_INLINE int 1188pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) 1189{ 1190 1191 --m->wire_count; 1192 if (m->wire_count == 0) 1193 return _pmap_unwire_pte_hold(pmap, m); 1194 else 1195 return 0; 1196} 1197 1198static int 1199_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) 1200{ 1201 vm_offset_t pteva; 1202 1203 /* 1204 * unmap the page table page 1205 */ 1206 pmap->pm_pdir[m->pindex] = 0; 1207 --pmap->pm_stats.resident_count; 1208 1209 /* 1210 * Do an invltlb to make the invalidated mapping 1211 * take effect immediately. 1212 */ 1213 pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); 1214 pmap_invalidate_page(pmap, pteva); 1215 1216 vm_page_free_zero(m); 1217 atomic_subtract_int(&cnt.v_wire_count, 1); 1218 return 1; 1219} 1220 1221/* 1222 * After removing a page table entry, this routine is used to 1223 * conditionally free the page, and manage the hold/wire counts. 1224 */ 1225static int 1226pmap_unuse_pt(pmap_t pmap, vm_offset_t va) 1227{ 1228 pd_entry_t ptepde; 1229 vm_page_t mpte; 1230 1231 if (va >= VM_MAXUSER_ADDRESS) 1232 return 0; 1233 ptepde = *pmap_pde(pmap, va); 1234 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); 1235 return pmap_unwire_pte_hold(pmap, mpte); 1236} 1237 1238void 1239pmap_pinit0(pmap_t pmap) 1240{ 1241 1242 PMAP_LOCK_INIT(pmap); 1243 pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); 1244#ifdef PAE 1245 pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); 1246#endif 1247 pmap->pm_active = 0; 1248 PCPU_SET(curpmap, pmap); 1249 TAILQ_INIT(&pmap->pm_pvchunk); 1250 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1251 mtx_lock_spin(&allpmaps_lock); 1252 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1253 mtx_unlock_spin(&allpmaps_lock); 1254} 1255 1256/* 1257 * Initialize a preallocated and zeroed pmap structure, 1258 * such as one in a vmspace structure. 1259 */ 1260void 1261pmap_pinit(pmap_t pmap) 1262{ 1263 vm_page_t m, ptdpg[NPGPTD]; 1264 vm_paddr_t pa; 1265 static int color; 1266 int i; 1267 1268 PMAP_LOCK_INIT(pmap); 1269 1270 /* 1271 * No need to allocate page table space yet but we do need a valid 1272 * page directory table. 1273 */ 1274 if (pmap->pm_pdir == NULL) { 1275 pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map, 1276 NBPTD); 1277#ifdef PAE 1278 pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); 1279 KASSERT(((vm_offset_t)pmap->pm_pdpt & 1280 ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, 1281 ("pmap_pinit: pdpt misaligned")); 1282 KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), 1283 ("pmap_pinit: pdpt above 4g")); 1284#endif 1285 } 1286 1287 /* 1288 * allocate the page directory page(s) 1289 */ 1290 for (i = 0; i < NPGPTD;) { 1291 m = vm_page_alloc(NULL, color++, 1292 VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 1293 VM_ALLOC_ZERO); 1294 if (m == NULL) 1295 VM_WAIT; 1296 else { 1297 ptdpg[i++] = m; 1298 } 1299 } 1300 1301 pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); 1302 1303 for (i = 0; i < NPGPTD; i++) { 1304 if ((ptdpg[i]->flags & PG_ZERO) == 0) 1305 bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE); 1306 } 1307 1308 mtx_lock_spin(&allpmaps_lock); 1309 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1310 mtx_unlock_spin(&allpmaps_lock); 1311 /* Wire in kernel global address entries. */ 1312 /* XXX copies current process, does not fill in MPPTDI */ 1313 bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); 1314#ifdef SMP 1315 pmap->pm_pdir[MPPTDI] = PTD[MPPTDI]; 1316#endif 1317 1318 /* install self-referential address mapping entry(s) */ 1319 for (i = 0; i < NPGPTD; i++) { 1320 pa = VM_PAGE_TO_PHYS(ptdpg[i]); 1321 pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; 1322#ifdef PAE 1323 pmap->pm_pdpt[i] = pa | PG_V; 1324#endif 1325 } 1326 1327 pmap->pm_active = 0; 1328 TAILQ_INIT(&pmap->pm_pvchunk); 1329 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1330} 1331 1332/* 1333 * this routine is called if the page table page is not 1334 * mapped correctly. 1335 */ 1336static vm_page_t 1337_pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags) 1338{ 1339 vm_paddr_t ptepa; 1340 vm_page_t m; 1341 1342 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || 1343 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, 1344 ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); 1345 1346 /* 1347 * Allocate a page table page. 1348 */ 1349 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1350 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1351 if (flags & M_WAITOK) { 1352 PMAP_UNLOCK(pmap); 1353 vm_page_unlock_queues(); 1354 VM_WAIT; 1355 vm_page_lock_queues(); 1356 PMAP_LOCK(pmap); 1357 } 1358 1359 /* 1360 * Indicate the need to retry. While waiting, the page table 1361 * page may have been allocated. 1362 */ 1363 return (NULL); 1364 } 1365 if ((m->flags & PG_ZERO) == 0) 1366 pmap_zero_page(m); 1367 1368 /* 1369 * Map the pagetable page into the process address space, if 1370 * it isn't already there. 1371 */ 1372 1373 pmap->pm_stats.resident_count++; 1374 1375 ptepa = VM_PAGE_TO_PHYS(m); 1376 pmap->pm_pdir[ptepindex] = 1377 (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); 1378 1379 return m; 1380} 1381 1382static vm_page_t 1383pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags) 1384{ 1385 unsigned ptepindex; 1386 pd_entry_t ptepa; 1387 vm_page_t m; 1388 1389 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT || 1390 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK, 1391 ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK")); 1392 1393 /* 1394 * Calculate pagetable page index 1395 */ 1396 ptepindex = va >> PDRSHIFT; 1397retry: 1398 /* 1399 * Get the page directory entry 1400 */ 1401 ptepa = pmap->pm_pdir[ptepindex]; 1402 1403 /* 1404 * This supports switching from a 4MB page to a 1405 * normal 4K page. 1406 */ 1407 if (ptepa & PG_PS) { 1408 pmap->pm_pdir[ptepindex] = 0; 1409 ptepa = 0; 1410 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 1411 pmap_invalidate_all(kernel_pmap); 1412 } 1413 1414 /* 1415 * If the page table page is mapped, we just increment the 1416 * hold count, and activate it. 1417 */ 1418 if (ptepa) { 1419 m = PHYS_TO_VM_PAGE(ptepa); 1420 m->wire_count++; 1421 } else { 1422 /* 1423 * Here if the pte page isn't mapped, or if it has 1424 * been deallocated. 1425 */ 1426 m = _pmap_allocpte(pmap, ptepindex, flags); 1427 if (m == NULL && (flags & M_WAITOK)) 1428 goto retry; 1429 } 1430 return (m); 1431} 1432 1433 1434/*************************************************** 1435* Pmap allocation/deallocation routines. 1436 ***************************************************/ 1437 1438#ifdef SMP 1439/* 1440 * Deal with a SMP shootdown of other users of the pmap that we are 1441 * trying to dispose of. This can be a bit hairy. 1442 */ 1443static u_int *lazymask; 1444static u_int lazyptd; 1445static volatile u_int lazywait; 1446 1447void pmap_lazyfix_action(void); 1448 1449void 1450pmap_lazyfix_action(void) 1451{ 1452 u_int mymask = PCPU_GET(cpumask); 1453 1454#ifdef COUNT_IPIS 1455 *ipi_lazypmap_counts[PCPU_GET(cpuid)]++; 1456#endif 1457 if (rcr3() == lazyptd) 1458 load_cr3(PCPU_GET(curpcb)->pcb_cr3); 1459 atomic_clear_int(lazymask, mymask); 1460 atomic_store_rel_int(&lazywait, 1); 1461} 1462 1463static void 1464pmap_lazyfix_self(u_int mymask) 1465{ 1466 1467 if (rcr3() == lazyptd) 1468 load_cr3(PCPU_GET(curpcb)->pcb_cr3); 1469 atomic_clear_int(lazymask, mymask); 1470} 1471 1472 1473static void 1474pmap_lazyfix(pmap_t pmap) 1475{ 1476 u_int mymask; 1477 u_int mask; 1478 u_int spins; 1479 1480 while ((mask = pmap->pm_active) != 0) { 1481 spins = 50000000; 1482 mask = mask & -mask; /* Find least significant set bit */ 1483 mtx_lock_spin(&smp_ipi_mtx); 1484#ifdef PAE 1485 lazyptd = vtophys(pmap->pm_pdpt); 1486#else 1487 lazyptd = vtophys(pmap->pm_pdir); 1488#endif 1489 mymask = PCPU_GET(cpumask); 1490 if (mask == mymask) { 1491 lazymask = &pmap->pm_active; 1492 pmap_lazyfix_self(mymask); 1493 } else { 1494 atomic_store_rel_int((u_int *)&lazymask, 1495 (u_int)&pmap->pm_active); 1496 atomic_store_rel_int(&lazywait, 0); 1497 ipi_selected(mask, IPI_LAZYPMAP); 1498 while (lazywait == 0) { 1499 ia32_pause(); 1500 if (--spins == 0) 1501 break; 1502 } 1503 } 1504 mtx_unlock_spin(&smp_ipi_mtx); 1505 if (spins == 0) 1506 printf("pmap_lazyfix: spun for 50000000\n"); 1507 } 1508} 1509 1510#else /* SMP */ 1511 1512/* 1513 * Cleaning up on uniprocessor is easy. For various reasons, we're 1514 * unlikely to have to even execute this code, including the fact 1515 * that the cleanup is deferred until the parent does a wait(2), which 1516 * means that another userland process has run. 1517 */ 1518static void 1519pmap_lazyfix(pmap_t pmap) 1520{ 1521 u_int cr3; 1522 1523 cr3 = vtophys(pmap->pm_pdir); 1524 if (cr3 == rcr3()) { 1525 load_cr3(PCPU_GET(curpcb)->pcb_cr3); 1526 pmap->pm_active &= ~(PCPU_GET(cpumask)); 1527 } 1528} 1529#endif /* SMP */ 1530 1531/* 1532 * Release any resources held by the given physical map. 1533 * Called when a pmap initialized by pmap_pinit is being released. 1534 * Should only be called if the map contains no valid mappings. 1535 */ 1536void 1537pmap_release(pmap_t pmap) 1538{ 1539 vm_page_t m, ptdpg[NPGPTD]; 1540 int i; 1541 1542 KASSERT(pmap->pm_stats.resident_count == 0, 1543 ("pmap_release: pmap resident count %ld != 0", 1544 pmap->pm_stats.resident_count)); 1545 1546 pmap_lazyfix(pmap); 1547 mtx_lock_spin(&allpmaps_lock); 1548 LIST_REMOVE(pmap, pm_list); 1549 mtx_unlock_spin(&allpmaps_lock); 1550 1551 for (i = 0; i < NPGPTD; i++) 1552 ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i]); 1553 1554 bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * 1555 sizeof(*pmap->pm_pdir)); 1556#ifdef SMP 1557 pmap->pm_pdir[MPPTDI] = 0; 1558#endif 1559 1560 pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); 1561 1562 vm_page_lock_queues(); 1563 for (i = 0; i < NPGPTD; i++) { 1564 m = ptdpg[i]; 1565#ifdef PAE 1566 KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), 1567 ("pmap_release: got wrong ptd page")); 1568#endif 1569 m->wire_count--; 1570 atomic_subtract_int(&cnt.v_wire_count, 1); 1571 vm_page_free_zero(m); 1572 } 1573 vm_page_unlock_queues(); 1574 PMAP_LOCK_DESTROY(pmap); 1575} 1576 1577static int 1578kvm_size(SYSCTL_HANDLER_ARGS) 1579{ 1580 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; 1581 1582 return sysctl_handle_long(oidp, &ksize, 0, req); 1583} 1584SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 1585 0, 0, kvm_size, "IU", "Size of KVM"); 1586 1587static int 1588kvm_free(SYSCTL_HANDLER_ARGS) 1589{ 1590 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 1591 1592 return sysctl_handle_long(oidp, &kfree, 0, req); 1593} 1594SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 1595 0, 0, kvm_free, "IU", "Amount of KVM free"); 1596 1597/* 1598 * grow the number of kernel page table entries, if needed 1599 */ 1600void 1601pmap_growkernel(vm_offset_t addr) 1602{ 1603 struct pmap *pmap; 1604 vm_paddr_t ptppaddr; 1605 vm_page_t nkpg; 1606 pd_entry_t newpdir; 1607 pt_entry_t *pde; 1608 1609 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 1610 if (kernel_vm_end == 0) { 1611 kernel_vm_end = KERNBASE; 1612 nkpt = 0; 1613 while (pdir_pde(PTD, kernel_vm_end)) { 1614 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1615 nkpt++; 1616 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 1617 kernel_vm_end = kernel_map->max_offset; 1618 break; 1619 } 1620 } 1621 } 1622 addr = roundup2(addr, PAGE_SIZE * NPTEPG); 1623 if (addr - 1 >= kernel_map->max_offset) 1624 addr = kernel_map->max_offset; 1625 while (kernel_vm_end < addr) { 1626 if (pdir_pde(PTD, kernel_vm_end)) { 1627 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1628 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 1629 kernel_vm_end = kernel_map->max_offset; 1630 break; 1631 } 1632 continue; 1633 } 1634 1635 /* 1636 * This index is bogus, but out of the way 1637 */ 1638 nkpg = vm_page_alloc(NULL, nkpt, 1639 VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); 1640 if (!nkpg) 1641 panic("pmap_growkernel: no memory to grow kernel"); 1642 1643 nkpt++; 1644 1645 pmap_zero_page(nkpg); 1646 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 1647 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); 1648 pdir_pde(PTD, kernel_vm_end) = newpdir; 1649 1650 mtx_lock_spin(&allpmaps_lock); 1651 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1652 pde = pmap_pde(pmap, kernel_vm_end); 1653 pde_store(pde, newpdir); 1654 } 1655 mtx_unlock_spin(&allpmaps_lock); 1656 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1657 if (kernel_vm_end - 1 >= kernel_map->max_offset) { 1658 kernel_vm_end = kernel_map->max_offset; 1659 break; 1660 } 1661 } 1662} 1663 1664 1665/*************************************************** 1666 * page management routines. 1667 ***************************************************/ 1668 1669CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); 1670CTASSERT(_NPCM == 11); 1671 1672static __inline struct pv_chunk * 1673pv_to_chunk(pv_entry_t pv) 1674{ 1675 1676 return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK); 1677} 1678 1679#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) 1680 1681#define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ 1682#define PC_FREE10 0x0000fffful /* Free values for index 10 */ 1683 1684static uint32_t pc_freemask[11] = { 1685 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 1686 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 1687 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, 1688 PC_FREE0_9, PC_FREE10 1689}; 1690 1691SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, 1692 "Current number of pv entries"); 1693 1694#ifdef PV_STATS 1695static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; 1696 1697SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, 1698 "Current number of pv entry chunks"); 1699SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, 1700 "Current number of pv entry chunks allocated"); 1701SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, 1702 "Current number of pv entry chunks frees"); 1703SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, 1704 "Number of times tried to get a chunk page but failed."); 1705 1706static long pv_entry_frees, pv_entry_allocs; 1707static int pv_entry_spare; 1708 1709SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, 1710 "Current number of pv entry frees"); 1711SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, 1712 "Current number of pv entry allocs"); 1713SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, 1714 "Current number of spare pv entries"); 1715 1716static int pmap_collect_inactive, pmap_collect_active; 1717 1718SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0, 1719 "Current number times pmap_collect called on inactive queue"); 1720SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0, 1721 "Current number times pmap_collect called on active queue"); 1722#endif 1723 1724/* 1725 * We are in a serious low memory condition. Resort to 1726 * drastic measures to free some pages so we can allocate 1727 * another pv entry chunk. This is normally called to 1728 * unmap inactive pages, and if necessary, active pages. 1729 */ 1730static void 1731pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq) 1732{ 1733 pmap_t pmap; 1734 pt_entry_t *pte, tpte; 1735 pv_entry_t next_pv, pv; 1736 vm_offset_t va; 1737 vm_page_t m; 1738 1739 sched_pin(); 1740 TAILQ_FOREACH(m, &vpq->pl, pageq) { 1741 if (m->hold_count || m->busy) 1742 continue; 1743 TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) { 1744 va = pv->pv_va; 1745 pmap = PV_PMAP(pv); 1746 /* Avoid deadlock and lock recursion. */ 1747 if (pmap > locked_pmap) 1748 PMAP_LOCK(pmap); 1749 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) 1750 continue; 1751 pmap->pm_stats.resident_count--; 1752 pte = pmap_pte_quick(pmap, va); 1753 tpte = pte_load_clear(pte); 1754 KASSERT((tpte & PG_W) == 0, 1755 ("pmap_collect: wired pte %#jx", (uintmax_t)tpte)); 1756 if (tpte & PG_A) 1757 vm_page_flag_set(m, PG_REFERENCED); 1758 if (tpte & PG_M) { 1759 KASSERT((tpte & PG_RW), 1760 ("pmap_collect: modified page not writable: va: %#x, pte: %#jx", 1761 va, (uintmax_t)tpte)); 1762 vm_page_dirty(m); 1763 } 1764 pmap_invalidate_page(pmap, va); 1765 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 1766 if (TAILQ_EMPTY(&m->md.pv_list)) 1767 vm_page_flag_clear(m, PG_WRITEABLE); 1768 m->md.pv_list_count--; 1769 pmap_unuse_pt(pmap, va); 1770 free_pv_entry(pmap, pv); 1771 if (pmap != locked_pmap) 1772 PMAP_UNLOCK(pmap); 1773 } 1774 } 1775 sched_unpin(); 1776} 1777 1778 1779/* 1780 * free the pv_entry back to the free list 1781 */ 1782static void 1783free_pv_entry(pmap_t pmap, pv_entry_t pv) 1784{ 1785 vm_page_t m; 1786 struct pv_chunk *pc; 1787 int idx, field, bit; 1788 1789 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1790 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1791 PV_STAT(pv_entry_frees++); 1792 PV_STAT(pv_entry_spare++); 1793 pv_entry_count--; 1794 pc = pv_to_chunk(pv); 1795 idx = pv - &pc->pc_pventry[0]; 1796 field = idx / 32; 1797 bit = idx % 32; 1798 pc->pc_map[field] |= 1ul << bit; 1799 /* move to head of list */ 1800 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1801 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1802 for (idx = 0; idx < _NPCM; idx++) 1803 if (pc->pc_map[idx] != pc_freemask[idx]) 1804 return; 1805 PV_STAT(pv_entry_spare -= _NPCPV); 1806 PV_STAT(pc_chunk_count--); 1807 PV_STAT(pc_chunk_frees++); 1808 /* entire chunk is free, return it */ 1809 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1810 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 1811 pmap_qremove((vm_offset_t)pc, 1); 1812 vm_page_unwire(m, 0); 1813 vm_page_free(m); 1814 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 1815} 1816 1817/* 1818 * get a new pv_entry, allocating a block from the system 1819 * when needed. 1820 */ 1821static pv_entry_t 1822get_pv_entry(pmap_t pmap, int try) 1823{ 1824 static const struct timeval printinterval = { 60, 0 }; 1825 static struct timeval lastprint; 1826 static vm_pindex_t colour; 1827 int bit, field, page_req; 1828 pv_entry_t pv; 1829 struct pv_chunk *pc; 1830 vm_page_t m; 1831 1832 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1833 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1834 PV_STAT(pv_entry_allocs++); 1835 pv_entry_count++; 1836 if (pv_entry_count > pv_entry_high_water) 1837 pagedaemon_wakeup(); 1838 pc = TAILQ_FIRST(&pmap->pm_pvchunk); 1839 if (pc != NULL) { 1840 for (field = 0; field < _NPCM; field++) { 1841 if (pc->pc_map[field]) { 1842 bit = bsfl(pc->pc_map[field]); 1843 break; 1844 } 1845 } 1846 if (field < _NPCM) { 1847 pv = &pc->pc_pventry[field * 32 + bit]; 1848 pc->pc_map[field] &= ~(1ul << bit); 1849 /* If this was the last item, move it to tail */ 1850 for (field = 0; field < _NPCM; field++) 1851 if (pc->pc_map[field] != 0) { 1852 PV_STAT(pv_entry_spare--); 1853 return (pv); /* not full, return */ 1854 } 1855 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 1856 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); 1857 PV_STAT(pv_entry_spare--); 1858 return (pv); 1859 } 1860 } 1861 pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); 1862 page_req = try ? VM_ALLOC_NORMAL : VM_ALLOC_SYSTEM; 1863 m = vm_page_alloc(NULL, colour, page_req | 1864 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); 1865 if (m == NULL || pc == NULL) { 1866 if (try) { 1867 pv_entry_count--; 1868 PV_STAT(pc_chunk_tryfail++); 1869 if (m) { 1870 vm_page_lock_queues(); 1871 vm_page_unwire(m, 0); 1872 vm_page_free(m); 1873 vm_page_unlock_queues(); 1874 } 1875 if (pc) 1876 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 1877 return (NULL); 1878 } 1879 /* 1880 * Reclaim pv entries: At first, destroy mappings to 1881 * inactive pages. After that, if a pv chunk entry 1882 * is still needed, destroy mappings to active pages. 1883 */ 1884 if (ratecheck(&lastprint, &printinterval)) 1885 printf("Approaching the limit on PV entries, " 1886 "consider increasing tunables " 1887 "vm.pmap.shpgperproc or " 1888 "vm.pmap.pv_entry_max\n"); 1889 PV_STAT(pmap_collect_inactive++); 1890 pmap_collect(pmap, &vm_page_queues[PQ_INACTIVE]); 1891 if (m == NULL) 1892 m = vm_page_alloc(NULL, colour, VM_ALLOC_SYSTEM | 1893 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); 1894 if (pc == NULL) 1895 pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); 1896 if (m == NULL || pc == NULL) { 1897 PV_STAT(pmap_collect_active++); 1898 pmap_collect(pmap, &vm_page_queues[PQ_ACTIVE]); 1899 if (m == NULL) 1900 m = vm_page_alloc(NULL, colour, 1901 VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ | 1902 VM_ALLOC_WIRED); 1903 if (pc == NULL) 1904 pc = (struct pv_chunk *) 1905 pmap_ptelist_alloc(&pv_vafree); 1906 if (m == NULL || pc == NULL) 1907 panic("get_pv_entry: increase vm.pmap.shpgperproc"); 1908 } 1909 } 1910 PV_STAT(pc_chunk_count++); 1911 PV_STAT(pc_chunk_allocs++); 1912 colour++; 1913 pmap_qenter((vm_offset_t)pc, &m, 1); 1914 pc->pc_pmap = pmap; 1915 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ 1916 for (field = 1; field < _NPCM; field++) 1917 pc->pc_map[field] = pc_freemask[field]; 1918 pv = &pc->pc_pventry[0]; 1919 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); 1920 PV_STAT(pv_entry_spare += _NPCPV - 1); 1921 return (pv); 1922} 1923 1924static void 1925pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 1926{ 1927 pv_entry_t pv; 1928 1929 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1930 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1931 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 1932 if (pmap == PV_PMAP(pv) && va == pv->pv_va) 1933 break; 1934 } 1935 KASSERT(pv != NULL, ("pmap_remove_entry: pv not found")); 1936 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 1937 m->md.pv_list_count--; 1938 if (TAILQ_EMPTY(&m->md.pv_list)) 1939 vm_page_flag_clear(m, PG_WRITEABLE); 1940 free_pv_entry(pmap, pv); 1941} 1942 1943/* 1944 * Create a pv entry for page at pa for 1945 * (pmap, va). 1946 */ 1947static void 1948pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 1949{ 1950 pv_entry_t pv; 1951 1952 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1953 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1954 pv = get_pv_entry(pmap, FALSE); 1955 pv->pv_va = va; 1956 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 1957 m->md.pv_list_count++; 1958} 1959 1960/* 1961 * Conditionally create a pv entry. 1962 */ 1963static boolean_t 1964pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) 1965{ 1966 pv_entry_t pv; 1967 1968 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1969 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1970 if (pv_entry_count < pv_entry_high_water && 1971 (pv = get_pv_entry(pmap, TRUE)) != NULL) { 1972 pv->pv_va = va; 1973 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 1974 m->md.pv_list_count++; 1975 return (TRUE); 1976 } else 1977 return (FALSE); 1978} 1979 1980/* 1981 * pmap_remove_pte: do the things to unmap a page in a process 1982 */ 1983static int 1984pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va) 1985{ 1986 pt_entry_t oldpte; 1987 vm_page_t m; 1988 1989 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1990 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 1991 oldpte = pte_load_clear(ptq); 1992 if (oldpte & PG_W) 1993 pmap->pm_stats.wired_count -= 1; 1994 /* 1995 * Machines that don't support invlpg, also don't support 1996 * PG_G. 1997 */ 1998 if (oldpte & PG_G) 1999 pmap_invalidate_page(kernel_pmap, va); 2000 pmap->pm_stats.resident_count -= 1; 2001 if (oldpte & PG_MANAGED) { 2002 m = PHYS_TO_VM_PAGE(oldpte); 2003 if (oldpte & PG_M) { 2004 KASSERT((oldpte & PG_RW), 2005 ("pmap_remove_pte: modified page not writable: va: %#x, pte: %#jx", 2006 va, (uintmax_t)oldpte)); 2007 vm_page_dirty(m); 2008 } 2009 if (oldpte & PG_A) 2010 vm_page_flag_set(m, PG_REFERENCED); 2011 pmap_remove_entry(pmap, m, va); 2012 } 2013 return (pmap_unuse_pt(pmap, va)); 2014} 2015 2016/* 2017 * Remove a single page from a process address space 2018 */ 2019static void 2020pmap_remove_page(pmap_t pmap, vm_offset_t va) 2021{ 2022 pt_entry_t *pte; 2023 2024 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2025 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 2026 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2027 if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) 2028 return; 2029 pmap_remove_pte(pmap, pte, va); 2030 pmap_invalidate_page(pmap, va); 2031} 2032 2033/* 2034 * Remove the given range of addresses from the specified map. 2035 * 2036 * It is assumed that the start and end are properly 2037 * rounded to the page size. 2038 */ 2039void 2040pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 2041{ 2042 vm_offset_t pdnxt; 2043 pd_entry_t ptpaddr; 2044 pt_entry_t *pte; 2045 int anyvalid; 2046 2047 /* 2048 * Perform an unsynchronized read. This is, however, safe. 2049 */ 2050 if (pmap->pm_stats.resident_count == 0) 2051 return; 2052 2053 anyvalid = 0; 2054 2055 vm_page_lock_queues(); 2056 sched_pin(); 2057 PMAP_LOCK(pmap); 2058 2059 /* 2060 * special handling of removing one page. a very 2061 * common operation and easy to short circuit some 2062 * code. 2063 */ 2064 if ((sva + PAGE_SIZE == eva) && 2065 ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { 2066 pmap_remove_page(pmap, sva); 2067 goto out; 2068 } 2069 2070 for (; sva < eva; sva = pdnxt) { 2071 unsigned pdirindex; 2072 2073 /* 2074 * Calculate index for next page table. 2075 */ 2076 pdnxt = (sva + NBPDR) & ~PDRMASK; 2077 if (pmap->pm_stats.resident_count == 0) 2078 break; 2079 2080 pdirindex = sva >> PDRSHIFT; 2081 ptpaddr = pmap->pm_pdir[pdirindex]; 2082 2083 /* 2084 * Weed out invalid mappings. Note: we assume that the page 2085 * directory table is always allocated, and in kernel virtual. 2086 */ 2087 if (ptpaddr == 0) 2088 continue; 2089 2090 /* 2091 * Check for large page. 2092 */ 2093 if ((ptpaddr & PG_PS) != 0) { 2094 pmap->pm_pdir[pdirindex] = 0; 2095 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 2096 anyvalid = 1; 2097 continue; 2098 } 2099 2100 /* 2101 * Limit our scan to either the end of the va represented 2102 * by the current page table page, or to the end of the 2103 * range being removed. 2104 */ 2105 if (pdnxt > eva) 2106 pdnxt = eva; 2107 2108 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 2109 sva += PAGE_SIZE) { 2110 if (*pte == 0) 2111 continue; 2112 2113 /* 2114 * The TLB entry for a PG_G mapping is invalidated 2115 * by pmap_remove_pte(). 2116 */ 2117 if ((*pte & PG_G) == 0) 2118 anyvalid = 1; 2119 if (pmap_remove_pte(pmap, pte, sva)) 2120 break; 2121 } 2122 } 2123out: 2124 sched_unpin(); 2125 vm_page_unlock_queues(); 2126 if (anyvalid) 2127 pmap_invalidate_all(pmap); 2128 PMAP_UNLOCK(pmap); 2129} 2130 2131/* 2132 * Routine: pmap_remove_all 2133 * Function: 2134 * Removes this physical page from 2135 * all physical maps in which it resides. 2136 * Reflects back modify bits to the pager. 2137 * 2138 * Notes: 2139 * Original versions of this routine were very 2140 * inefficient because they iteratively called 2141 * pmap_remove (slow...) 2142 */ 2143 2144void 2145pmap_remove_all(vm_page_t m) 2146{ 2147 pv_entry_t pv; 2148 pmap_t pmap; 2149 pt_entry_t *pte, tpte; 2150 2151#if defined(PMAP_DIAGNOSTIC) 2152 /* 2153 * XXX This makes pmap_remove_all() illegal for non-managed pages! 2154 */ 2155 if (m->flags & PG_FICTITIOUS) { 2156 panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x", 2157 VM_PAGE_TO_PHYS(m)); 2158 } 2159#endif 2160 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2161 sched_pin(); 2162 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2163 pmap = PV_PMAP(pv); 2164 PMAP_LOCK(pmap); 2165 pmap->pm_stats.resident_count--; 2166 pte = pmap_pte_quick(pmap, pv->pv_va); 2167 tpte = pte_load_clear(pte); 2168 if (tpte & PG_W) 2169 pmap->pm_stats.wired_count--; 2170 if (tpte & PG_A) 2171 vm_page_flag_set(m, PG_REFERENCED); 2172 2173 /* 2174 * Update the vm_page_t clean and reference bits. 2175 */ 2176 if (tpte & PG_M) { 2177 KASSERT((tpte & PG_RW), 2178 ("pmap_remove_all: modified page not writable: va: %#x, pte: %#jx", 2179 pv->pv_va, (uintmax_t)tpte)); 2180 vm_page_dirty(m); 2181 } 2182 pmap_invalidate_page(pmap, pv->pv_va); 2183 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2184 m->md.pv_list_count--; 2185 pmap_unuse_pt(pmap, pv->pv_va); 2186 free_pv_entry(pmap, pv); 2187 PMAP_UNLOCK(pmap); 2188 } 2189 vm_page_flag_clear(m, PG_WRITEABLE); 2190 sched_unpin(); 2191} 2192 2193/* 2194 * Set the physical protection on the 2195 * specified range of this map as requested. 2196 */ 2197void 2198pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 2199{ 2200 vm_offset_t pdnxt; 2201 pd_entry_t ptpaddr; 2202 pt_entry_t *pte; 2203 int anychanged; 2204 2205 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 2206 pmap_remove(pmap, sva, eva); 2207 return; 2208 } 2209 2210 if (prot & VM_PROT_WRITE) 2211 return; 2212 2213 anychanged = 0; 2214 2215 vm_page_lock_queues(); 2216 sched_pin(); 2217 PMAP_LOCK(pmap); 2218 for (; sva < eva; sva = pdnxt) { 2219 unsigned obits, pbits, pdirindex; 2220 2221 pdnxt = (sva + NBPDR) & ~PDRMASK; 2222 2223 pdirindex = sva >> PDRSHIFT; 2224 ptpaddr = pmap->pm_pdir[pdirindex]; 2225 2226 /* 2227 * Weed out invalid mappings. Note: we assume that the page 2228 * directory table is always allocated, and in kernel virtual. 2229 */ 2230 if (ptpaddr == 0) 2231 continue; 2232 2233 /* 2234 * Check for large page. 2235 */ 2236 if ((ptpaddr & PG_PS) != 0) { 2237 pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW); 2238 anychanged = 1; 2239 continue; 2240 } 2241 2242 if (pdnxt > eva) 2243 pdnxt = eva; 2244 2245 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, 2246 sva += PAGE_SIZE) { 2247 vm_page_t m; 2248 2249retry: 2250 /* 2251 * Regardless of whether a pte is 32 or 64 bits in 2252 * size, PG_RW, PG_A, and PG_M are among the least 2253 * significant 32 bits. 2254 */ 2255 obits = pbits = *(u_int *)pte; 2256 if (pbits & PG_MANAGED) { 2257 m = NULL; 2258 if (pbits & PG_A) { 2259 m = PHYS_TO_VM_PAGE(*pte); 2260 vm_page_flag_set(m, PG_REFERENCED); 2261 pbits &= ~PG_A; 2262 } 2263 if ((pbits & PG_M) != 0) { 2264 if (m == NULL) 2265 m = PHYS_TO_VM_PAGE(*pte); 2266 vm_page_dirty(m); 2267 } 2268 } 2269 2270 pbits &= ~(PG_RW | PG_M); 2271 2272 if (pbits != obits) { 2273 if (!atomic_cmpset_int((u_int *)pte, obits, 2274 pbits)) 2275 goto retry; 2276 if (obits & PG_G) 2277 pmap_invalidate_page(pmap, sva); 2278 else 2279 anychanged = 1; 2280 } 2281 } 2282 } 2283 sched_unpin(); 2284 vm_page_unlock_queues(); 2285 if (anychanged) 2286 pmap_invalidate_all(pmap); 2287 PMAP_UNLOCK(pmap); 2288} 2289 2290/* 2291 * Insert the given physical page (p) at 2292 * the specified virtual address (v) in the 2293 * target physical map with the protection requested. 2294 * 2295 * If specified, the page will be wired down, meaning 2296 * that the related pte can not be reclaimed. 2297 * 2298 * NB: This is the only routine which MAY NOT lazy-evaluate 2299 * or lose information. That is, this routine must actually 2300 * insert this page into the given map NOW. 2301 */ 2302void 2303pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 2304 boolean_t wired) 2305{ 2306 vm_paddr_t pa; 2307 pd_entry_t *pde; 2308 pt_entry_t *pte; 2309 vm_paddr_t opa; 2310 pt_entry_t origpte, newpte; 2311 vm_page_t mpte, om; 2312 boolean_t invlva; 2313 2314 va &= PG_FRAME; 2315#ifdef PMAP_DIAGNOSTIC 2316 if (va > VM_MAX_KERNEL_ADDRESS) 2317 panic("pmap_enter: toobig"); 2318 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) 2319 panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va); 2320#endif 2321 2322 mpte = NULL; 2323 2324 vm_page_lock_queues(); 2325 PMAP_LOCK(pmap); 2326 sched_pin(); 2327 2328 /* 2329 * In the case that a page table page is not 2330 * resident, we are creating it here. 2331 */ 2332 if (va < VM_MAXUSER_ADDRESS) { 2333 mpte = pmap_allocpte(pmap, va, M_WAITOK); 2334 } 2335#if 0 && defined(PMAP_DIAGNOSTIC) 2336 else { 2337 pd_entry_t *pdeaddr = pmap_pde(pmap, va); 2338 origpte = *pdeaddr; 2339 if ((origpte & PG_V) == 0) { 2340 panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n", 2341 pmap->pm_pdir[PTDPTDI], origpte, va); 2342 } 2343 } 2344#endif 2345 2346 pde = pmap_pde(pmap, va); 2347 if ((*pde & PG_PS) != 0) 2348 panic("pmap_enter: attempted pmap_enter on 4MB page"); 2349 pte = pmap_pte_quick(pmap, va); 2350 2351 /* 2352 * Page Directory table entry not valid, we need a new PT page 2353 */ 2354 if (pte == NULL) { 2355 panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n", 2356 (uintmax_t)pmap->pm_pdir[PTDPTDI], va); 2357 } 2358 2359 pa = VM_PAGE_TO_PHYS(m); 2360 om = NULL; 2361 origpte = *pte; 2362 opa = origpte & PG_FRAME; 2363 2364 /* 2365 * Mapping has not changed, must be protection or wiring change. 2366 */ 2367 if (origpte && (opa == pa)) { 2368 /* 2369 * Wiring change, just update stats. We don't worry about 2370 * wiring PT pages as they remain resident as long as there 2371 * are valid mappings in them. Hence, if a user page is wired, 2372 * the PT page will be also. 2373 */ 2374 if (wired && ((origpte & PG_W) == 0)) 2375 pmap->pm_stats.wired_count++; 2376 else if (!wired && (origpte & PG_W)) 2377 pmap->pm_stats.wired_count--; 2378 2379 /* 2380 * Remove extra pte reference 2381 */ 2382 if (mpte) 2383 mpte->wire_count--; 2384 2385 /* 2386 * We might be turning off write access to the page, 2387 * so we go ahead and sense modify status. 2388 */ 2389 if (origpte & PG_MANAGED) { 2390 om = m; 2391 pa |= PG_MANAGED; 2392 } 2393 goto validate; 2394 } 2395 /* 2396 * Mapping has changed, invalidate old range and fall through to 2397 * handle validating new mapping. 2398 */ 2399 if (opa) { 2400 if (origpte & PG_W) 2401 pmap->pm_stats.wired_count--; 2402 if (origpte & PG_MANAGED) { 2403 om = PHYS_TO_VM_PAGE(opa); 2404 pmap_remove_entry(pmap, om, va); 2405 } 2406 if (mpte != NULL) { 2407 mpte->wire_count--; 2408 KASSERT(mpte->wire_count > 0, 2409 ("pmap_enter: missing reference to page table page," 2410 " va: 0x%x", va)); 2411 } 2412 } else 2413 pmap->pm_stats.resident_count++; 2414 2415 /* 2416 * Enter on the PV list if part of our managed memory. 2417 */ 2418 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) { 2419 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, 2420 ("pmap_enter: managed mapping within the clean submap")); 2421 pmap_insert_entry(pmap, va, m); 2422 pa |= PG_MANAGED; 2423 } 2424 2425 /* 2426 * Increment counters 2427 */ 2428 if (wired) 2429 pmap->pm_stats.wired_count++; 2430 2431validate: 2432 /* 2433 * Now validate mapping with desired protection/wiring. 2434 */ 2435 newpte = (pt_entry_t)(pa | PG_V); 2436 if ((prot & VM_PROT_WRITE) != 0) { 2437 newpte |= PG_RW; 2438 vm_page_flag_set(m, PG_WRITEABLE); 2439 } 2440 if (wired) 2441 newpte |= PG_W; 2442 if (va < VM_MAXUSER_ADDRESS) 2443 newpte |= PG_U; 2444 if (pmap == kernel_pmap) 2445 newpte |= pgeflag; 2446 2447 /* 2448 * if the mapping or permission bits are different, we need 2449 * to update the pte. 2450 */ 2451 if ((origpte & ~(PG_M|PG_A)) != newpte) { 2452 if (origpte & PG_V) { 2453 invlva = FALSE; 2454 origpte = pte_load_store(pte, newpte | PG_A); 2455 if (origpte & PG_A) { 2456 if (origpte & PG_MANAGED) 2457 vm_page_flag_set(om, PG_REFERENCED); 2458 if (opa != VM_PAGE_TO_PHYS(m)) 2459 invlva = TRUE; 2460 } 2461 if (origpte & PG_M) { 2462 KASSERT((origpte & PG_RW), 2463 ("pmap_enter: modified page not writable: va: %#x, pte: %#jx", 2464 va, (uintmax_t)origpte)); 2465 if ((origpte & PG_MANAGED) != 0) 2466 vm_page_dirty(om); 2467 if ((prot & VM_PROT_WRITE) == 0) 2468 invlva = TRUE; 2469 } 2470 if (invlva) 2471 pmap_invalidate_page(pmap, va); 2472 } else 2473 pte_store(pte, newpte | PG_A); 2474 } 2475 sched_unpin(); 2476 vm_page_unlock_queues(); 2477 PMAP_UNLOCK(pmap); 2478} 2479 2480/* 2481 * Maps a sequence of resident pages belonging to the same object. 2482 * The sequence begins with the given page m_start. This page is 2483 * mapped at the given virtual address start. Each subsequent page is 2484 * mapped at a virtual address that is offset from start by the same 2485 * amount as the page is offset from m_start within the object. The 2486 * last page in the sequence is the page with the largest offset from 2487 * m_start that can be mapped at a virtual address less than the given 2488 * virtual address end. Not every virtual page between start and end 2489 * is mapped; only those for which a resident page exists with the 2490 * corresponding offset from m_start are mapped. 2491 */ 2492void 2493pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 2494 vm_page_t m_start, vm_prot_t prot) 2495{ 2496 vm_page_t m, mpte; 2497 vm_pindex_t diff, psize; 2498 2499 VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED); 2500 psize = atop(end - start); 2501 mpte = NULL; 2502 m = m_start; 2503 PMAP_LOCK(pmap); 2504 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { 2505 mpte = pmap_enter_quick_locked(pmap, start + ptoa(diff), m, 2506 prot, mpte); 2507 m = TAILQ_NEXT(m, listq); 2508 } 2509 PMAP_UNLOCK(pmap); 2510} 2511 2512/* 2513 * this code makes some *MAJOR* assumptions: 2514 * 1. Current pmap & pmap exists. 2515 * 2. Not wired. 2516 * 3. Read access. 2517 * 4. No page table pages. 2518 * but is *MUCH* faster than pmap_enter... 2519 */ 2520 2521void 2522pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) 2523{ 2524 2525 PMAP_LOCK(pmap); 2526 (void) pmap_enter_quick_locked(pmap, va, m, prot, NULL); 2527 PMAP_UNLOCK(pmap); 2528} 2529 2530static vm_page_t 2531pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, 2532 vm_prot_t prot, vm_page_t mpte) 2533{ 2534 pt_entry_t *pte; 2535 vm_paddr_t pa; 2536 2537 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || 2538 (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0, 2539 ("pmap_enter_quick_locked: managed mapping within the clean submap")); 2540 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2541 PMAP_LOCK_ASSERT(pmap, MA_OWNED); 2542 2543 /* 2544 * In the case that a page table page is not 2545 * resident, we are creating it here. 2546 */ 2547 if (va < VM_MAXUSER_ADDRESS) { 2548 unsigned ptepindex; 2549 pd_entry_t ptepa; 2550 2551 /* 2552 * Calculate pagetable page index 2553 */ 2554 ptepindex = va >> PDRSHIFT; 2555 if (mpte && (mpte->pindex == ptepindex)) { 2556 mpte->wire_count++; 2557 } else { 2558 /* 2559 * Get the page directory entry 2560 */ 2561 ptepa = pmap->pm_pdir[ptepindex]; 2562 2563 /* 2564 * If the page table page is mapped, we just increment 2565 * the hold count, and activate it. 2566 */ 2567 if (ptepa) { 2568 if (ptepa & PG_PS) 2569 panic("pmap_enter_quick: unexpected mapping into 4MB page"); 2570 mpte = PHYS_TO_VM_PAGE(ptepa); 2571 mpte->wire_count++; 2572 } else { 2573 mpte = _pmap_allocpte(pmap, ptepindex, 2574 M_NOWAIT); 2575 if (mpte == NULL) 2576 return (mpte); 2577 } 2578 } 2579 } else { 2580 mpte = NULL; 2581 } 2582 2583 /* 2584 * This call to vtopte makes the assumption that we are 2585 * entering the page into the current pmap. In order to support 2586 * quick entry into any pmap, one would likely use pmap_pte_quick. 2587 * But that isn't as quick as vtopte. 2588 */ 2589 pte = vtopte(va); 2590 if (*pte) { 2591 if (mpte != NULL) { 2592 pmap_unwire_pte_hold(pmap, mpte); 2593 mpte = NULL; 2594 } 2595 return (mpte); 2596 } 2597 2598 /* 2599 * Enter on the PV list if part of our managed memory. 2600 */ 2601 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 && 2602 !pmap_try_insert_pv_entry(pmap, va, m)) { 2603 if (mpte != NULL) { 2604 pmap_unwire_pte_hold(pmap, mpte); 2605 mpte = NULL; 2606 } 2607 return (mpte); 2608 } 2609 2610 /* 2611 * Increment counters 2612 */ 2613 pmap->pm_stats.resident_count++; 2614 2615 pa = VM_PAGE_TO_PHYS(m); 2616 2617 /* 2618 * Now validate mapping with RO protection 2619 */ 2620 if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) 2621 pte_store(pte, pa | PG_V | PG_U); 2622 else 2623 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 2624 return mpte; 2625} 2626 2627/* 2628 * Make a temporary mapping for a physical address. This is only intended 2629 * to be used for panic dumps. 2630 */ 2631void * 2632pmap_kenter_temporary(vm_paddr_t pa, int i) 2633{ 2634 vm_offset_t va; 2635 2636 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 2637 pmap_kenter(va, pa); 2638 invlpg(va); 2639 return ((void *)crashdumpmap); 2640} 2641 2642/* 2643 * This code maps large physical mmap regions into the 2644 * processor address space. Note that some shortcuts 2645 * are taken, but the code works. 2646 */ 2647void 2648pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, 2649 vm_object_t object, vm_pindex_t pindex, 2650 vm_size_t size) 2651{ 2652 vm_page_t p; 2653 2654 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 2655 KASSERT(object->type == OBJT_DEVICE, 2656 ("pmap_object_init_pt: non-device object")); 2657 if (pseflag && 2658 ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) { 2659 int i; 2660 vm_page_t m[1]; 2661 unsigned int ptepindex; 2662 int npdes; 2663 pd_entry_t ptepa; 2664 2665 PMAP_LOCK(pmap); 2666 if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)]) 2667 goto out; 2668 PMAP_UNLOCK(pmap); 2669retry: 2670 p = vm_page_lookup(object, pindex); 2671 if (p != NULL) { 2672 if (vm_page_sleep_if_busy(p, FALSE, "init4p")) 2673 goto retry; 2674 } else { 2675 p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL); 2676 if (p == NULL) 2677 return; 2678 m[0] = p; 2679 2680 if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) { 2681 vm_page_lock_queues(); 2682 vm_page_free(p); 2683 vm_page_unlock_queues(); 2684 return; 2685 } 2686 2687 p = vm_page_lookup(object, pindex); 2688 vm_page_lock_queues(); 2689 vm_page_wakeup(p); 2690 vm_page_unlock_queues(); 2691 } 2692 2693 ptepa = VM_PAGE_TO_PHYS(p); 2694 if (ptepa & (NBPDR - 1)) 2695 return; 2696 2697 p->valid = VM_PAGE_BITS_ALL; 2698 2699 PMAP_LOCK(pmap); 2700 pmap->pm_stats.resident_count += size >> PAGE_SHIFT; 2701 npdes = size >> PDRSHIFT; 2702 for(i = 0; i < npdes; i++) { 2703 pde_store(&pmap->pm_pdir[ptepindex], 2704 ptepa | PG_U | PG_RW | PG_V | PG_PS); 2705 ptepa += NBPDR; 2706 ptepindex += 1; 2707 } 2708 pmap_invalidate_all(pmap); 2709out: 2710 PMAP_UNLOCK(pmap); 2711 } 2712} 2713 2714/* 2715 * Routine: pmap_change_wiring 2716 * Function: Change the wiring attribute for a map/virtual-address 2717 * pair. 2718 * In/out conditions: 2719 * The mapping must already exist in the pmap. 2720 */ 2721void 2722pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) 2723{ 2724 pt_entry_t *pte; 2725 2726 PMAP_LOCK(pmap); 2727 pte = pmap_pte(pmap, va); 2728 2729 if (wired && !pmap_pte_w(pte)) 2730 pmap->pm_stats.wired_count++; 2731 else if (!wired && pmap_pte_w(pte)) 2732 pmap->pm_stats.wired_count--; 2733 2734 /* 2735 * Wiring is not a hardware characteristic so there is no need to 2736 * invalidate TLB. 2737 */ 2738 pmap_pte_set_w(pte, wired); 2739 pmap_pte_release(pte); 2740 PMAP_UNLOCK(pmap); 2741} 2742 2743 2744 2745/* 2746 * Copy the range specified by src_addr/len 2747 * from the source map to the range dst_addr/len 2748 * in the destination map. 2749 * 2750 * This routine is only advisory and need not do anything. 2751 */ 2752 2753void 2754pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 2755 vm_offset_t src_addr) 2756{ 2757 vm_offset_t addr; 2758 vm_offset_t end_addr = src_addr + len; 2759 vm_offset_t pdnxt; 2760 2761 if (dst_addr != src_addr) 2762 return; 2763 2764 if (!pmap_is_current(src_pmap)) 2765 return; 2766 2767 vm_page_lock_queues(); 2768 if (dst_pmap < src_pmap) { 2769 PMAP_LOCK(dst_pmap); 2770 PMAP_LOCK(src_pmap); 2771 } else { 2772 PMAP_LOCK(src_pmap); 2773 PMAP_LOCK(dst_pmap); 2774 } 2775 sched_pin(); 2776 for (addr = src_addr; addr < end_addr; addr = pdnxt) { 2777 pt_entry_t *src_pte, *dst_pte; 2778 vm_page_t dstmpte, srcmpte; 2779 pd_entry_t srcptepaddr; 2780 unsigned ptepindex; 2781 2782 if (addr >= UPT_MIN_ADDRESS) 2783 panic("pmap_copy: invalid to pmap_copy page tables"); 2784 2785 pdnxt = (addr + NBPDR) & ~PDRMASK; 2786 ptepindex = addr >> PDRSHIFT; 2787 2788 srcptepaddr = src_pmap->pm_pdir[ptepindex]; 2789 if (srcptepaddr == 0) 2790 continue; 2791 2792 if (srcptepaddr & PG_PS) { 2793 if (dst_pmap->pm_pdir[ptepindex] == 0) { 2794 dst_pmap->pm_pdir[ptepindex] = srcptepaddr & 2795 ~PG_W; 2796 dst_pmap->pm_stats.resident_count += 2797 NBPDR / PAGE_SIZE; 2798 } 2799 continue; 2800 } 2801 2802 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 2803 if (srcmpte->wire_count == 0) 2804 panic("pmap_copy: source page table page is unused"); 2805 2806 if (pdnxt > end_addr) 2807 pdnxt = end_addr; 2808 2809 src_pte = vtopte(addr); 2810 while (addr < pdnxt) { 2811 pt_entry_t ptetemp; 2812 ptetemp = *src_pte; 2813 /* 2814 * we only virtual copy managed pages 2815 */ 2816 if ((ptetemp & PG_MANAGED) != 0) { 2817 dstmpte = pmap_allocpte(dst_pmap, addr, 2818 M_NOWAIT); 2819 if (dstmpte == NULL) 2820 break; 2821 dst_pte = pmap_pte_quick(dst_pmap, addr); 2822 if (*dst_pte == 0 && 2823 pmap_try_insert_pv_entry(dst_pmap, addr, 2824 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { 2825 /* 2826 * Clear the wired, modified, and 2827 * accessed (referenced) bits 2828 * during the copy. 2829 */ 2830 *dst_pte = ptetemp & ~(PG_W | PG_M | 2831 PG_A); 2832 dst_pmap->pm_stats.resident_count++; 2833 } else 2834 pmap_unwire_pte_hold(dst_pmap, dstmpte); 2835 if (dstmpte->wire_count >= srcmpte->wire_count) 2836 break; 2837 } 2838 addr += PAGE_SIZE; 2839 src_pte++; 2840 } 2841 } 2842 sched_unpin(); 2843 vm_page_unlock_queues(); 2844 PMAP_UNLOCK(src_pmap); 2845 PMAP_UNLOCK(dst_pmap); 2846} 2847 2848static __inline void 2849pagezero(void *page) 2850{ 2851#if defined(I686_CPU) 2852 if (cpu_class == CPUCLASS_686) { 2853#if defined(CPU_ENABLE_SSE) 2854 if (cpu_feature & CPUID_SSE2) 2855 sse2_pagezero(page); 2856 else 2857#endif 2858 i686_pagezero(page); 2859 } else 2860#endif 2861 bzero(page, PAGE_SIZE); 2862} 2863 2864/* 2865 * pmap_zero_page zeros the specified hardware page by mapping 2866 * the page into KVM and using bzero to clear its contents. 2867 */ 2868void 2869pmap_zero_page(vm_page_t m) 2870{ 2871 struct sysmaps *sysmaps; 2872 2873 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 2874 mtx_lock(&sysmaps->lock); 2875 if (*sysmaps->CMAP2) 2876 panic("pmap_zero_page: CMAP2 busy"); 2877 sched_pin(); 2878 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M; 2879 invlcaddr(sysmaps->CADDR2); 2880 pagezero(sysmaps->CADDR2); 2881 *sysmaps->CMAP2 = 0; 2882 sched_unpin(); 2883 mtx_unlock(&sysmaps->lock); 2884} 2885 2886/* 2887 * pmap_zero_page_area zeros the specified hardware page by mapping 2888 * the page into KVM and using bzero to clear its contents. 2889 * 2890 * off and size may not cover an area beyond a single hardware page. 2891 */ 2892void 2893pmap_zero_page_area(vm_page_t m, int off, int size) 2894{ 2895 struct sysmaps *sysmaps; 2896 2897 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 2898 mtx_lock(&sysmaps->lock); 2899 if (*sysmaps->CMAP2) 2900 panic("pmap_zero_page: CMAP2 busy"); 2901 sched_pin(); 2902 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M; 2903 invlcaddr(sysmaps->CADDR2); 2904 if (off == 0 && size == PAGE_SIZE) 2905 pagezero(sysmaps->CADDR2); 2906 else 2907 bzero((char *)sysmaps->CADDR2 + off, size); 2908 *sysmaps->CMAP2 = 0; 2909 sched_unpin(); 2910 mtx_unlock(&sysmaps->lock); 2911} 2912 2913/* 2914 * pmap_zero_page_idle zeros the specified hardware page by mapping 2915 * the page into KVM and using bzero to clear its contents. This 2916 * is intended to be called from the vm_pagezero process only and 2917 * outside of Giant. 2918 */ 2919void 2920pmap_zero_page_idle(vm_page_t m) 2921{ 2922 2923 if (*CMAP3) 2924 panic("pmap_zero_page: CMAP3 busy"); 2925 sched_pin(); 2926 *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M; 2927 invlcaddr(CADDR3); 2928 pagezero(CADDR3); 2929 *CMAP3 = 0; 2930 sched_unpin(); 2931} 2932 2933/* 2934 * pmap_copy_page copies the specified (machine independent) 2935 * page by mapping the page into virtual memory and using 2936 * bcopy to copy the page, one machine dependent page at a 2937 * time. 2938 */ 2939void 2940pmap_copy_page(vm_page_t src, vm_page_t dst) 2941{ 2942 struct sysmaps *sysmaps; 2943 2944 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)]; 2945 mtx_lock(&sysmaps->lock); 2946 if (*sysmaps->CMAP1) 2947 panic("pmap_copy_page: CMAP1 busy"); 2948 if (*sysmaps->CMAP2) 2949 panic("pmap_copy_page: CMAP2 busy"); 2950 sched_pin(); 2951 invlpg((u_int)sysmaps->CADDR1); 2952 invlpg((u_int)sysmaps->CADDR2); 2953 *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A; 2954 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M; 2955 bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE); 2956 *sysmaps->CMAP1 = 0; 2957 *sysmaps->CMAP2 = 0; 2958 sched_unpin(); 2959 mtx_unlock(&sysmaps->lock); 2960} 2961 2962/* 2963 * Returns true if the pmap's pv is one of the first 2964 * 16 pvs linked to from this page. This count may 2965 * be changed upwards or downwards in the future; it 2966 * is only necessary that true be returned for a small 2967 * subset of pmaps for proper page aging. 2968 */ 2969boolean_t 2970pmap_page_exists_quick(pmap_t pmap, vm_page_t m) 2971{ 2972 pv_entry_t pv; 2973 int loops = 0; 2974 2975 if (m->flags & PG_FICTITIOUS) 2976 return FALSE; 2977 2978 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2979 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2980 if (PV_PMAP(pv) == pmap) { 2981 return TRUE; 2982 } 2983 loops++; 2984 if (loops >= 16) 2985 break; 2986 } 2987 return (FALSE); 2988} 2989 2990/* 2991 * Remove all pages from specified address space 2992 * this aids process exit speeds. Also, this code 2993 * is special cased for current process only, but 2994 * can have the more generic (and slightly slower) 2995 * mode enabled. This is much faster than pmap_remove 2996 * in the case of running down an entire address space. 2997 */ 2998void 2999pmap_remove_pages(pmap_t pmap) 3000{ 3001 pt_entry_t *pte, tpte; 3002 vm_page_t m; 3003 pv_entry_t pv; 3004 struct pv_chunk *pc, *npc; 3005 int field, idx; 3006 int32_t bit; 3007 uint32_t inuse, bitmask; 3008 int allfree; 3009 3010 if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) { 3011 printf("warning: pmap_remove_pages called with non-current pmap\n"); 3012 return; 3013 } 3014 vm_page_lock_queues(); 3015 PMAP_LOCK(pmap); 3016 sched_pin(); 3017 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { 3018 allfree = 1; 3019 for (field = 0; field < _NPCM; field++) { 3020 inuse = (~(pc->pc_map[field])) & pc_freemask[field]; 3021 while (inuse != 0) { 3022 bit = bsfl(inuse); 3023 bitmask = 1UL << bit; 3024 idx = field * 32 + bit; 3025 pv = &pc->pc_pventry[idx]; 3026 inuse &= ~bitmask; 3027 3028 pte = vtopte(pv->pv_va); 3029 tpte = *pte; 3030 3031 if (tpte == 0) { 3032 printf( 3033 "TPTE at %p IS ZERO @ VA %08x\n", 3034 pte, pv->pv_va); 3035 panic("bad pte"); 3036 } 3037 3038/* 3039 * We cannot remove wired pages from a process' mapping at this time 3040 */ 3041 if (tpte & PG_W) { 3042 allfree = 0; 3043 continue; 3044 } 3045 3046 m = PHYS_TO_VM_PAGE(tpte); 3047 KASSERT(m->phys_addr == (tpte & PG_FRAME), 3048 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 3049 m, (uintmax_t)m->phys_addr, 3050 (uintmax_t)tpte)); 3051 3052 KASSERT(m < &vm_page_array[vm_page_array_size], 3053 ("pmap_remove_pages: bad tpte %#jx", 3054 (uintmax_t)tpte)); 3055 3056 pmap->pm_stats.resident_count--; 3057 3058 pte_clear(pte); 3059 3060 /* 3061 * Update the vm_page_t clean/reference bits. 3062 */ 3063 if (tpte & PG_M) 3064 vm_page_dirty(m); 3065 3066 /* Mark free */ 3067 PV_STAT(pv_entry_frees++); 3068 PV_STAT(pv_entry_spare++); 3069 pv_entry_count--; 3070 pc->pc_map[field] |= bitmask; 3071 m->md.pv_list_count--; 3072 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3073 if (TAILQ_EMPTY(&m->md.pv_list)) 3074 vm_page_flag_clear(m, PG_WRITEABLE); 3075 3076 pmap_unuse_pt(pmap, pv->pv_va); 3077 } 3078 } 3079 if (allfree) { 3080 PV_STAT(pv_entry_spare -= _NPCPV); 3081 PV_STAT(pc_chunk_count--); 3082 PV_STAT(pc_chunk_frees++); 3083 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); 3084 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); 3085 pmap_qremove((vm_offset_t)pc, 1); 3086 vm_page_unwire(m, 0); 3087 vm_page_free(m); 3088 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); 3089 } 3090 } 3091 sched_unpin(); 3092 vm_page_unlock_queues(); 3093 pmap_invalidate_all(pmap); 3094 PMAP_UNLOCK(pmap); 3095} 3096 3097/* 3098 * pmap_is_modified: 3099 * 3100 * Return whether or not the specified physical page was modified 3101 * in any physical maps. 3102 */ 3103boolean_t 3104pmap_is_modified(vm_page_t m) 3105{ 3106 pv_entry_t pv; 3107 pt_entry_t *pte; 3108 pmap_t pmap; 3109 boolean_t rv; 3110 3111 rv = FALSE; 3112 if (m->flags & PG_FICTITIOUS) 3113 return (rv); 3114 3115 sched_pin(); 3116 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3117 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3118 pmap = PV_PMAP(pv); 3119 PMAP_LOCK(pmap); 3120 pte = pmap_pte_quick(pmap, pv->pv_va); 3121 rv = (*pte & PG_M) != 0; 3122 PMAP_UNLOCK(pmap); 3123 if (rv) 3124 break; 3125 } 3126 sched_unpin(); 3127 return (rv); 3128} 3129 3130/* 3131 * pmap_is_prefaultable: 3132 * 3133 * Return whether or not the specified virtual address is elgible 3134 * for prefault. 3135 */ 3136boolean_t 3137pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 3138{ 3139 pt_entry_t *pte; 3140 boolean_t rv; 3141 3142 rv = FALSE; 3143 PMAP_LOCK(pmap); 3144 if (*pmap_pde(pmap, addr)) { 3145 pte = vtopte(addr); 3146 rv = *pte == 0; 3147 } 3148 PMAP_UNLOCK(pmap); 3149 return (rv); 3150} 3151 3152/* 3153 * Clear the write and modified bits in each of the given page's mappings. 3154 */ 3155void 3156pmap_remove_write(vm_page_t m) 3157{ 3158 pv_entry_t pv; 3159 pmap_t pmap; 3160 pt_entry_t oldpte, *pte; 3161 3162 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3163 if ((m->flags & PG_FICTITIOUS) != 0 || 3164 (m->flags & PG_WRITEABLE) == 0) 3165 return; 3166 sched_pin(); 3167 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3168 pmap = PV_PMAP(pv); 3169 PMAP_LOCK(pmap); 3170 pte = pmap_pte_quick(pmap, pv->pv_va); 3171retry: 3172 oldpte = *pte; 3173 if ((oldpte & PG_RW) != 0) { 3174 /* 3175 * Regardless of whether a pte is 32 or 64 bits 3176 * in size, PG_RW and PG_M are among the least 3177 * significant 32 bits. 3178 */ 3179 if (!atomic_cmpset_int((u_int *)pte, oldpte, 3180 oldpte & ~(PG_RW | PG_M))) 3181 goto retry; 3182 if ((oldpte & PG_M) != 0) 3183 vm_page_dirty(m); 3184 pmap_invalidate_page(pmap, pv->pv_va); 3185 } 3186 PMAP_UNLOCK(pmap); 3187 } 3188 vm_page_flag_clear(m, PG_WRITEABLE); 3189 sched_unpin(); 3190} 3191 3192/* 3193 * pmap_ts_referenced: 3194 * 3195 * Return a count of reference bits for a page, clearing those bits. 3196 * It is not necessary for every reference bit to be cleared, but it 3197 * is necessary that 0 only be returned when there are truly no 3198 * reference bits set. 3199 * 3200 * XXX: The exact number of bits to check and clear is a matter that 3201 * should be tested and standardized at some point in the future for 3202 * optimal aging of shared pages. 3203 */ 3204int 3205pmap_ts_referenced(vm_page_t m) 3206{ 3207 pv_entry_t pv, pvf, pvn; 3208 pmap_t pmap; 3209 pt_entry_t *pte; 3210 int rtval = 0; 3211 3212 if (m->flags & PG_FICTITIOUS) 3213 return (rtval); 3214 sched_pin(); 3215 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3216 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 3217 pvf = pv; 3218 do { 3219 pvn = TAILQ_NEXT(pv, pv_list); 3220 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 3221 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 3222 pmap = PV_PMAP(pv); 3223 PMAP_LOCK(pmap); 3224 pte = pmap_pte_quick(pmap, pv->pv_va); 3225 if ((*pte & PG_A) != 0) { 3226 atomic_clear_int((u_int *)pte, PG_A); 3227 pmap_invalidate_page(pmap, pv->pv_va); 3228 rtval++; 3229 if (rtval > 4) 3230 pvn = NULL; 3231 } 3232 PMAP_UNLOCK(pmap); 3233 } while ((pv = pvn) != NULL && pv != pvf); 3234 } 3235 sched_unpin(); 3236 return (rtval); 3237} 3238 3239/* 3240 * Clear the modify bits on the specified physical page. 3241 */ 3242void 3243pmap_clear_modify(vm_page_t m) 3244{ 3245 pv_entry_t pv; 3246 pmap_t pmap; 3247 pt_entry_t *pte; 3248 3249 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3250 if ((m->flags & PG_FICTITIOUS) != 0) 3251 return; 3252 sched_pin(); 3253 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3254 pmap = PV_PMAP(pv); 3255 PMAP_LOCK(pmap); 3256 pte = pmap_pte_quick(pmap, pv->pv_va); 3257 if ((*pte & PG_M) != 0) { 3258 /* 3259 * Regardless of whether a pte is 32 or 64 bits 3260 * in size, PG_M is among the least significant 3261 * 32 bits. 3262 */ 3263 atomic_clear_int((u_int *)pte, PG_M); 3264 pmap_invalidate_page(pmap, pv->pv_va); 3265 } 3266 PMAP_UNLOCK(pmap); 3267 } 3268 sched_unpin(); 3269} 3270 3271/* 3272 * pmap_clear_reference: 3273 * 3274 * Clear the reference bit on the specified physical page. 3275 */ 3276void 3277pmap_clear_reference(vm_page_t m) 3278{ 3279 pv_entry_t pv; 3280 pmap_t pmap; 3281 pt_entry_t *pte; 3282 3283 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 3284 if ((m->flags & PG_FICTITIOUS) != 0) 3285 return; 3286 sched_pin(); 3287 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3288 pmap = PV_PMAP(pv); 3289 PMAP_LOCK(pmap); 3290 pte = pmap_pte_quick(pmap, pv->pv_va); 3291 if ((*pte & PG_A) != 0) { 3292 /* 3293 * Regardless of whether a pte is 32 or 64 bits 3294 * in size, PG_A is among the least significant 3295 * 32 bits. 3296 */ 3297 atomic_clear_int((u_int *)pte, PG_A); 3298 pmap_invalidate_page(pmap, pv->pv_va); 3299 } 3300 PMAP_UNLOCK(pmap); 3301 } 3302 sched_unpin(); 3303} 3304 3305/* 3306 * Miscellaneous support routines follow 3307 */ 3308 3309/* 3310 * Map a set of physical memory pages into the kernel virtual 3311 * address space. Return a pointer to where it is mapped. This 3312 * routine is intended to be used for mapping device memory, 3313 * NOT real memory. 3314 */ 3315void * 3316pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) 3317{ 3318 vm_offset_t va, tmpva, offset; 3319 3320 offset = pa & PAGE_MASK; 3321 size = roundup(offset + size, PAGE_SIZE); 3322 pa = pa & PG_FRAME; 3323 3324 if (pa < KERNLOAD && pa + size <= KERNLOAD) 3325 va = KERNBASE + pa; 3326 else 3327 va = kmem_alloc_nofault(kernel_map, size); 3328 if (!va) 3329 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 3330 3331 for (tmpva = va; size > 0; ) { 3332 pmap_kenter_attr(tmpva, pa, mode); 3333 size -= PAGE_SIZE; 3334 tmpva += PAGE_SIZE; 3335 pa += PAGE_SIZE; 3336 } 3337 pmap_invalidate_range(kernel_pmap, va, tmpva); 3338 pmap_invalidate_cache(); 3339 return ((void *)(va + offset)); 3340} 3341 3342void * 3343pmap_mapdev(vm_paddr_t pa, vm_size_t size) 3344{ 3345 3346 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); 3347} 3348 3349void * 3350pmap_mapbios(vm_paddr_t pa, vm_size_t size) 3351{ 3352 3353 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); 3354} 3355 3356void 3357pmap_unmapdev(vm_offset_t va, vm_size_t size) 3358{ 3359 vm_offset_t base, offset, tmpva; 3360 3361 if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) 3362 return; 3363 base = va & PG_FRAME; 3364 offset = va & PAGE_MASK; 3365 size = roundup(offset + size, PAGE_SIZE); 3366 for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) 3367 pmap_kremove(tmpva); 3368 pmap_invalidate_range(kernel_pmap, va, tmpva); 3369 kmem_free(kernel_map, base, size); 3370} 3371 3372int 3373pmap_change_attr(va, size, mode) 3374 vm_offset_t va; 3375 vm_size_t size; 3376 int mode; 3377{ 3378 vm_offset_t base, offset, tmpva; 3379 pt_entry_t *pte; 3380 u_int opte, npte; 3381 pd_entry_t *pde; 3382 3383 base = va & PG_FRAME; 3384 offset = va & PAGE_MASK; 3385 size = roundup(offset + size, PAGE_SIZE); 3386 3387 /* Only supported on kernel virtual addresses. */ 3388 if (base <= VM_MAXUSER_ADDRESS) 3389 return (EINVAL); 3390 3391 /* 4MB pages and pages that aren't mapped aren't supported. */ 3392 for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) { 3393 pde = pmap_pde(kernel_pmap, tmpva); 3394 if (*pde & PG_PS) 3395 return (EINVAL); 3396 if (*pde == 0) 3397 return (EINVAL); 3398 pte = vtopte(va); 3399 if (*pte == 0) 3400 return (EINVAL); 3401 } 3402 3403 /* 3404 * Ok, all the pages exist and are 4k, so run through them updating 3405 * their cache mode. 3406 */ 3407 for (tmpva = base; size > 0; ) { 3408 pte = vtopte(tmpva); 3409 3410 /* 3411 * The cache mode bits are all in the low 32-bits of the 3412 * PTE, so we can just spin on updating the low 32-bits. 3413 */ 3414 do { 3415 opte = *(u_int *)pte; 3416 npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT); 3417 npte |= pmap_cache_bits(mode, 0); 3418 } while (npte != opte && 3419 !atomic_cmpset_int((u_int *)pte, opte, npte)); 3420 tmpva += PAGE_SIZE; 3421 size -= PAGE_SIZE; 3422 } 3423 3424 /* 3425 * Flush CPU caches to make sure any data isn't cached that shouldn't 3426 * be, etc. 3427 */ 3428 pmap_invalidate_range(kernel_pmap, base, tmpva); 3429 pmap_invalidate_cache(); 3430 return (0); 3431} 3432 3433/* 3434 * perform the pmap work for mincore 3435 */ 3436int 3437pmap_mincore(pmap_t pmap, vm_offset_t addr) 3438{ 3439 pt_entry_t *ptep, pte; 3440 vm_page_t m; 3441 int val = 0; 3442 3443 PMAP_LOCK(pmap); 3444 ptep = pmap_pte(pmap, addr); 3445 pte = (ptep != NULL) ? *ptep : 0; 3446 pmap_pte_release(ptep); 3447 PMAP_UNLOCK(pmap); 3448 3449 if (pte != 0) { 3450 vm_paddr_t pa; 3451 3452 val = MINCORE_INCORE; 3453 if ((pte & PG_MANAGED) == 0) 3454 return val; 3455 3456 pa = pte & PG_FRAME; 3457 3458 m = PHYS_TO_VM_PAGE(pa); 3459 3460 /* 3461 * Modified by us 3462 */ 3463 if (pte & PG_M) 3464 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 3465 else { 3466 /* 3467 * Modified by someone else 3468 */ 3469 vm_page_lock_queues(); 3470 if (m->dirty || pmap_is_modified(m)) 3471 val |= MINCORE_MODIFIED_OTHER; 3472 vm_page_unlock_queues(); 3473 } 3474 /* 3475 * Referenced by us 3476 */ 3477 if (pte & PG_A) 3478 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 3479 else { 3480 /* 3481 * Referenced by someone else 3482 */ 3483 vm_page_lock_queues(); 3484 if ((m->flags & PG_REFERENCED) || 3485 pmap_ts_referenced(m)) { 3486 val |= MINCORE_REFERENCED_OTHER; 3487 vm_page_flag_set(m, PG_REFERENCED); 3488 } 3489 vm_page_unlock_queues(); 3490 } 3491 } 3492 return val; 3493} 3494 3495void 3496pmap_activate(struct thread *td) 3497{ 3498 pmap_t pmap, oldpmap; 3499 u_int32_t cr3; 3500 3501 critical_enter(); 3502 pmap = vmspace_pmap(td->td_proc->p_vmspace); 3503 oldpmap = PCPU_GET(curpmap); 3504#if defined(SMP) 3505 atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask)); 3506 atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask)); 3507#else 3508 oldpmap->pm_active &= ~1; 3509 pmap->pm_active |= 1; 3510#endif 3511#ifdef PAE 3512 cr3 = vtophys(pmap->pm_pdpt); 3513#else 3514 cr3 = vtophys(pmap->pm_pdir); 3515#endif 3516 /* 3517 * pmap_activate is for the current thread on the current cpu 3518 */ 3519 td->td_pcb->pcb_cr3 = cr3; 3520 load_cr3(cr3); 3521 PCPU_SET(curpmap, pmap); 3522 critical_exit(); 3523} 3524 3525vm_offset_t 3526pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) 3527{ 3528 3529 if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { 3530 return addr; 3531 } 3532 3533 addr = (addr + PDRMASK) & ~PDRMASK; 3534 return addr; 3535} 3536 3537 3538#if defined(PMAP_DEBUG) 3539pmap_pid_dump(int pid) 3540{ 3541 pmap_t pmap; 3542 struct proc *p; 3543 int npte = 0; 3544 int index; 3545 3546 sx_slock(&allproc_lock); 3547 FOREACH_PROC_IN_SYSTEM(p) { 3548 if (p->p_pid != pid) 3549 continue; 3550 3551 if (p->p_vmspace) { 3552 int i,j; 3553 index = 0; 3554 pmap = vmspace_pmap(p->p_vmspace); 3555 for (i = 0; i < NPDEPTD; i++) { 3556 pd_entry_t *pde; 3557 pt_entry_t *pte; 3558 vm_offset_t base = i << PDRSHIFT; 3559 3560 pde = &pmap->pm_pdir[i]; 3561 if (pde && pmap_pde_v(pde)) { 3562 for (j = 0; j < NPTEPG; j++) { 3563 vm_offset_t va = base + (j << PAGE_SHIFT); 3564 if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { 3565 if (index) { 3566 index = 0; 3567 printf("\n"); 3568 } 3569 sx_sunlock(&allproc_lock); 3570 return npte; 3571 } 3572 pte = pmap_pte(pmap, va); 3573 if (pte && pmap_pte_v(pte)) { 3574 pt_entry_t pa; 3575 vm_page_t m; 3576 pa = *pte; 3577 m = PHYS_TO_VM_PAGE(pa); 3578 printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", 3579 va, pa, m->hold_count, m->wire_count, m->flags); 3580 npte++; 3581 index++; 3582 if (index >= 2) { 3583 index = 0; 3584 printf("\n"); 3585 } else { 3586 printf(" "); 3587 } 3588 } 3589 } 3590 } 3591 } 3592 } 3593 } 3594 sx_sunlock(&allproc_lock); 3595 return npte; 3596} 3597#endif 3598 3599#if defined(DEBUG) 3600 3601static void pads(pmap_t pm); 3602void pmap_pvdump(vm_offset_t pa); 3603 3604/* print address space of pmap*/ 3605static void 3606pads(pmap_t pm) 3607{ 3608 int i, j; 3609 vm_paddr_t va; 3610 pt_entry_t *ptep; 3611 3612 if (pm == kernel_pmap) 3613 return; 3614 for (i = 0; i < NPDEPTD; i++) 3615 if (pm->pm_pdir[i]) 3616 for (j = 0; j < NPTEPG; j++) { 3617 va = (i << PDRSHIFT) + (j << PAGE_SHIFT); 3618 if (pm == kernel_pmap && va < KERNBASE) 3619 continue; 3620 if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) 3621 continue; 3622 ptep = pmap_pte(pm, va); 3623 if (pmap_pte_v(ptep)) 3624 printf("%x:%x ", va, *ptep); 3625 }; 3626 3627} 3628 3629void 3630pmap_pvdump(vm_paddr_t pa) 3631{ 3632 pv_entry_t pv; 3633 pmap_t pmap; 3634 vm_page_t m; 3635 3636 printf("pa %x", pa); 3637 m = PHYS_TO_VM_PAGE(pa); 3638 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3639 pmap = PV_PMAP(pv); 3640 printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va); 3641 pads(pmap); 3642 } 3643 printf(" "); 3644} 3645#endif 3646