pmap.c revision 130386
1/*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * the Systems Programming Group of the University of Utah Computer 11 * Science Department and William Jolitz of UUNET Technologies Inc. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. All advertising materials mentioning features or use of this software 22 * must display the following acknowledgement: 23 * This product includes software developed by the University of 24 * California, Berkeley and its contributors. 25 * 4. Neither the name of the University nor the names of its contributors 26 * may be used to endorse or promote products derived from this software 27 * without specific prior written permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 32 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 39 * SUCH DAMAGE. 40 * 41 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 42 */ 43/*- 44 * Copyright (c) 2003 Networks Associates Technology, Inc. 45 * All rights reserved. 46 * 47 * This software was developed for the FreeBSD Project by Jake Burkholder, 48 * Safeport Network Services, and Network Associates Laboratories, the 49 * Security Research Division of Network Associates, Inc. under 50 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA 51 * CHATS research program. 52 * 53 * Redistribution and use in source and binary forms, with or without 54 * modification, are permitted provided that the following conditions 55 * are met: 56 * 1. Redistributions of source code must retain the above copyright 57 * notice, this list of conditions and the following disclaimer. 58 * 2. Redistributions in binary form must reproduce the above copyright 59 * notice, this list of conditions and the following disclaimer in the 60 * documentation and/or other materials provided with the distribution. 61 * 62 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 63 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 64 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 65 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 66 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 67 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 68 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 69 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 70 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 71 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 72 * SUCH DAMAGE. 73 */ 74 75#include <sys/cdefs.h> 76__FBSDID("$FreeBSD: head/sys/i386/i386/pmap.c 130386 2004-06-12 20:01:48Z alc $"); 77 78/* 79 * Manages physical address maps. 80 * 81 * In addition to hardware address maps, this 82 * module is called upon to provide software-use-only 83 * maps which may or may not be stored in the same 84 * form as hardware maps. These pseudo-maps are 85 * used to store intermediate results from copy 86 * operations to and from address spaces. 87 * 88 * Since the information managed by this module is 89 * also stored by the logical address mapping module, 90 * this module may throw away valid virtual-to-physical 91 * mappings at almost any time. However, invalidations 92 * of virtual-to-physical mappings must be done as 93 * requested. 94 * 95 * In order to cope with hardware architectures which 96 * make virtual-to-physical map invalidates expensive, 97 * this module may delay invalidate or reduced protection 98 * operations until such time as they are actually 99 * necessary. This module is given full information as 100 * to which processors are currently using which maps, 101 * and to when physical maps must be made correct. 102 */ 103 104#include "opt_cpu.h" 105#include "opt_pmap.h" 106#include "opt_msgbuf.h" 107#include "opt_kstack_pages.h" 108 109#include <sys/param.h> 110#include <sys/systm.h> 111#include <sys/kernel.h> 112#include <sys/lock.h> 113#include <sys/mman.h> 114#include <sys/msgbuf.h> 115#include <sys/mutex.h> 116#include <sys/proc.h> 117#include <sys/sx.h> 118#include <sys/user.h> 119#include <sys/vmmeter.h> 120#include <sys/sched.h> 121#include <sys/sysctl.h> 122#ifdef SMP 123#include <sys/smp.h> 124#endif 125 126#include <vm/vm.h> 127#include <vm/vm_param.h> 128#include <vm/vm_kern.h> 129#include <vm/vm_page.h> 130#include <vm/vm_map.h> 131#include <vm/vm_object.h> 132#include <vm/vm_extern.h> 133#include <vm/vm_pageout.h> 134#include <vm/vm_pager.h> 135#include <vm/uma.h> 136 137#include <machine/cpu.h> 138#include <machine/cputypes.h> 139#include <machine/md_var.h> 140#include <machine/specialreg.h> 141#ifdef SMP 142#include <machine/smp.h> 143#endif 144 145#if !defined(CPU_ENABLE_SSE) && defined(I686_CPU) 146#define CPU_ENABLE_SSE 147#endif 148#if defined(CPU_DISABLE_SSE) 149#undef CPU_ENABLE_SSE 150#endif 151 152#define PMAP_KEEP_PDIRS 153#ifndef PMAP_SHPGPERPROC 154#define PMAP_SHPGPERPROC 200 155#endif 156 157#if defined(DIAGNOSTIC) 158#define PMAP_DIAGNOSTIC 159#endif 160 161#define MINPV 2048 162 163#if !defined(PMAP_DIAGNOSTIC) 164#define PMAP_INLINE __inline 165#else 166#define PMAP_INLINE 167#endif 168 169/* 170 * Get PDEs and PTEs for user/kernel address space 171 */ 172#define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) 173#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) 174 175#define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) 176#define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) 177#define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) 178#define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) 179#define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) 180 181#define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ 182 atomic_clear_int((u_int *)(pte), PG_W)) 183#define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) 184 185struct pmap kernel_pmap_store; 186LIST_HEAD(pmaplist, pmap); 187static struct pmaplist allpmaps; 188static struct mtx allpmaps_lock; 189#ifdef SMP 190static struct mtx lazypmap_lock; 191#endif 192 193vm_paddr_t avail_end; /* PA of last available physical page */ 194vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ 195vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ 196static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */ 197int pgeflag = 0; /* PG_G or-in */ 198int pseflag = 0; /* PG_PS or-in */ 199 200static int nkpt; 201vm_offset_t kernel_vm_end; 202extern u_int32_t KERNend; 203 204#ifdef PAE 205static uma_zone_t pdptzone; 206#endif 207 208/* 209 * Data for the pv entry allocation mechanism 210 */ 211static uma_zone_t pvzone; 212static struct vm_object pvzone_obj; 213static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; 214int pmap_pagedaemon_waken; 215 216/* 217 * All those kernel PT submaps that BSD is so fond of 218 */ 219pt_entry_t *CMAP1 = 0; 220static pt_entry_t *CMAP2, *CMAP3, *ptmmap; 221caddr_t CADDR1 = 0, ptvmmap = 0; 222static caddr_t CADDR2, CADDR3; 223static struct mtx CMAPCADDR12_lock; 224static pt_entry_t *msgbufmap; 225struct msgbuf *msgbufp = 0; 226 227/* 228 * Crashdump maps. 229 */ 230static pt_entry_t *pt_crashdumpmap; 231static caddr_t crashdumpmap; 232 233#ifdef SMP 234extern pt_entry_t *SMPpt; 235#endif 236static pt_entry_t *PMAP1 = 0, *PMAP2; 237static pt_entry_t *PADDR1 = 0, *PADDR2; 238#ifdef SMP 239static int PMAP1cpu; 240static int PMAP1changedcpu; 241SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 242 &PMAP1changedcpu, 0, 243 "Number of times pmap_pte_quick changed CPU with same PMAP1"); 244#endif 245static int PMAP1changed; 246SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 247 &PMAP1changed, 0, 248 "Number of times pmap_pte_quick changed PMAP1"); 249static int PMAP1unchanged; 250SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 251 &PMAP1unchanged, 0, 252 "Number of times pmap_pte_quick didn't change PMAP1"); 253 254static PMAP_INLINE void free_pv_entry(pv_entry_t pv); 255static pv_entry_t get_pv_entry(void); 256static void pmap_clear_ptes(vm_page_t m, int bit) 257 __always_inline; 258 259static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva); 260static void pmap_remove_page(struct pmap *pmap, vm_offset_t va); 261static int pmap_remove_entry(struct pmap *pmap, vm_page_t m, 262 vm_offset_t va); 263static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, 264 vm_page_t mpte, vm_page_t m); 265 266static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va); 267 268static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex); 269static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); 270static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t); 271static vm_offset_t pmap_kmem_choose(vm_offset_t addr); 272#ifdef PAE 273static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait); 274#endif 275 276CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); 277CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); 278 279/* 280 * Move the kernel virtual free pointer to the next 281 * 4MB. This is used to help improve performance 282 * by using a large (4MB) page for much of the kernel 283 * (.text, .data, .bss) 284 */ 285static vm_offset_t 286pmap_kmem_choose(vm_offset_t addr) 287{ 288 vm_offset_t newaddr = addr; 289 290#ifndef DISABLE_PSE 291 if (cpu_feature & CPUID_PSE) 292 newaddr = (addr + PDRMASK) & ~PDRMASK; 293#endif 294 return newaddr; 295} 296 297/* 298 * Bootstrap the system enough to run with virtual memory. 299 * 300 * On the i386 this is called after mapping has already been enabled 301 * and just syncs the pmap module with what has already been done. 302 * [We can't call it easily with mapping off since the kernel is not 303 * mapped with PA == VA, hence we would have to relocate every address 304 * from the linked base (virtual) address "KERNBASE" to the actual 305 * (physical) address starting relative to 0] 306 */ 307void 308pmap_bootstrap(firstaddr, loadaddr) 309 vm_paddr_t firstaddr; 310 vm_paddr_t loadaddr; 311{ 312 vm_offset_t va; 313 pt_entry_t *pte; 314 int i; 315 316 /* 317 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too 318 * large. It should instead be correctly calculated in locore.s and 319 * not based on 'first' (which is a physical address, not a virtual 320 * address, for the start of unused physical memory). The kernel 321 * page tables are NOT double mapped and thus should not be included 322 * in this calculation. 323 */ 324 virtual_avail = (vm_offset_t) KERNBASE + firstaddr; 325 virtual_avail = pmap_kmem_choose(virtual_avail); 326 327 virtual_end = VM_MAX_KERNEL_ADDRESS; 328 329 /* 330 * Initialize the kernel pmap (which is statically allocated). 331 */ 332 kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); 333#ifdef PAE 334 kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); 335#endif 336 kernel_pmap->pm_active = -1; /* don't allow deactivation */ 337 TAILQ_INIT(&kernel_pmap->pm_pvlist); 338 LIST_INIT(&allpmaps); 339#ifdef SMP 340 mtx_init(&lazypmap_lock, "lazypmap", NULL, MTX_SPIN); 341#endif 342 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); 343 mtx_lock_spin(&allpmaps_lock); 344 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); 345 mtx_unlock_spin(&allpmaps_lock); 346 nkpt = NKPT; 347 348 /* 349 * Reserve some special page table entries/VA space for temporary 350 * mapping of pages. 351 */ 352#define SYSMAP(c, p, v, n) \ 353 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); 354 355 va = virtual_avail; 356 pte = vtopte(va); 357 358 /* 359 * CMAP1/CMAP2 are used for zeroing and copying pages. 360 * CMAP3 is used for the idle process page zeroing. 361 */ 362 SYSMAP(caddr_t, CMAP1, CADDR1, 1) 363 SYSMAP(caddr_t, CMAP2, CADDR2, 1) 364 SYSMAP(caddr_t, CMAP3, CADDR3, 1) 365 *CMAP3 = 0; 366 367 mtx_init(&CMAPCADDR12_lock, "CMAPCADDR12", NULL, MTX_DEF); 368 369 /* 370 * Crashdump maps. 371 */ 372 SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); 373 374 /* 375 * ptvmmap is used for reading arbitrary physical pages via /dev/mem. 376 * XXX ptmmap is not used. 377 */ 378 SYSMAP(caddr_t, ptmmap, ptvmmap, 1) 379 380 /* 381 * msgbufp is used to map the system message buffer. 382 * XXX msgbufmap is not used. 383 */ 384 SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 385 atop(round_page(MSGBUF_SIZE))) 386 387 /* 388 * ptemap is used for pmap_pte_quick 389 */ 390 SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1); 391 SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1); 392 393 virtual_avail = va; 394 395 *CMAP1 = *CMAP2 = 0; 396 for (i = 0; i < NKPT; i++) 397 PTD[i] = 0; 398 399 /* Turn on PG_G on kernel page(s) */ 400 pmap_set_pg(); 401} 402 403/* 404 * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. 405 */ 406void 407pmap_set_pg(void) 408{ 409 pd_entry_t pdir; 410 pt_entry_t *pte; 411 vm_offset_t va, endva; 412 int i; 413 414 if (pgeflag == 0) 415 return; 416 417 i = KERNLOAD/NBPDR; 418 endva = KERNBASE + KERNend; 419 420 if (pseflag) { 421 va = KERNBASE + KERNLOAD; 422 while (va < endva) { 423 pdir = kernel_pmap->pm_pdir[KPTDI+i]; 424 pdir |= pgeflag; 425 kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir; 426 invltlb(); /* Play it safe, invltlb() every time */ 427 i++; 428 va += NBPDR; 429 } 430 } else { 431 va = (vm_offset_t)btext; 432 while (va < endva) { 433 pte = vtopte(va); 434 if (*pte) 435 *pte |= pgeflag; 436 invltlb(); /* Play it safe, invltlb() every time */ 437 va += PAGE_SIZE; 438 } 439 } 440} 441 442#ifdef PAE 443static void * 444pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait) 445{ 446 *flags = UMA_SLAB_PRIV; 447 return (contigmalloc(PAGE_SIZE, NULL, 0, 0x0ULL, 0xffffffffULL, 1, 0)); 448} 449#endif 450 451/* 452 * Initialize the pmap module. 453 * Called by vm_init, to initialize any structures that the pmap 454 * system needs to map virtual memory. 455 * pmap_init has been enhanced to support in a fairly consistant 456 * way, discontiguous physical memory. 457 */ 458void 459pmap_init(void) 460{ 461 int i; 462 463 /* 464 * Allocate memory for random pmap data structures. Includes the 465 * pv_head_table. 466 */ 467 468 for(i = 0; i < vm_page_array_size; i++) { 469 vm_page_t m; 470 471 m = &vm_page_array[i]; 472 TAILQ_INIT(&m->md.pv_list); 473 m->md.pv_list_count = 0; 474 } 475 476 /* 477 * init the pv free list 478 */ 479 pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL, 480 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE); 481 uma_prealloc(pvzone, MINPV); 482 483#ifdef PAE 484 pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, 485 NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, 486 UMA_ZONE_VM | UMA_ZONE_NOFREE); 487 uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); 488#endif 489 490 /* 491 * Now it is safe to enable pv_table recording. 492 */ 493 pmap_initialized = TRUE; 494} 495 496/* 497 * Initialize the address space (zone) for the pv_entries. Set a 498 * high water mark so that the system can recover from excessive 499 * numbers of pv entries. 500 */ 501void 502pmap_init2() 503{ 504 int shpgperproc = PMAP_SHPGPERPROC; 505 506 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); 507 pv_entry_max = shpgperproc * maxproc + vm_page_array_size; 508 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); 509 pv_entry_high_water = 9 * (pv_entry_max / 10); 510 uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max); 511} 512 513 514/*************************************************** 515 * Low level helper routines..... 516 ***************************************************/ 517 518#if defined(PMAP_DIAGNOSTIC) 519 520/* 521 * This code checks for non-writeable/modified pages. 522 * This should be an invalid condition. 523 */ 524static int 525pmap_nw_modified(pt_entry_t ptea) 526{ 527 int pte; 528 529 pte = (int) ptea; 530 531 if ((pte & (PG_M|PG_RW)) == PG_M) 532 return 1; 533 else 534 return 0; 535} 536#endif 537 538 539/* 540 * this routine defines the region(s) of memory that should 541 * not be tested for the modified bit. 542 */ 543static PMAP_INLINE int 544pmap_track_modified(vm_offset_t va) 545{ 546 if ((va < kmi.clean_sva) || (va >= kmi.clean_eva)) 547 return 1; 548 else 549 return 0; 550} 551 552#ifdef I386_CPU 553/* 554 * i386 only has "invalidate everything" and no SMP to worry about. 555 */ 556PMAP_INLINE void 557pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 558{ 559 560 if (pmap == kernel_pmap || pmap->pm_active) 561 invltlb(); 562} 563 564PMAP_INLINE void 565pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 566{ 567 568 if (pmap == kernel_pmap || pmap->pm_active) 569 invltlb(); 570} 571 572PMAP_INLINE void 573pmap_invalidate_all(pmap_t pmap) 574{ 575 576 if (pmap == kernel_pmap || pmap->pm_active) 577 invltlb(); 578} 579#else /* !I386_CPU */ 580#ifdef SMP 581/* 582 * For SMP, these functions have to use the IPI mechanism for coherence. 583 */ 584void 585pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 586{ 587 u_int cpumask; 588 u_int other_cpus; 589 590 if (smp_started) { 591 if (!(read_eflags() & PSL_I)) 592 panic("%s: interrupts disabled", __func__); 593 mtx_lock_spin(&smp_tlb_mtx); 594 } else 595 critical_enter(); 596 /* 597 * We need to disable interrupt preemption but MUST NOT have 598 * interrupts disabled here. 599 * XXX we may need to hold schedlock to get a coherent pm_active 600 * XXX critical sections disable interrupts again 601 */ 602 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 603 invlpg(va); 604 smp_invlpg(va); 605 } else { 606 cpumask = PCPU_GET(cpumask); 607 other_cpus = PCPU_GET(other_cpus); 608 if (pmap->pm_active & cpumask) 609 invlpg(va); 610 if (pmap->pm_active & other_cpus) 611 smp_masked_invlpg(pmap->pm_active & other_cpus, va); 612 } 613 if (smp_started) 614 mtx_unlock_spin(&smp_tlb_mtx); 615 else 616 critical_exit(); 617} 618 619void 620pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 621{ 622 u_int cpumask; 623 u_int other_cpus; 624 vm_offset_t addr; 625 626 if (smp_started) { 627 if (!(read_eflags() & PSL_I)) 628 panic("%s: interrupts disabled", __func__); 629 mtx_lock_spin(&smp_tlb_mtx); 630 } else 631 critical_enter(); 632 /* 633 * We need to disable interrupt preemption but MUST NOT have 634 * interrupts disabled here. 635 * XXX we may need to hold schedlock to get a coherent pm_active 636 * XXX critical sections disable interrupts again 637 */ 638 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 639 for (addr = sva; addr < eva; addr += PAGE_SIZE) 640 invlpg(addr); 641 smp_invlpg_range(sva, eva); 642 } else { 643 cpumask = PCPU_GET(cpumask); 644 other_cpus = PCPU_GET(other_cpus); 645 if (pmap->pm_active & cpumask) 646 for (addr = sva; addr < eva; addr += PAGE_SIZE) 647 invlpg(addr); 648 if (pmap->pm_active & other_cpus) 649 smp_masked_invlpg_range(pmap->pm_active & other_cpus, 650 sva, eva); 651 } 652 if (smp_started) 653 mtx_unlock_spin(&smp_tlb_mtx); 654 else 655 critical_exit(); 656} 657 658void 659pmap_invalidate_all(pmap_t pmap) 660{ 661 u_int cpumask; 662 u_int other_cpus; 663 664 if (smp_started) { 665 if (!(read_eflags() & PSL_I)) 666 panic("%s: interrupts disabled", __func__); 667 mtx_lock_spin(&smp_tlb_mtx); 668 } else 669 critical_enter(); 670 /* 671 * We need to disable interrupt preemption but MUST NOT have 672 * interrupts disabled here. 673 * XXX we may need to hold schedlock to get a coherent pm_active 674 * XXX critical sections disable interrupts again 675 */ 676 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) { 677 invltlb(); 678 smp_invltlb(); 679 } else { 680 cpumask = PCPU_GET(cpumask); 681 other_cpus = PCPU_GET(other_cpus); 682 if (pmap->pm_active & cpumask) 683 invltlb(); 684 if (pmap->pm_active & other_cpus) 685 smp_masked_invltlb(pmap->pm_active & other_cpus); 686 } 687 if (smp_started) 688 mtx_unlock_spin(&smp_tlb_mtx); 689 else 690 critical_exit(); 691} 692#else /* !SMP */ 693/* 694 * Normal, non-SMP, 486+ invalidation functions. 695 * We inline these within pmap.c for speed. 696 */ 697PMAP_INLINE void 698pmap_invalidate_page(pmap_t pmap, vm_offset_t va) 699{ 700 701 if (pmap == kernel_pmap || pmap->pm_active) 702 invlpg(va); 703} 704 705PMAP_INLINE void 706pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 707{ 708 vm_offset_t addr; 709 710 if (pmap == kernel_pmap || pmap->pm_active) 711 for (addr = sva; addr < eva; addr += PAGE_SIZE) 712 invlpg(addr); 713} 714 715PMAP_INLINE void 716pmap_invalidate_all(pmap_t pmap) 717{ 718 719 if (pmap == kernel_pmap || pmap->pm_active) 720 invltlb(); 721} 722#endif /* !SMP */ 723#endif /* !I386_CPU */ 724 725/* 726 * Are we current address space or kernel? N.B. We return FALSE when 727 * a pmap's page table is in use because a kernel thread is borrowing 728 * it. The borrowed page table can change spontaneously, making any 729 * dependence on its continued use subject to a race condition. 730 */ 731static __inline int 732pmap_is_current(pmap_t pmap) 733{ 734 735 return (pmap == kernel_pmap || 736 (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) && 737 (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME))); 738} 739 740/* 741 * If the given pmap is not the current pmap, Giant must be held. 742 */ 743pt_entry_t * 744pmap_pte(pmap_t pmap, vm_offset_t va) 745{ 746 pd_entry_t newpf; 747 pd_entry_t *pde; 748 749 pde = pmap_pde(pmap, va); 750 if (*pde & PG_PS) 751 return (pde); 752 if (*pde != 0) { 753 /* are we current address space or kernel? */ 754 if (pmap_is_current(pmap)) 755 return (vtopte(va)); 756 GIANT_REQUIRED; 757 newpf = *pde & PG_FRAME; 758 if ((*PMAP2 & PG_FRAME) != newpf) { 759 *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; 760 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); 761 } 762 return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); 763 } 764 return (0); 765} 766 767static __inline void 768invlcaddr(void *caddr) 769{ 770#ifdef I386_CPU 771 invltlb(); 772#else 773 invlpg((u_int)caddr); 774#endif 775} 776 777/* 778 * Super fast pmap_pte routine best used when scanning 779 * the pv lists. This eliminates many coarse-grained 780 * invltlb calls. Note that many of the pv list 781 * scans are across different pmaps. It is very wasteful 782 * to do an entire invltlb for checking a single mapping. 783 * 784 * If the given pmap is not the current pmap, vm_page_queue_mtx 785 * must be held and curthread pinned to a CPU. 786 */ 787static pt_entry_t * 788pmap_pte_quick(pmap_t pmap, vm_offset_t va) 789{ 790 pd_entry_t newpf; 791 pd_entry_t *pde; 792 793 pde = pmap_pde(pmap, va); 794 if (*pde & PG_PS) 795 return (pde); 796 if (*pde != 0) { 797 /* are we current address space or kernel? */ 798 if (pmap_is_current(pmap)) 799 return (vtopte(va)); 800 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 801 KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); 802 newpf = *pde & PG_FRAME; 803 if ((*PMAP1 & PG_FRAME) != newpf) { 804 *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; 805#ifdef SMP 806 PMAP1cpu = PCPU_GET(cpuid); 807#endif 808 invlcaddr(PADDR1); 809 PMAP1changed++; 810 } else 811#ifdef SMP 812 if (PMAP1cpu != PCPU_GET(cpuid)) { 813 PMAP1cpu = PCPU_GET(cpuid); 814 invlcaddr(PADDR1); 815 PMAP1changedcpu++; 816 } else 817#endif 818 PMAP1unchanged++; 819 return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); 820 } 821 return (0); 822} 823 824/* 825 * Routine: pmap_extract 826 * Function: 827 * Extract the physical page address associated 828 * with the given map/virtual_address pair. 829 */ 830vm_paddr_t 831pmap_extract(pmap, va) 832 register pmap_t pmap; 833 vm_offset_t va; 834{ 835 vm_paddr_t rtval; 836 pt_entry_t *pte; 837 pd_entry_t pde; 838 839 if (pmap == 0) 840 return 0; 841 pde = pmap->pm_pdir[va >> PDRSHIFT]; 842 if (pde != 0) { 843 if ((pde & PG_PS) != 0) { 844 rtval = (pde & ~PDRMASK) | (va & PDRMASK); 845 return rtval; 846 } 847 pte = pmap_pte(pmap, va); 848 rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK)); 849 return rtval; 850 } 851 return 0; 852 853} 854 855/* 856 * Routine: pmap_extract_and_hold 857 * Function: 858 * Atomically extract and hold the physical page 859 * with the given pmap and virtual address pair 860 * if that mapping permits the given protection. 861 */ 862vm_page_t 863pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) 864{ 865 vm_paddr_t pa; 866 vm_page_t m; 867 868 m = NULL; 869 mtx_lock(&Giant); 870 if ((pa = pmap_extract(pmap, va)) != 0) { 871 m = PHYS_TO_VM_PAGE(pa); 872 vm_page_lock_queues(); 873 vm_page_hold(m); 874 vm_page_unlock_queues(); 875 } 876 mtx_unlock(&Giant); 877 return (m); 878} 879 880/*************************************************** 881 * Low level mapping routines..... 882 ***************************************************/ 883 884/* 885 * Add a wired page to the kva. 886 * Note: not SMP coherent. 887 */ 888PMAP_INLINE void 889pmap_kenter(vm_offset_t va, vm_paddr_t pa) 890{ 891 pt_entry_t *pte; 892 893 pte = vtopte(va); 894 pte_store(pte, pa | PG_RW | PG_V | pgeflag); 895} 896 897/* 898 * Remove a page from the kernel pagetables. 899 * Note: not SMP coherent. 900 */ 901PMAP_INLINE void 902pmap_kremove(vm_offset_t va) 903{ 904 pt_entry_t *pte; 905 906 pte = vtopte(va); 907 pte_clear(pte); 908} 909 910/* 911 * Used to map a range of physical addresses into kernel 912 * virtual address space. 913 * 914 * The value passed in '*virt' is a suggested virtual address for 915 * the mapping. Architectures which can support a direct-mapped 916 * physical to virtual region can return the appropriate address 917 * within that region, leaving '*virt' unchanged. Other 918 * architectures should map the pages starting at '*virt' and 919 * update '*virt' with the first usable address after the mapped 920 * region. 921 */ 922vm_offset_t 923pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) 924{ 925 vm_offset_t va, sva; 926 927 va = sva = *virt; 928 while (start < end) { 929 pmap_kenter(va, start); 930 va += PAGE_SIZE; 931 start += PAGE_SIZE; 932 } 933 pmap_invalidate_range(kernel_pmap, sva, va); 934 *virt = va; 935 return (sva); 936} 937 938 939/* 940 * Add a list of wired pages to the kva 941 * this routine is only used for temporary 942 * kernel mappings that do not need to have 943 * page modification or references recorded. 944 * Note that old mappings are simply written 945 * over. The page *must* be wired. 946 * Note: SMP coherent. Uses a ranged shootdown IPI. 947 */ 948void 949pmap_qenter(vm_offset_t sva, vm_page_t *m, int count) 950{ 951 vm_offset_t va; 952 953 va = sva; 954 while (count-- > 0) { 955 pmap_kenter(va, VM_PAGE_TO_PHYS(*m)); 956 va += PAGE_SIZE; 957 m++; 958 } 959 pmap_invalidate_range(kernel_pmap, sva, va); 960} 961 962/* 963 * This routine tears out page mappings from the 964 * kernel -- it is meant only for temporary mappings. 965 * Note: SMP coherent. Uses a ranged shootdown IPI. 966 */ 967void 968pmap_qremove(vm_offset_t sva, int count) 969{ 970 vm_offset_t va; 971 972 va = sva; 973 while (count-- > 0) { 974 pmap_kremove(va); 975 va += PAGE_SIZE; 976 } 977 pmap_invalidate_range(kernel_pmap, sva, va); 978} 979 980/*************************************************** 981 * Page table page management routines..... 982 ***************************************************/ 983 984/* 985 * This routine unholds page table pages, and if the hold count 986 * drops to zero, then it decrements the wire count. 987 */ 988static int 989_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) 990{ 991 992 while (vm_page_sleep_if_busy(m, FALSE, "pmuwpt")) 993 vm_page_lock_queues(); 994 995 if (m->hold_count == 0) { 996 vm_offset_t pteva; 997 /* 998 * unmap the page table page 999 */ 1000 pmap->pm_pdir[m->pindex] = 0; 1001 --pmap->pm_stats.resident_count; 1002 /* 1003 * We never unwire a kernel page table page, making a 1004 * check for the kernel_pmap unnecessary. 1005 */ 1006 if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)) { 1007 /* 1008 * Do an invltlb to make the invalidated mapping 1009 * take effect immediately. 1010 */ 1011 pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); 1012 pmap_invalidate_page(pmap, pteva); 1013 } 1014 1015 /* 1016 * If the page is finally unwired, simply free it. 1017 */ 1018 --m->wire_count; 1019 if (m->wire_count == 0) { 1020 vm_page_busy(m); 1021 vm_page_free_zero(m); 1022 atomic_subtract_int(&cnt.v_wire_count, 1); 1023 } 1024 return 1; 1025 } 1026 return 0; 1027} 1028 1029static PMAP_INLINE int 1030pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m) 1031{ 1032 vm_page_unhold(m); 1033 if (m->hold_count == 0) 1034 return _pmap_unwire_pte_hold(pmap, m); 1035 else 1036 return 0; 1037} 1038 1039/* 1040 * After removing a page table entry, this routine is used to 1041 * conditionally free the page, and manage the hold/wire counts. 1042 */ 1043static int 1044pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte) 1045{ 1046 1047 if (va >= VM_MAXUSER_ADDRESS) 1048 return 0; 1049 1050 return pmap_unwire_pte_hold(pmap, mpte); 1051} 1052 1053void 1054pmap_pinit0(pmap) 1055 struct pmap *pmap; 1056{ 1057 1058 pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); 1059#ifdef PAE 1060 pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); 1061#endif 1062 pmap->pm_active = 0; 1063 PCPU_SET(curpmap, pmap); 1064 TAILQ_INIT(&pmap->pm_pvlist); 1065 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1066 mtx_lock_spin(&allpmaps_lock); 1067 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1068 mtx_unlock_spin(&allpmaps_lock); 1069} 1070 1071/* 1072 * Initialize a preallocated and zeroed pmap structure, 1073 * such as one in a vmspace structure. 1074 */ 1075void 1076pmap_pinit(pmap) 1077 register struct pmap *pmap; 1078{ 1079 vm_page_t m, ptdpg[NPGPTD]; 1080 vm_paddr_t pa; 1081 static int color; 1082 int i; 1083 1084 /* 1085 * No need to allocate page table space yet but we do need a valid 1086 * page directory table. 1087 */ 1088 if (pmap->pm_pdir == NULL) { 1089 pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map, 1090 NBPTD); 1091#ifdef PAE 1092 pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); 1093 KASSERT(((vm_offset_t)pmap->pm_pdpt & 1094 ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, 1095 ("pmap_pinit: pdpt misaligned")); 1096 KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), 1097 ("pmap_pinit: pdpt above 4g")); 1098#endif 1099 } 1100 1101 /* 1102 * allocate the page directory page(s) 1103 */ 1104 for (i = 0; i < NPGPTD;) { 1105 m = vm_page_alloc(NULL, color++, 1106 VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | 1107 VM_ALLOC_ZERO); 1108 if (m == NULL) 1109 VM_WAIT; 1110 else { 1111 ptdpg[i++] = m; 1112 } 1113 } 1114 1115 pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); 1116 1117 for (i = 0; i < NPGPTD; i++) { 1118 if ((ptdpg[i]->flags & PG_ZERO) == 0) 1119 bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE); 1120 } 1121 1122 mtx_lock_spin(&allpmaps_lock); 1123 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); 1124 mtx_unlock_spin(&allpmaps_lock); 1125 /* Wire in kernel global address entries. */ 1126 /* XXX copies current process, does not fill in MPPTDI */ 1127 bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); 1128#ifdef SMP 1129 pmap->pm_pdir[MPPTDI] = PTD[MPPTDI]; 1130#endif 1131 1132 /* install self-referential address mapping entry(s) */ 1133 for (i = 0; i < NPGPTD; i++) { 1134 pa = VM_PAGE_TO_PHYS(ptdpg[i]); 1135 pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; 1136#ifdef PAE 1137 pmap->pm_pdpt[i] = pa | PG_V; 1138#endif 1139 } 1140 1141 pmap->pm_active = 0; 1142 TAILQ_INIT(&pmap->pm_pvlist); 1143 bzero(&pmap->pm_stats, sizeof pmap->pm_stats); 1144} 1145 1146/* 1147 * this routine is called if the page table page is not 1148 * mapped correctly. 1149 */ 1150static vm_page_t 1151_pmap_allocpte(pmap, ptepindex) 1152 pmap_t pmap; 1153 unsigned ptepindex; 1154{ 1155 vm_paddr_t ptepa; 1156 vm_page_t m; 1157 1158 /* 1159 * Allocate a page table page. 1160 */ 1161 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | 1162 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { 1163 VM_WAIT; 1164 /* 1165 * Indicate the need to retry. While waiting, the page table 1166 * page may have been allocated. 1167 */ 1168 return (NULL); 1169 } 1170 if ((m->flags & PG_ZERO) == 0) 1171 pmap_zero_page(m); 1172 1173 KASSERT(m->queue == PQ_NONE, 1174 ("_pmap_allocpte: %p->queue != PQ_NONE", m)); 1175 1176 /* 1177 * Increment the hold count for the page table page 1178 * (denoting a new mapping.) 1179 */ 1180 m->hold_count++; 1181 1182 /* 1183 * Map the pagetable page into the process address space, if 1184 * it isn't already there. 1185 */ 1186 1187 pmap->pm_stats.resident_count++; 1188 1189 ptepa = VM_PAGE_TO_PHYS(m); 1190 pmap->pm_pdir[ptepindex] = 1191 (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); 1192 1193 vm_page_lock_queues(); 1194 vm_page_wakeup(m); 1195 vm_page_unlock_queues(); 1196 1197 return m; 1198} 1199 1200static vm_page_t 1201pmap_allocpte(pmap_t pmap, vm_offset_t va) 1202{ 1203 unsigned ptepindex; 1204 pd_entry_t ptepa; 1205 vm_page_t m; 1206 1207 /* 1208 * Calculate pagetable page index 1209 */ 1210 ptepindex = va >> PDRSHIFT; 1211retry: 1212 /* 1213 * Get the page directory entry 1214 */ 1215 ptepa = pmap->pm_pdir[ptepindex]; 1216 1217 /* 1218 * This supports switching from a 4MB page to a 1219 * normal 4K page. 1220 */ 1221 if (ptepa & PG_PS) { 1222 pmap->pm_pdir[ptepindex] = 0; 1223 ptepa = 0; 1224 pmap_invalidate_all(kernel_pmap); 1225 } 1226 1227 /* 1228 * If the page table page is mapped, we just increment the 1229 * hold count, and activate it. 1230 */ 1231 if (ptepa) { 1232 m = PHYS_TO_VM_PAGE(ptepa); 1233 m->hold_count++; 1234 } else { 1235 /* 1236 * Here if the pte page isn't mapped, or if it has 1237 * been deallocated. 1238 */ 1239 m = _pmap_allocpte(pmap, ptepindex); 1240 if (m == NULL) 1241 goto retry; 1242 } 1243 return (m); 1244} 1245 1246 1247/*************************************************** 1248* Pmap allocation/deallocation routines. 1249 ***************************************************/ 1250 1251#ifdef SMP 1252/* 1253 * Deal with a SMP shootdown of other users of the pmap that we are 1254 * trying to dispose of. This can be a bit hairy. 1255 */ 1256static u_int *lazymask; 1257static u_int lazyptd; 1258static volatile u_int lazywait; 1259 1260void pmap_lazyfix_action(void); 1261 1262void 1263pmap_lazyfix_action(void) 1264{ 1265 u_int mymask = PCPU_GET(cpumask); 1266 1267 if (rcr3() == lazyptd) 1268 load_cr3(PCPU_GET(curpcb)->pcb_cr3); 1269 atomic_clear_int(lazymask, mymask); 1270 atomic_store_rel_int(&lazywait, 1); 1271} 1272 1273static void 1274pmap_lazyfix_self(u_int mymask) 1275{ 1276 1277 if (rcr3() == lazyptd) 1278 load_cr3(PCPU_GET(curpcb)->pcb_cr3); 1279 atomic_clear_int(lazymask, mymask); 1280} 1281 1282 1283static void 1284pmap_lazyfix(pmap_t pmap) 1285{ 1286 u_int mymask = PCPU_GET(cpumask); 1287 u_int mask; 1288 register u_int spins; 1289 1290 while ((mask = pmap->pm_active) != 0) { 1291 spins = 50000000; 1292 mask = mask & -mask; /* Find least significant set bit */ 1293 mtx_lock_spin(&lazypmap_lock); 1294#ifdef PAE 1295 lazyptd = vtophys(pmap->pm_pdpt); 1296#else 1297 lazyptd = vtophys(pmap->pm_pdir); 1298#endif 1299 if (mask == mymask) { 1300 lazymask = &pmap->pm_active; 1301 pmap_lazyfix_self(mymask); 1302 } else { 1303 atomic_store_rel_int((u_int *)&lazymask, 1304 (u_int)&pmap->pm_active); 1305 atomic_store_rel_int(&lazywait, 0); 1306 ipi_selected(mask, IPI_LAZYPMAP); 1307 while (lazywait == 0) { 1308 ia32_pause(); 1309 if (--spins == 0) 1310 break; 1311 } 1312 } 1313 mtx_unlock_spin(&lazypmap_lock); 1314 if (spins == 0) 1315 printf("pmap_lazyfix: spun for 50000000\n"); 1316 } 1317} 1318 1319#else /* SMP */ 1320 1321/* 1322 * Cleaning up on uniprocessor is easy. For various reasons, we're 1323 * unlikely to have to even execute this code, including the fact 1324 * that the cleanup is deferred until the parent does a wait(2), which 1325 * means that another userland process has run. 1326 */ 1327static void 1328pmap_lazyfix(pmap_t pmap) 1329{ 1330 u_int cr3; 1331 1332 cr3 = vtophys(pmap->pm_pdir); 1333 if (cr3 == rcr3()) { 1334 load_cr3(PCPU_GET(curpcb)->pcb_cr3); 1335 pmap->pm_active &= ~(PCPU_GET(cpumask)); 1336 } 1337} 1338#endif /* SMP */ 1339 1340/* 1341 * Release any resources held by the given physical map. 1342 * Called when a pmap initialized by pmap_pinit is being released. 1343 * Should only be called if the map contains no valid mappings. 1344 */ 1345void 1346pmap_release(pmap_t pmap) 1347{ 1348 vm_page_t m, ptdpg[NPGPTD]; 1349 int i; 1350 1351 KASSERT(pmap->pm_stats.resident_count == 0, 1352 ("pmap_release: pmap resident count %ld != 0", 1353 pmap->pm_stats.resident_count)); 1354 1355 pmap_lazyfix(pmap); 1356 mtx_lock_spin(&allpmaps_lock); 1357 LIST_REMOVE(pmap, pm_list); 1358 mtx_unlock_spin(&allpmaps_lock); 1359 1360 for (i = 0; i < NPGPTD; i++) 1361 ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i]); 1362 1363 bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * 1364 sizeof(*pmap->pm_pdir)); 1365#ifdef SMP 1366 pmap->pm_pdir[MPPTDI] = 0; 1367#endif 1368 1369 pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); 1370 1371 vm_page_lock_queues(); 1372 for (i = 0; i < NPGPTD; i++) { 1373 m = ptdpg[i]; 1374#ifdef PAE 1375 KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), 1376 ("pmap_release: got wrong ptd page")); 1377#endif 1378 m->wire_count--; 1379 atomic_subtract_int(&cnt.v_wire_count, 1); 1380 vm_page_free_zero(m); 1381 } 1382 vm_page_unlock_queues(); 1383} 1384 1385static int 1386kvm_size(SYSCTL_HANDLER_ARGS) 1387{ 1388 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; 1389 1390 return sysctl_handle_long(oidp, &ksize, 0, req); 1391} 1392SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 1393 0, 0, kvm_size, "IU", "Size of KVM"); 1394 1395static int 1396kvm_free(SYSCTL_HANDLER_ARGS) 1397{ 1398 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; 1399 1400 return sysctl_handle_long(oidp, &kfree, 0, req); 1401} 1402SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 1403 0, 0, kvm_free, "IU", "Amount of KVM free"); 1404 1405/* 1406 * grow the number of kernel page table entries, if needed 1407 */ 1408void 1409pmap_growkernel(vm_offset_t addr) 1410{ 1411 struct pmap *pmap; 1412 int s; 1413 vm_paddr_t ptppaddr; 1414 vm_page_t nkpg; 1415 pd_entry_t newpdir; 1416 pt_entry_t *pde; 1417 1418 s = splhigh(); 1419 mtx_assert(&kernel_map->system_mtx, MA_OWNED); 1420 if (kernel_vm_end == 0) { 1421 kernel_vm_end = KERNBASE; 1422 nkpt = 0; 1423 while (pdir_pde(PTD, kernel_vm_end)) { 1424 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1425 nkpt++; 1426 } 1427 } 1428 addr = roundup2(addr, PAGE_SIZE * NPTEPG); 1429 while (kernel_vm_end < addr) { 1430 if (pdir_pde(PTD, kernel_vm_end)) { 1431 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1432 continue; 1433 } 1434 1435 /* 1436 * This index is bogus, but out of the way 1437 */ 1438 nkpg = vm_page_alloc(NULL, nkpt, 1439 VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED); 1440 if (!nkpg) 1441 panic("pmap_growkernel: no memory to grow kernel"); 1442 1443 nkpt++; 1444 1445 pmap_zero_page(nkpg); 1446 ptppaddr = VM_PAGE_TO_PHYS(nkpg); 1447 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); 1448 pdir_pde(PTD, kernel_vm_end) = newpdir; 1449 1450 mtx_lock_spin(&allpmaps_lock); 1451 LIST_FOREACH(pmap, &allpmaps, pm_list) { 1452 pde = pmap_pde(pmap, kernel_vm_end); 1453 pde_store(pde, newpdir); 1454 } 1455 mtx_unlock_spin(&allpmaps_lock); 1456 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1); 1457 } 1458 splx(s); 1459} 1460 1461 1462/*************************************************** 1463 * page management routines. 1464 ***************************************************/ 1465 1466/* 1467 * free the pv_entry back to the free list 1468 */ 1469static PMAP_INLINE void 1470free_pv_entry(pv_entry_t pv) 1471{ 1472 pv_entry_count--; 1473 uma_zfree(pvzone, pv); 1474} 1475 1476/* 1477 * get a new pv_entry, allocating a block from the system 1478 * when needed. 1479 * the memory allocation is performed bypassing the malloc code 1480 * because of the possibility of allocations at interrupt time. 1481 */ 1482static pv_entry_t 1483get_pv_entry(void) 1484{ 1485 pv_entry_count++; 1486 if (pv_entry_high_water && 1487 (pv_entry_count > pv_entry_high_water) && 1488 (pmap_pagedaemon_waken == 0)) { 1489 pmap_pagedaemon_waken = 1; 1490 wakeup (&vm_pages_needed); 1491 } 1492 return uma_zalloc(pvzone, M_NOWAIT); 1493} 1494 1495/* 1496 * If it is the first entry on the list, it is actually 1497 * in the header and we must copy the following entry up 1498 * to the header. Otherwise we must search the list for 1499 * the entry. In either case we free the now unused entry. 1500 */ 1501 1502static int 1503pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) 1504{ 1505 pv_entry_t pv; 1506 int rtval; 1507 int s; 1508 1509 s = splvm(); 1510 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1511 if (m->md.pv_list_count < pmap->pm_stats.resident_count) { 1512 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 1513 if (pmap == pv->pv_pmap && va == pv->pv_va) 1514 break; 1515 } 1516 } else { 1517 TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) { 1518 if (va == pv->pv_va) 1519 break; 1520 } 1521 } 1522 1523 rtval = 0; 1524 if (pv) { 1525 rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem); 1526 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 1527 m->md.pv_list_count--; 1528 if (TAILQ_FIRST(&m->md.pv_list) == NULL) 1529 vm_page_flag_clear(m, PG_WRITEABLE); 1530 1531 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist); 1532 free_pv_entry(pv); 1533 } 1534 1535 splx(s); 1536 return rtval; 1537} 1538 1539/* 1540 * Create a pv entry for page at pa for 1541 * (pmap, va). 1542 */ 1543static void 1544pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m) 1545{ 1546 1547 int s; 1548 pv_entry_t pv; 1549 1550 s = splvm(); 1551 pv = get_pv_entry(); 1552 pv->pv_va = va; 1553 pv->pv_pmap = pmap; 1554 pv->pv_ptem = mpte; 1555 1556 vm_page_lock_queues(); 1557 TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist); 1558 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 1559 m->md.pv_list_count++; 1560 1561 vm_page_unlock_queues(); 1562 splx(s); 1563} 1564 1565/* 1566 * pmap_remove_pte: do the things to unmap a page in a process 1567 */ 1568static int 1569pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va) 1570{ 1571 pt_entry_t oldpte; 1572 vm_page_t m, mpte; 1573 1574 oldpte = pte_load_clear(ptq); 1575 if (oldpte & PG_W) 1576 pmap->pm_stats.wired_count -= 1; 1577 /* 1578 * Machines that don't support invlpg, also don't support 1579 * PG_G. 1580 */ 1581 if (oldpte & PG_G) 1582 pmap_invalidate_page(kernel_pmap, va); 1583 pmap->pm_stats.resident_count -= 1; 1584 if (oldpte & PG_MANAGED) { 1585 m = PHYS_TO_VM_PAGE(oldpte); 1586 if (oldpte & PG_M) { 1587#if defined(PMAP_DIAGNOSTIC) 1588 if (pmap_nw_modified((pt_entry_t) oldpte)) { 1589 printf( 1590 "pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n", 1591 va, oldpte); 1592 } 1593#endif 1594 if (pmap_track_modified(va)) 1595 vm_page_dirty(m); 1596 } 1597 if (oldpte & PG_A) 1598 vm_page_flag_set(m, PG_REFERENCED); 1599 return pmap_remove_entry(pmap, m, va); 1600 } else { 1601 mpte = PHYS_TO_VM_PAGE(*pmap_pde(pmap, va)); 1602 return pmap_unuse_pt(pmap, va, mpte); 1603 } 1604} 1605 1606/* 1607 * Remove a single page from a process address space 1608 */ 1609static void 1610pmap_remove_page(pmap_t pmap, vm_offset_t va) 1611{ 1612 pt_entry_t *pte; 1613 1614 if ((pte = pmap_pte(pmap, va)) == NULL || *pte == 0) 1615 return; 1616 pmap_remove_pte(pmap, pte, va); 1617 pmap_invalidate_page(pmap, va); 1618} 1619 1620/* 1621 * Remove the given range of addresses from the specified map. 1622 * 1623 * It is assumed that the start and end are properly 1624 * rounded to the page size. 1625 */ 1626void 1627pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) 1628{ 1629 vm_offset_t pdnxt; 1630 pd_entry_t ptpaddr; 1631 pt_entry_t *pte; 1632 int anyvalid; 1633 1634 if (pmap == NULL) 1635 return; 1636 1637 if (pmap->pm_stats.resident_count == 0) 1638 return; 1639 1640 /* 1641 * special handling of removing one page. a very 1642 * common operation and easy to short circuit some 1643 * code. 1644 */ 1645 if ((sva + PAGE_SIZE == eva) && 1646 ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { 1647 pmap_remove_page(pmap, sva); 1648 return; 1649 } 1650 1651 anyvalid = 0; 1652 1653 for (; sva < eva; sva = pdnxt) { 1654 unsigned pdirindex; 1655 1656 /* 1657 * Calculate index for next page table. 1658 */ 1659 pdnxt = (sva + NBPDR) & ~PDRMASK; 1660 if (pmap->pm_stats.resident_count == 0) 1661 break; 1662 1663 pdirindex = sva >> PDRSHIFT; 1664 ptpaddr = pmap->pm_pdir[pdirindex]; 1665 1666 /* 1667 * Weed out invalid mappings. Note: we assume that the page 1668 * directory table is always allocated, and in kernel virtual. 1669 */ 1670 if (ptpaddr == 0) 1671 continue; 1672 1673 /* 1674 * Check for large page. 1675 */ 1676 if ((ptpaddr & PG_PS) != 0) { 1677 pmap->pm_pdir[pdirindex] = 0; 1678 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 1679 anyvalid = 1; 1680 continue; 1681 } 1682 1683 /* 1684 * Limit our scan to either the end of the va represented 1685 * by the current page table page, or to the end of the 1686 * range being removed. 1687 */ 1688 if (pdnxt > eva) 1689 pdnxt = eva; 1690 1691 for (; sva != pdnxt; sva += PAGE_SIZE) { 1692 if ((pte = pmap_pte(pmap, sva)) == NULL || 1693 *pte == 0) 1694 continue; 1695 anyvalid = 1; 1696 if (pmap_remove_pte(pmap, pte, sva)) 1697 break; 1698 } 1699 } 1700 1701 if (anyvalid) 1702 pmap_invalidate_all(pmap); 1703} 1704 1705/* 1706 * Routine: pmap_remove_all 1707 * Function: 1708 * Removes this physical page from 1709 * all physical maps in which it resides. 1710 * Reflects back modify bits to the pager. 1711 * 1712 * Notes: 1713 * Original versions of this routine were very 1714 * inefficient because they iteratively called 1715 * pmap_remove (slow...) 1716 */ 1717 1718void 1719pmap_remove_all(vm_page_t m) 1720{ 1721 register pv_entry_t pv; 1722 pt_entry_t *pte, tpte; 1723 int s; 1724 1725#if defined(PMAP_DIAGNOSTIC) 1726 /* 1727 * XXX This makes pmap_remove_all() illegal for non-managed pages! 1728 */ 1729 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) { 1730 panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x", 1731 VM_PAGE_TO_PHYS(m)); 1732 } 1733#endif 1734 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 1735 s = splvm(); 1736 sched_pin(); 1737 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 1738 pv->pv_pmap->pm_stats.resident_count--; 1739 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 1740 tpte = pte_load_clear(pte); 1741 if (tpte & PG_W) 1742 pv->pv_pmap->pm_stats.wired_count--; 1743 if (tpte & PG_A) 1744 vm_page_flag_set(m, PG_REFERENCED); 1745 1746 /* 1747 * Update the vm_page_t clean and reference bits. 1748 */ 1749 if (tpte & PG_M) { 1750#if defined(PMAP_DIAGNOSTIC) 1751 if (pmap_nw_modified((pt_entry_t) tpte)) { 1752 printf( 1753 "pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n", 1754 pv->pv_va, tpte); 1755 } 1756#endif 1757 if (pmap_track_modified(pv->pv_va)) 1758 vm_page_dirty(m); 1759 } 1760 pmap_invalidate_page(pv->pv_pmap, pv->pv_va); 1761 TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); 1762 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 1763 m->md.pv_list_count--; 1764 pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); 1765 free_pv_entry(pv); 1766 } 1767 vm_page_flag_clear(m, PG_WRITEABLE); 1768 sched_unpin(); 1769 splx(s); 1770} 1771 1772/* 1773 * Set the physical protection on the 1774 * specified range of this map as requested. 1775 */ 1776void 1777pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) 1778{ 1779 vm_offset_t pdnxt; 1780 pd_entry_t ptpaddr; 1781 int anychanged; 1782 1783 if (pmap == NULL) 1784 return; 1785 1786 if ((prot & VM_PROT_READ) == VM_PROT_NONE) { 1787 pmap_remove(pmap, sva, eva); 1788 return; 1789 } 1790 1791 if (prot & VM_PROT_WRITE) 1792 return; 1793 1794 anychanged = 0; 1795 1796 for (; sva < eva; sva = pdnxt) { 1797 unsigned pdirindex; 1798 1799 pdnxt = (sva + NBPDR) & ~PDRMASK; 1800 1801 pdirindex = sva >> PDRSHIFT; 1802 ptpaddr = pmap->pm_pdir[pdirindex]; 1803 1804 /* 1805 * Weed out invalid mappings. Note: we assume that the page 1806 * directory table is always allocated, and in kernel virtual. 1807 */ 1808 if (ptpaddr == 0) 1809 continue; 1810 1811 /* 1812 * Check for large page. 1813 */ 1814 if ((ptpaddr & PG_PS) != 0) { 1815 pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW); 1816 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; 1817 anychanged = 1; 1818 continue; 1819 } 1820 1821 if (pdnxt > eva) 1822 pdnxt = eva; 1823 1824 for (; sva != pdnxt; sva += PAGE_SIZE) { 1825 pt_entry_t pbits; 1826 pt_entry_t *pte; 1827 vm_page_t m; 1828 1829 if ((pte = pmap_pte(pmap, sva)) == NULL) 1830 continue; 1831 pbits = *pte; 1832 if (pbits & PG_MANAGED) { 1833 m = NULL; 1834 if (pbits & PG_A) { 1835 m = PHYS_TO_VM_PAGE(pbits); 1836 vm_page_flag_set(m, PG_REFERENCED); 1837 pbits &= ~PG_A; 1838 } 1839 if ((pbits & PG_M) != 0 && 1840 pmap_track_modified(sva)) { 1841 if (m == NULL) 1842 m = PHYS_TO_VM_PAGE(pbits); 1843 vm_page_dirty(m); 1844 pbits &= ~PG_M; 1845 } 1846 } 1847 1848 pbits &= ~PG_RW; 1849 1850 if (pbits != *pte) { 1851 pte_store(pte, pbits); 1852 anychanged = 1; 1853 } 1854 } 1855 } 1856 if (anychanged) 1857 pmap_invalidate_all(pmap); 1858} 1859 1860/* 1861 * Insert the given physical page (p) at 1862 * the specified virtual address (v) in the 1863 * target physical map with the protection requested. 1864 * 1865 * If specified, the page will be wired down, meaning 1866 * that the related pte can not be reclaimed. 1867 * 1868 * NB: This is the only routine which MAY NOT lazy-evaluate 1869 * or lose information. That is, this routine must actually 1870 * insert this page into the given map NOW. 1871 */ 1872void 1873pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, 1874 boolean_t wired) 1875{ 1876 vm_paddr_t pa; 1877 register pt_entry_t *pte; 1878 vm_paddr_t opa; 1879 pt_entry_t origpte, newpte; 1880 vm_page_t mpte; 1881 1882 if (pmap == NULL) 1883 return; 1884 1885 va &= PG_FRAME; 1886#ifdef PMAP_DIAGNOSTIC 1887 if (va > VM_MAX_KERNEL_ADDRESS) 1888 panic("pmap_enter: toobig"); 1889 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) 1890 panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va); 1891#endif 1892 1893 mpte = NULL; 1894 /* 1895 * In the case that a page table page is not 1896 * resident, we are creating it here. 1897 */ 1898 if (va < VM_MAXUSER_ADDRESS) { 1899 mpte = pmap_allocpte(pmap, va); 1900 } 1901#if 0 && defined(PMAP_DIAGNOSTIC) 1902 else { 1903 pd_entry_t *pdeaddr = pmap_pde(pmap, va); 1904 origpte = *pdeaddr; 1905 if ((origpte & PG_V) == 0) { 1906 panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n", 1907 pmap->pm_pdir[PTDPTDI], origpte, va); 1908 } 1909 } 1910#endif 1911 1912 pte = pmap_pte(pmap, va); 1913 1914 /* 1915 * Page Directory table entry not valid, we need a new PT page 1916 */ 1917 if (pte == NULL) { 1918 panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n", 1919 (uintmax_t)pmap->pm_pdir[PTDPTDI], va); 1920 } 1921 1922 pa = VM_PAGE_TO_PHYS(m) & PG_FRAME; 1923 origpte = *pte; 1924 opa = origpte & PG_FRAME; 1925 1926 if (origpte & PG_PS) { 1927 /* 1928 * Yes, I know this will truncate upper address bits for PAE, 1929 * but I'm actually more interested in the lower bits 1930 */ 1931 printf("pmap_enter: va %p, pte %p, origpte %p\n", 1932 (void *)va, (void *)pte, (void *)(uintptr_t)origpte); 1933 panic("pmap_enter: attempted pmap_enter on 4MB page"); 1934 } 1935 1936 /* 1937 * Mapping has not changed, must be protection or wiring change. 1938 */ 1939 if (origpte && (opa == pa)) { 1940 /* 1941 * Wiring change, just update stats. We don't worry about 1942 * wiring PT pages as they remain resident as long as there 1943 * are valid mappings in them. Hence, if a user page is wired, 1944 * the PT page will be also. 1945 */ 1946 if (wired && ((origpte & PG_W) == 0)) 1947 pmap->pm_stats.wired_count++; 1948 else if (!wired && (origpte & PG_W)) 1949 pmap->pm_stats.wired_count--; 1950 1951#if defined(PMAP_DIAGNOSTIC) 1952 if (pmap_nw_modified((pt_entry_t) origpte)) { 1953 printf( 1954 "pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n", 1955 va, origpte); 1956 } 1957#endif 1958 1959 /* 1960 * Remove extra pte reference 1961 */ 1962 if (mpte) 1963 mpte->hold_count--; 1964 1965 /* 1966 * We might be turning off write access to the page, 1967 * so we go ahead and sense modify status. 1968 */ 1969 if (origpte & PG_MANAGED) { 1970 if ((origpte & PG_M) && pmap_track_modified(va)) { 1971 vm_page_t om; 1972 om = PHYS_TO_VM_PAGE(opa); 1973 vm_page_dirty(om); 1974 } 1975 pa |= PG_MANAGED; 1976 } 1977 goto validate; 1978 } 1979 /* 1980 * Mapping has changed, invalidate old range and fall through to 1981 * handle validating new mapping. 1982 */ 1983 if (opa) { 1984 int err; 1985 vm_page_lock_queues(); 1986 err = pmap_remove_pte(pmap, pte, va); 1987 vm_page_unlock_queues(); 1988 if (err) 1989 panic("pmap_enter: pte vanished, va: 0x%x", va); 1990 } 1991 1992 /* 1993 * Enter on the PV list if part of our managed memory. Note that we 1994 * raise IPL while manipulating pv_table since pmap_enter can be 1995 * called at interrupt time. 1996 */ 1997 if (pmap_initialized && 1998 (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) { 1999 pmap_insert_entry(pmap, va, mpte, m); 2000 pa |= PG_MANAGED; 2001 } 2002 2003 /* 2004 * Increment counters 2005 */ 2006 pmap->pm_stats.resident_count++; 2007 if (wired) 2008 pmap->pm_stats.wired_count++; 2009 2010validate: 2011 /* 2012 * Now validate mapping with desired protection/wiring. 2013 */ 2014 newpte = (pt_entry_t)(pa | PG_V); 2015 if ((prot & VM_PROT_WRITE) != 0) 2016 newpte |= PG_RW; 2017 if (wired) 2018 newpte |= PG_W; 2019 if (va < VM_MAXUSER_ADDRESS) 2020 newpte |= PG_U; 2021 if (pmap == kernel_pmap) 2022 newpte |= pgeflag; 2023 2024 /* 2025 * if the mapping or permission bits are different, we need 2026 * to update the pte. 2027 */ 2028 if ((origpte & ~(PG_M|PG_A)) != newpte) { 2029 pte_store(pte, newpte | PG_A); 2030 /*if (origpte)*/ { 2031 pmap_invalidate_page(pmap, va); 2032 } 2033 } 2034} 2035 2036/* 2037 * this code makes some *MAJOR* assumptions: 2038 * 1. Current pmap & pmap exists. 2039 * 2. Not wired. 2040 * 3. Read access. 2041 * 4. No page table pages. 2042 * 5. Tlbflush is deferred to calling procedure. 2043 * 6. Page IS managed. 2044 * but is *MUCH* faster than pmap_enter... 2045 */ 2046 2047vm_page_t 2048pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte) 2049{ 2050 pt_entry_t *pte; 2051 vm_paddr_t pa; 2052 2053 /* 2054 * In the case that a page table page is not 2055 * resident, we are creating it here. 2056 */ 2057 if (va < VM_MAXUSER_ADDRESS) { 2058 unsigned ptepindex; 2059 pd_entry_t ptepa; 2060 2061 /* 2062 * Calculate pagetable page index 2063 */ 2064 ptepindex = va >> PDRSHIFT; 2065 if (mpte && (mpte->pindex == ptepindex)) { 2066 mpte->hold_count++; 2067 } else { 2068retry: 2069 /* 2070 * Get the page directory entry 2071 */ 2072 ptepa = pmap->pm_pdir[ptepindex]; 2073 2074 /* 2075 * If the page table page is mapped, we just increment 2076 * the hold count, and activate it. 2077 */ 2078 if (ptepa) { 2079 if (ptepa & PG_PS) 2080 panic("pmap_enter_quick: unexpected mapping into 4MB page"); 2081 mpte = PHYS_TO_VM_PAGE(ptepa); 2082 mpte->hold_count++; 2083 } else { 2084 mpte = _pmap_allocpte(pmap, ptepindex); 2085 if (mpte == NULL) 2086 goto retry; 2087 } 2088 } 2089 } else { 2090 mpte = NULL; 2091 } 2092 2093 /* 2094 * This call to vtopte makes the assumption that we are 2095 * entering the page into the current pmap. In order to support 2096 * quick entry into any pmap, one would likely use pmap_pte_quick. 2097 * But that isn't as quick as vtopte. 2098 */ 2099 pte = vtopte(va); 2100 if (*pte) { 2101 if (mpte != NULL) { 2102 vm_page_lock_queues(); 2103 pmap_unwire_pte_hold(pmap, mpte); 2104 vm_page_unlock_queues(); 2105 } 2106 return 0; 2107 } 2108 2109 /* 2110 * Enter on the PV list if part of our managed memory. Note that we 2111 * raise IPL while manipulating pv_table since pmap_enter can be 2112 * called at interrupt time. 2113 */ 2114 if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) 2115 pmap_insert_entry(pmap, va, mpte, m); 2116 2117 /* 2118 * Increment counters 2119 */ 2120 pmap->pm_stats.resident_count++; 2121 2122 pa = VM_PAGE_TO_PHYS(m); 2123 2124 /* 2125 * Now validate mapping with RO protection 2126 */ 2127 if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) 2128 pte_store(pte, pa | PG_V | PG_U); 2129 else 2130 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); 2131 2132 return mpte; 2133} 2134 2135/* 2136 * Make a temporary mapping for a physical address. This is only intended 2137 * to be used for panic dumps. 2138 */ 2139void * 2140pmap_kenter_temporary(vm_paddr_t pa, int i) 2141{ 2142 vm_offset_t va; 2143 2144 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); 2145 pmap_kenter(va, pa); 2146#ifndef I386_CPU 2147 invlpg(va); 2148#else 2149 invltlb(); 2150#endif 2151 return ((void *)crashdumpmap); 2152} 2153 2154/* 2155 * This code maps large physical mmap regions into the 2156 * processor address space. Note that some shortcuts 2157 * are taken, but the code works. 2158 */ 2159void 2160pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, 2161 vm_object_t object, vm_pindex_t pindex, 2162 vm_size_t size) 2163{ 2164 vm_page_t p; 2165 2166 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED); 2167 KASSERT(object->type == OBJT_DEVICE, 2168 ("pmap_object_init_pt: non-device object")); 2169 if (pseflag && 2170 ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) { 2171 int i; 2172 vm_page_t m[1]; 2173 unsigned int ptepindex; 2174 int npdes; 2175 pd_entry_t ptepa; 2176 2177 if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)]) 2178 return; 2179retry: 2180 p = vm_page_lookup(object, pindex); 2181 if (p != NULL) { 2182 vm_page_lock_queues(); 2183 if (vm_page_sleep_if_busy(p, FALSE, "init4p")) 2184 goto retry; 2185 } else { 2186 p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL); 2187 if (p == NULL) 2188 return; 2189 m[0] = p; 2190 2191 if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) { 2192 vm_page_lock_queues(); 2193 vm_page_free(p); 2194 vm_page_unlock_queues(); 2195 return; 2196 } 2197 2198 p = vm_page_lookup(object, pindex); 2199 vm_page_lock_queues(); 2200 vm_page_wakeup(p); 2201 } 2202 vm_page_unlock_queues(); 2203 2204 ptepa = VM_PAGE_TO_PHYS(p); 2205 if (ptepa & (NBPDR - 1)) 2206 return; 2207 2208 p->valid = VM_PAGE_BITS_ALL; 2209 2210 pmap->pm_stats.resident_count += size >> PAGE_SHIFT; 2211 npdes = size >> PDRSHIFT; 2212 for(i = 0; i < npdes; i++) { 2213 pde_store(&pmap->pm_pdir[ptepindex], 2214 ptepa | PG_U | PG_RW | PG_V | PG_PS); 2215 ptepa += NBPDR; 2216 ptepindex += 1; 2217 } 2218 pmap_invalidate_all(pmap); 2219 } 2220} 2221 2222/* 2223 * Routine: pmap_change_wiring 2224 * Function: Change the wiring attribute for a map/virtual-address 2225 * pair. 2226 * In/out conditions: 2227 * The mapping must already exist in the pmap. 2228 */ 2229void 2230pmap_change_wiring(pmap, va, wired) 2231 register pmap_t pmap; 2232 vm_offset_t va; 2233 boolean_t wired; 2234{ 2235 register pt_entry_t *pte; 2236 2237 if (pmap == NULL) 2238 return; 2239 2240 pte = pmap_pte(pmap, va); 2241 2242 if (wired && !pmap_pte_w(pte)) 2243 pmap->pm_stats.wired_count++; 2244 else if (!wired && pmap_pte_w(pte)) 2245 pmap->pm_stats.wired_count--; 2246 2247 /* 2248 * Wiring is not a hardware characteristic so there is no need to 2249 * invalidate TLB. 2250 */ 2251 pmap_pte_set_w(pte, wired); 2252} 2253 2254 2255 2256/* 2257 * Copy the range specified by src_addr/len 2258 * from the source map to the range dst_addr/len 2259 * in the destination map. 2260 * 2261 * This routine is only advisory and need not do anything. 2262 */ 2263 2264void 2265pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, 2266 vm_offset_t src_addr) 2267{ 2268 vm_offset_t addr; 2269 vm_offset_t end_addr = src_addr + len; 2270 vm_offset_t pdnxt; 2271 vm_page_t m; 2272 2273 if (dst_addr != src_addr) 2274 return; 2275 2276 if (!pmap_is_current(src_pmap)) 2277 return; 2278 2279 for (addr = src_addr; addr < end_addr; addr = pdnxt) { 2280 pt_entry_t *src_pte, *dst_pte; 2281 vm_page_t dstmpte, srcmpte; 2282 pd_entry_t srcptepaddr; 2283 unsigned ptepindex; 2284 2285 if (addr >= UPT_MIN_ADDRESS) 2286 panic("pmap_copy: invalid to pmap_copy page tables\n"); 2287 2288 /* 2289 * Don't let optional prefaulting of pages make us go 2290 * way below the low water mark of free pages or way 2291 * above high water mark of used pv entries. 2292 */ 2293 if (cnt.v_free_count < cnt.v_free_reserved || 2294 pv_entry_count > pv_entry_high_water) 2295 break; 2296 2297 pdnxt = (addr + NBPDR) & ~PDRMASK; 2298 ptepindex = addr >> PDRSHIFT; 2299 2300 srcptepaddr = src_pmap->pm_pdir[ptepindex]; 2301 if (srcptepaddr == 0) 2302 continue; 2303 2304 if (srcptepaddr & PG_PS) { 2305 if (dst_pmap->pm_pdir[ptepindex] == 0) { 2306 dst_pmap->pm_pdir[ptepindex] = srcptepaddr; 2307 dst_pmap->pm_stats.resident_count += 2308 NBPDR / PAGE_SIZE; 2309 } 2310 continue; 2311 } 2312 2313 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); 2314 if (srcmpte->hold_count == 0 || (srcmpte->flags & PG_BUSY)) 2315 continue; 2316 2317 if (pdnxt > end_addr) 2318 pdnxt = end_addr; 2319 2320 src_pte = vtopte(addr); 2321 while (addr < pdnxt) { 2322 pt_entry_t ptetemp; 2323 ptetemp = *src_pte; 2324 /* 2325 * we only virtual copy managed pages 2326 */ 2327 if ((ptetemp & PG_MANAGED) != 0) { 2328 /* 2329 * We have to check after allocpte for the 2330 * pte still being around... allocpte can 2331 * block. 2332 */ 2333 dstmpte = pmap_allocpte(dst_pmap, addr); 2334 dst_pte = pmap_pte(dst_pmap, addr); 2335 if ((*dst_pte == 0) && (ptetemp = *src_pte)) { 2336 /* 2337 * Clear the modified and 2338 * accessed (referenced) bits 2339 * during the copy. 2340 */ 2341 m = PHYS_TO_VM_PAGE(ptetemp); 2342 *dst_pte = ptetemp & ~(PG_M | PG_A); 2343 dst_pmap->pm_stats.resident_count++; 2344 pmap_insert_entry(dst_pmap, addr, 2345 dstmpte, m); 2346 } else { 2347 vm_page_lock_queues(); 2348 pmap_unwire_pte_hold(dst_pmap, dstmpte); 2349 vm_page_unlock_queues(); 2350 } 2351 if (dstmpte->hold_count >= srcmpte->hold_count) 2352 break; 2353 } 2354 addr += PAGE_SIZE; 2355 src_pte++; 2356 } 2357 } 2358} 2359 2360static __inline void 2361pagezero(void *page) 2362{ 2363#if defined(I686_CPU) 2364 if (cpu_class == CPUCLASS_686) { 2365#if defined(CPU_ENABLE_SSE) 2366 if (cpu_feature & CPUID_SSE2) 2367 sse2_pagezero(page); 2368 else 2369#endif 2370 i686_pagezero(page); 2371 } else 2372#endif 2373 bzero(page, PAGE_SIZE); 2374} 2375 2376/* 2377 * pmap_zero_page zeros the specified hardware page by mapping 2378 * the page into KVM and using bzero to clear its contents. 2379 */ 2380void 2381pmap_zero_page(vm_page_t m) 2382{ 2383 2384 mtx_lock(&CMAPCADDR12_lock); 2385 if (*CMAP2) 2386 panic("pmap_zero_page: CMAP2 busy"); 2387 sched_pin(); 2388 *CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M; 2389 invlcaddr(CADDR2); 2390 pagezero(CADDR2); 2391 *CMAP2 = 0; 2392 sched_unpin(); 2393 mtx_unlock(&CMAPCADDR12_lock); 2394} 2395 2396/* 2397 * pmap_zero_page_area zeros the specified hardware page by mapping 2398 * the page into KVM and using bzero to clear its contents. 2399 * 2400 * off and size may not cover an area beyond a single hardware page. 2401 */ 2402void 2403pmap_zero_page_area(vm_page_t m, int off, int size) 2404{ 2405 2406 mtx_lock(&CMAPCADDR12_lock); 2407 if (*CMAP2) 2408 panic("pmap_zero_page: CMAP2 busy"); 2409 sched_pin(); 2410 *CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M; 2411 invlcaddr(CADDR2); 2412 if (off == 0 && size == PAGE_SIZE) 2413 pagezero(CADDR2); 2414 else 2415 bzero((char *)CADDR2 + off, size); 2416 *CMAP2 = 0; 2417 sched_unpin(); 2418 mtx_unlock(&CMAPCADDR12_lock); 2419} 2420 2421/* 2422 * pmap_zero_page_idle zeros the specified hardware page by mapping 2423 * the page into KVM and using bzero to clear its contents. This 2424 * is intended to be called from the vm_pagezero process only and 2425 * outside of Giant. 2426 */ 2427void 2428pmap_zero_page_idle(vm_page_t m) 2429{ 2430 2431 if (*CMAP3) 2432 panic("pmap_zero_page: CMAP3 busy"); 2433 sched_pin(); 2434 *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M; 2435 invlcaddr(CADDR3); 2436 pagezero(CADDR3); 2437 *CMAP3 = 0; 2438 sched_unpin(); 2439} 2440 2441/* 2442 * pmap_copy_page copies the specified (machine independent) 2443 * page by mapping the page into virtual memory and using 2444 * bcopy to copy the page, one machine dependent page at a 2445 * time. 2446 */ 2447void 2448pmap_copy_page(vm_page_t src, vm_page_t dst) 2449{ 2450 2451 mtx_lock(&CMAPCADDR12_lock); 2452 if (*CMAP1) 2453 panic("pmap_copy_page: CMAP1 busy"); 2454 if (*CMAP2) 2455 panic("pmap_copy_page: CMAP2 busy"); 2456 sched_pin(); 2457#ifdef I386_CPU 2458 invltlb(); 2459#else 2460 invlpg((u_int)CADDR1); 2461 invlpg((u_int)CADDR2); 2462#endif 2463 *CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A; 2464 *CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M; 2465 bcopy(CADDR1, CADDR2, PAGE_SIZE); 2466 *CMAP1 = 0; 2467 *CMAP2 = 0; 2468 sched_unpin(); 2469 mtx_unlock(&CMAPCADDR12_lock); 2470} 2471 2472/* 2473 * Returns true if the pmap's pv is one of the first 2474 * 16 pvs linked to from this page. This count may 2475 * be changed upwards or downwards in the future; it 2476 * is only necessary that true be returned for a small 2477 * subset of pmaps for proper page aging. 2478 */ 2479boolean_t 2480pmap_page_exists_quick(pmap, m) 2481 pmap_t pmap; 2482 vm_page_t m; 2483{ 2484 pv_entry_t pv; 2485 int loops = 0; 2486 int s; 2487 2488 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 2489 return FALSE; 2490 2491 s = splvm(); 2492 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2493 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2494 if (pv->pv_pmap == pmap) { 2495 splx(s); 2496 return TRUE; 2497 } 2498 loops++; 2499 if (loops >= 16) 2500 break; 2501 } 2502 splx(s); 2503 return (FALSE); 2504} 2505 2506#define PMAP_REMOVE_PAGES_CURPROC_ONLY 2507/* 2508 * Remove all pages from specified address space 2509 * this aids process exit speeds. Also, this code 2510 * is special cased for current process only, but 2511 * can have the more generic (and slightly slower) 2512 * mode enabled. This is much faster than pmap_remove 2513 * in the case of running down an entire address space. 2514 */ 2515void 2516pmap_remove_pages(pmap, sva, eva) 2517 pmap_t pmap; 2518 vm_offset_t sva, eva; 2519{ 2520 pt_entry_t *pte, tpte; 2521 vm_page_t m; 2522 pv_entry_t pv, npv; 2523 int s; 2524 2525#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY 2526 if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))) { 2527 printf("warning: pmap_remove_pages called with non-current pmap\n"); 2528 return; 2529 } 2530#endif 2531 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2532 s = splvm(); 2533 sched_pin(); 2534 for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) { 2535 2536 if (pv->pv_va >= eva || pv->pv_va < sva) { 2537 npv = TAILQ_NEXT(pv, pv_plist); 2538 continue; 2539 } 2540 2541#ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY 2542 pte = vtopte(pv->pv_va); 2543#else 2544 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 2545#endif 2546 tpte = *pte; 2547 2548 if (tpte == 0) { 2549 printf("TPTE at %p IS ZERO @ VA %08x\n", 2550 pte, pv->pv_va); 2551 panic("bad pte"); 2552 } 2553 2554/* 2555 * We cannot remove wired pages from a process' mapping at this time 2556 */ 2557 if (tpte & PG_W) { 2558 npv = TAILQ_NEXT(pv, pv_plist); 2559 continue; 2560 } 2561 2562 m = PHYS_TO_VM_PAGE(tpte); 2563 KASSERT(m->phys_addr == (tpte & PG_FRAME), 2564 ("vm_page_t %p phys_addr mismatch %016jx %016jx", 2565 m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); 2566 2567 KASSERT(m < &vm_page_array[vm_page_array_size], 2568 ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); 2569 2570 pv->pv_pmap->pm_stats.resident_count--; 2571 2572 pte_clear(pte); 2573 2574 /* 2575 * Update the vm_page_t clean and reference bits. 2576 */ 2577 if (tpte & PG_M) { 2578 vm_page_dirty(m); 2579 } 2580 2581 npv = TAILQ_NEXT(pv, pv_plist); 2582 TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist); 2583 2584 m->md.pv_list_count--; 2585 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2586 if (TAILQ_FIRST(&m->md.pv_list) == NULL) { 2587 vm_page_flag_clear(m, PG_WRITEABLE); 2588 } 2589 2590 pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem); 2591 free_pv_entry(pv); 2592 } 2593 sched_unpin(); 2594 splx(s); 2595 pmap_invalidate_all(pmap); 2596} 2597 2598/* 2599 * pmap_is_modified: 2600 * 2601 * Return whether or not the specified physical page was modified 2602 * in any physical maps. 2603 */ 2604boolean_t 2605pmap_is_modified(vm_page_t m) 2606{ 2607 pv_entry_t pv; 2608 pt_entry_t *pte; 2609 int s; 2610 2611 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 2612 return FALSE; 2613 2614 s = splvm(); 2615 sched_pin(); 2616 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2617 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2618 /* 2619 * if the bit being tested is the modified bit, then 2620 * mark clean_map and ptes as never 2621 * modified. 2622 */ 2623 if (!pmap_track_modified(pv->pv_va)) 2624 continue; 2625#if defined(PMAP_DIAGNOSTIC) 2626 if (!pv->pv_pmap) { 2627 printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va); 2628 continue; 2629 } 2630#endif 2631 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 2632 if (*pte & PG_M) { 2633 sched_unpin(); 2634 splx(s); 2635 return TRUE; 2636 } 2637 } 2638 sched_unpin(); 2639 splx(s); 2640 return (FALSE); 2641} 2642 2643/* 2644 * pmap_is_prefaultable: 2645 * 2646 * Return whether or not the specified virtual address is elgible 2647 * for prefault. 2648 */ 2649boolean_t 2650pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) 2651{ 2652 pt_entry_t *pte; 2653 2654 if ((*pmap_pde(pmap, addr)) == 0) 2655 return (FALSE); 2656 pte = vtopte(addr); 2657 if (*pte) 2658 return (FALSE); 2659 return (TRUE); 2660} 2661 2662/* 2663 * Clear the given bit in each of the given page's ptes. 2664 */ 2665static __inline void 2666pmap_clear_ptes(vm_page_t m, int bit) 2667{ 2668 register pv_entry_t pv; 2669 pt_entry_t pbits, *pte; 2670 int s; 2671 2672 if (!pmap_initialized || (m->flags & PG_FICTITIOUS) || 2673 (bit == PG_RW && (m->flags & PG_WRITEABLE) == 0)) 2674 return; 2675 2676 s = splvm(); 2677 sched_pin(); 2678 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2679 /* 2680 * Loop over all current mappings setting/clearing as appropos If 2681 * setting RO do we need to clear the VAC? 2682 */ 2683 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 2684 /* 2685 * don't write protect pager mappings 2686 */ 2687 if (bit == PG_RW) { 2688 if (!pmap_track_modified(pv->pv_va)) 2689 continue; 2690 } 2691 2692#if defined(PMAP_DIAGNOSTIC) 2693 if (!pv->pv_pmap) { 2694 printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va); 2695 continue; 2696 } 2697#endif 2698 2699 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 2700 pbits = *pte; 2701 if (pbits & bit) { 2702 if (bit == PG_RW) { 2703 if (pbits & PG_M) { 2704 vm_page_dirty(m); 2705 } 2706 pte_store(pte, pbits & ~(PG_M|PG_RW)); 2707 } else { 2708 pte_store(pte, pbits & ~bit); 2709 } 2710 pmap_invalidate_page(pv->pv_pmap, pv->pv_va); 2711 } 2712 } 2713 if (bit == PG_RW) 2714 vm_page_flag_clear(m, PG_WRITEABLE); 2715 sched_unpin(); 2716 splx(s); 2717} 2718 2719/* 2720 * pmap_page_protect: 2721 * 2722 * Lower the permission for all mappings to a given page. 2723 */ 2724void 2725pmap_page_protect(vm_page_t m, vm_prot_t prot) 2726{ 2727 if ((prot & VM_PROT_WRITE) == 0) { 2728 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { 2729 pmap_clear_ptes(m, PG_RW); 2730 } else { 2731 pmap_remove_all(m); 2732 } 2733 } 2734} 2735 2736/* 2737 * pmap_ts_referenced: 2738 * 2739 * Return a count of reference bits for a page, clearing those bits. 2740 * It is not necessary for every reference bit to be cleared, but it 2741 * is necessary that 0 only be returned when there are truly no 2742 * reference bits set. 2743 * 2744 * XXX: The exact number of bits to check and clear is a matter that 2745 * should be tested and standardized at some point in the future for 2746 * optimal aging of shared pages. 2747 */ 2748int 2749pmap_ts_referenced(vm_page_t m) 2750{ 2751 register pv_entry_t pv, pvf, pvn; 2752 pt_entry_t *pte; 2753 pt_entry_t v; 2754 int s; 2755 int rtval = 0; 2756 2757 if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) 2758 return (rtval); 2759 2760 s = splvm(); 2761 sched_pin(); 2762 mtx_assert(&vm_page_queue_mtx, MA_OWNED); 2763 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { 2764 2765 pvf = pv; 2766 2767 do { 2768 pvn = TAILQ_NEXT(pv, pv_list); 2769 2770 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); 2771 2772 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); 2773 2774 if (!pmap_track_modified(pv->pv_va)) 2775 continue; 2776 2777 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); 2778 2779 if (pte && ((v = pte_load(pte)) & PG_A) != 0) { 2780 pte_store(pte, v & ~PG_A); 2781 pmap_invalidate_page(pv->pv_pmap, pv->pv_va); 2782 2783 rtval++; 2784 if (rtval > 4) { 2785 break; 2786 } 2787 } 2788 } while ((pv = pvn) != NULL && pv != pvf); 2789 } 2790 sched_unpin(); 2791 splx(s); 2792 2793 return (rtval); 2794} 2795 2796/* 2797 * Clear the modify bits on the specified physical page. 2798 */ 2799void 2800pmap_clear_modify(vm_page_t m) 2801{ 2802 pmap_clear_ptes(m, PG_M); 2803} 2804 2805/* 2806 * pmap_clear_reference: 2807 * 2808 * Clear the reference bit on the specified physical page. 2809 */ 2810void 2811pmap_clear_reference(vm_page_t m) 2812{ 2813 pmap_clear_ptes(m, PG_A); 2814} 2815 2816/* 2817 * Miscellaneous support routines follow 2818 */ 2819 2820/* 2821 * Map a set of physical memory pages into the kernel virtual 2822 * address space. Return a pointer to where it is mapped. This 2823 * routine is intended to be used for mapping device memory, 2824 * NOT real memory. 2825 */ 2826void * 2827pmap_mapdev(pa, size) 2828 vm_paddr_t pa; 2829 vm_size_t size; 2830{ 2831 vm_offset_t va, tmpva, offset; 2832 2833 offset = pa & PAGE_MASK; 2834 size = roundup(offset + size, PAGE_SIZE); 2835 pa = pa & PG_FRAME; 2836 2837 if (pa < KERNLOAD && pa + size <= KERNLOAD) 2838 va = KERNBASE + pa; 2839 else 2840 va = kmem_alloc_nofault(kernel_map, size); 2841 if (!va) 2842 panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); 2843 2844 for (tmpva = va; size > 0; ) { 2845 pmap_kenter(tmpva, pa); 2846 size -= PAGE_SIZE; 2847 tmpva += PAGE_SIZE; 2848 pa += PAGE_SIZE; 2849 } 2850 pmap_invalidate_range(kernel_pmap, va, tmpva); 2851 return ((void *)(va + offset)); 2852} 2853 2854void 2855pmap_unmapdev(va, size) 2856 vm_offset_t va; 2857 vm_size_t size; 2858{ 2859 vm_offset_t base, offset, tmpva; 2860 2861 if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) 2862 return; 2863 base = va & PG_FRAME; 2864 offset = va & PAGE_MASK; 2865 size = roundup(offset + size, PAGE_SIZE); 2866 for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) 2867 pmap_kremove(tmpva); 2868 pmap_invalidate_range(kernel_pmap, va, tmpva); 2869 kmem_free(kernel_map, base, size); 2870} 2871 2872/* 2873 * perform the pmap work for mincore 2874 */ 2875int 2876pmap_mincore(pmap, addr) 2877 pmap_t pmap; 2878 vm_offset_t addr; 2879{ 2880 pt_entry_t *ptep, pte; 2881 vm_page_t m; 2882 int val = 0; 2883 2884 ptep = pmap_pte(pmap, addr); 2885 if (ptep == 0) { 2886 return 0; 2887 } 2888 2889 if ((pte = *ptep) != 0) { 2890 vm_paddr_t pa; 2891 2892 val = MINCORE_INCORE; 2893 if ((pte & PG_MANAGED) == 0) 2894 return val; 2895 2896 pa = pte & PG_FRAME; 2897 2898 m = PHYS_TO_VM_PAGE(pa); 2899 2900 /* 2901 * Modified by us 2902 */ 2903 if (pte & PG_M) 2904 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; 2905 else { 2906 /* 2907 * Modified by someone else 2908 */ 2909 vm_page_lock_queues(); 2910 if (m->dirty || pmap_is_modified(m)) 2911 val |= MINCORE_MODIFIED_OTHER; 2912 vm_page_unlock_queues(); 2913 } 2914 /* 2915 * Referenced by us 2916 */ 2917 if (pte & PG_A) 2918 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; 2919 else { 2920 /* 2921 * Referenced by someone else 2922 */ 2923 vm_page_lock_queues(); 2924 if ((m->flags & PG_REFERENCED) || 2925 pmap_ts_referenced(m)) { 2926 val |= MINCORE_REFERENCED_OTHER; 2927 vm_page_flag_set(m, PG_REFERENCED); 2928 } 2929 vm_page_unlock_queues(); 2930 } 2931 } 2932 return val; 2933} 2934 2935void 2936pmap_activate(struct thread *td) 2937{ 2938 struct proc *p = td->td_proc; 2939 pmap_t pmap, oldpmap; 2940 u_int32_t cr3; 2941 2942 critical_enter(); 2943 pmap = vmspace_pmap(td->td_proc->p_vmspace); 2944 oldpmap = PCPU_GET(curpmap); 2945#if defined(SMP) 2946 atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask)); 2947 atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask)); 2948#else 2949 oldpmap->pm_active &= ~1; 2950 pmap->pm_active |= 1; 2951#endif 2952#ifdef PAE 2953 cr3 = vtophys(pmap->pm_pdpt); 2954#else 2955 cr3 = vtophys(pmap->pm_pdir); 2956#endif 2957 /* XXXKSE this is wrong. 2958 * pmap_activate is for the current thread on the current cpu 2959 */ 2960 if (p->p_flag & P_SA) { 2961 /* Make sure all other cr3 entries are updated. */ 2962 /* what if they are running? XXXKSE (maybe abort them) */ 2963 FOREACH_THREAD_IN_PROC(p, td) { 2964 td->td_pcb->pcb_cr3 = cr3; 2965 } 2966 } else { 2967 td->td_pcb->pcb_cr3 = cr3; 2968 } 2969 load_cr3(cr3); 2970 PCPU_SET(curpmap, pmap); 2971 critical_exit(); 2972} 2973 2974vm_offset_t 2975pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) 2976{ 2977 2978 if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) { 2979 return addr; 2980 } 2981 2982 addr = (addr + PDRMASK) & ~PDRMASK; 2983 return addr; 2984} 2985 2986 2987#if defined(PMAP_DEBUG) 2988pmap_pid_dump(int pid) 2989{ 2990 pmap_t pmap; 2991 struct proc *p; 2992 int npte = 0; 2993 int index; 2994 2995 sx_slock(&allproc_lock); 2996 LIST_FOREACH(p, &allproc, p_list) { 2997 if (p->p_pid != pid) 2998 continue; 2999 3000 if (p->p_vmspace) { 3001 int i,j; 3002 index = 0; 3003 pmap = vmspace_pmap(p->p_vmspace); 3004 for (i = 0; i < NPDEPTD; i++) { 3005 pd_entry_t *pde; 3006 pt_entry_t *pte; 3007 vm_offset_t base = i << PDRSHIFT; 3008 3009 pde = &pmap->pm_pdir[i]; 3010 if (pde && pmap_pde_v(pde)) { 3011 for (j = 0; j < NPTEPG; j++) { 3012 vm_offset_t va = base + (j << PAGE_SHIFT); 3013 if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { 3014 if (index) { 3015 index = 0; 3016 printf("\n"); 3017 } 3018 sx_sunlock(&allproc_lock); 3019 return npte; 3020 } 3021 pte = pmap_pte(pmap, va); 3022 if (pte && pmap_pte_v(pte)) { 3023 pt_entry_t pa; 3024 vm_page_t m; 3025 pa = *pte; 3026 m = PHYS_TO_VM_PAGE(pa); 3027 printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", 3028 va, pa, m->hold_count, m->wire_count, m->flags); 3029 npte++; 3030 index++; 3031 if (index >= 2) { 3032 index = 0; 3033 printf("\n"); 3034 } else { 3035 printf(" "); 3036 } 3037 } 3038 } 3039 } 3040 } 3041 } 3042 } 3043 sx_sunlock(&allproc_lock); 3044 return npte; 3045} 3046#endif 3047 3048#if defined(DEBUG) 3049 3050static void pads(pmap_t pm); 3051void pmap_pvdump(vm_offset_t pa); 3052 3053/* print address space of pmap*/ 3054static void 3055pads(pm) 3056 pmap_t pm; 3057{ 3058 int i, j; 3059 vm_paddr_t va; 3060 pt_entry_t *ptep; 3061 3062 if (pm == kernel_pmap) 3063 return; 3064 for (i = 0; i < NPDEPTD; i++) 3065 if (pm->pm_pdir[i]) 3066 for (j = 0; j < NPTEPG; j++) { 3067 va = (i << PDRSHIFT) + (j << PAGE_SHIFT); 3068 if (pm == kernel_pmap && va < KERNBASE) 3069 continue; 3070 if (pm != kernel_pmap && va > UPT_MAX_ADDRESS) 3071 continue; 3072 ptep = pmap_pte(pm, va); 3073 if (pmap_pte_v(ptep)) 3074 printf("%x:%x ", va, *ptep); 3075 }; 3076 3077} 3078 3079void 3080pmap_pvdump(pa) 3081 vm_paddr_t pa; 3082{ 3083 pv_entry_t pv; 3084 vm_page_t m; 3085 3086 printf("pa %x", pa); 3087 m = PHYS_TO_VM_PAGE(pa); 3088 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) { 3089 printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va); 3090 pads(pv->pv_pmap); 3091 } 3092 printf(" "); 3093} 3094#endif 3095